From 3bef198f1b17d1bb89260bad947ef084c0a2d1a6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Dec 2020 12:17:16 -0800 Subject: [PATCH 001/307] JFS: more checks for invalid superblock syzbot is feeding invalid superblock data to JFS for mount testing. JFS does not check several of the fields -- just assumes that they are good since the JFS_MAGIC and version fields are good. In this case (syzbot reproducer), we have s_l2bsize == 0xda0c, pad == 0xf045, and s_state == 0x50, all of which are invalid IMO. Having s_l2bsize == 0xda0c causes this UBSAN warning: UBSAN: shift-out-of-bounds in fs/jfs/jfs_mount.c:373:25 shift exponent -9716 is negative s_l2bsize can be tested for correctness. pad can be tested for non-0 and punted. s_state can be tested for its valid values and punted. Do those 3 tests and if any of them fails, report the superblock as invalid/corrupt and let fsck handle it. With this patch, chkSuper() says this when JFS_DEBUG is enabled: jfs_mount: Mount Failure: superblock is corrupt! Mount JFS Failure: -22 jfs_mount failed w/return code = -22 The obvious problem with this method is that next week there could be another syzbot test that uses different fields for invalid values, this making this like a game of whack-a-mole. syzkaller link: https://syzkaller.appspot.com/bug?extid=36315852ece4132ec193 Reported-by: syzbot+36315852ece4132ec193@syzkaller.appspotmail.com Reported-by: kernel test robot # v2 Signed-off-by: Randy Dunlap Signed-off-by: Dave Kleikamp Cc: jfs-discussion@lists.sourceforge.net --- fs/jfs/jfs_filsys.h | 1 + fs/jfs/jfs_mount.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 1e899298f7f0..b5d702df7111 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -268,5 +268,6 @@ * fsck() must be run to repair */ #define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ +#define FM_STATE_MAX 0x0000000f /* max value of s_state */ #endif /* _H_JFS_FILSYS */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 2935d4c776ec..5d7d7170c03c 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "jfs_incore.h" #include "jfs_filsys.h" @@ -366,6 +367,15 @@ static int chkSuper(struct super_block *sb) sbi->bsize = bsize; sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); + /* check some fields for possible corruption */ + if (sbi->l2bsize != ilog2((u32)bsize) || + j_sb->pad != 0 || + le32_to_cpu(j_sb->s_state) > FM_STATE_MAX) { + rc = -EINVAL; + jfs_err("jfs_mount: Mount Failure: superblock is corrupt!"); + goto out; + } + /* * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer * cache. From 2ee5f8f05949735fa2f4c463a5e13fcb3660c719 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 8 Dec 2020 17:41:42 +0100 Subject: [PATCH 002/307] units: Add Watt units As there are the temperature units, let's add the Watt macros definition. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/units.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/units.h b/include/linux/units.h index aaf716364ec3..92c234e71cab 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -4,6 +4,10 @@ #include +#define MILLIWATT_PER_WATT 1000L +#define MICROWATT_PER_MILLIWATT 1000L +#define MICROWATT_PER_WATT 1000000L + #define ABSOLUTE_ZERO_MILLICELSIUS -273150 static inline long milli_kelvin_to_millicelsius(long t) From f5ad1c747956d501516610ee7900f4a6d57ee2f5 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 8 Dec 2020 17:41:43 +0100 Subject: [PATCH 003/307] Documentation/powercap/dtpm: Add documentation for dtpm The dynamic thermal and power management is a technique to dynamically adjust the power consumption of different devices in order to ensure a global thermal constraint. An userspace daemon is usually monitoring the temperature and the power to take immediate action on the device. The DTPM framework provides an unified API to userspace to act on the power. Document this framework. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- Documentation/power/index.rst | 1 + Documentation/power/powercap/dtpm.rst | 212 ++++++++++++++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 Documentation/power/powercap/dtpm.rst diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst index ced8a8007434..a0f5244fb427 100644 --- a/Documentation/power/index.rst +++ b/Documentation/power/index.rst @@ -30,6 +30,7 @@ Power Management userland-swsusp powercap/powercap + powercap/dtpm regulator/consumer regulator/design diff --git a/Documentation/power/powercap/dtpm.rst b/Documentation/power/powercap/dtpm.rst new file mode 100644 index 000000000000..a38dee3d815b --- /dev/null +++ b/Documentation/power/powercap/dtpm.rst @@ -0,0 +1,212 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================== +Dynamic Thermal Power Management framework +========================================== + +On the embedded world, the complexity of the SoC leads to an +increasing number of hotspots which need to be monitored and mitigated +as a whole in order to prevent the temperature to go above the +normative and legally stated 'skin temperature'. + +Another aspect is to sustain the performance for a given power budget, +for example virtual reality where the user can feel dizziness if the +performance is capped while a big CPU is processing something else. Or +reduce the battery charging because the dissipated power is too high +compared with the power consumed by other devices. + +The user space is the most adequate place to dynamically act on the +different devices by limiting their power given an application +profile: it has the knowledge of the platform. + +The Dynamic Thermal Power Management (DTPM) is a technique acting on +the device power by limiting and/or balancing a power budget among +different devices. + +The DTPM framework provides an unified interface to act on the +device power. + +Overview +======== + +The DTPM framework relies on the powercap framework to create the +powercap entries in the sysfs directory and implement the backend +driver to do the connection with the power manageable device. + +The DTPM is a tree representation describing the power constraints +shared between devices, not their physical positions. + +The nodes of the tree are a virtual description aggregating the power +characteristics of the children nodes and their power limitations. + +The leaves of the tree are the real power manageable devices. + +For instance:: + + SoC + | + `-- pkg + | + |-- pd0 (cpu0-3) + | + `-- pd1 (cpu4-5) + +The pkg power will be the sum of pd0 and pd1 power numbers:: + + SoC (400mW - 3100mW) + | + `-- pkg (400mW - 3100mW) + | + |-- pd0 (100mW - 700mW) + | + `-- pd1 (300mW - 2400mW) + +When the nodes are inserted in the tree, their power characteristics are propagated to the parents:: + + SoC (600mW - 5900mW) + | + |-- pkg (400mW - 3100mW) + | | + | |-- pd0 (100mW - 700mW) + | | + | `-- pd1 (300mW - 2400mW) + | + `-- pd2 (200mW - 2800mW) + +Each node have a weight on a 2^10 basis reflecting the percentage of power consumption along the siblings:: + + SoC (w=1024) + | + |-- pkg (w=538) + | | + | |-- pd0 (w=231) + | | + | `-- pd1 (w=794) + | + `-- pd2 (w=486) + + Note the sum of weights at the same level are equal to 1024. + +When a power limitation is applied to a node, then it is distributed along the children given their weights. For example, if we set a power limitation of 3200mW at the 'SoC' root node, the resulting tree will be:: + + SoC (w=1024) <--- power_limit = 3200mW + | + |-- pkg (w=538) --> power_limit = 1681mW + | | + | |-- pd0 (w=231) --> power_limit = 378mW + | | + | `-- pd1 (w=794) --> power_limit = 1303mW + | + `-- pd2 (w=486) --> power_limit = 1519mW + + +Flat description +---------------- + +A root node is created and it is the parent of all the nodes. This +description is the simplest one and it is supposed to give to user +space a flat representation of all the devices supporting the power +limitation without any power limitation distribution. + +Hierarchical description +------------------------ + +The different devices supporting the power limitation are represented +hierarchically. There is one root node, all intermediate nodes are +grouping the child nodes which can be intermediate nodes also or real +devices. + +The intermediate nodes aggregate the power information and allows to +set the power limit given the weight of the nodes. + +User space API +============== + +As stated in the overview, the DTPM framework is built on top of the +powercap framework. Thus the sysfs interface is the same, please refer +to the powercap documentation for further details. + + * power_uw: Instantaneous power consumption. If the node is an + intermediate node, then the power consumption will be the sum of all + children power consumption. + + * max_power_range_uw: The power range resulting of the maximum power + minus the minimum power. + + * name: The name of the node. This is implementation dependent. Even + if it is not recommended for the user space, several nodes can have + the same name. + + * constraint_X_name: The name of the constraint. + + * constraint_X_max_power_uw: The maximum power limit to be applicable + to the node. + + * constraint_X_power_limit_uw: The power limit to be applied to the + node. If the value contained in constraint_X_max_power_uw is set, + the constraint will be removed. + + * constraint_X_time_window_us: The meaning of this file will depend + on the constraint number. + +Constraints +----------- + + * Constraint 0: The power limitation is immediately applied, without + limitation in time. + +Kernel API +========== + +Overview +-------- + +The DTPM framework has no power limiting backend support. It is +generic and provides a set of API to let the different drivers to +implement the backend part for the power limitation and create the +power constraints tree. + +It is up to the platform to provide the initialization function to +allocate and link the different nodes of the tree. + +A special macro has the role of declaring a node and the corresponding +initialization function via a description structure. This one contains +an optional parent field allowing to hook different devices to an +already existing tree at boot time. + +For instance:: + + struct dtpm_descr my_descr = { + .name = "my_name", + .init = my_init_func, + }; + + DTPM_DECLARE(my_descr); + +The nodes of the DTPM tree are described with dtpm structure. The +steps to add a new power limitable device is done in three steps: + + * Allocate the dtpm node + * Set the power number of the dtpm node + * Register the dtpm node + +The registration of the dtpm node is done with the powercap +ops. Basically, it must implements the callbacks to get and set the +power and the limit. + +Alternatively, if the node to be inserted is an intermediate one, then +a simple function to insert it as a future parent is available. + +If a device has its power characteristics changing, then the tree must +be updated with the new power numbers and weights. + +Nomenclature +------------ + + * dtpm_alloc() : Allocate and initialize a dtpm structure + + * dtpm_register() : Add the dtpm node to the tree + + * dtpm_unregister() : Remove the dtpm node from the tree + + * dtpm_update_power() : Update the power characteristics of the dtpm node From a20d0ef97abf486a917aff066c457bdb930425af Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 8 Dec 2020 17:41:44 +0100 Subject: [PATCH 004/307] powercap/drivers/dtpm: Add API for dynamic thermal power management On the embedded world, the complexity of the SoC leads to an increasing number of hotspots which need to be monitored and mitigated as a whole in order to prevent the temperature to go above the normative and legally stated 'skin temperature'. Another aspect is to sustain the performance for a given power budget, for example virtual reality where the user can feel dizziness if the GPU performance is capped while a big CPU is processing something else. Or reduce the battery charging because the dissipated power is too high compared with the power consumed by other devices. The userspace is the most adequate place to dynamically act on the different devices by limiting their power given an application profile: it has the knowledge of the platform. These userspace daemons are in charge of the Dynamic Thermal Power Management (DTPM). Nowadays, the dtpm daemons are abusing the thermal framework as they act on the cooling device state to force a specific and arbitrary state without taking care of the governor decisions. Given the closed loop of some governors that can confuse the logic or directly enter in a decision conflict. As the number of cooling device support is limited today to the CPU and the GPU, the dtpm daemons have little control on the power dissipation of the system. The out of tree solutions are hacking around here and there in the drivers, in the frameworks to have control on the devices. The common solution is to declare them as cooling devices. There is no unification of the power limitation unit, opaque states are used. This patch provides a way to create a hierarchy of constraints using the powercap framework. The devices which are registered as power limit-able devices are represented in this hierarchy as a tree. They are linked together with intermediate nodes which are just there to propagate the constraint to the children. The leaves of the tree are the real devices, the intermediate nodes are virtual, aggregating the children constraints and power characteristics. Each node have a weight on a 2^10 basis, in order to reflect the percentage of power distribution of the children's node. This percentage is used to dispatch the power limit to the children. The weight is computed against the max power of the siblings. This simple approach allows to do a fair distribution of the power limit. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Tested-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 6 + drivers/powercap/Makefile | 1 + drivers/powercap/dtpm.c | 473 ++++++++++++++++++++++++++++++ include/asm-generic/vmlinux.lds.h | 11 + include/linux/dtpm.h | 75 +++++ 5 files changed, 566 insertions(+) create mode 100644 drivers/powercap/dtpm.c create mode 100644 include/linux/dtpm.h diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index bc228725346b..cc1953bd8bed 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -43,4 +43,10 @@ config IDLE_INJECT CPUs for power capping. Idle period can be injected synchronously on a set of specified CPUs or alternatively on a per CPU basis. + +config DTPM + bool "Power capping for Dynamic Thermal Power Management" + help + This enables support for the power capping for the dynamic + thermal power management userspace engine. endif diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile index 7255c94ec61c..6482ac52054d 100644 --- a/drivers/powercap/Makefile +++ b/drivers/powercap/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_DTPM) += dtpm.o obj-$(CONFIG_POWERCAP) += powercap_sys.o obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c new file mode 100644 index 000000000000..5b6857e9b064 --- /dev/null +++ b/drivers/powercap/dtpm.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2020 Linaro Limited + * + * Author: Daniel Lezcano + * + * The powercap based Dynamic Thermal Power Management framework + * provides to the userspace a consistent API to set the power limit + * on some devices. + * + * DTPM defines the functions to create a tree of constraints. Each + * parent node is a virtual description of the aggregation of the + * children. It propagates the constraints set at its level to its + * children and collect the children power information. The leaves of + * the tree are the real devices which have the ability to get their + * current power consumption and set their power limit. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#define DTPM_POWER_LIMIT_FLAG BIT(0) + +static const char *constraint_name[] = { + "Instantaneous", +}; + +static DEFINE_MUTEX(dtpm_lock); +static struct powercap_control_type *pct; +static struct dtpm *root; + +static int get_time_window_us(struct powercap_zone *pcz, int cid, u64 *window) +{ + return -ENOSYS; +} + +static int set_time_window_us(struct powercap_zone *pcz, int cid, u64 window) +{ + return -ENOSYS; +} + +static int get_max_power_range_uw(struct powercap_zone *pcz, u64 *max_power_uw) +{ + struct dtpm *dtpm = to_dtpm(pcz); + + mutex_lock(&dtpm_lock); + *max_power_uw = dtpm->power_max - dtpm->power_min; + mutex_unlock(&dtpm_lock); + + return 0; +} + +static int __get_power_uw(struct dtpm *dtpm, u64 *power_uw) +{ + struct dtpm *child; + u64 power; + int ret = 0; + + if (dtpm->ops) { + *power_uw = dtpm->ops->get_power_uw(dtpm); + return 0; + } + + *power_uw = 0; + + list_for_each_entry(child, &dtpm->children, sibling) { + ret = __get_power_uw(child, &power); + if (ret) + break; + *power_uw += power; + } + + return ret; +} + +static int get_power_uw(struct powercap_zone *pcz, u64 *power_uw) +{ + struct dtpm *dtpm = to_dtpm(pcz); + int ret; + + mutex_lock(&dtpm_lock); + ret = __get_power_uw(dtpm, power_uw); + mutex_unlock(&dtpm_lock); + + return ret; +} + +static void __dtpm_rebalance_weight(struct dtpm *dtpm) +{ + struct dtpm *child; + + list_for_each_entry(child, &dtpm->children, sibling) { + + pr_debug("Setting weight '%d' for '%s'\n", + child->weight, child->zone.name); + + child->weight = DIV_ROUND_CLOSEST(child->power_max * 1024, + dtpm->power_max); + + __dtpm_rebalance_weight(child); + } +} + +static void __dtpm_sub_power(struct dtpm *dtpm) +{ + struct dtpm *parent = dtpm->parent; + + while (parent) { + parent->power_min -= dtpm->power_min; + parent->power_max -= dtpm->power_max; + parent->power_limit -= dtpm->power_limit; + parent = parent->parent; + } + + __dtpm_rebalance_weight(root); +} + +static void __dtpm_add_power(struct dtpm *dtpm) +{ + struct dtpm *parent = dtpm->parent; + + while (parent) { + parent->power_min += dtpm->power_min; + parent->power_max += dtpm->power_max; + parent->power_limit += dtpm->power_limit; + parent = parent->parent; + } + + __dtpm_rebalance_weight(root); +} + +/** + * dtpm_update_power - Update the power on the dtpm + * @dtpm: a pointer to a dtpm structure to update + * @power_min: a u64 representing the new power_min value + * @power_max: a u64 representing the new power_max value + * + * Function to update the power values of the dtpm node specified in + * parameter. These new values will be propagated to the tree. + * + * Return: zero on success, -EINVAL if the values are inconsistent + */ +int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max) +{ + mutex_lock(&dtpm_lock); + + if (power_min == dtpm->power_min && power_max == dtpm->power_max) + return 0; + + if (power_max < power_min) + return -EINVAL; + + __dtpm_sub_power(dtpm); + + dtpm->power_min = power_min; + dtpm->power_max = power_max; + if (!test_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags)) + dtpm->power_limit = power_max; + + __dtpm_add_power(dtpm); + + mutex_unlock(&dtpm_lock); + + return 0; +} + +/** + * dtpm_release_zone - Cleanup when the node is released + * @pcz: a pointer to a powercap_zone structure + * + * Do some housecleaning and update the weight on the tree. The + * release will be denied if the node has children. This function must + * be called by the specific release callback of the different + * backends. + * + * Return: 0 on success, -EBUSY if there are children + */ +int dtpm_release_zone(struct powercap_zone *pcz) +{ + struct dtpm *dtpm = to_dtpm(pcz); + struct dtpm *parent = dtpm->parent; + + mutex_lock(&dtpm_lock); + + if (!list_empty(&dtpm->children)) + return -EBUSY; + + if (parent) + list_del(&dtpm->sibling); + + __dtpm_sub_power(dtpm); + + mutex_unlock(&dtpm_lock); + + if (dtpm->ops) + dtpm->ops->release(dtpm); + + kfree(dtpm); + + return 0; +} + +static int __get_power_limit_uw(struct dtpm *dtpm, int cid, u64 *power_limit) +{ + *power_limit = dtpm->power_limit; + return 0; +} + +static int get_power_limit_uw(struct powercap_zone *pcz, + int cid, u64 *power_limit) +{ + struct dtpm *dtpm = to_dtpm(pcz); + int ret; + + mutex_lock(&dtpm_lock); + ret = __get_power_limit_uw(dtpm, cid, power_limit); + mutex_unlock(&dtpm_lock); + + return ret; +} + +/* + * Set the power limit on the nodes, the power limit is distributed + * given the weight of the children. + * + * The dtpm node lock must be held when calling this function. + */ +static int __set_power_limit_uw(struct dtpm *dtpm, int cid, u64 power_limit) +{ + struct dtpm *child; + int ret = 0; + u64 power; + + /* + * A max power limitation means we remove the power limit, + * otherwise we set a constraint and flag the dtpm node. + */ + if (power_limit == dtpm->power_max) { + clear_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags); + } else { + set_bit(DTPM_POWER_LIMIT_FLAG, &dtpm->flags); + } + + pr_debug("Setting power limit for '%s': %llu uW\n", + dtpm->zone.name, power_limit); + + /* + * Only leaves of the dtpm tree has ops to get/set the power + */ + if (dtpm->ops) { + dtpm->power_limit = dtpm->ops->set_power_uw(dtpm, power_limit); + } else { + dtpm->power_limit = 0; + + list_for_each_entry(child, &dtpm->children, sibling) { + + /* + * Integer division rounding will inevitably + * lead to a different min or max value when + * set several times. In order to restore the + * initial value, we force the child's min or + * max power every time if the constraint is + * at the boundaries. + */ + if (power_limit == dtpm->power_max) { + power = child->power_max; + } else if (power_limit == dtpm->power_min) { + power = child->power_min; + } else { + power = DIV_ROUND_CLOSEST( + power_limit * child->weight, 1024); + } + + pr_debug("Setting power limit for '%s': %llu uW\n", + child->zone.name, power); + + ret = __set_power_limit_uw(child, cid, power); + if (!ret) + ret = __get_power_limit_uw(child, cid, &power); + + if (ret) + break; + + dtpm->power_limit += power; + } + } + + return ret; +} + +static int set_power_limit_uw(struct powercap_zone *pcz, + int cid, u64 power_limit) +{ + struct dtpm *dtpm = to_dtpm(pcz); + int ret; + + mutex_lock(&dtpm_lock); + + /* + * Don't allow values outside of the power range previously + * set when initializing the power numbers. + */ + power_limit = clamp_val(power_limit, dtpm->power_min, dtpm->power_max); + + ret = __set_power_limit_uw(dtpm, cid, power_limit); + + pr_debug("%s: power limit: %llu uW, power max: %llu uW\n", + dtpm->zone.name, dtpm->power_limit, dtpm->power_max); + + mutex_unlock(&dtpm_lock); + + return ret; +} + +static const char *get_constraint_name(struct powercap_zone *pcz, int cid) +{ + return constraint_name[cid]; +} + +static int get_max_power_uw(struct powercap_zone *pcz, int id, u64 *max_power) +{ + struct dtpm *dtpm = to_dtpm(pcz); + + mutex_lock(&dtpm_lock); + *max_power = dtpm->power_max; + mutex_unlock(&dtpm_lock); + + return 0; +} + +static struct powercap_zone_constraint_ops constraint_ops = { + .set_power_limit_uw = set_power_limit_uw, + .get_power_limit_uw = get_power_limit_uw, + .set_time_window_us = set_time_window_us, + .get_time_window_us = get_time_window_us, + .get_max_power_uw = get_max_power_uw, + .get_name = get_constraint_name, +}; + +static struct powercap_zone_ops zone_ops = { + .get_max_power_range_uw = get_max_power_range_uw, + .get_power_uw = get_power_uw, + .release = dtpm_release_zone, +}; + +/** + * dtpm_alloc - Allocate and initialize a dtpm struct + * @name: a string specifying the name of the node + * + * Return: a struct dtpm pointer, NULL in case of error + */ +struct dtpm *dtpm_alloc(struct dtpm_ops *ops) +{ + struct dtpm *dtpm; + + dtpm = kzalloc(sizeof(*dtpm), GFP_KERNEL); + if (dtpm) { + INIT_LIST_HEAD(&dtpm->children); + INIT_LIST_HEAD(&dtpm->sibling); + dtpm->weight = 1024; + dtpm->ops = ops; + } + + return dtpm; +} + +/** + * dtpm_unregister - Unregister a dtpm node from the hierarchy tree + * @dtpm: a pointer to a dtpm structure corresponding to the node to be removed + * + * Call the underlying powercap unregister function. That will call + * the release callback of the powercap zone. + */ +void dtpm_unregister(struct dtpm *dtpm) +{ + powercap_unregister_zone(pct, &dtpm->zone); + + pr_info("Unregistered dtpm node '%s'\n", dtpm->zone.name); +} + +/** + * dtpm_register - Register a dtpm node in the hierarchy tree + * @name: a string specifying the name of the node + * @dtpm: a pointer to a dtpm structure corresponding to the new node + * @parent: a pointer to a dtpm structure corresponding to the parent node + * + * Create a dtpm node in the tree. If no parent is specified, the node + * is the root node of the hierarchy. If the root node already exists, + * then the registration will fail. The powercap controller must be + * initialized before calling this function. + * + * The dtpm structure must be initialized with the power numbers + * before calling this function. + * + * Return: zero on success, a negative value in case of error: + * -EAGAIN: the function is called before the framework is initialized. + * -EBUSY: the root node is already inserted + * -EINVAL: * there is no root node yet and @parent is specified + * * no all ops are defined + * * parent have ops which are reserved for leaves + * Other negative values are reported back from the powercap framework + */ +int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent) +{ + struct powercap_zone *pcz; + + if (!pct) + return -EAGAIN; + + if (root && !parent) + return -EBUSY; + + if (!root && parent) + return -EINVAL; + + if (parent && parent->ops) + return -EINVAL; + + if (!dtpm) + return -EINVAL; + + if (dtpm->ops && !(dtpm->ops->set_power_uw && + dtpm->ops->get_power_uw && + dtpm->ops->release)) + return -EINVAL; + + pcz = powercap_register_zone(&dtpm->zone, pct, name, + parent ? &parent->zone : NULL, + &zone_ops, MAX_DTPM_CONSTRAINTS, + &constraint_ops); + if (IS_ERR(pcz)) + return PTR_ERR(pcz); + + mutex_lock(&dtpm_lock); + + if (parent) { + list_add_tail(&dtpm->sibling, &parent->children); + dtpm->parent = parent; + } else { + root = dtpm; + } + + __dtpm_add_power(dtpm); + + pr_info("Registered dtpm node '%s' / %llu-%llu uW, \n", + dtpm->zone.name, dtpm->power_min, dtpm->power_max); + + mutex_unlock(&dtpm_lock); + + return 0; +} + +static int __init dtpm_init(void) +{ + struct dtpm_descr **dtpm_descr; + + pct = powercap_register_control_type(NULL, "dtpm", NULL); + if (!pct) { + pr_err("Failed to register control type\n"); + return -EINVAL; + } + + for_each_dtpm_table(dtpm_descr) + (*dtpm_descr)->init(*dtpm_descr); + + return 0; +} +late_initcall(dtpm_init); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b2b3d81b1535..b3e4e0740089 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -316,6 +316,16 @@ #define THERMAL_TABLE(name) #endif +#ifdef CONFIG_DTPM +#define DTPM_TABLE() \ + . = ALIGN(8); \ + __dtpm_table = .; \ + KEEP(*(__dtpm_table)) \ + __dtpm_table_end = .; +#else +#define DTPM_TABLE() +#endif + #define KERNEL_DTB() \ STRUCT_ALIGN(); \ __dtb_start = .; \ @@ -733,6 +743,7 @@ ACPI_PROBE_TABLE(irqchip) \ ACPI_PROBE_TABLE(timer) \ THERMAL_TABLE(governor) \ + DTPM_TABLE() \ EARLYCON_TABLE() \ LSM_TABLE() \ EARLY_LSM_TABLE() \ diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h new file mode 100644 index 000000000000..7a1d0b50e334 --- /dev/null +++ b/include/linux/dtpm.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2020 Linaro Ltd + * + * Author: Daniel Lezcano + */ +#ifndef ___DTPM_H__ +#define ___DTPM_H__ + +#include + +#define MAX_DTPM_DESCR 8 +#define MAX_DTPM_CONSTRAINTS 1 + +struct dtpm { + struct powercap_zone zone; + struct dtpm *parent; + struct list_head sibling; + struct list_head children; + struct dtpm_ops *ops; + unsigned long flags; + u64 power_limit; + u64 power_max; + u64 power_min; + int weight; + void *private; +}; + +struct dtpm_ops { + u64 (*set_power_uw)(struct dtpm *, u64); + u64 (*get_power_uw)(struct dtpm *); + void (*release)(struct dtpm *); +}; + +struct dtpm_descr; + +typedef int (*dtpm_init_t)(struct dtpm_descr *); + +struct dtpm_descr { + struct dtpm *parent; + const char *name; + dtpm_init_t init; +}; + +/* Init section thermal table */ +extern struct dtpm_descr *__dtpm_table[]; +extern struct dtpm_descr *__dtpm_table_end[]; + +#define DTPM_TABLE_ENTRY(name) \ + static typeof(name) *__dtpm_table_entry_##name \ + __used __section("__dtpm_table") = &name + +#define DTPM_DECLARE(name) DTPM_TABLE_ENTRY(name) + +#define for_each_dtpm_table(__dtpm) \ + for (__dtpm = __dtpm_table; \ + __dtpm < __dtpm_table_end; \ + __dtpm++) + +static inline struct dtpm *to_dtpm(struct powercap_zone *zone) +{ + return container_of(zone, struct dtpm, zone); +} + +int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max); + +int dtpm_release_zone(struct powercap_zone *pcz); + +struct dtpm *dtpm_alloc(struct dtpm_ops *ops); + +void dtpm_unregister(struct dtpm *dtpm); + +int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent); + +#endif From 0e8f68d7f04856a9e2ad4817b477fa35124888bd Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 8 Dec 2020 17:41:45 +0100 Subject: [PATCH 005/307] powercap/drivers/dtpm: Add CPU energy model based support With the powercap dtpm controller, we are able to plug devices with power limitation features in the tree. The following patch introduces the CPU power limitation based on the energy model and the performance states. The power limitation is done at the performance domain level. If some CPUs are unplugged, the corresponding power will be subtracted from the performance domain total power. It is up to the platform to initialize the dtpm tree and add the CPU. Here is an example to create a simple tree with one root node called "pkg" and the CPU's performance domains. static int dtpm_register_pkg(struct dtpm_descr *descr) { struct dtpm *pkg; int ret; pkg = dtpm_alloc(NULL); if (!pkg) return -ENOMEM; ret = dtpm_register(descr->name, pkg, descr->parent); if (ret) return ret; return dtpm_register_cpu(pkg); } static struct dtpm_descr descr = { .name = "pkg", .init = dtpm_register_pkg, }; DTPM_DECLARE(descr); Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Tested-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 7 + drivers/powercap/Makefile | 1 + drivers/powercap/dtpm_cpu.c | 257 ++++++++++++++++++++++++++++++++++++ include/linux/cpuhotplug.h | 1 + include/linux/dtpm.h | 2 + 5 files changed, 268 insertions(+) create mode 100644 drivers/powercap/dtpm_cpu.c diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index cc1953bd8bed..20b4325c6161 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -49,4 +49,11 @@ config DTPM help This enables support for the power capping for the dynamic thermal power management userspace engine. + +config DTPM_CPU + bool "Add CPU power capping based on the energy model" + depends on DTPM && ENERGY_MODEL + help + This enables support for CPU power limitation based on + energy model. endif diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile index 6482ac52054d..fabcf388a8d3 100644 --- a/drivers/powercap/Makefile +++ b/drivers/powercap/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_DTPM) += dtpm.o +obj-$(CONFIG_DTPM_CPU) += dtpm_cpu.o obj-$(CONFIG_POWERCAP) += powercap_sys.o obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c new file mode 100644 index 000000000000..6933c783c6b4 --- /dev/null +++ b/drivers/powercap/dtpm_cpu.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2020 Linaro Limited + * + * Author: Daniel Lezcano + * + * The DTPM CPU is based on the energy model. It hooks the CPU in the + * DTPM tree which in turns update the power number by propagating the + * power number from the CPU energy model information to the parents. + * + * The association between the power and the performance state, allows + * to set the power of the CPU at the OPP granularity. + * + * The CPU hotplug is supported and the power numbers will be updated + * if a CPU is hot plugged / unplugged. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static struct dtpm *__parent; + +static DEFINE_PER_CPU(struct dtpm *, dtpm_per_cpu); + +struct dtpm_cpu { + struct freq_qos_request qos_req; + int cpu; +}; + +/* + * When a new CPU is inserted at hotplug or boot time, add the power + * contribution and update the dtpm tree. + */ +static int power_add(struct dtpm *dtpm, struct em_perf_domain *em) +{ + u64 power_min, power_max; + + power_min = em->table[0].power; + power_min *= MICROWATT_PER_MILLIWATT; + power_min += dtpm->power_min; + + power_max = em->table[em->nr_perf_states - 1].power; + power_max *= MICROWATT_PER_MILLIWATT; + power_max += dtpm->power_max; + + return dtpm_update_power(dtpm, power_min, power_max); +} + +/* + * When a CPU is unplugged, remove its power contribution from the + * dtpm tree. + */ +static int power_sub(struct dtpm *dtpm, struct em_perf_domain *em) +{ + u64 power_min, power_max; + + power_min = em->table[0].power; + power_min *= MICROWATT_PER_MILLIWATT; + power_min = dtpm->power_min - power_min; + + power_max = em->table[em->nr_perf_states - 1].power; + power_max *= MICROWATT_PER_MILLIWATT; + power_max = dtpm->power_max - power_max; + + return dtpm_update_power(dtpm, power_min, power_max); +} + +static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit) +{ + struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct em_perf_domain *pd; + struct cpumask cpus; + unsigned long freq; + u64 power; + int i, nr_cpus; + + pd = em_cpu_get(dtpm_cpu->cpu); + + cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); + + nr_cpus = cpumask_weight(&cpus); + + for (i = 0; i < pd->nr_perf_states; i++) { + + power = pd->table[i].power * MICROWATT_PER_MILLIWATT * nr_cpus; + + if (power > power_limit) + break; + } + + freq = pd->table[i - 1].frequency; + + freq_qos_update_request(&dtpm_cpu->qos_req, freq); + + power_limit = pd->table[i - 1].power * + MICROWATT_PER_MILLIWATT * nr_cpus; + + return power_limit; +} + +static u64 get_pd_power_uw(struct dtpm *dtpm) +{ + struct dtpm_cpu *dtpm_cpu = dtpm->private; + struct em_perf_domain *pd; + struct cpumask cpus; + unsigned long freq; + int i, nr_cpus; + + pd = em_cpu_get(dtpm_cpu->cpu); + freq = cpufreq_quick_get(dtpm_cpu->cpu); + cpumask_and(&cpus, cpu_online_mask, to_cpumask(pd->cpus)); + nr_cpus = cpumask_weight(&cpus); + + for (i = 0; i < pd->nr_perf_states; i++) { + + if (pd->table[i].frequency < freq) + continue; + + return pd->table[i].power * + MICROWATT_PER_MILLIWATT * nr_cpus; + } + + return 0; +} + +static void pd_release(struct dtpm *dtpm) +{ + struct dtpm_cpu *dtpm_cpu = dtpm->private; + + if (freq_qos_request_active(&dtpm_cpu->qos_req)) + freq_qos_remove_request(&dtpm_cpu->qos_req); + + kfree(dtpm_cpu); +} + +static struct dtpm_ops dtpm_ops = { + .set_power_uw = set_pd_power_limit, + .get_power_uw = get_pd_power_uw, + .release = pd_release, +}; + +static int cpuhp_dtpm_cpu_offline(unsigned int cpu) +{ + struct cpufreq_policy *policy; + struct em_perf_domain *pd; + struct dtpm *dtpm; + + policy = cpufreq_cpu_get(cpu); + + if (!policy) + return 0; + + pd = em_cpu_get(cpu); + if (!pd) + return -EINVAL; + + dtpm = per_cpu(dtpm_per_cpu, cpu); + + power_sub(dtpm, pd); + + if (cpumask_weight(policy->cpus) != 1) + return 0; + + for_each_cpu(cpu, policy->related_cpus) + per_cpu(dtpm_per_cpu, cpu) = NULL; + + dtpm_unregister(dtpm); + + return 0; +} + +static int cpuhp_dtpm_cpu_online(unsigned int cpu) +{ + struct dtpm *dtpm; + struct dtpm_cpu *dtpm_cpu; + struct cpufreq_policy *policy; + struct em_perf_domain *pd; + char name[CPUFREQ_NAME_LEN]; + int ret = -ENOMEM; + + policy = cpufreq_cpu_get(cpu); + + if (!policy) + return 0; + + pd = em_cpu_get(cpu); + if (!pd) + return -EINVAL; + + dtpm = per_cpu(dtpm_per_cpu, cpu); + if (dtpm) + return power_add(dtpm, pd); + + dtpm = dtpm_alloc(&dtpm_ops); + if (!dtpm) + return -EINVAL; + + dtpm_cpu = kzalloc(sizeof(dtpm_cpu), GFP_KERNEL); + if (!dtpm_cpu) + goto out_kfree_dtpm; + + dtpm->private = dtpm_cpu; + dtpm_cpu->cpu = cpu; + + for_each_cpu(cpu, policy->related_cpus) + per_cpu(dtpm_per_cpu, cpu) = dtpm; + + sprintf(name, "cpu%d", dtpm_cpu->cpu); + + ret = dtpm_register(name, dtpm, __parent); + if (ret) + goto out_kfree_dtpm_cpu; + + ret = power_add(dtpm, pd); + if (ret) + goto out_dtpm_unregister; + + ret = freq_qos_add_request(&policy->constraints, + &dtpm_cpu->qos_req, FREQ_QOS_MAX, + pd->table[pd->nr_perf_states - 1].frequency); + if (ret) + goto out_power_sub; + + return 0; + +out_power_sub: + power_sub(dtpm, pd); + +out_dtpm_unregister: + dtpm_unregister(dtpm); + dtpm_cpu = NULL; + dtpm = NULL; + +out_kfree_dtpm_cpu: + for_each_cpu(cpu, policy->related_cpus) + per_cpu(dtpm_per_cpu, cpu) = NULL; + kfree(dtpm_cpu); + +out_kfree_dtpm: + kfree(dtpm); + return ret; +} + +int dtpm_register_cpu(struct dtpm *parent) +{ + __parent = parent; + + return cpuhp_setup_state(CPUHP_AP_DTPM_CPU_ONLINE, + "dtpm_cpu:online", + cpuhp_dtpm_cpu_online, + cpuhp_dtpm_cpu_offline); +} diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 0042ef362511..ee09a39627d6 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -193,6 +193,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, + CPUHP_AP_DTPM_CPU_ONLINE, CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/dtpm.h b/include/linux/dtpm.h index 7a1d0b50e334..e80a332e3d8a 100644 --- a/include/linux/dtpm.h +++ b/include/linux/dtpm.h @@ -72,4 +72,6 @@ void dtpm_unregister(struct dtpm *dtpm); int dtpm_register(const char *name, struct dtpm *dtpm, struct dtpm *parent); +int dtpm_register_cpu(struct dtpm *parent); + #endif From 8e0cbf356377fabac47a027dd176cd1cacc5fc01 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Tue, 29 Dec 2020 19:18:25 -0500 Subject: [PATCH 006/307] Documentation: Add documentation for new platform_profile sysfs attribute On modern systems the platform performance, temperature, fan and other hardware related characteristics are often dynamically configurable. The profile is often automatically adjusted to the load by some automatic-mechanism (which may very well live outside the kernel). These auto platform-adjustment mechanisms often can be configured with one of several 'platform-profiles', with either a bias towards low-power consumption or towards performance (and higher power consumption and thermals). Introduce a new platform_profile sysfs API which offers a generic API for selecting the performance-profile of these automatic-mechanisms. Co-developed-by: Hans de Goede Signed-off-by: Mark Pearson Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- .../ABI/testing/sysfs-platform_profile | 24 +++++++++++ Documentation/userspace-api/index.rst | 1 + .../userspace-api/sysfs-platform_profile.rst | 42 +++++++++++++++++++ 3 files changed, 67 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-platform_profile create mode 100644 Documentation/userspace-api/sysfs-platform_profile.rst diff --git a/Documentation/ABI/testing/sysfs-platform_profile b/Documentation/ABI/testing/sysfs-platform_profile new file mode 100644 index 000000000000..9d6b89b66cca --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform_profile @@ -0,0 +1,24 @@ +What: /sys/firmware/acpi/platform_profile_choices +Date: October 2020 +Contact: Hans de Goede +Description: This file contains a space-separated list of profiles supported for this device. + + Drivers must use the following standard profile-names: + + ============ ============================================ + low-power Low power consumption + cool Cooler operation + quiet Quieter operation + balanced Balance between low power consumption and performance + performance High performance operation + ============ ============================================ + + Userspace may expect drivers to offer more than one of these + standard profile names. + +What: /sys/firmware/acpi/platform_profile +Date: October 2020 +Contact: Hans de Goede +Description: Reading this file gives the current selected profile for this + device. Writing this file with one of the strings from + platform_profile_choices changes the profile to the new value. diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst index acd2cc2a538d..d29b020e5622 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -24,6 +24,7 @@ place where this information is gathered. ioctl/index iommu media/index + sysfs-platform_profile .. only:: subproject and html diff --git a/Documentation/userspace-api/sysfs-platform_profile.rst b/Documentation/userspace-api/sysfs-platform_profile.rst new file mode 100644 index 000000000000..c33a71263d9e --- /dev/null +++ b/Documentation/userspace-api/sysfs-platform_profile.rst @@ -0,0 +1,42 @@ +===================================================================== +Platform Profile Selection (e.g. /sys/firmware/acpi/platform_profile) +===================================================================== + +On modern systems the platform performance, temperature, fan and other +hardware related characteristics are often dynamically configurable. The +platform configuration is often automatically adjusted to the current +conditions by some automatic mechanism (which may very well live outside +the kernel). + +These auto platform adjustment mechanisms often can be configured with +one of several platform profiles, with either a bias towards low power +operation or towards performance. + +The purpose of the platform_profile attribute is to offer a generic sysfs +API for selecting the platform profile of these automatic mechanisms. + +Note that this API is only for selecting the platform profile, it is +NOT a goal of this API to allow monitoring the resulting performance +characteristics. Monitoring performance is best done with device/vendor +specific tools such as e.g. turbostat. + +Specifically when selecting a high performance profile the actual achieved +performance may be limited by various factors such as: the heat generated +by other components, room temperature, free air flow at the bottom of a +laptop, etc. It is explicitly NOT a goal of this API to let userspace know +about any sub-optimal conditions which are impeding reaching the requested +performance level. + +Since numbers on their own cannot represent the multiple variables that a +profile will adjust (power consumption, heat generation, etc) this API +uses strings to describe the various profiles. To make sure that userspace +gets a consistent experience the sysfs-platform_profile ABI document defines +a fixed set of profile names. Drivers *must* map their internal profile +representation onto this fixed set. + +If there is no good match when mapping then a new profile name may be +added. Drivers which wish to introduce new profile names must: + + 1. Explain why the existing profile names canot be used. + 2. Add the new profile name, along with a clear description of the + expected behaviour, to the sysfs-platform_profile ABI documentation. From a2ff95e018f1d2bc816f3078d5110a655e355f18 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Tue, 29 Dec 2020 19:18:26 -0500 Subject: [PATCH 007/307] ACPI: platform: Add platform profile support This is the initial implementation of the platform-profile feature. It provides the details discussed and outlined in the sysfs-platform_profile document. Many modern systems have the ability to modify the operating profile to control aspects like fan speed, temperature and power levels. This module provides a common sysfs interface that platform modules can register against to control their individual profile options. Signed-off-by: Mark Pearson Reviewed-by: Hans de Goede [ rjw: Use full words in enum values names ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/Kconfig | 17 +++ drivers/acpi/Makefile | 1 + drivers/acpi/platform_profile.c | 181 +++++++++++++++++++++++++++++++ include/linux/platform_profile.h | 39 +++++++ 4 files changed, 238 insertions(+) create mode 100644 drivers/acpi/platform_profile.c create mode 100644 include/linux/platform_profile.h diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index edf1558c1105..5ddff93e38c2 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -326,6 +326,23 @@ config ACPI_THERMAL To compile this driver as a module, choose M here: the module will be called thermal. +config ACPI_PLATFORM_PROFILE + tristate "ACPI Platform Profile Driver" + default m + help + This driver adds support for platform-profiles on platforms that + support it. + + Platform-profiles can be used to control the platform behaviour. For + example whether to operate in a lower power mode, in a higher + power performance mode or between the two. + + This driver provides the sysfs interface and is used as the registration + point for platform specific drivers. + + Which profiles are supported is determined on a per-platform basis and + should be obtained from the platform specific driver. + config ACPI_CUSTOM_DSDT_FILE string "Custom DSDT Table file to include" default "" diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 076894a3330f..52b627c7f977 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -79,6 +79,7 @@ obj-$(CONFIG_ACPI_PCI_SLOT) += pci_slot.o obj-$(CONFIG_ACPI_PROCESSOR) += processor.o obj-$(CONFIG_ACPI) += container.o obj-$(CONFIG_ACPI_THERMAL) += thermal.o +obj-$(CONFIG_ACPI_PLATFORM_PROFILE) += platform_profile.o obj-$(CONFIG_ACPI_NFIT) += nfit/ obj-$(CONFIG_ACPI_NUMA) += numa/ obj-$(CONFIG_ACPI) += acpi_memhotplug.o diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c new file mode 100644 index 000000000000..91be50a32cc8 --- /dev/null +++ b/drivers/acpi/platform_profile.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* Platform profile sysfs interface */ + +#include +#include +#include +#include +#include +#include + +static const struct platform_profile_handler *cur_profile; +static DEFINE_MUTEX(profile_lock); + +static const char * const profile_names[] = { + [PLATFORM_PROFILE_LOW_POWER] = "low-power", + [PLATFORM_PROFILE_COOL] = "cool", + [PLATFORM_PROFILE_QUIET] = "quiet", + [PLATFORM_PROFILE_BALANCED] = "balanced", + [PLATFORM_PROFILE_PERFORMANCE] = "performance", +}; +static_assert(ARRAY_SIZE(profile_names) == PLATFORM_PROFILE_LAST); + +static ssize_t platform_profile_choices_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int len = 0; + int err, i; + + err = mutex_lock_interruptible(&profile_lock); + if (err) + return err; + + if (!cur_profile) { + mutex_unlock(&profile_lock); + return -ENODEV; + } + + for_each_set_bit(i, cur_profile->choices, PLATFORM_PROFILE_LAST) { + if (len == 0) + len += sysfs_emit_at(buf, len, "%s", profile_names[i]); + else + len += sysfs_emit_at(buf, len, " %s", profile_names[i]); + } + len += sysfs_emit_at(buf, len, "\n"); + mutex_unlock(&profile_lock); + return len; +} + +static ssize_t platform_profile_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + enum platform_profile_option profile = PLATFORM_PROFILE_BALANCED; + int err; + + err = mutex_lock_interruptible(&profile_lock); + if (err) + return err; + + if (!cur_profile) { + mutex_unlock(&profile_lock); + return -ENODEV; + } + + err = cur_profile->profile_get(&profile); + mutex_unlock(&profile_lock); + if (err) + return err; + + /* Check that profile is valid index */ + if (WARN_ON((profile < 0) || (profile >= ARRAY_SIZE(profile_names)))) + return -EIO; + + return sysfs_emit(buf, "%s\n", profile_names[profile]); +} + +static ssize_t platform_profile_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int err, i; + + err = mutex_lock_interruptible(&profile_lock); + if (err) + return err; + + if (!cur_profile) { + mutex_unlock(&profile_lock); + return -ENODEV; + } + + /* Scan for a matching profile */ + i = sysfs_match_string(profile_names, buf); + if (i < 0) { + mutex_unlock(&profile_lock); + return -EINVAL; + } + + /* Check that platform supports this profile choice */ + if (!test_bit(i, cur_profile->choices)) { + mutex_unlock(&profile_lock); + return -EOPNOTSUPP; + } + + err = cur_profile->profile_set(i); + mutex_unlock(&profile_lock); + if (err) + return err; + return count; +} + +static DEVICE_ATTR_RO(platform_profile_choices); +static DEVICE_ATTR_RW(platform_profile); + +static struct attribute *platform_profile_attrs[] = { + &dev_attr_platform_profile_choices.attr, + &dev_attr_platform_profile.attr, + NULL +}; + +static const struct attribute_group platform_profile_group = { + .attrs = platform_profile_attrs +}; + +void platform_profile_notify(void) +{ + if (!cur_profile) + return; + sysfs_notify(acpi_kobj, NULL, "platform_profile"); +} +EXPORT_SYMBOL_GPL(platform_profile_notify); + +int platform_profile_register(const struct platform_profile_handler *pprof) +{ + int err; + + mutex_lock(&profile_lock); + /* We can only have one active profile */ + if (cur_profile) { + mutex_unlock(&profile_lock); + return -EEXIST; + } + + /* Sanity check the profile handler field are set */ + if (!pprof || bitmap_empty(pprof->choices, PLATFORM_PROFILE_LAST) || + !pprof->profile_set || !pprof->profile_get) { + mutex_unlock(&profile_lock); + return -EINVAL; + } + + err = sysfs_create_group(acpi_kobj, &platform_profile_group); + if (err) { + mutex_unlock(&profile_lock); + return err; + } + + cur_profile = pprof; + mutex_unlock(&profile_lock); + return 0; +} +EXPORT_SYMBOL_GPL(platform_profile_register); + +int platform_profile_remove(void) +{ + mutex_lock(&profile_lock); + if (!cur_profile) { + mutex_unlock(&profile_lock); + return -ENODEV; + } + + sysfs_remove_group(acpi_kobj, &platform_profile_group); + cur_profile = NULL; + mutex_unlock(&profile_lock); + return 0; +} +EXPORT_SYMBOL_GPL(platform_profile_remove); + +MODULE_AUTHOR("Mark Pearson "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h new file mode 100644 index 000000000000..3623d7108421 --- /dev/null +++ b/include/linux/platform_profile.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Platform profile sysfs interface + * + * See Documentation/ABI/testing/sysfs-platform_profile.rst for more + * information. + */ + +#ifndef _PLATFORM_PROFILE_H_ +#define _PLATFORM_PROFILE_H_ + +#include + +/* + * If more options are added please update profile_names + * array in platform-profile.c and sysfs-platform-profile.rst + * documentation. + */ + +enum platform_profile_option { + PLATFORM_PROFILE_LOW_POWER, + PLATFORM_PROFILE_COOL, + PLATFORM_PROFILE_QUIET, + PLATFORM_PROFILE_BALANCED, + PLATFORM_PROFILE_PERFORMANCE, + PLATFORM_PROFILE_LAST, /*must always be last */ +}; + +struct platform_profile_handler { + unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + int (*profile_get)(enum platform_profile_option *profile); + int (*profile_set)(enum platform_profile_option profile); +}; + +int platform_profile_register(const struct platform_profile_handler *pprof); +int platform_profile_remove(void); +void platform_profile_notify(void); + +#endif /*_PLATFORM_PROFILE_H_*/ From 8f50db4b5c79af2ba54f5fbe8a5173fd7f37a493 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 30 Dec 2020 16:37:44 +0100 Subject: [PATCH 008/307] powercap/drivers/dtpm: Fix __udivdi3 and __aeabi_uldivmod unresolved symbols 32-bit architectures do not support u64 divisions, so the macro DIV_ROUND_CLOSEST is not adequate as the compiler will replace the call to an unexisting function for the platform, leading to unresolved references to symbols. Fix this by using the compatible macros: DIV64_U64_ROUND_CLOSEST and DIV_ROUND_CLOSEST_ULL. Fixes: a20d0ef97abf ("powercap/drivers/dtpm: Add API for dynamic thermal power management") Reported-by: kernel test robot Signed-off-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 5b6857e9b064..0abcc439d728 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -99,8 +99,8 @@ static void __dtpm_rebalance_weight(struct dtpm *dtpm) pr_debug("Setting weight '%d' for '%s'\n", child->weight, child->zone.name); - child->weight = DIV_ROUND_CLOSEST(child->power_max * 1024, - dtpm->power_max); + child->weight = DIV64_U64_ROUND_CLOSEST( + child->power_max * 1024, dtpm->power_max); __dtpm_rebalance_weight(child); } @@ -272,7 +272,7 @@ static int __set_power_limit_uw(struct dtpm *dtpm, int cid, u64 power_limit) } else if (power_limit == dtpm->power_min) { power = child->power_min; } else { - power = DIV_ROUND_CLOSEST( + power = DIV_ROUND_CLOSEST_ULL( power_limit * child->weight, 1024); } From 1e8f44f159b31fe31ad2f40f96575b6ad6df2fe9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 11 Mar 2020 17:22:19 -0400 Subject: [PATCH 009/307] do_tmpfile(): don't mess with finish_open() use vfs_open() instead Signed-off-by: Al Viro --- fs/namei.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 78443a85480a..a3b3ca62ef5c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3325,10 +3325,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ error = may_open(&path, 0, op->open_flag); - if (error) - goto out2; - file->f_path.mnt = path.mnt; - error = finish_open(file, child, NULL); + if (!error) + error = vfs_open(&path, file); out2: mnt_drop_write(path.mnt); out: From 26ddb45e13a3e09f5f5517a3c9d6510208181516 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 9 Dec 2020 17:09:28 -0500 Subject: [PATCH 010/307] fs/namei.c: Remove unlikely of status being -ECHILD in lookup_fast() Running my yearly branch profiling code, it detected a 100% wrong branch condition in name.c for lookup_fast(). The code in question has: status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; if (unlazy_child(nd, dentry, seq)) return ERR_PTR(-ECHILD); if (unlikely(status == -ECHILD)) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); If the status of the d_revalidate() is greater than zero, then the function finishes. Otherwise, if it is an "unlazy_child" it returns with -ECHILD. After the above two checks, the status is compared to -ECHILD, as that is what is returned if the original d_revalidate() needed to be done in a non-rcu mode. Especially this path is called in a condition of: if (nd->flags & LOOKUP_RCU) { And most of the d_revalidate() functions have: if (flags & LOOKUP_RCU) return -ECHILD; It appears that that is the only case that this if statement is triggered on two of my machines, running in production. As it is dependent on what filesystem mix is configured in the running kernel, simply remove the unlikely() from the if statement. Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Al Viro --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index a3b3ca62ef5c..3345a9f38ccb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1495,7 +1495,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, return dentry; if (unlazy_child(nd, dentry, seq)) return ERR_PTR(-ECHILD); - if (unlikely(status == -ECHILD)) + if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { From e36cffed20a324e116f329a94061ae30dd26fb51 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Dec 2020 09:19:08 -0700 Subject: [PATCH 011/307] fs: make unlazy_walk() error handling consistent Most callers check for non-zero return, and assume it's -ECHILD (which it always will be). One caller uses the actual error return. Clean this up and make it fully consistent, by having unlazy_walk() return a bool instead. Rename it to try_to_unlazy() and return true on success, and failure on error. That's easier to read. No functional changes in this patch. Cc: Al Viro Signed-off-by: Jens Axboe Signed-off-by: Al Viro --- fs/namei.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 3345a9f38ccb..21fd06753504 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -669,17 +669,17 @@ static bool legitimize_root(struct nameidata *nd) */ /** - * unlazy_walk - try to switch to ref-walk mode. + * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * Returns: 0 on success, -ECHILD on failure + * Returns: true on success, false on failure * - * unlazy_walk attempts to legitimize the current nd->path and nd->root + * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. - * Nothing should touch nameidata between unlazy_walk() failure and + * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd) +static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; @@ -694,14 +694,14 @@ static int unlazy_walk(struct nameidata *nd) goto out; rcu_read_unlock(); BUG_ON(nd->inode != parent->d_inode); - return 0; + return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: rcu_read_unlock(); - return -ECHILD; + return false; } /** @@ -792,7 +792,7 @@ static int complete_walk(struct nameidata *nd) */ if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; - if (unlikely(unlazy_walk(nd))) + if (!try_to_unlazy(nd)) return -ECHILD; } @@ -1466,7 +1466,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, unsigned seq; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { - if (unlazy_walk(nd)) + if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } @@ -1567,10 +1567,8 @@ static inline int may_lookup(struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (err != -ECHILD) + if (err != -ECHILD || !try_to_unlazy(nd)) return err; - if (unlazy_walk(nd)) - return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); } @@ -1592,7 +1590,7 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, seq); - if (unlazy_walk(nd) != 0 || !grabbed_link) + if (!try_to_unlazy(nd) != 0 || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) @@ -1634,7 +1632,7 @@ static const char *pick_link(struct nameidata *nd, struct path *link, touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { - if (unlikely(unlazy_walk(nd))) + if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } @@ -1651,11 +1649,8 @@ static const char *pick_link(struct nameidata *nd, struct path *link, get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); - if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); + if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); - } } else { res = get(link->dentry, inode, &last->done); } @@ -2195,7 +2190,7 @@ OK: } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd)) + if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; @@ -3129,7 +3124,6 @@ static const char *open_last_lookups(struct nameidata *nd, struct inode *inode; struct dentry *dentry; const char *res; - int error; nd->flags |= op->intent; @@ -3153,9 +3147,8 @@ static const char *open_last_lookups(struct nameidata *nd, } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { - error = unlazy_walk(nd); - if (unlikely(error)) - return ERR_PTR(error); + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ @@ -3164,9 +3157,7 @@ static const char *open_last_lookups(struct nameidata *nd, } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { - error = mnt_want_write(nd->path.mnt); - if (!error) - got_write = true; + got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be From ae66db45fd309fd1c6d4e846dfc8414dfec7d6ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jan 2021 00:08:41 -0500 Subject: [PATCH 012/307] saner calling conventions for unlazy_child() same as for the previous commit - instead of 0/-ECHILD make it return true/false, rename to try_to_unlazy_child(). Signed-off-by: Al Viro --- fs/namei.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 21fd06753504..2ee219497460 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -705,19 +705,19 @@ out: } /** - * unlazy_child - try to switch to ref-walk mode. + * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data - * @dentry: child of nd->path.dentry - * @seq: seq number to check dentry against - * Returns: 0 on success, -ECHILD on failure + * @dentry: next dentry to step into + * @seq: seq number to check @dentry against + * Returns: true on success, false on failure * - * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry - * for ref-walk mode. @dentry must be a path found by a do_lookup call on - * @nd. Must be called from rcu-walk context. - * Nothing should touch nameidata between unlazy_child() failure and + * Similar to to try_to_unlazy(), but here we have the next dentry already + * picked by rcu-walk and want to legitimize that in addition to the current + * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. + * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ -static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq) +static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsigned seq) { BUG_ON(!(nd->flags & LOOKUP_RCU)); @@ -747,7 +747,7 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se if (unlikely(!legitimize_root(nd))) goto out_dput; rcu_read_unlock(); - return 0; + return true; out2: nd->path.mnt = NULL; @@ -755,11 +755,11 @@ out1: nd->path.dentry = NULL; out: rcu_read_unlock(); - return -ECHILD; + return false; out_dput: rcu_read_unlock(); dput(dentry); - return -ECHILD; + return false; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) @@ -1372,7 +1372,7 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ENOENT; if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 0; - if (unlazy_child(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry, seq)) return -ECHILD; // *path might've been clobbered by __follow_mount_rcu() path->mnt = nd->path.mnt; @@ -1493,7 +1493,7 @@ static struct dentry *lookup_fast(struct nameidata *nd, status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; - if (unlazy_child(nd, dentry, seq)) + if (!try_to_unlazy_next(nd, dentry, seq)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ From 6c6ec2b0a3e0381d886d531bd1471dfdb1509237 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Dec 2020 09:19:09 -0700 Subject: [PATCH 013/307] fs: add support for LOOKUP_CACHED io_uring always punts opens to async context, since there's no control over whether the lookup blocks or not. Add LOOKUP_CACHED to support just doing the fast RCU based lookups, which we know will not block. If we can do a cached path resolution of the filename, then we don't have to always punt lookups for a worker. During path resolution, we always do LOOKUP_RCU first. If that fails and we terminate LOOKUP_RCU, then fail a LOOKUP_CACHED attempt as well. Cc: Al Viro Signed-off-by: Jens Axboe Signed-off-by: Al Viro --- fs/namei.c | 9 +++++++++ include/linux/namei.h | 1 + 2 files changed, 10 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 2ee219497460..4cae88733a5c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -686,6 +686,8 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; + if (nd->flags & LOOKUP_CACHED) + goto out1; if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) @@ -722,6 +724,8 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; + if (nd->flags & LOOKUP_CACHED) + goto out2; if (unlikely(!legitimize_links(nd))) goto out2; if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) @@ -792,6 +796,7 @@ static int complete_walk(struct nameidata *nd) */ if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED))) nd->root.mnt = NULL; + nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } @@ -2204,6 +2209,10 @@ static const char *path_init(struct nameidata *nd, unsigned flags) int error; const char *s = nd->name->name; + /* LOOKUP_CACHED requires RCU, ask caller to retry */ + if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) + return ERR_PTR(-EAGAIN); + if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) diff --git a/include/linux/namei.h b/include/linux/namei.h index a4bb992623c4..b9605b2b46e7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -46,6 +46,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT}; #define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ #define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ #define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ +#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) From 99668f618062816ca7ba639b007eb145b9d3d41e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Dec 2020 09:19:10 -0700 Subject: [PATCH 014/307] fs: expose LOOKUP_CACHED through openat2() RESOLVE_CACHED Now that we support non-blocking path resolution internally, expose it via openat2() in the struct open_how ->resolve flags. This allows applications using openat2() to limit path resolution to the extent that it is already cached. If the lookup cannot be satisfied in a non-blocking manner, openat2(2) will return -1/-EAGAIN. Cc: Al Viro Signed-off-by: Jens Axboe Signed-off-by: Al Viro --- fs/open.c | 6 ++++++ include/linux/fcntl.h | 2 +- include/uapi/linux/openat2.h | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/open.c b/fs/open.c index 1e06e443a565..ca5444733acd 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1091,6 +1091,12 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) lookup_flags |= LOOKUP_BENEATH; if (how->resolve & RESOLVE_IN_ROOT) lookup_flags |= LOOKUP_IN_ROOT; + if (how->resolve & RESOLVE_CACHED) { + /* Don't bother even trying for create/truncate/tmpfile open */ + if (flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + return -EAGAIN; + lookup_flags |= LOOKUP_CACHED; + } op->lookup_flags = lookup_flags; return 0; diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 921e750843e6..766fcd973beb 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -19,7 +19,7 @@ /* List of all valid flags for the how->resolve argument: */ #define VALID_RESOLVE_FLAGS \ (RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \ - RESOLVE_BENEATH | RESOLVE_IN_ROOT) + RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_CACHED) /* List of all open_how "versions". */ #define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */ diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h index 58b1eb711360..a5feb7604948 100644 --- a/include/uapi/linux/openat2.h +++ b/include/uapi/linux/openat2.h @@ -35,5 +35,9 @@ struct open_how { #define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." be scoped inside the dirfd (similar to chroot(2)). */ +#define RESOLVE_CACHED 0x20 /* Only complete if resolution can be + completed through cached lookup. May + return -EAGAIN if that's not + possible. */ #endif /* _UAPI_LINUX_OPENAT2_H */ From 8a00dd0012f383fc0c39b169b694dc15236cec7c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Jun 2020 20:14:02 -0400 Subject: [PATCH 015/307] binfmt_elf: partially sanitize PRSTATUS_SIZE and SET_PR_FPVALID On 64bit architectures that support 32bit processes there are two possible layouts for NT_PRSTATUS note in ELF coredumps. For one thing, several fields are 64bit for native processes and 32bit for compat ones (pr_sigpend, etc.). For another, the register dump is obviously different - the size and number of registers are not going to be the same for 32bit and 64bit variants of processor. Usually that's handled by having two structures - elf_prstatus for native layout and compat_elf_prstatus for 32bit one. 32bit processes are handled by fs/compat_binfmt_elf.c, which defines a macro called 'elf_prstatus' that expands to compat_elf_prstatus. Then it includes fs/binfmt_elf.c, which makes all references to struct elf_prstatus to be textually replaced with struct compat_elf_prstatus. Ugly and somewhat brittle, but it works. However, amd64 is worse - there are _three_ possible layouts. One for native 64bit processes, another for i386 (32bit) processes and yet another for x32 (32bit address space with full 64bit registers). Both i386 and x32 processes are handled by fs/compat_binfmt_elf.c, with usual compat_binfmt_elf.c trickery. However, the layouts for i386 and x32 are not identical - they have the common beginning, but the register dump part (pr_reg) is bigger on x32. Worse, pr_reg is not the last field - it's followed by int pr_fpvalid, so that field ends up at different offsets for i386 and x32 layouts. Fortunately, there's not much code that cares about any of that - it's all encapsulated in fill_thread_core_info(). Since x32 variant is bigger, we define compat_elf_prstatus to match that layout. That way i386 processes have enough space to fit their layout into. Moreover, since these layouts are identical prior to pr_reg, we don't need to distinguish x32 and i386 cases when we are setting the fields prior to pr_reg. Filling pr_reg itself is done by calling ->get() method of appropriate regset, and that method knows what layout (and size) to use. We do need to distinguish x32 and i386 cases only for two things: setting ->pr_fpvalid (offset differs for x32 and i386) and choosing the right size for our note. The way it's done is Not Nice, for the lack of more accurate printable description. There are two macros (PRSTATUS_SIZE and SET_PR_FPVALID), that default essentially to sizeof(struct elf_prstatus) and (S)->pr_fpvalid = 1. On x86 asm/compat.h provides its own variants. Unfortunately, quite a few things go wrong there: * PRSTATUS_SIZE doesn't use the normal test for process being an x32 one; it compares the size reported by regset with the size of pr_reg. * it hardcodes the sizes of x32 and i386 variants (296 and 144 resp.), so if some change in includes leads to asm/compat.h pulled in by fs/binfmt_elf.c we are in trouble - it will end up using the size of x32 variant for 64bit processes. * it's in the wrong place; asm/compat.h couldn't define the structure for i386 layout, since it lacks quite a few types needed for it. Hardcoded sizes are largely due to that. The proper fix would be to have an explicitly defined i386 variant of structure and have PRSTATUS_SIZE/SET_PR_FPVALID check for TIF_X32 to choose the variant that should be used. Unfortunately, that requires some manipulations of headers; we'll do that later in the series, but for now let's go with the minimal variant - rename PRSTATUS_SIZE in asm/compat.h to COMPAT_PRSTATUS_SIZE, have fs/compat_binfmt_elf.c define PRSTATUS_SIZE to COMPAT_PRSTATUS_SIZE and use the normal TIF_X32 check in that macro. The size of i386 variant is kept hardcoded for now. Similar story for SET_PR_FPVALID. Signed-off-by: Al Viro --- arch/x86/include/asm/compat.h | 11 +++++++---- fs/binfmt_elf.c | 13 +++++-------- fs/compat_binfmt_elf.c | 8 ++++++++ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index f145e3326c6d..15cf0f831dee 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -165,10 +165,13 @@ struct compat_shmid64_ds { typedef struct user_regs_struct compat_elf_gregset_t; /* Full regset -- prstatus on x32, otherwise on ia32 */ -#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) -#define SET_PR_FPVALID(S, V, R) \ - do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ - while (0) +#define COMPAT_PRSTATUS_SIZE (user_64bit_mode(task_pt_regs(current)) \ + ? sizeof(struct compat_elf_prstatus) \ + : 144) +#define COMPAT_SET_PR_FPVALID(S) \ + (*(user_64bit_mode(task_pt_regs(current)) \ + ? &(S)->pr_fpvalid \ + : (int *)((void *)(S) + 140)) = 1) #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 950bc177238a..8380478d3d92 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1717,11 +1717,11 @@ static void do_thread_regset_writeback(struct task_struct *task, } #ifndef PRSTATUS_SIZE -#define PRSTATUS_SIZE(S, R) sizeof(S) +#define PRSTATUS_SIZE sizeof(struct elf_prstatus) #endif #ifndef SET_PR_FPVALID -#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V)) +#define SET_PR_FPVALID(S) ((S)->pr_fpvalid = 1) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, @@ -1729,7 +1729,6 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, long signr, size_t *total) { unsigned int i; - int regset0_size; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1738,13 +1737,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * We assume that regset 0 is NT_PRSTATUS. */ fill_prstatus(&t->prstatus, t->task, signr); - regset0_size = regset_get(t->task, &view->regsets[0], + regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - if (regset0_size < 0) - return 0; fill_note(&t->notes[0], "CORE", NT_PRSTATUS, - PRSTATUS_SIZE(t->prstatus, regset0_size), &t->prstatus); + PRSTATUS_SIZE, &t->prstatus); *total += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); @@ -1772,7 +1769,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, continue; if (is_fpreg) - SET_PR_FPVALID(&t->prstatus, 1, regset0_size); + SET_PR_FPVALID(&t->prstatus); fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX", note_type, ret, data); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 2c557229696a..962df845ed51 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -95,6 +95,14 @@ #define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE #endif +#ifdef COMPAT_PRSTATUS_SIZE +#define PRSTATUS_SIZE COMPAT_PRSTATUS_SIZE +#endif + +#ifdef COMPAT_SET_PR_FPVALID +#define SET_PR_FPVALID(S) COMPAT_SET_PR_FPVALID(S) +#endif + #ifdef COMPAT_ELF_PLAT_INIT #undef ELF_PLAT_INIT #define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT From f40d81231b2ddfac41d5bf09462b260b256e15ba Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 4 Jan 2021 13:59:12 +0000 Subject: [PATCH 016/307] PM / devfreq: Correct spelling in a comment The device attribute exposed in sysfs is called 'polling_interval'. Align the comment. Signed-off-by: Lukasz Luba Signed-off-by: Chanwoo Choi --- drivers/devfreq/governor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h index 2a52f97b542d..70f44b3ca42e 100644 --- a/drivers/devfreq/governor.h +++ b/drivers/devfreq/governor.h @@ -40,7 +40,7 @@ /* * Definition of governor attribute flags except for common sysfs attributes * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL - * : Indicate polling_interal sysfs attribute + * : Indicate polling_interval sysfs attribute * - DEVFREQ_GOV_ATTR_TIMER * : Indicate timer sysfs attribute */ From ec894883de5336e28313e531e2f3a8b86f1a8a1a Mon Sep 17 00:00:00 2001 From: pierre Kuo Date: Wed, 16 Dec 2020 10:25:38 +0800 Subject: [PATCH 017/307] PM / devfreq: Replace devfreq->dev.parent as dev in devfreq_add_device In devfreq_add_device, replace devfreq->dev.parent as dev to keep code simple. Signed-off-by: pierre Kuo Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 6aa10de792b3..94cc25fd68da 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -893,13 +893,13 @@ struct devfreq *devfreq_add_device(struct device *dev, goto err_devfreq; devfreq->nb_min.notifier_call = qos_min_notifier_call; - err = dev_pm_qos_add_notifier(devfreq->dev.parent, &devfreq->nb_min, + err = dev_pm_qos_add_notifier(dev, &devfreq->nb_min, DEV_PM_QOS_MIN_FREQUENCY); if (err) goto err_devfreq; devfreq->nb_max.notifier_call = qos_max_notifier_call; - err = dev_pm_qos_add_notifier(devfreq->dev.parent, &devfreq->nb_max, + err = dev_pm_qos_add_notifier(dev, &devfreq->nb_max, DEV_PM_QOS_MAX_FREQUENCY); if (err) goto err_devfreq; From fb7791e213a64495ec2336869b868fcd8af14346 Mon Sep 17 00:00:00 2001 From: Ivan Babrou Date: Mon, 4 Jan 2021 15:57:18 -0800 Subject: [PATCH 018/307] cpupower: add Makefile dependencies for install targets This allows building cpupower in parallel rather than serially. Signed-off-by: Ivan Babrou Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 8 ++++---- tools/power/cpupower/bench/Makefile | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index c7bcddbd486d..3b1594447f29 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -270,14 +270,14 @@ clean: $(MAKE) -C bench O=$(OUTPUT) clean -install-lib: +install-lib: libcpupower $(INSTALL) -d $(DESTDIR)${libdir} $(CP) $(OUTPUT)libcpupower.so* $(DESTDIR)${libdir}/ $(INSTALL) -d $(DESTDIR)${includedir} $(INSTALL_DATA) lib/cpufreq.h $(DESTDIR)${includedir}/cpufreq.h $(INSTALL_DATA) lib/cpuidle.h $(DESTDIR)${includedir}/cpuidle.h -install-tools: +install-tools: $(OUTPUT)cpupower $(INSTALL) -d $(DESTDIR)${bindir} $(INSTALL_PROGRAM) $(OUTPUT)cpupower $(DESTDIR)${bindir} $(INSTALL) -d $(DESTDIR)${bash_completion_dir} @@ -293,14 +293,14 @@ install-man: $(INSTALL_DATA) -D man/cpupower-info.1 $(DESTDIR)${mandir}/man1/cpupower-info.1 $(INSTALL_DATA) -D man/cpupower-monitor.1 $(DESTDIR)${mandir}/man1/cpupower-monitor.1 -install-gmo: +install-gmo: create-gmo $(INSTALL) -d $(DESTDIR)${localedir} for HLANG in $(LANGUAGES); do \ echo '$(INSTALL_DATA) -D $(OUTPUT)po/$$HLANG.gmo $(DESTDIR)${localedir}/$$HLANG/LC_MESSAGES/cpupower.mo'; \ $(INSTALL_DATA) -D $(OUTPUT)po/$$HLANG.gmo $(DESTDIR)${localedir}/$$HLANG/LC_MESSAGES/cpupower.mo; \ done; -install-bench: +install-bench: compile-bench @#DESTDIR must be set from outside to survive @sbindir=$(sbindir) bindir=$(bindir) docdir=$(docdir) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) install diff --git a/tools/power/cpupower/bench/Makefile b/tools/power/cpupower/bench/Makefile index f68b4bc55273..d9d9923af85c 100644 --- a/tools/power/cpupower/bench/Makefile +++ b/tools/power/cpupower/bench/Makefile @@ -27,7 +27,7 @@ $(OUTPUT)cpufreq-bench: $(OBJS) all: $(OUTPUT)cpufreq-bench -install: +install: $(OUTPUT)cpufreq-bench mkdir -p $(DESTDIR)/$(sbindir) mkdir -p $(DESTDIR)/$(bindir) mkdir -p $(DESTDIR)/$(docdir) From f2485a2dc9f0f30fbdd013ad5772975100c71360 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Jun 2020 00:08:44 -0400 Subject: [PATCH 019/307] elf_prstatus: collect the common part (everything before pr_reg) into a struct Preparations to doing i386 compat elf_prstatus sanely - rather than duplicating the beginning of compat_elf_prstatus, take these fields into a separate structure (compat_elf_prstatus_common), so that it could be reused. Due to the incestous relationship between binfmt_elf.c and compat_binfmt_elf.c we need the same shape change done to native struct elf_prstatus, gathering the fields prior to pr_reg into a new structure (struct elf_prstatus_common). Fortunately, offset of pr_reg is always a multiple of 16 with no padding right before it, so it's possible to turn all the stuff prior to it into a single member without disturbing the layout. [build fix from Geert Uytterhoeven folded in] Signed-off-by: Al Viro --- arch/ia64/kernel/crash.c | 2 +- arch/mips/kernel/binfmt_elfn32.c | 7 ++++++- arch/mips/kernel/binfmt_elfo32.c | 6 +++++- arch/powerpc/platforms/powernv/opal-core.c | 6 +++--- arch/s390/kernel/crash_dump.c | 2 +- fs/binfmt_elf.c | 8 ++++---- fs/binfmt_elf_fdpic.c | 22 +++++----------------- fs/compat_binfmt_elf.c | 1 + include/linux/elfcore-compat.h | 7 ++++++- include/linux/elfcore.h | 7 ++++++- kernel/kexec_core.c | 2 +- 11 files changed, 39 insertions(+), 31 deletions(-) diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index fec70d662d0c..4f47741005d2 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -43,7 +43,7 @@ crash_save_this_cpu(void) elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg); memset(prstatus, 0, sizeof(*prstatus)); - prstatus->pr_pid = current->pid; + prstatus->common.pr_pid = current->pid; ia64_dump_cpu_regs(dst); cfm = dst[43]; diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 6ee3f7218c67..136dc0c9300d 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -44,7 +44,8 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; #include #define elf_prstatus elf_prstatus32 -struct elf_prstatus32 +#define elf_prstatus_common elf_prstatus32_common +struct elf_prstatus32_common { struct elf_siginfo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ @@ -58,6 +59,10 @@ struct elf_prstatus32 struct old_timeval32 pr_stime; /* System time */ struct old_timeval32 pr_cutime;/* Cumulative user time */ struct old_timeval32 pr_cstime;/* Cumulative system time */ +}; +struct elf_prstatus32 +{ + struct elf_prstatus32_common common: elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index 6dd103d3cebb..b1f4b8f1dee7 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -49,7 +49,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; #include #define elf_prstatus elf_prstatus32 -struct elf_prstatus32 +struct elf_prstatus32_common { struct elf_siginfo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ @@ -63,6 +63,10 @@ struct elf_prstatus32 struct old_timeval32 pr_stime; /* System time */ struct old_timeval32 pr_cutime;/* Cumulative user time */ struct old_timeval32 pr_cstime;/* Cumulative system time */ +}; +struct elf_prstatus32 +{ + struct elf_prstatus32_common common: elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 23571f0b555a..0d9ba70f7251 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -119,8 +119,8 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir, * As a PIR value could also be '0', add an offset of '100' * to every PIR to avoid misinterpretations in GDB. */ - prstatus->pr_pid = cpu_to_be32(100 + pir); - prstatus->pr_ppid = cpu_to_be32(1); + prstatus->common.pr_pid = cpu_to_be32(100 + pir); + prstatus->common.pr_ppid = cpu_to_be32(1); /* * Indicate SIGUSR1 for crash initiated from kernel. @@ -130,7 +130,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir, short sig; sig = kernel_initiated ? SIGUSR1 : SIGTERM; - prstatus->pr_cursig = cpu_to_be16(sig); + prstatus->common.pr_cursig = cpu_to_be16(sig); } } diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 205b2e2648aa..0e36dfc9ccd6 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -365,7 +365,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs)); memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs)); - nt_prstatus.pr_pid = cpu; + nt_prstatus.common.pr_pid = cpu; /* Prepare fpregset (floating point) note */ memset(&nt_fpregset, 0, sizeof(nt_fpregset)); memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8380478d3d92..4c1550b13899 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1495,7 +1495,7 @@ static void fill_note(struct memelfnote *note, const char *name, int type, * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ -static void fill_prstatus(struct elf_prstatus *prstatus, +static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; @@ -1736,7 +1736,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * than being the whole note contents. We fill the reset in here. * We assume that regset 0 is NT_PRSTATUS. */ - fill_prstatus(&t->prstatus, t->task, signr); + fill_prstatus(&t->prstatus.common, t->task, signr); regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); @@ -1958,7 +1958,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) struct task_struct *p = t->thread; t->num_notes = 0; - fill_prstatus(&t->prstatus, p, signr); + fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), @@ -2037,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(info->prstatus, current, siginfo->si_signo); + fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index be4062b8ba75..03d81a14bcbf 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1191,18 +1191,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf_prstatus_fdpic { - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned long pr_sigpend; /* Set of pending signals */ - unsigned long pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct __kernel_old_timeval pr_utime; /* User time */ - struct __kernel_old_timeval pr_stime; /* System time */ - struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ - struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ + struct elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ /* When using FDPIC, the loadmap addresses need to be communicated * to GDB in order for GDB to do the necessary relocations. The @@ -1301,7 +1290,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ -static void fill_prstatus(struct elf_prstatus_fdpic *prstatus, +static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; @@ -1332,9 +1321,6 @@ static void fill_prstatus(struct elf_prstatus_fdpic *prstatus, } prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); - - prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; - prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, @@ -1405,7 +1391,9 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ if (!t) return t; - fill_prstatus(&t->prstatus, p, signr); + fill_prstatus(&t->prstatus.common, p, signr); + t->prstatus.pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + t->prstatus.pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 962df845ed51..feb48a5c2d44 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -50,6 +50,7 @@ * which requires asm/elf.h to define compat_elf_gregset_t et al. */ #define elf_prstatus compat_elf_prstatus +#define elf_prstatus_common compat_elf_prstatus_common #define elf_prpsinfo compat_elf_prpsinfo #undef ns_to_kernel_old_timeval diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h index 10485f0c9740..4aeda5f1f038 100644 --- a/include/linux/elfcore-compat.h +++ b/include/linux/elfcore-compat.h @@ -17,7 +17,7 @@ struct compat_elf_siginfo compat_int_t si_errno; }; -struct compat_elf_prstatus +struct compat_elf_prstatus_common { struct compat_elf_siginfo pr_info; short pr_cursig; @@ -31,6 +31,11 @@ struct compat_elf_prstatus struct old_timeval32 pr_stime; struct old_timeval32 pr_cutime; struct old_timeval32 pr_cstime; +}; + +struct compat_elf_prstatus +{ + struct compat_elf_prstatus_common common; compat_elf_gregset_t pr_reg; compat_int_t pr_fpvalid; }; diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index de51c1bef27d..2aaa15779d50 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -29,7 +29,7 @@ struct elf_siginfo * the SVR4 structure, but more Linuxy, with things that Linux does * not support and which gdb doesn't really use excluded. */ -struct elf_prstatus +struct elf_prstatus_common { struct elf_siginfo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ @@ -43,6 +43,11 @@ struct elf_prstatus struct __kernel_old_timeval pr_stime; /* System time */ struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ +}; + +struct elf_prstatus +{ + struct elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 4f8efc278aa7..80905e5aa8ae 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1076,7 +1076,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; + prstatus.common.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); From 7facdc426f86c67e579e49e100943cbccc43e1c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Jun 2020 23:03:25 -0400 Subject: [PATCH 020/307] [amd64] clean PRSTATUS_SIZE/SET_PR_FPVALID up properly To get rid of hardcoded size/offset in those macros we need to have a definition of i386 variant of struct elf_prstatus. However, we can't do that in asm/compat.h - the types needed for that are not there and adding an include of asm/user32.h into asm/compat.h would cause a lot of mess. That could be conveniently done in elfcore-compat.h, but currently there is nowhere to put arch-dependent parts of it - no asm/elfcore-compat.h. So we introduce a new file (asm/elfcore-compat.h, present on architectures that have CONFIG_ARCH_HAS_ELFCORE_COMPAT set, currently only on x86), have it pulled by linux/elfcore-compat.h and move the definitions there. As a side benefit, we don't need to worry about accidental inclusion of that file into binfmt_elf.c itself, so we don't need the dance with COMPAT_PRSTATUS_SIZE, etc. - only fs/compat_binfmt_elf.c will see that header. Signed-off-by: Al Viro --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/include/asm/compat.h | 14 ------------ arch/x86/include/asm/elfcore-compat.h | 31 +++++++++++++++++++++++++++ fs/compat_binfmt_elf.c | 8 ------- include/linux/elfcore-compat.h | 18 ++++++++++------ 6 files changed, 46 insertions(+), 29 deletions(-) create mode 100644 arch/x86/include/asm/elfcore-compat.h diff --git a/arch/Kconfig b/arch/Kconfig index 78c6f05b10f9..a17ced73b23c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1105,6 +1105,9 @@ config HAVE_ARCH_PFN_VALID config ARCH_SUPPORTS_DEBUG_PAGEALLOC bool +config ARCH_HAS_ELFCORE_COMPAT + bool + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b6dd10b162a..302a6b453c91 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB + select ARCH_HAS_ELFCORE_COMPAT config FORCE_DYNAMIC_FTRACE def_bool y diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 15cf0f831dee..be09c7eac89f 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -159,20 +159,6 @@ struct compat_shmid64_ds { compat_ulong_t __unused5; }; -/* - * The type of struct elf_prstatus.pr_reg in compatible core dumps. - */ -typedef struct user_regs_struct compat_elf_gregset_t; - -/* Full regset -- prstatus on x32, otherwise on ia32 */ -#define COMPAT_PRSTATUS_SIZE (user_64bit_mode(task_pt_regs(current)) \ - ? sizeof(struct compat_elf_prstatus) \ - : 144) -#define COMPAT_SET_PR_FPVALID(S) \ - (*(user_64bit_mode(task_pt_regs(current)) \ - ? &(S)->pr_fpvalid \ - : (int *)((void *)(S) + 140)) = 1) - #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) diff --git a/arch/x86/include/asm/elfcore-compat.h b/arch/x86/include/asm/elfcore-compat.h new file mode 100644 index 000000000000..f1b6c7a8d8fc --- /dev/null +++ b/arch/x86/include/asm/elfcore-compat.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_ELFCORE_COMPAT_H +#define _ASM_X86_ELFCORE_COMPAT_H + +#include + +/* + * On amd64 we have two 32bit ABIs - i386 and x32. The latter + * has bigger registers, so we use it for compat_elf_regset_t. + * The former uses i386_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID + * are used to choose the size and location of ->pr_fpvalid of + * the layout actually used. + */ +typedef struct user_regs_struct compat_elf_gregset_t; + +struct i386_elf_prstatus +{ + struct compat_elf_prstatus_common common; + struct user_regs_struct32 pr_reg; + compat_int_t pr_fpvalid; +}; + +#define PRSTATUS_SIZE \ + (user_64bit_mode(task_pt_regs(current)) \ + ? sizeof(struct compat_elf_prstatus) \ + : sizeof(struct i386_elf_prstatus)) +#define SET_PR_FPVALID(S) \ + (*(user_64bit_mode(task_pt_regs(current)) \ + ? &(S)->pr_fpvalid \ + : &((struct i386_elf_prstatus *)(S))->pr_fpvalid) = 1) + +#endif diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index feb48a5c2d44..a6321415aba0 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -96,14 +96,6 @@ #define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE #endif -#ifdef COMPAT_PRSTATUS_SIZE -#define PRSTATUS_SIZE COMPAT_PRSTATUS_SIZE -#endif - -#ifdef COMPAT_SET_PR_FPVALID -#define SET_PR_FPVALID(S) COMPAT_SET_PR_FPVALID(S) -#endif - #ifdef COMPAT_ELF_PLAT_INIT #undef ELF_PLAT_INIT #define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h index 4aeda5f1f038..e272c3d452ce 100644 --- a/include/linux/elfcore-compat.h +++ b/include/linux/elfcore-compat.h @@ -33,13 +33,6 @@ struct compat_elf_prstatus_common struct old_timeval32 pr_cstime; }; -struct compat_elf_prstatus -{ - struct compat_elf_prstatus_common common; - compat_elf_gregset_t pr_reg; - compat_int_t pr_fpvalid; -}; - struct compat_elf_prpsinfo { char pr_state; @@ -54,4 +47,15 @@ struct compat_elf_prpsinfo char pr_psargs[ELF_PRARGSZ]; }; +#ifdef CONFIG_ARCH_HAS_ELFCORE_COMPAT +#include +#endif + +struct compat_elf_prstatus +{ + struct compat_elf_prstatus_common common; + compat_elf_gregset_t pr_reg; + compat_int_t pr_fpvalid; +}; + #endif /* _LINUX_ELFCORE_COMPAT_H */ From 85f2ada718a81b282ee78a96d0ab1450543612e7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jan 2021 18:34:30 -0500 Subject: [PATCH 021/307] x32: make X32, !IA32_EMULATION setups able to execute x32 binaries It's really trivial - the only wrinkle is making sure that compiler knows that ia32-related side of COMPAT_ARCH_DLINFO is dead code on such configs (we don't get there without having passed compat_elf_check_arch(), and on such configs that'll fail for ia32 binary). Signed-off-by: Al Viro --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/elf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 302a6b453c91..a2182d22b5fa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2864,7 +2864,6 @@ config IA32_EMULATION depends on X86_64 select ARCH_WANT_OLD_COMPAT_IPC select BINFMT_ELF - select COMPAT_BINFMT_ELF select COMPAT_OLD_SIGACTION help Include code to run legacy 32-bit programs under a @@ -2900,6 +2899,7 @@ config COMPAT_32 config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 + select COMPAT_BINFMT_ELF if BINFMT_ELF if COMPAT config COMPAT_FOR_U64_ALIGNMENT diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 66bdfe838d61..9224d40cdefe 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -364,7 +364,7 @@ do { \ #define COMPAT_ARCH_DLINFO \ if (exec->e_machine == EM_X86_64) \ ARCH_DLINFO_X32; \ -else \ +else if (IS_ENABLED(CONFIG_IA32_EMULATION)) \ ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) From 6835501e789a94760f34efffff0e4706e3ee1d71 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jun 2020 19:22:46 -0400 Subject: [PATCH 022/307] mips binfmt_elf*32.c: use elfcore-compat.h ... rather than duplicating declarations from it. Signed-off-by: Al Viro --- arch/mips/kernel/binfmt_elfn32.c | 37 ++++---------------------------- arch/mips/kernel/binfmt_elfo32.c | 36 ++++--------------------------- 2 files changed, 8 insertions(+), 65 deletions(-) diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index a11c291b9241..720bbf272744 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -42,46 +42,17 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; #include #include #include +#include #define elf_prstatus elf_prstatus32 -#define elf_prstatus_common elf_prstatus32_common -struct elf_prstatus32_common -{ - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned int pr_sigpend; /* Set of pending signals */ - unsigned int pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct old_timeval32 pr_utime; /* User time */ - struct old_timeval32 pr_stime; /* System time */ - struct old_timeval32 pr_cutime;/* Cumulative user time */ - struct old_timeval32 pr_cstime;/* Cumulative system time */ -}; +#define elf_prstatus_common compat_elf_prstatus_common struct elf_prstatus32 { - struct elf_prstatus32_common common: + struct compat_elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; - -#define elf_prpsinfo elf_prpsinfo32 -struct elf_prpsinfo32 -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned int pr_flag; /* flags */ - __kernel_uid_t pr_uid; - __kernel_gid_t pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; +#define elf_prpsinfo compat_elf_prpsinfo #define elf_caddr_t u32 #define init_elf_binfmt init_elfn32_binfmt diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index afe8940d4952..6fcab231c962 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -47,45 +47,17 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; #include #include #include +#include #define elf_prstatus elf_prstatus32 -struct elf_prstatus32_common -{ - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned int pr_sigpend; /* Set of pending signals */ - unsigned int pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct old_timeval32 pr_utime; /* User time */ - struct old_timeval32 pr_stime; /* System time */ - struct old_timeval32 pr_cutime;/* Cumulative user time */ - struct old_timeval32 pr_cstime;/* Cumulative system time */ -}; +#define elf_prstatus_common compat_elf_prstatus_common struct elf_prstatus32 { - struct elf_prstatus32_common common: + struct compat_elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; - -#define elf_prpsinfo elf_prpsinfo32 -struct elf_prpsinfo32 -{ - char pr_state; /* numeric process state */ - char pr_sname; /* char for pr_state */ - char pr_zomb; /* zombie */ - char pr_nice; /* nice val */ - unsigned int pr_flag; /* flags */ - __kernel_uid_t pr_uid; - __kernel_gid_t pr_gid; - pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; - /* Lots missing */ - char pr_fname[16]; /* filename of executable */ - char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ -}; +#define elf_prpsinfo compat_elf_prpsinfo #define elf_caddr_t u32 #define init_elf_binfmt init_elf32_binfmt From c3cd7564819a7c1761b3b91770b6083cb29b2620 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Jun 2020 18:09:06 -0400 Subject: [PATCH 023/307] mips: kill unused definitions in binfmt_elf[on]32.c elf_caddr_t: unused since 2002 jiffies_to_timeval: unused since 2015 TASK_SIZE: used only downstream of SET_PERSONALITY2(), and after that point the normal definition results in TASK_SIZE32 just fine. Signed-off-by: Al Viro --- arch/mips/kernel/binfmt_elfn32.c | 18 ------------------ arch/mips/kernel/binfmt_elfo32.c | 18 ------------------ 2 files changed, 36 deletions(-) diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 720bbf272744..4c5544ae8fa4 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -54,28 +54,10 @@ struct elf_prstatus32 }; #define elf_prpsinfo compat_elf_prpsinfo -#define elf_caddr_t u32 #define init_elf_binfmt init_elfn32_binfmt -#define jiffies_to_timeval jiffies_to_old_timeval32 -static __inline__ void -jiffies_to_old_timeval32(unsigned long jiffies, struct old_timeval32 *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - u32 rem; - value->tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} - #define ELF_CORE_EFLAGS EF_MIPS_ABI2 -#undef TASK_SIZE -#define TASK_SIZE TASK_SIZE32 - #undef ns_to_kernel_old_timeval #define ns_to_kernel_old_timeval ns_to_old_timeval32 diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index 6fcab231c962..6ba7501e7079 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -59,26 +59,8 @@ struct elf_prstatus32 }; #define elf_prpsinfo compat_elf_prpsinfo -#define elf_caddr_t u32 #define init_elf_binfmt init_elf32_binfmt -#define jiffies_to_timeval jiffies_to_old_timeval32 -static inline void -jiffies_to_old_timeval32(unsigned long jiffies, struct old_timeval32 *value) -{ - /* - * Convert jiffies to nanoseconds and separate with - * one divide. - */ - u64 nsec = (u64)jiffies * TICK_NSEC; - u32 rem; - value->tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); - value->tv_usec = rem / NSEC_PER_USEC; -} - -#undef TASK_SIZE -#define TASK_SIZE TASK_SIZE32 - #undef ns_to_kernel_old_timeval #define ns_to_kernel_old_timeval ns_to_old_timeval32 From fd624c712dfcb6bd6d34018bf879cb4fc6ef84f9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Jun 2020 23:33:11 -0400 Subject: [PATCH 024/307] mips: KVM_GUEST makes no sense for 64bit builds... it's always been about MIPS32 Signed-off-by: Al Viro --- arch/mips/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 0a17bedf4f0d..04aecf51e376 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2182,7 +2182,7 @@ endchoice config KVM_GUEST bool "KVM Guest Kernel" depends on CPU_MIPS32_R2 - depends on BROKEN_ON_SMP + depends on !64BIT && BROKEN_ON_SMP help Select this option if building a guest kernel for KVM (Trap & Emulate) mode. From 056f280f3b63f68073dd8d332bf2a0132deccd82 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Jun 2020 23:37:24 -0400 Subject: [PATCH 025/307] mips compat: don't bother with ELF_ET_DYN_BASE normal mips one is just fine - it's only used after we'd done SET_PERSONALITY2() and by that point TASK_SIZE will yield the right value Signed-off-by: Al Viro --- arch/mips/include/asm/elf.h | 2 -- arch/mips/kernel/binfmt_elfn32.c | 4 ---- arch/mips/kernel/binfmt_elfo32.c | 8 -------- 3 files changed, 14 deletions(-) diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h index 71c7622025d1..d29e43e4f9b1 100644 --- a/arch/mips/include/asm/elf.h +++ b/arch/mips/include/asm/elf.h @@ -469,9 +469,7 @@ extern const char *__elf_base_platform; the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. */ -#ifndef ELF_ET_DYN_BASE #define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) -#endif /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 4c5544ae8fa4..08bc05fd9882 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -34,10 +34,6 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; */ #define elf_check_arch elfn32_check_arch -#define TASK32_SIZE 0x7fff8000UL -#undef ELF_ET_DYN_BASE -#define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2) - #include #include #include diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index 6ba7501e7079..f5ee6b43b49c 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -34,14 +34,6 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; */ #define elf_check_arch elfo32_check_arch -#ifdef CONFIG_KVM_GUEST -#define TASK32_SIZE 0x3fff8000UL -#else -#define TASK32_SIZE 0x7fff8000UL -#endif -#undef ELF_ET_DYN_BASE -#define ELF_ET_DYN_BASE (TASK32_SIZE / 3 * 2) - #include #include From 2fb33bec053b01e616fab921aab4d4775d374e8f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Jun 2020 23:42:04 -0400 Subject: [PATCH 026/307] mips: don't bother with ELF_CORE_EFLAGS mips coredumps are regset-based, so ELF_CORE_EFLAGS is not used at all - user_..._view.e_flags is. Signed-off-by: Al Viro --- arch/mips/kernel/binfmt_elfn32.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 08bc05fd9882..573f2a177da6 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -52,8 +52,6 @@ struct elf_prstatus32 #define init_elf_binfmt init_elfn32_binfmt -#define ELF_CORE_EFLAGS EF_MIPS_ABI2 - #undef ns_to_kernel_old_timeval #define ns_to_kernel_old_timeval ns_to_old_timeval32 From 0bb87f051e4282afb5f472807c7244b21cf515c7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 14 Jun 2020 00:18:12 -0400 Subject: [PATCH 027/307] mips compat: switch to compat_binfmt_elf.c Like amd64, mips has two 32bit ABIs - o32 and n32. Unlike amd64, it does not use compat_binfmt_elf.c for either of those; each of those ABIs has a binfmt handler of its own, both very similar to fs/compat_binfmt_elf.c. And the same technics as we use on amd64 can be used to make fs/compat_binfmt_elf.c handle both. * merge elfo32_check_arch() with elfn32_check_arch(), make that serve as compat_elf_check_arch(). Note that SET_PERSONALITY2() is already the same for all ABI variants - it looks at the elf header to choose the flags to set. * add asm/elfcore-compat.h, using the bigger (n32) variant of elf32_prstatus as compat_elf_prstatus there. * make PRSTATUS_SIZE() and SET_PR_FPVALID() choose the right layout, same as done for amd64. test_thread_flag(TIF_32BIT_REGS) is used as the predicate. Voila - we are rid of binfmt_elf{n,o}32.c; fs/compat_binfmt_elf.c is used, same as for all other ELF-supporting 64bit architectures that need 32bit compat. Signed-off-by: Al Viro --- arch/mips/Kconfig | 8 ++-- arch/mips/include/asm/elf.h | 54 ++++++++------------- arch/mips/include/asm/elfcore-compat.h | 29 +++++++++++ arch/mips/kernel/Makefile | 4 +- arch/mips/kernel/binfmt_elfn32.c | 65 ------------------------- arch/mips/kernel/binfmt_elfo32.c | 66 -------------------------- arch/mips/kernel/scall64-n64.S | 2 +- 7 files changed, 54 insertions(+), 174 deletions(-) create mode 100644 arch/mips/include/asm/elfcore-compat.h delete mode 100644 arch/mips/kernel/binfmt_elfn32.c delete mode 100644 arch/mips/kernel/binfmt_elfo32.c diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 04aecf51e376..a46423f1cabc 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -92,6 +92,7 @@ config MIPS select SET_FS select SYSCTL_EXCEPTION_TRACE select VIRT_TO_BUS + select ARCH_HAS_ELFCORE_COMPAT config MIPS_FIXUP_BIGPHYS_ADDR bool @@ -3277,6 +3278,7 @@ config MIPS32_O32 select ARCH_WANT_OLD_COMPAT_IPC select COMPAT select MIPS32_COMPAT + select COMPAT_BINFMT_ELF select SYSVIPC_COMPAT if SYSVIPC help Select this option if you want to run o32 binaries. These are pure @@ -3290,6 +3292,7 @@ config MIPS32_N32 depends on 64BIT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select COMPAT + select COMPAT_BINFMT_ELF select MIPS32_COMPAT select SYSVIPC_COMPAT if SYSVIPC help @@ -3300,11 +3303,6 @@ config MIPS32_N32 If unsure, say N. -config BINFMT_ELF32 - bool - default y if MIPS32_O32 || MIPS32_N32 - select ELFCORE - menu "Power management options" config ARCH_HIBERNATION_POSSIBLE diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h index d29e43e4f9b1..dc8d2863752c 100644 --- a/arch/mips/include/asm/elf.h +++ b/arch/mips/include/asm/elf.h @@ -201,7 +201,6 @@ struct mips_elf_abiflags_v0 { uint32_t flags2; }; -#ifndef ELF_ARCH /* ELF register definitions */ #define ELF_NGREG 45 #define ELF_NFPREG 33 @@ -219,7 +218,7 @@ void mips_dump_regs64(u64 *uregs, const struct pt_regs *regs); /* * This is used to ensure we don't load something for the wrong architecture. */ -#define elf_check_arch elfo32_check_arch +#define elf_check_arch elf32_check_arch /* * These are used to set parameters in the core dumps. @@ -235,7 +234,8 @@ void mips_dump_regs64(u64 *uregs, const struct pt_regs *regs); /* * This is used to ensure we don't load something for the wrong architecture. */ -#define elf_check_arch elfn64_check_arch +#define elf_check_arch elf64_check_arch +#define compat_elf_check_arch elf32_check_arch /* * These are used to set parameters in the core dumps. @@ -257,8 +257,6 @@ void mips_dump_regs64(u64 *uregs, const struct pt_regs *regs); #endif #define ELF_ARCH EM_MIPS -#endif /* !defined(ELF_ARCH) */ - /* * In order to be sure that we don't attempt to execute an O32 binary which * requires 64 bit FP (FR=1) on a system which does not support it we refuse @@ -277,9 +275,9 @@ void mips_dump_regs64(u64 *uregs, const struct pt_regs *regs); #define vmcore_elf64_check_arch mips_elf_check_machine /* - * Return non-zero if HDR identifies an o32 ELF binary. + * Return non-zero if HDR identifies an o32 or n32 ELF binary. */ -#define elfo32_check_arch(hdr) \ +#define elf32_check_arch(hdr) \ ({ \ int __res = 1; \ struct elfhdr *__h = (hdr); \ @@ -288,21 +286,26 @@ void mips_dump_regs64(u64 *uregs, const struct pt_regs *regs); __res = 0; \ if (__h->e_ident[EI_CLASS] != ELFCLASS32) \ __res = 0; \ - if ((__h->e_flags & EF_MIPS_ABI2) != 0) \ - __res = 0; \ - if (((__h->e_flags & EF_MIPS_ABI) != 0) && \ - ((__h->e_flags & EF_MIPS_ABI) != EF_MIPS_ABI_O32)) \ - __res = 0; \ - if (__h->e_flags & __MIPS_O32_FP64_MUST_BE_ZERO) \ - __res = 0; \ - \ + if ((__h->e_flags & EF_MIPS_ABI2) != 0) { \ + if (!IS_ENABLED(CONFIG_MIPS32_N32) || \ + (__h->e_flags & EF_MIPS_ABI)) \ + __res = 0; \ + } else { \ + if (IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_MIPS32_O32)) \ + __res = 0; \ + if (((__h->e_flags & EF_MIPS_ABI) != 0) && \ + ((__h->e_flags & EF_MIPS_ABI) != EF_MIPS_ABI_O32)) \ + __res = 0; \ + if (__h->e_flags & __MIPS_O32_FP64_MUST_BE_ZERO) \ + __res = 0; \ + } \ __res; \ }) /* * Return non-zero if HDR identifies an n64 ELF binary. */ -#define elfn64_check_arch(hdr) \ +#define elf64_check_arch(hdr) \ ({ \ int __res = 1; \ struct elfhdr *__h = (hdr); \ @@ -315,25 +318,6 @@ void mips_dump_regs64(u64 *uregs, const struct pt_regs *regs); __res; \ }) -/* - * Return non-zero if HDR identifies an n32 ELF binary. - */ -#define elfn32_check_arch(hdr) \ -({ \ - int __res = 1; \ - struct elfhdr *__h = (hdr); \ - \ - if (!mips_elf_check_machine(__h)) \ - __res = 0; \ - if (__h->e_ident[EI_CLASS] != ELFCLASS32) \ - __res = 0; \ - if (((__h->e_flags & EF_MIPS_ABI2) == 0) || \ - ((__h->e_flags & EF_MIPS_ABI) != 0)) \ - __res = 0; \ - \ - __res; \ -}) - struct mips_abi; extern struct mips_abi mips_abi; diff --git a/arch/mips/include/asm/elfcore-compat.h b/arch/mips/include/asm/elfcore-compat.h new file mode 100644 index 000000000000..2f0f0103c75b --- /dev/null +++ b/arch/mips/include/asm/elfcore-compat.h @@ -0,0 +1,29 @@ +#ifndef _ASM_MIPS_ELFCORE_COMPAT_H +#define _ASM_MIPS_ELFCORE_COMPAT_H + +/* + * On mips we have two 32bit ABIs - o32 and n32. The latter + * has bigger registers, so we use it for compat_elf_regset_t. + * The former uses o32_elf_prstatus and PRSTATUS_SIZE/SET_PR_FPVALID + * are used to choose the size and location of ->pr_fpvalid of + * the layout actually used. + */ +typedef elf_gregset_t compat_elf_gregset_t; + +struct o32_elf_prstatus +{ + struct compat_elf_prstatus_common common; + unsigned int pr_reg[ELF_NGREG]; + compat_int_t pr_fpvalid; +}; + +#define PRSTATUS_SIZE \ + (!test_thread_flag(TIF_32BIT_REGS) \ + ? sizeof(struct compat_elf_prstatus) \ + : sizeof(struct o32_elf_prstatus)) +#define SET_PR_FPVALID(S) \ + (*(!test_thread_flag(TIF_32BIT_REGS) \ + ? &(S)->pr_fpvalid \ + : &((struct o32_elf_prstatus *)(S))->pr_fpvalid) = 1) + +#endif diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index 2a05b923f579..943eaeef73e9 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -80,8 +80,8 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_32BIT) += scall32-o32.o obj-$(CONFIG_64BIT) += scall64-n64.o obj-$(CONFIG_MIPS32_COMPAT) += linux32.o ptrace32.o signal32.o -obj-$(CONFIG_MIPS32_N32) += binfmt_elfn32.o scall64-n32.o signal_n32.o -obj-$(CONFIG_MIPS32_O32) += binfmt_elfo32.o scall64-o32.o signal_o32.o +obj-$(CONFIG_MIPS32_N32) += scall64-n32.o signal_n32.o +obj-$(CONFIG_MIPS32_O32) += scall64-o32.o signal_o32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_PROC_FS) += proc.o diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c deleted file mode 100644 index 573f2a177da6..000000000000 --- a/arch/mips/kernel/binfmt_elfn32.c +++ /dev/null @@ -1,65 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Support for n32 Linux/MIPS ELF binaries. - * Author: Ralf Baechle (ralf@linux-mips.org) - * - * Copyright (C) 1999, 2001 Ralf Baechle - * Copyright (C) 1999, 2001 Silicon Graphics, Inc. - * - * Heavily inspired by the 32-bit Sparc compat code which is - * Copyright (C) 1995, 1996, 1997, 1998 David S. Miller (davem@redhat.com) - * Copyright (C) 1995, 1996, 1997, 1998 Jakub Jelinek (jj@ultra.linux.cz) - */ - -#define ELF_ARCH EM_MIPS -#define ELF_CLASS ELFCLASS32 -#ifdef __MIPSEB__ -#define ELF_DATA ELFDATA2MSB; -#else /* __MIPSEL__ */ -#define ELF_DATA ELFDATA2LSB; -#endif - -/* ELF register definitions */ -#define ELF_NGREG 45 -#define ELF_NFPREG 33 - -typedef unsigned long elf_greg_t; -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -typedef double elf_fpreg_t; -typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; - -/* - * This is used to ensure we don't load something for the wrong architecture. - */ -#define elf_check_arch elfn32_check_arch - -#include -#include -#include -#include -#include - -#define elf_prstatus elf_prstatus32 -#define elf_prstatus_common compat_elf_prstatus_common -struct elf_prstatus32 -{ - struct compat_elf_prstatus_common common; - elf_gregset_t pr_reg; /* GP registers */ - int pr_fpvalid; /* True if math co-processor being used. */ -}; -#define elf_prpsinfo compat_elf_prpsinfo - -#define init_elf_binfmt init_elfn32_binfmt - -#undef ns_to_kernel_old_timeval -#define ns_to_kernel_old_timeval ns_to_old_timeval32 - -/* - * Some data types as stored in coredump. - */ -#define user_long_t compat_long_t -#define user_siginfo_t compat_siginfo_t -#define copy_siginfo_to_external copy_siginfo_to_external32 - -#include "../../../fs/binfmt_elf.c" diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c deleted file mode 100644 index f5ee6b43b49c..000000000000 --- a/arch/mips/kernel/binfmt_elfo32.c +++ /dev/null @@ -1,66 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Support for o32 Linux/MIPS ELF binaries. - * Author: Ralf Baechle (ralf@linux-mips.org) - * - * Copyright (C) 1999, 2001 Ralf Baechle - * Copyright (C) 1999, 2001 Silicon Graphics, Inc. - * - * Heavily inspired by the 32-bit Sparc compat code which is - * Copyright (C) 1995, 1996, 1997, 1998 David S. Miller (davem@redhat.com) - * Copyright (C) 1995, 1996, 1997, 1998 Jakub Jelinek (jj@ultra.linux.cz) - */ - -#define ELF_ARCH EM_MIPS -#define ELF_CLASS ELFCLASS32 -#ifdef __MIPSEB__ -#define ELF_DATA ELFDATA2MSB; -#else /* __MIPSEL__ */ -#define ELF_DATA ELFDATA2LSB; -#endif - -/* ELF register definitions */ -#define ELF_NGREG 45 -#define ELF_NFPREG 33 - -typedef unsigned int elf_greg_t; -typedef elf_greg_t elf_gregset_t[ELF_NGREG]; - -typedef double elf_fpreg_t; -typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; - -/* - * This is used to ensure we don't load something for the wrong architecture. - */ -#define elf_check_arch elfo32_check_arch - -#include - -#include -#include -#include -#include - -#define elf_prstatus elf_prstatus32 -#define elf_prstatus_common compat_elf_prstatus_common -struct elf_prstatus32 -{ - struct compat_elf_prstatus_common common; - elf_gregset_t pr_reg; /* GP registers */ - int pr_fpvalid; /* True if math co-processor being used. */ -}; -#define elf_prpsinfo compat_elf_prpsinfo - -#define init_elf_binfmt init_elf32_binfmt - -#undef ns_to_kernel_old_timeval -#define ns_to_kernel_old_timeval ns_to_old_timeval32 - -/* - * Some data types as stored in coredump. - */ -#define user_long_t compat_long_t -#define user_siginfo_t compat_siginfo_t -#define copy_siginfo_to_external copy_siginfo_to_external32 - -#include "../../../fs/binfmt_elf.c" diff --git a/arch/mips/kernel/scall64-n64.S b/arch/mips/kernel/scall64-n64.S index 23b2e2b1609c..5e9c497ce099 100644 --- a/arch/mips/kernel/scall64-n64.S +++ b/arch/mips/kernel/scall64-n64.S @@ -20,7 +20,7 @@ #include #include -#ifndef CONFIG_BINFMT_ELF32 +#ifndef CONFIG_MIPS32_COMPAT /* Neither O32 nor N32, so define handle_sys here */ #define handle_sys64 handle_sys #endif From 41026c343540e33627e23c8a91ebb679a7c0f89c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Dec 2020 23:56:34 -0500 Subject: [PATCH 028/307] Kconfig: regularize selection of CONFIG_BINFMT_ELF with mips converted to use of fs/config_binfmt_elf.c, there's no need to keep selects of that thing all over arch/* - we can simply turn into def_bool y if COMPAT && BINFMT_ELF (in fs/Kconfig.binfmt) and get rid of all selects. Several architectures got those selects wrong (e.g. you could end up with sparc64 sans BINFMT_ELF, with select violating dependencies, etc.) Randy Dunlap has spotted some of those; IMO this is simpler than his fix, but it depends upon the stuff that would need to be backported, so we might end up using his variant for -stable. Signed-off-by: Al Viro --- arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 2 -- arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/x86/Kconfig | 1 - fs/Kconfig.binfmt | 2 +- 8 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 05e17351e4f3..ed48fd42ab33 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1215,7 +1215,6 @@ config ARM64_TAGGED_ADDR_ABI menuconfig COMPAT bool "Kernel support for 32-bit EL0" depends on ARM64_4K_PAGES || EXPERT - select COMPAT_BINFMT_ELF if BINFMT_ELF select HAVE_UID16 select OLD_SIGSUSPEND3 select COMPAT_OLD_SIGACTION diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a46423f1cabc..f29ec95e3458 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3278,7 +3278,6 @@ config MIPS32_O32 select ARCH_WANT_OLD_COMPAT_IPC select COMPAT select MIPS32_COMPAT - select COMPAT_BINFMT_ELF select SYSVIPC_COMPAT if SYSVIPC help Select this option if you want to run o32 binaries. These are pure @@ -3292,7 +3291,6 @@ config MIPS32_N32 depends on 64BIT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select COMPAT - select COMPAT_BINFMT_ELF select MIPS32_COMPAT select SYSVIPC_COMPAT if SYSVIPC help diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 78b17621ee4a..26daf57b9df6 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -336,7 +336,6 @@ source "kernel/Kconfig.hz" config COMPAT def_bool y depends on 64BIT - select COMPAT_BINFMT_ELF if BINFMT_ELF config SYSVIPC_COMPAT def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 107bb4319e0e..d26a89cd8908 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -282,7 +282,6 @@ config COMPAT bool "Enable support for 32bit binaries" depends on PPC64 default y if !CPU_LITTLE_ENDIAN - select COMPAT_BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index e84bdd15150b..73b61fe7231e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -423,7 +423,6 @@ config 64BIT config COMPAT def_bool y prompt "Kernel support for 31 bit emulation" - select COMPAT_BINFMT_ELF if BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION select HAVE_UID16 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index c9c34dc52b7d..1a2b5649d267 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -494,7 +494,6 @@ config COMPAT bool depends on SPARC64 default y - select COMPAT_BINFMT_ELF select HAVE_UID16 select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a2182d22b5fa..6d130d1c440b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2899,7 +2899,6 @@ config COMPAT_32 config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 - select COMPAT_BINFMT_ELF if BINFMT_ELF if COMPAT config COMPAT_FOR_U64_ALIGNMENT diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 885da6d983b4..b32f5df68ae9 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -29,7 +29,7 @@ config BINFMT_ELF latest version). config COMPAT_BINFMT_ELF - bool + def_bool y depends on COMPAT && BINFMT_ELF select ELFCORE From 492ed38192fccb92022b7a6d3b2751a09a3494c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Dec 2020 14:03:08 -0500 Subject: [PATCH 029/307] compat_binfmt_elf: don't bother with undef of ELF_ARCH It's not used anywhere downstream (and never had been, AFAICS). Theoretically, fs/binfmt_elf.c does use it, but only in the non-regset coredump handling and all biarch architectures end up with that ifdefed out. Signed-off-by: Al Viro --- fs/compat_binfmt_elf.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index a6321415aba0..e40bdbdc094f 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -62,7 +62,6 @@ * differ from the native ones, or omitted when they match. */ -#undef ELF_ARCH #undef elf_check_arch #define elf_check_arch compat_elf_check_arch From e565d89e4aa07e3f20ac5e8757b1da24b5878e69 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 4 Dec 2020 14:06:04 -0500 Subject: [PATCH 030/307] get rid of COMPAT_ELF_EXEC_PAGESIZE not defined by any architecture (and never had been) Signed-off-by: Al Viro --- fs/compat_binfmt_elf.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index e40bdbdc094f..95e72d271b95 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -90,11 +90,6 @@ #define ELF_ET_DYN_BASE COMPAT_ELF_ET_DYN_BASE #endif -#ifdef COMPAT_ELF_EXEC_PAGESIZE -#undef ELF_EXEC_PAGESIZE -#define ELF_EXEC_PAGESIZE COMPAT_ELF_EXEC_PAGESIZE -#endif - #ifdef COMPAT_ELF_PLAT_INIT #undef ELF_PLAT_INIT #define ELF_PLAT_INIT COMPAT_ELF_PLAT_INIT From 2185c23071e2c1f26fbccb323aa831732540cfcc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Jan 2021 11:36:35 +0300 Subject: [PATCH 031/307] powercap/drivers/dtpm: Fix a double shift bug The DTPM_POWER_LIMIT_FLAG is used for test_bit() etc which take a bit number so it should be bit 0. But currently it's set to BIT(0) then that is double shifted equivalent to BIT(BIT(0)). This doesn't cause a run time problem because it's done consistently. Fixes: a20d0ef97abf ("powercap/drivers/dtpm: Add API for dynamic thermal power management") Signed-off-by: Dan Carpenter Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 0abcc439d728..d49df0569cd4 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -24,7 +24,7 @@ #include #include -#define DTPM_POWER_LIMIT_FLAG BIT(0) +#define DTPM_POWER_LIMIT_FLAG 0 static const char *constraint_name[] = { "Instantaneous", From 0fe1329b7b518f67c8f1760711eb0eaf90433fd3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Jan 2021 11:41:09 +0300 Subject: [PATCH 032/307] powercap/drivers/dtpm: Fix some missing unlock bugs We need to unlock on these paths before returning. Fixes: a20d0ef97abf ("powercap/drivers/dtpm: Add API for dynamic thermal power management") Signed-off-by: Dan Carpenter Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index d49df0569cd4..470a1182b868 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -147,13 +147,17 @@ static void __dtpm_add_power(struct dtpm *dtpm) */ int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max) { + int ret = 0; + mutex_lock(&dtpm_lock); if (power_min == dtpm->power_min && power_max == dtpm->power_max) - return 0; + goto unlock; - if (power_max < power_min) - return -EINVAL; + if (power_max < power_min) { + ret = -EINVAL; + goto unlock; + } __dtpm_sub_power(dtpm); @@ -164,9 +168,10 @@ int dtpm_update_power(struct dtpm *dtpm, u64 power_min, u64 power_max) __dtpm_add_power(dtpm); +unlock: mutex_unlock(&dtpm_lock); - return 0; + return ret; } /** @@ -187,8 +192,10 @@ int dtpm_release_zone(struct powercap_zone *pcz) mutex_lock(&dtpm_lock); - if (!list_empty(&dtpm->children)) + if (!list_empty(&dtpm->children)) { + mutex_unlock(&dtpm_lock); return -EBUSY; + } if (parent) list_del(&dtpm->sibling); From f8f706ad75abbc65fee365853e7b24731223fd6d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 6 Jan 2021 12:03:08 +0300 Subject: [PATCH 033/307] powercap/drivers/dtpm: Fix an IS_ERR() vs NULL check The powercap_register_control_type() function never returns NULL, it returns error pointers on error so update this check. Fixes: a20d0ef97abf ("powercap/drivers/dtpm: Add API for dynamic thermal power management") Signed-off-by: Dan Carpenter Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/powercap/dtpm.c b/drivers/powercap/dtpm.c index 470a1182b868..5a51cd34a7e8 100644 --- a/drivers/powercap/dtpm.c +++ b/drivers/powercap/dtpm.c @@ -467,9 +467,9 @@ static int __init dtpm_init(void) struct dtpm_descr **dtpm_descr; pct = powercap_register_control_type(NULL, "dtpm", NULL); - if (!pct) { + if (IS_ERR(pct)) { pr_err("Failed to register control type\n"); - return -EINVAL; + return PTR_ERR(pct); } for_each_dtpm_table(dtpm_descr) From 66e713fbbbc6c259559d4937a3b016d36ab529ff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 4 Jan 2021 12:10:53 +0000 Subject: [PATCH 034/307] powercap/drivers/dtpm: Fix size of object being allocated The kzalloc allocation for dtpm_cpu is currently allocating the size of the pointer and not the size of the structure. Fix this by using the correct sizeof argument. Addresses-Coverity: ("Wrong sizeof argument") Fixes: 0e8f68d7f048 ("powercap/drivers/dtpm: Add CPU energy model based support") Signed-off-by: Colin Ian King Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm_cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 6933c783c6b4..51c366938acd 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -200,7 +200,7 @@ static int cpuhp_dtpm_cpu_online(unsigned int cpu) if (!dtpm) return -EINVAL; - dtpm_cpu = kzalloc(sizeof(dtpm_cpu), GFP_KERNEL); + dtpm_cpu = kzalloc(sizeof(*dtpm_cpu), GFP_KERNEL); if (!dtpm_cpu) goto out_kfree_dtpm; From 9dd04ec6bc6fa7b310e5595f2ad9bef13eacd3a0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Jan 2021 19:42:15 +0100 Subject: [PATCH 035/307] cpufreq: intel_pstate: Always read hwp_cap_cached with READ_ONCE() Because intel_pstate_get_hwp_max() which updates hwp_cap_cached may run in parallel with the readers of it, annotate all of the read accesses to it with READ_ONCE(). Signed-off-by: Rafael J. Wysocki Tested-by: Chen Yu --- drivers/cpufreq/intel_pstate.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index be05e038d956..74bf54e6c993 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -914,7 +914,7 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu) } value &= ~GENMASK_ULL(31, 0); - min_perf = HWP_LOWEST_PERF(cpu->hwp_cap_cached); + min_perf = HWP_LOWEST_PERF(READ_ONCE(cpu->hwp_cap_cached)); /* Set hwp_max = hwp_min */ value |= HWP_MAX_PERF(min_perf); @@ -1750,6 +1750,7 @@ static int hwp_boost_hold_time_ns = 3 * NSEC_PER_MSEC; static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) { u64 hwp_req = READ_ONCE(cpu->hwp_req_cached); + u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached); u32 max_limit = (hwp_req & 0xff00) >> 8; u32 min_limit = (hwp_req & 0xff); u32 boost_level1; @@ -1776,14 +1777,14 @@ static inline void intel_pstate_hwp_boost_up(struct cpudata *cpu) cpu->hwp_boost_min = min_limit; /* level at half way mark between min and guranteed */ - boost_level1 = (HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) + min_limit) >> 1; + boost_level1 = (HWP_GUARANTEED_PERF(hwp_cap) + min_limit) >> 1; if (cpu->hwp_boost_min < boost_level1) cpu->hwp_boost_min = boost_level1; - else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) - cpu->hwp_boost_min = HWP_GUARANTEED_PERF(cpu->hwp_cap_cached); - else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(cpu->hwp_cap_cached) && - max_limit != HWP_GUARANTEED_PERF(cpu->hwp_cap_cached)) + else if (cpu->hwp_boost_min < HWP_GUARANTEED_PERF(hwp_cap)) + cpu->hwp_boost_min = HWP_GUARANTEED_PERF(hwp_cap); + else if (cpu->hwp_boost_min == HWP_GUARANTEED_PERF(hwp_cap) && + max_limit != HWP_GUARANTEED_PERF(hwp_cap)) cpu->hwp_boost_min = max_limit; else return; From a45ee4d4e13b0e35a8ec7ea0bf9267243d57b302 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Jan 2021 19:43:30 +0100 Subject: [PATCH 036/307] cpufreq: intel_pstate: Change intel_pstate_get_hwp_max() argument All of the callers of intel_pstate_get_hwp_max() access the struct cpudata object that corresponds to the given CPU already and the function itself needs to access that object (in order to update hwp_cap_cached), so modify the code to pass a struct cpudata pointer to it instead of the CPU number. Signed-off-by: Rafael J. Wysocki Tested-by: Chen Yu --- drivers/cpufreq/intel_pstate.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 74bf54e6c993..3eb63daf2523 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -819,13 +819,13 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { NULL, }; -static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max, +static void intel_pstate_get_hwp_max(struct cpudata *cpu, int *phy_max, int *current_max) { u64 cap; - rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); - WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap); + rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap); + WRITE_ONCE(cpu->hwp_cap_cached, cap); if (global.no_turbo || global.turbo_disabled) *current_max = HWP_GUARANTEED_PERF(cap); else @@ -1213,7 +1213,7 @@ static void update_qos_request(enum freq_qos_req_type type) continue; if (hwp_active) - intel_pstate_get_hwp_max(i, &turbo_max, &max_state); + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); else turbo_max = cpu->pstate.turbo_pstate; @@ -1723,7 +1723,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) if (hwp_active && !hwp_mode_bdw) { unsigned int phy_max, current_max; - intel_pstate_get_hwp_max(cpu->cpu, &phy_max, ¤t_max); + intel_pstate_get_hwp_max(cpu, &phy_max, ¤t_max); cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling; cpu->pstate.turbo_pstate = phy_max; } else { @@ -2208,7 +2208,7 @@ static void intel_pstate_update_perf_limits(struct cpudata *cpu, * rather than pure ratios. */ if (hwp_active) { - intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); } else { max_state = global.no_turbo || global.turbo_disabled ? cpu->pstate.max_pstate : cpu->pstate.turbo_pstate; @@ -2323,7 +2323,7 @@ static void intel_pstate_verify_cpu_policy(struct cpudata *cpu, if (hwp_active) { int max_state, turbo_max; - intel_pstate_get_hwp_max(cpu->cpu, &turbo_max, &max_state); + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); max_freq = max_state * cpu->pstate.scaling; } else { max_freq = intel_pstate_get_max_freq(cpu); @@ -2710,7 +2710,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) if (hwp_active) { u64 value; - intel_pstate_get_hwp_max(policy->cpu, &turbo_max, &max_state); + intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state); policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP; rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value); WRITE_ONCE(cpu->hwp_req_cached, value); From 597ffbc8d085870e071807b514a6ed45809f81a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Jan 2021 19:44:18 +0100 Subject: [PATCH 037/307] cpufreq: intel_pstate: Rename two functions Rename intel_cpufreq_adjust_hwp() and intel_cpufreq_adjust_perf_ctl() to intel_cpufreq_hwp_update() and intel_cpufreq_perf_ctl_update(), respectively, to avoid possible confusion with the ->adjist_perf() callback function, intel_cpufreq_adjust_perf(). Signed-off-by: Rafael J. Wysocki Tested-by: Chen Yu --- drivers/cpufreq/intel_pstate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3eb63daf2523..86873f4c6a72 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2527,7 +2527,7 @@ static void intel_cpufreq_trace(struct cpudata *cpu, unsigned int trace_type, in fp_toint(cpu->iowait_boost * 100)); } -static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 min, u32 max, +static void intel_cpufreq_hwp_update(struct cpudata *cpu, u32 min, u32 max, u32 desired, bool fast_switch) { u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev; @@ -2551,7 +2551,7 @@ static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 min, u32 max, wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value); } -static void intel_cpufreq_adjust_perf_ctl(struct cpudata *cpu, +static void intel_cpufreq_perf_ctl_update(struct cpudata *cpu, u32 target_pstate, bool fast_switch) { if (fast_switch) @@ -2573,10 +2573,10 @@ static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, int max_pstate = policy->strict_target ? target_pstate : cpu->max_perf_ratio; - intel_cpufreq_adjust_hwp(cpu, target_pstate, max_pstate, 0, + intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 0, fast_switch); } else if (target_pstate != old_pstate) { - intel_cpufreq_adjust_perf_ctl(cpu, target_pstate, fast_switch); + intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch); } cpu->pstate.current_pstate = target_pstate; @@ -2674,7 +2674,7 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum, target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate); - intel_cpufreq_adjust_hwp(cpu, min_pstate, max_pstate, target_pstate, true); + intel_cpufreq_hwp_update(cpu, min_pstate, max_pstate, target_pstate, true); cpu->pstate.current_pstate = target_pstate; intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate); From 6f67e060083a84a4cc364eab6ae40c717165fb0c Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 12 Jan 2021 13:21:27 +0800 Subject: [PATCH 038/307] cpufreq: intel_pstate: Get per-CPU max freq via MSR_HWP_CAPABILITIES if available Currently, when turbo is disabled (either by BIOS or by the user), the intel_pstate driver reads the max non-turbo frequency from the package-wide MSR_PLATFORM_INFO(0xce) register. However, on asymmetric platforms it is possible in theory that small and big core with HWP enabled might have different max non-turbo CPU frequency, because MSR_HWP_CAPABILITIES is per-CPU scope according to Intel Software Developer Manual. The turbo max freq is already per-CPU in current code, so make similar change to the max non-turbo frequency as well. Reported-by: Wendy Wang Signed-off-by: Chen Yu [ rjw: Subject and changelog edits ] Cc: 4.18+ # 4.18+: a45ee4d4e13b: cpufreq: intel_pstate: Change intel_pstate_get_hwp_max() argument Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 86873f4c6a72..6f2ff2775664 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1714,11 +1714,9 @@ static void intel_pstate_max_within_limits(struct cpudata *cpu) static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) { cpu->pstate.min_pstate = pstate_funcs.get_min(); - cpu->pstate.max_pstate = pstate_funcs.get_max(); cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); cpu->pstate.scaling = pstate_funcs.get_scaling(); - cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; if (hwp_active && !hwp_mode_bdw) { unsigned int phy_max, current_max; @@ -1726,9 +1724,12 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) intel_pstate_get_hwp_max(cpu, &phy_max, ¤t_max); cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling; cpu->pstate.turbo_pstate = phy_max; + cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(READ_ONCE(cpu->hwp_cap_cached)); } else { cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + cpu->pstate.max_pstate = pstate_funcs.get_max(); } + cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; if (pstate_funcs.get_aperf_mperf_shift) cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); From 763ec5daaea835e5604d08364c9081e7304b7c2b Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 02:18:25 +0300 Subject: [PATCH 039/307] cpufreq: tegra20: Use resource-managed API Switch cpufreq-tegra20 driver to use resource-managed API. This removes the need to get opp_table pointer using dev_pm_opp_get_opp_table() in order to release OPP table that was requested by dev_pm_opp_set_supported_hw(), making the code a bit more straightforward. Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra20-cpufreq.c | 45 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c index 8c893043953e..e8db3d75be25 100644 --- a/drivers/cpufreq/tegra20-cpufreq.c +++ b/drivers/cpufreq/tegra20-cpufreq.c @@ -32,6 +32,16 @@ static bool cpu0_node_has_opp_v2_prop(void) return ret; } +static void tegra20_cpufreq_put_supported_hw(void *opp_table) +{ + dev_pm_opp_put_supported_hw(opp_table); +} + +static void tegra20_cpufreq_dt_unregister(void *cpufreq_dt) +{ + platform_device_unregister(cpufreq_dt); +} + static int tegra20_cpufreq_probe(struct platform_device *pdev) { struct platform_device *cpufreq_dt; @@ -68,42 +78,31 @@ static int tegra20_cpufreq_probe(struct platform_device *pdev) return err; } + err = devm_add_action_or_reset(&pdev->dev, + tegra20_cpufreq_put_supported_hw, + opp_table); + if (err) + return err; + cpufreq_dt = platform_device_register_simple("cpufreq-dt", -1, NULL, 0); err = PTR_ERR_OR_ZERO(cpufreq_dt); if (err) { dev_err(&pdev->dev, "failed to create cpufreq-dt device: %d\n", err); - goto err_put_supported_hw; + return err; } - platform_set_drvdata(pdev, cpufreq_dt); - - return 0; - -err_put_supported_hw: - dev_pm_opp_put_supported_hw(opp_table); - - return err; -} - -static int tegra20_cpufreq_remove(struct platform_device *pdev) -{ - struct platform_device *cpufreq_dt; - struct opp_table *opp_table; - - cpufreq_dt = platform_get_drvdata(pdev); - platform_device_unregister(cpufreq_dt); - - opp_table = dev_pm_opp_get_opp_table(get_cpu_device(0)); - dev_pm_opp_put_supported_hw(opp_table); - dev_pm_opp_put_opp_table(opp_table); + err = devm_add_action_or_reset(&pdev->dev, + tegra20_cpufreq_dt_unregister, + cpufreq_dt); + if (err) + return err; return 0; } static struct platform_driver tegra20_cpufreq_driver = { .probe = tegra20_cpufreq_probe, - .remove = tegra20_cpufreq_remove, .driver = { .name = "tegra20-cpufreq", }, From 266991721c15f9feb5c4b839cb1bdde4a2b20030 Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Wed, 13 Jan 2021 14:52:41 +0800 Subject: [PATCH 040/307] cpufreq: qcom-hw: enable boost support At least on sdm850, the 2956800 khz is detected as a boost frequency in function qcom_cpufreq_hw_read_lut(). Let's enable boost support by calling cpufreq_enable_boost_support(), so that we can get the boost frequency by switching it on via 'boost' sysfs entry like below. $ echo 1 > /sys/devices/system/cpu/cpufreq/boost Signed-off-by: Shawn Guo Tested-by: Steev Klimaszewski Reviewed-by: Bjorn Andersson Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 9ed5341dc515..acc645b85e79 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -347,6 +347,12 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) dev_pm_opp_of_register_em(cpu_dev, policy->cpus); + if (policy_has_boost_freq(policy)) { + ret = cpufreq_enable_boost_support(); + if (ret) + dev_warn(cpu_dev, "failed to enable boost: %d\n", ret); + } + return 0; error: devm_iounmap(dev, base); From 05f456286fd489558c72a4711d22a5612c965685 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 17 Jan 2021 15:26:35 +0100 Subject: [PATCH 041/307] cpufreq: brcmstb-avs-cpufreq: Free resources in error path If 'cpufreq_register_driver()' fails, we must release the resources allocated in 'brcm_avs_prepare_init()' as already done in the remove function. To do that, introduce a new function 'brcm_avs_prepare_uninit()' in order to avoid code duplication. This also makes the code more readable (IMHO). Fixes: de322e085995 ("cpufreq: brcmstb-avs-cpufreq: AVS CPUfreq driver for Broadcom STB SoCs") Signed-off-by: Christophe JAILLET [ Viresh: Updated Subject ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/brcmstb-avs-cpufreq.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 3e31e5d28b79..e25ccb744187 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -597,6 +597,16 @@ unmap_base: return ret; } +static void brcm_avs_prepare_uninit(struct platform_device *pdev) +{ + struct private_data *priv; + + priv = platform_get_drvdata(pdev); + + iounmap(priv->avs_intr_base); + iounmap(priv->base); +} + static int brcm_avs_cpufreq_init(struct cpufreq_policy *policy) { struct cpufreq_frequency_table *freq_table; @@ -732,21 +742,22 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev) brcm_avs_driver.driver_data = pdev; - return cpufreq_register_driver(&brcm_avs_driver); + ret = cpufreq_register_driver(&brcm_avs_driver); + if (ret) + brcm_avs_prepare_uninit(pdev); + + return ret; } static int brcm_avs_cpufreq_remove(struct platform_device *pdev) { - struct private_data *priv; int ret; ret = cpufreq_unregister_driver(&brcm_avs_driver); if (ret) return ret; - priv = platform_get_drvdata(pdev); - iounmap(priv->base); - iounmap(priv->avs_intr_base); + brcm_avs_prepare_uninit(pdev); return 0; } From 3657f729b6fb5f2c0bf693742de2dcd49c572aa1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 17 Jan 2021 15:26:44 +0100 Subject: [PATCH 042/307] cpufreq: brcmstb-avs-cpufreq: Fix resource leaks in ->remove() If 'cpufreq_unregister_driver()' fails, just WARN and continue, so that other resources are freed. Fixes: de322e085995 ("cpufreq: brcmstb-avs-cpufreq: AVS CPUfreq driver for Broadcom STB SoCs") Signed-off-by: Christophe JAILLET [ Viresh: Updated Subject ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/brcmstb-avs-cpufreq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index e25ccb744187..4153150e20db 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -754,8 +754,7 @@ static int brcm_avs_cpufreq_remove(struct platform_device *pdev) int ret; ret = cpufreq_unregister_driver(&brcm_avs_driver); - if (ret) - return ret; + WARN_ON(ret); brcm_avs_prepare_uninit(pdev); From 3dfaea3811f8b6a89a347e8da9ab862cdf3e30fe Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Fri, 15 Jan 2021 10:48:18 -0800 Subject: [PATCH 043/307] ACPICA: Fix exception code class checks ACPICA commit 1a3a549286ea9db07d7ec700e7a70dd8bcc4354e The macros to classify different AML exception codes are broken. For instance, ACPI_ENV_EXCEPTION(Status) will always evaluate to zero due to #define AE_CODE_ENVIRONMENTAL 0x0000 #define ACPI_ENV_EXCEPTION(Status) (Status & AE_CODE_ENVIRONMENTAL) Similarly, ACPI_AML_EXCEPTION(Status) will evaluate to a non-zero value for error codes of type AE_CODE_PROGRAMMER, AE_CODE_ACPI_TABLES, as well as AE_CODE_AML, and not just AE_CODE_AML as the name suggests. This commit fixes those checks. Fixes: d46b6537f0ce ("ACPICA: AML Parser: ignore all exceptions resulting from incorrect AML during table load") Link: https://github.com/acpica/acpica/commit/1a3a5492 Signed-off-by: Maximilian Luz Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- include/acpi/acexcep.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 2fc624a61769..f8a4afb0279a 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -59,11 +59,11 @@ struct acpi_exception_info { #define AE_OK (acpi_status) 0x0000 -#define ACPI_ENV_EXCEPTION(status) (status & AE_CODE_ENVIRONMENTAL) -#define ACPI_AML_EXCEPTION(status) (status & AE_CODE_AML) -#define ACPI_PROG_EXCEPTION(status) (status & AE_CODE_PROGRAMMER) -#define ACPI_TABLE_EXCEPTION(status) (status & AE_CODE_ACPI_TABLES) -#define ACPI_CNTL_EXCEPTION(status) (status & AE_CODE_CONTROL) +#define ACPI_ENV_EXCEPTION(status) (((status) & AE_CODE_MASK) == AE_CODE_ENVIRONMENTAL) +#define ACPI_AML_EXCEPTION(status) (((status) & AE_CODE_MASK) == AE_CODE_AML) +#define ACPI_PROG_EXCEPTION(status) (((status) & AE_CODE_MASK) == AE_CODE_PROGRAMMER) +#define ACPI_TABLE_EXCEPTION(status) (((status) & AE_CODE_MASK) == AE_CODE_ACPI_TABLES) +#define ACPI_CNTL_EXCEPTION(status) (((status) & AE_CODE_MASK) == AE_CODE_CONTROL) /* * Environmental exceptions From 25d866c46c1d58083fa783703484f8473d61db54 Mon Sep 17 00:00:00 2001 From: Maximilian Luz Date: Fri, 15 Jan 2021 10:48:19 -0800 Subject: [PATCH 044/307] ACPICA: Clean up exception code class checks ACPICA commit 5a8390fbd4c5c60da0b6d4ba53b5ee34fda9a0cb With the exception code class check macros fixed in the previous commit, let us now use those to simplify exception class checks across ACPICA. Link: https://github.com/acpica/acpica/commit/5a8390fb Signed-off-by: Maximilian Luz Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dbobject.c | 2 +- drivers/acpi/acpica/dsdebug.c | 2 +- drivers/acpi/acpica/psloop.c | 3 +-- drivers/acpi/acpica/psparse.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/acpica/dbobject.c b/drivers/acpi/acpica/dbobject.c index 4b4c530a0654..95ab91b35f29 100644 --- a/drivers/acpi/acpica/dbobject.c +++ b/drivers/acpi/acpica/dbobject.c @@ -47,7 +47,7 @@ acpi_db_dump_method_info(acpi_status status, struct acpi_walk_state *walk_state) /* Ignore control codes, they are not errors */ - if ((status & AE_CODE_MASK) == AE_CODE_CONTROL) { + if (ACPI_CNTL_EXCEPTION(status)) { return; } diff --git a/drivers/acpi/acpica/dsdebug.c b/drivers/acpi/acpica/dsdebug.c index 63bc5f19fb82..2c22e3eff535 100644 --- a/drivers/acpi/acpica/dsdebug.c +++ b/drivers/acpi/acpica/dsdebug.c @@ -100,7 +100,7 @@ acpi_ds_dump_method_stack(acpi_status status, /* Ignore control codes, they are not errors */ - if ((status & AE_CODE_MASK) == AE_CODE_CONTROL) { + if (ACPI_CNTL_EXCEPTION(status)) { return_VOID; } diff --git a/drivers/acpi/acpica/psloop.c b/drivers/acpi/acpica/psloop.c index 3cf0687b9915..1ba17cf16c41 100644 --- a/drivers/acpi/acpica/psloop.c +++ b/drivers/acpi/acpica/psloop.c @@ -264,8 +264,7 @@ acpi_status acpi_ps_parse_loop(struct acpi_walk_state *walk_state) ACPI_TO_POINTER (TRUE)); if (ACPI_FAILURE(status) - && ((status & AE_CODE_MASK) != - AE_CODE_CONTROL)) { + && !ACPI_CNTL_EXCEPTION(status)) { if (status == AE_AML_NO_RETURN_VALUE) { ACPI_EXCEPTION((AE_INFO, status, "Invoked method did not return a value")); diff --git a/drivers/acpi/acpica/psparse.c b/drivers/acpi/acpica/psparse.c index bd3caf735be3..06490a137982 100644 --- a/drivers/acpi/acpica/psparse.c +++ b/drivers/acpi/acpica/psparse.c @@ -383,7 +383,7 @@ acpi_ps_next_parse_state(struct acpi_walk_state *walk_state, default: status = callback_status; - if ((callback_status & AE_CODE_MASK) == AE_CODE_CONTROL) { + if (ACPI_CNTL_EXCEPTION(callback_status)) { status = AE_OK; } break; From 7114ebffd330bfc5a95b9832a70b6bd857d26fd8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jan 2021 14:16:44 +0100 Subject: [PATCH 045/307] cpufreq: remove tango driver The tango platform is getting removed, so the driver is no longer needed. Cc: Marc Gonzalez Cc: Mans Rullgard Signed-off-by: Arnd Bergmann [ Viresh: Update cpufreq-dt-platdev.c as well ] Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 5 ---- drivers/cpufreq/Makefile | 1 - drivers/cpufreq/cpufreq-dt-platdev.c | 2 -- drivers/cpufreq/tango-cpufreq.c | 38 ---------------------------- 4 files changed, 46 deletions(-) delete mode 100644 drivers/cpufreq/tango-cpufreq.c diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 1f73fa75b1a0..e65e0a43be64 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -289,11 +289,6 @@ config ARM_STI_CPUFREQ this config option if you wish to add CPUFreq support for STi based SoCs. -config ARM_TANGO_CPUFREQ - bool - depends on CPUFREQ_DT && ARCH_TANGO - default y - config ARM_TEGRA20_CPUFREQ tristate "Tegra20/30 CPUFreq support" depends on ARCH_TEGRA && CPUFREQ_DT diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index f1b7e3dd6e5d..1ab9b1536304 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -79,7 +79,6 @@ obj-$(CONFIG_ARM_SCPI_CPUFREQ) += scpi-cpufreq.o obj-$(CONFIG_ARM_SPEAR_CPUFREQ) += spear-cpufreq.o obj-$(CONFIG_ARM_STI_CPUFREQ) += sti-cpufreq.o obj-$(CONFIG_ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM) += sun50i-cpufreq-nvmem.o -obj-$(CONFIG_ARM_TANGO_CPUFREQ) += tango-cpufreq.o obj-$(CONFIG_ARM_TEGRA20_CPUFREQ) += tegra20-cpufreq.o obj-$(CONFIG_ARM_TEGRA124_CPUFREQ) += tegra124-cpufreq.o obj-$(CONFIG_ARM_TEGRA186_CPUFREQ) += tegra186-cpufreq.o diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index bd2db0188cbb..3ba2f716fe97 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -141,8 +141,6 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "st,stih410", }, { .compatible = "st,stih418", }, - { .compatible = "sigma,tango4", }, - { .compatible = "ti,am33xx", }, { .compatible = "ti,am43", }, { .compatible = "ti,dra7", }, diff --git a/drivers/cpufreq/tango-cpufreq.c b/drivers/cpufreq/tango-cpufreq.c deleted file mode 100644 index 89a7f860bfe8..000000000000 --- a/drivers/cpufreq/tango-cpufreq.c +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -#include -#include -#include - -static const struct of_device_id machines[] __initconst = { - { .compatible = "sigma,tango4" }, - { /* sentinel */ } -}; - -static int __init tango_cpufreq_init(void) -{ - struct device *cpu_dev = get_cpu_device(0); - unsigned long max_freq; - struct clk *cpu_clk; - void *res; - - if (!of_match_node(machines, of_root)) - return -ENODEV; - - cpu_clk = clk_get(cpu_dev, NULL); - if (IS_ERR(cpu_clk)) - return -ENODEV; - - max_freq = clk_get_rate(cpu_clk); - - dev_pm_opp_add(cpu_dev, max_freq / 1, 0); - dev_pm_opp_add(cpu_dev, max_freq / 2, 0); - dev_pm_opp_add(cpu_dev, max_freq / 3, 0); - dev_pm_opp_add(cpu_dev, max_freq / 5, 0); - dev_pm_opp_add(cpu_dev, max_freq / 9, 0); - - res = platform_device_register_data(NULL, "cpufreq-dt", -1, NULL, 0); - - return PTR_ERR_OR_ZERO(res); -} -device_initcall(tango_cpufreq_init); From c1a7c2ce7c37a9c51228848647b9908b1cb532d1 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 21 Jan 2021 16:23:51 -0800 Subject: [PATCH 046/307] ACPICA: fix -Wfallthrough ACPICA commit 4b9135f5774caa796ddf826448811e8e7f08ef2f GCC 7.1 gained -Wimplicit-fallthrough to warn on implicit fallthrough, as well as __attribute__((__fallthrough__)) and comments to explicitly denote that cases of fallthrough were intentional. Clang also supports this warning and statement attribute, but not the comment form. Robert Moore provides additional context about the lint comments being removed. They were for "an old version of PC-Lint, which we don't use anymore." Drop those. This will help us enable -Wimplicit-fallthrough throughout the Linux kernel. Suggested-by: Robert Moore Reported-by: Jon Hunter Link: https://github.com/acpica/acpica/commit/4b9135f5 Signed-off-by: Nick Desaulniers Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dscontrol.c | 2 +- drivers/acpi/acpica/dswexec.c | 3 +-- drivers/acpi/acpica/dswload.c | 2 +- drivers/acpi/acpica/dswload2.c | 2 +- drivers/acpi/acpica/exfldio.c | 2 +- drivers/acpi/acpica/exresop.c | 4 ++-- drivers/acpi/acpica/exstore.c | 4 ++-- drivers/acpi/acpica/hwgpe.c | 2 +- drivers/acpi/acpica/utdelete.c | 2 +- include/acpi/actypes.h | 6 ++++++ include/acpi/platform/acgcc.h | 15 +++++++++++++++ 11 files changed, 32 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/acpica/dscontrol.c b/drivers/acpi/acpica/dscontrol.c index 4b5b6e859f62..b58ffc7acdb9 100644 --- a/drivers/acpi/acpica/dscontrol.c +++ b/drivers/acpi/acpica/dscontrol.c @@ -62,7 +62,7 @@ acpi_ds_exec_begin_control_op(struct acpi_walk_state *walk_state, } } - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case AML_IF_OP: /* diff --git a/drivers/acpi/acpica/dswexec.c b/drivers/acpi/acpica/dswexec.c index 1d4f8c81028c..4a9799246fae 100644 --- a/drivers/acpi/acpica/dswexec.c +++ b/drivers/acpi/acpica/dswexec.c @@ -598,8 +598,7 @@ acpi_status acpi_ds_exec_end_op(struct acpi_walk_state *walk_state) break; } - /* Fall through */ - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case AML_INT_EVAL_SUBTREE_OP: diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c index 27069325b6de..dd97c86f8e41 100644 --- a/drivers/acpi/acpica/dswload.c +++ b/drivers/acpi/acpica/dswload.c @@ -224,7 +224,7 @@ acpi_ds_load1_begin_op(struct acpi_walk_state *walk_state, break; } - /*lint -fallthrough */ + ACPI_FALLTHROUGH; default: diff --git a/drivers/acpi/acpica/dswload2.c b/drivers/acpi/acpica/dswload2.c index edadbe146506..d9a3dfca7555 100644 --- a/drivers/acpi/acpica/dswload2.c +++ b/drivers/acpi/acpica/dswload2.c @@ -214,7 +214,7 @@ acpi_ds_load2_begin_op(struct acpi_walk_state *walk_state, break; } - /*lint -fallthrough */ + ACPI_FALLTHROUGH; default: diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c index ade35ff1c7ba..cde24e0fa6a8 100644 --- a/drivers/acpi/acpica/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -434,7 +434,7 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, * region_field case and write the datum to the Operation Region */ - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case ACPI_TYPE_LOCAL_REGION_FIELD: /* diff --git a/drivers/acpi/acpica/exresop.c b/drivers/acpi/acpica/exresop.c index 4d1b22971d58..4a0f8b8bfe62 100644 --- a/drivers/acpi/acpica/exresop.c +++ b/drivers/acpi/acpica/exresop.c @@ -198,7 +198,7 @@ acpi_ex_resolve_operands(u16 opcode, target_op = AML_DEBUG_OP; - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case ACPI_REFCLASS_ARG: case ACPI_REFCLASS_LOCAL: @@ -264,7 +264,7 @@ acpi_ex_resolve_operands(u16 opcode, * Else not a string - fall through to the normal Reference * case below */ - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case ARGI_REFERENCE: /* References: */ case ARGI_INTEGER_REF: diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c index 3adc0a29d890..8fe33051275d 100644 --- a/drivers/acpi/acpica/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -96,7 +96,7 @@ acpi_ex_store(union acpi_operand_object *source_desc, return_ACPI_STATUS(AE_OK); } - /*lint -fallthrough */ + ACPI_FALLTHROUGH; default: @@ -422,7 +422,7 @@ acpi_ex_store_object_to_node(union acpi_operand_object *source_desc, break; } - /* Fallthrough */ + ACPI_FALLTHROUGH; case ACPI_TYPE_DEVICE: case ACPI_TYPE_EVENT: diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index b13a4ed5bc63..0c84300e915c 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -167,7 +167,7 @@ acpi_hw_low_set_gpe(struct acpi_gpe_event_info *gpe_event_info, u32 action) return (AE_BAD_PARAMETER); } - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case ACPI_GPE_ENABLE: diff --git a/drivers/acpi/acpica/utdelete.c b/drivers/acpi/acpica/utdelete.c index 4c0d4e434196..624a26794d55 100644 --- a/drivers/acpi/acpica/utdelete.c +++ b/drivers/acpi/acpica/utdelete.c @@ -112,7 +112,7 @@ static void acpi_ut_delete_internal_obj(union acpi_operand_object *object) gpe_block); } - /*lint -fallthrough */ + ACPI_FALLTHROUGH; case ACPI_TYPE_PROCESSOR: case ACPI_TYPE_THERMAL: diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 647cb11d0a0a..2a32593691bc 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -1286,4 +1286,10 @@ typedef enum { #define ACPI_OPT_END -1 +/* Definitions for explicit fallthrough */ + +#ifndef ACPI_FALLTHROUGH +#define ACPI_FALLTHROUGH do {} while(0) +#endif + #endif /* __ACTYPES_H__ */ diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index 7d63d03cf507..91f7a02c798a 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -54,4 +54,19 @@ typedef __builtin_va_list va_list; #define ACPI_USE_NATIVE_MATH64 +/* GCC did not support __has_attribute until 5.1. */ + +#ifndef __has_attribute +#define __has_attribute(x) 0 +#endif + +/* + * Explictly mark intentional explicit fallthrough to silence + * -Wimplicit-fallthrough in GCC 7.1+. + */ + +#if __has_attribute(__fallthrough__) +#define ACPI_FALLTHROUGH __attribute__((__fallthrough__)) +#endif + #endif /* __ACGCC_H__ */ From c01df543c3a24a84e89b015827cf55cb0e613fa3 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 15 Jan 2021 10:48:21 -0800 Subject: [PATCH 047/307] ACPICA: add type casts for string functions Detected by gcc 10.2.0. ACPICA commit 608559800e1ad48b819744aeb1866d94335e2655 Link: https://github.com/acpica/acpica/commit/60855980 Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dbinput.c | 4 ++-- drivers/acpi/acpica/utstrsuppt.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/acpica/dbinput.c b/drivers/acpi/acpica/dbinput.c index 2952856b8a67..b8a48923064f 100644 --- a/drivers/acpi/acpica/dbinput.c +++ b/drivers/acpi/acpica/dbinput.c @@ -473,7 +473,7 @@ char *acpi_db_get_next_token(char *string, /* Remove any spaces at the beginning, ignore blank lines */ - while (*string && isspace(*string)) { + while (*string && isspace((int)*string)) { string++; } @@ -571,7 +571,7 @@ char *acpi_db_get_next_token(char *string, /* Find end of token */ - while (*string && !isspace(*string)) { + while (*string && !isspace((int)*string)) { string++; } break; diff --git a/drivers/acpi/acpica/utstrsuppt.c b/drivers/acpi/acpica/utstrsuppt.c index 2d91003fcf26..199982a6fb16 100644 --- a/drivers/acpi/acpica/utstrsuppt.c +++ b/drivers/acpi/acpica/utstrsuppt.c @@ -104,7 +104,7 @@ acpi_status acpi_ut_convert_decimal_string(char *string, u64 *return_value_ptr) * 1) Runtime: terminate with no error, per the ACPI spec * 2) Compiler: return an error */ - if (!isdigit(*string)) { + if (!isdigit((int)*string)) { #ifdef ACPI_ASL_COMPILER status = AE_BAD_DECIMAL_CONSTANT; #endif @@ -158,7 +158,7 @@ acpi_status acpi_ut_convert_hex_string(char *string, u64 *return_value_ptr) * 1) Runtime: terminate with no error, per the ACPI spec * 2) Compiler: return an error */ - if (!isxdigit(*string)) { + if (!isxdigit((int)*string)) { #ifdef ACPI_ASL_COMPILER status = AE_BAD_HEX_CONSTANT; #endif From 7c9e83b6ad7101b924ca6404898c147b436bcc00 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 15 Jan 2021 10:48:22 -0800 Subject: [PATCH 048/307] ACPICA: Update version to 20201217 ACPICA commit 830dcc2b4fd2de8f0c63f1c366f51da276fe3d85 Version 20201217. Link: https://github.com/acpica/acpica/commit/830dcc2b Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 0bba8b8c350e..be76e40769cb 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20201113 +#define ACPI_CA_VERSION 0x20201217 #include #include From 9e30007088688a840f522ffe45a7920e4fa6dfe8 Mon Sep 17 00:00:00 2001 From: Al Stone Date: Fri, 15 Jan 2021 10:48:23 -0800 Subject: [PATCH 049/307] ACPICA: Remove the MTMR (Mid-Timer) table ACPICA commit 2c39dcccda4dc250a44379ae086b8b1a3fdad115 This table is no longer in use, and is not officially defined in the ACPI specification. Link: https://github.com/acpica/acpica/commit/2c39dccc Signed-off-by: Al Stone Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl2.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index ec66779cb193..94bfc0c2a893 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -33,7 +33,6 @@ #define ACPI_SIG_MPST "MPST" /* Memory Power State Table */ #define ACPI_SIG_MSCT "MSCT" /* Maximum System Characteristics Table */ #define ACPI_SIG_MSDM "MSDM" /* Microsoft Data Management Table */ -#define ACPI_SIG_MTMR "MTMR" /* MID Timer table */ #define ACPI_SIG_NFIT "NFIT" /* NVDIMM Firmware Interface Table */ #define ACPI_SIG_PCCT "PCCT" /* Platform Communications Channel Table */ #define ACPI_SIG_PDTT "PDTT" /* Platform Debug Trigger Table */ @@ -935,29 +934,6 @@ struct acpi_table_msdm { struct acpi_table_header header; /* Common ACPI table header */ }; -/******************************************************************************* - * - * MTMR - MID Timer Table - * Version 1 - * - * Conforms to "Simple Firmware Interface Specification", - * Draft 0.8.2, Oct 19, 2010 - * NOTE: The ACPI MTMR is equivalent to the SFI MTMR table. - * - ******************************************************************************/ - -struct acpi_table_mtmr { - struct acpi_table_header header; /* Common ACPI table header */ -}; - -/* MTMR entry */ - -struct acpi_mtmr_entry { - struct acpi_generic_address physical_address; - u32 frequency; - u32 irq; -}; - /******************************************************************************* * * NFIT - NVDIMM Interface Table (ACPI 6.0+) From 9a5c7de7a5d1da11ef95dbffa7e7ebef45317b78 Mon Sep 17 00:00:00 2001 From: Al Stone Date: Fri, 15 Jan 2021 10:48:24 -0800 Subject: [PATCH 050/307] ACPICA: Remove the VRTC table ACPICA commit 4534cc3700f73c88e2f6a0e0f0b9efe4fc644757 The VRTC table is no longer in use and is not defined by the ACPI specification. Remove the table from the known, allowed tables. Link: https://github.com/acpica/acpica/commit/4534cc37 Signed-off-by: Al Stone Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl3.h | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h index bdcac69fa6bd..d90c3e1978e6 100644 --- a/include/acpi/actbl3.h +++ b/include/acpi/actbl3.h @@ -33,7 +33,6 @@ #define ACPI_SIG_TCPA "TCPA" /* Trusted Computing Platform Alliance table */ #define ACPI_SIG_TPM2 "TPM2" /* Trusted Platform Module 2.0 H/W interface table */ #define ACPI_SIG_UEFI "UEFI" /* Uefi Boot Optimization Table */ -#define ACPI_SIG_VRTC "VRTC" /* Virtual Real Time Clock Table */ #define ACPI_SIG_WAET "WAET" /* Windows ACPI Emulated devices Table */ #define ACPI_SIG_WDAT "WDAT" /* Watchdog Action Table */ #define ACPI_SIG_WDDT "WDDT" /* Watchdog Timer Description Table */ @@ -484,28 +483,6 @@ struct acpi_table_uefi { u16 data_offset; /* Offset of remaining data in table */ }; -/******************************************************************************* - * - * VRTC - Virtual Real Time Clock Table - * Version 1 - * - * Conforms to "Simple Firmware Interface Specification", - * Draft 0.8.2, Oct 19, 2010 - * NOTE: The ACPI VRTC is equivalent to The SFI MRTC table. - * - ******************************************************************************/ - -struct acpi_table_vrtc { - struct acpi_table_header header; /* Common ACPI table header */ -}; - -/* VRTC entry */ - -struct acpi_vrtc_entry { - struct acpi_generic_address physical_address; - u32 irq; -}; - /******************************************************************************* * * WAET - Windows ACPI Emulated devices Table From 4441e55d5051368685b4c75b5157d752f940ee06 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 15 Jan 2021 10:48:25 -0800 Subject: [PATCH 051/307] ACPICA: Updated all copyrights to 2021 This affects all ACPICA source code modules. ACPICA commit c570953c914437e621dd5f160f26ddf352e0d2f4 Link: https://github.com/acpica/acpica/commit/c570953c Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/acapps.h | 4 ++-- drivers/acpi/acpica/accommon.h | 2 +- drivers/acpi/acpica/acconvert.h | 2 +- drivers/acpi/acpica/acdebug.h | 2 +- drivers/acpi/acpica/acdispat.h | 2 +- drivers/acpi/acpica/acevents.h | 2 +- drivers/acpi/acpica/acglobal.h | 2 +- drivers/acpi/acpica/achware.h | 2 +- drivers/acpi/acpica/acinterp.h | 2 +- drivers/acpi/acpica/aclocal.h | 2 +- drivers/acpi/acpica/acmacros.h | 2 +- drivers/acpi/acpica/acnamesp.h | 2 +- drivers/acpi/acpica/acobject.h | 2 +- drivers/acpi/acpica/acopcode.h | 2 +- drivers/acpi/acpica/acparser.h | 2 +- drivers/acpi/acpica/acpredef.h | 2 +- drivers/acpi/acpica/acresrc.h | 2 +- drivers/acpi/acpica/acstruct.h | 2 +- drivers/acpi/acpica/actables.h | 2 +- drivers/acpi/acpica/acutils.h | 2 +- drivers/acpi/acpica/amlcode.h | 2 +- drivers/acpi/acpica/amlresrc.h | 2 +- drivers/acpi/acpica/dbhistry.c | 2 +- drivers/acpi/acpica/dsargs.c | 2 +- drivers/acpi/acpica/dscontrol.c | 2 +- drivers/acpi/acpica/dsdebug.c | 2 +- drivers/acpi/acpica/dsfield.c | 2 +- drivers/acpi/acpica/dsinit.c | 2 +- drivers/acpi/acpica/dsmethod.c | 2 +- drivers/acpi/acpica/dsobject.c | 2 +- drivers/acpi/acpica/dsopcode.c | 2 +- drivers/acpi/acpica/dspkginit.c | 2 +- drivers/acpi/acpica/dswexec.c | 2 +- drivers/acpi/acpica/dswload.c | 2 +- drivers/acpi/acpica/dswload2.c | 2 +- drivers/acpi/acpica/dswscope.c | 2 +- drivers/acpi/acpica/dswstate.c | 2 +- drivers/acpi/acpica/evevent.c | 2 +- drivers/acpi/acpica/evglock.c | 2 +- drivers/acpi/acpica/evgpe.c | 2 +- drivers/acpi/acpica/evgpeblk.c | 2 +- drivers/acpi/acpica/evgpeinit.c | 2 +- drivers/acpi/acpica/evgpeutil.c | 2 +- drivers/acpi/acpica/evhandler.c | 2 +- drivers/acpi/acpica/evmisc.c | 2 +- drivers/acpi/acpica/evregion.c | 2 +- drivers/acpi/acpica/evrgnini.c | 2 +- drivers/acpi/acpica/evxface.c | 2 +- drivers/acpi/acpica/evxfevnt.c | 2 +- drivers/acpi/acpica/evxfgpe.c | 2 +- drivers/acpi/acpica/evxfregn.c | 2 +- drivers/acpi/acpica/exconcat.c | 2 +- drivers/acpi/acpica/exconfig.c | 2 +- drivers/acpi/acpica/exconvrt.c | 2 +- drivers/acpi/acpica/excreate.c | 2 +- drivers/acpi/acpica/exdebug.c | 2 +- drivers/acpi/acpica/exdump.c | 2 +- drivers/acpi/acpica/exfield.c | 2 +- drivers/acpi/acpica/exfldio.c | 2 +- drivers/acpi/acpica/exmisc.c | 2 +- drivers/acpi/acpica/exmutex.c | 2 +- drivers/acpi/acpica/exnames.c | 2 +- drivers/acpi/acpica/exoparg1.c | 2 +- drivers/acpi/acpica/exoparg2.c | 2 +- drivers/acpi/acpica/exoparg3.c | 2 +- drivers/acpi/acpica/exoparg6.c | 2 +- drivers/acpi/acpica/exprep.c | 2 +- drivers/acpi/acpica/exregion.c | 2 +- drivers/acpi/acpica/exresnte.c | 2 +- drivers/acpi/acpica/exresolv.c | 2 +- drivers/acpi/acpica/exresop.c | 2 +- drivers/acpi/acpica/exserial.c | 2 +- drivers/acpi/acpica/exstore.c | 2 +- drivers/acpi/acpica/exstoren.c | 2 +- drivers/acpi/acpica/exstorob.c | 2 +- drivers/acpi/acpica/exsystem.c | 2 +- drivers/acpi/acpica/extrace.c | 2 +- drivers/acpi/acpica/exutils.c | 2 +- drivers/acpi/acpica/hwacpi.c | 2 +- drivers/acpi/acpica/hwesleep.c | 2 +- drivers/acpi/acpica/hwgpe.c | 2 +- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/acpi/acpica/hwtimer.c | 2 +- drivers/acpi/acpica/hwvalid.c | 2 +- drivers/acpi/acpica/hwxface.c | 2 +- drivers/acpi/acpica/hwxfsleep.c | 2 +- drivers/acpi/acpica/nsarguments.c | 2 +- drivers/acpi/acpica/nsconvert.c | 2 +- drivers/acpi/acpica/nsdump.c | 2 +- drivers/acpi/acpica/nsdumpdv.c | 2 +- drivers/acpi/acpica/nsinit.c | 2 +- drivers/acpi/acpica/nsload.c | 2 +- drivers/acpi/acpica/nsparse.c | 2 +- drivers/acpi/acpica/nspredef.c | 2 +- drivers/acpi/acpica/nsprepkg.c | 2 +- drivers/acpi/acpica/nsrepair.c | 2 +- drivers/acpi/acpica/nsrepair2.c | 2 +- drivers/acpi/acpica/nsutils.c | 2 +- drivers/acpi/acpica/nswalk.c | 2 +- drivers/acpi/acpica/nsxfname.c | 2 +- drivers/acpi/acpica/psargs.c | 2 +- drivers/acpi/acpica/psloop.c | 2 +- drivers/acpi/acpica/psobject.c | 2 +- drivers/acpi/acpica/psopcode.c | 2 +- drivers/acpi/acpica/psopinfo.c | 2 +- drivers/acpi/acpica/psparse.c | 2 +- drivers/acpi/acpica/psscope.c | 2 +- drivers/acpi/acpica/pstree.c | 2 +- drivers/acpi/acpica/psutils.c | 2 +- drivers/acpi/acpica/pswalk.c | 2 +- drivers/acpi/acpica/psxface.c | 2 +- drivers/acpi/acpica/tbdata.c | 2 +- drivers/acpi/acpica/tbfadt.c | 2 +- drivers/acpi/acpica/tbfind.c | 2 +- drivers/acpi/acpica/tbinstal.c | 2 +- drivers/acpi/acpica/tbprint.c | 2 +- drivers/acpi/acpica/tbutils.c | 2 +- drivers/acpi/acpica/tbxface.c | 2 +- drivers/acpi/acpica/tbxfload.c | 2 +- drivers/acpi/acpica/tbxfroot.c | 2 +- drivers/acpi/acpica/utaddress.c | 2 +- drivers/acpi/acpica/utalloc.c | 2 +- drivers/acpi/acpica/utascii.c | 2 +- drivers/acpi/acpica/utbuffer.c | 2 +- drivers/acpi/acpica/utcache.c | 2 +- drivers/acpi/acpica/utcopy.c | 2 +- drivers/acpi/acpica/utdebug.c | 2 +- drivers/acpi/acpica/utdecode.c | 2 +- drivers/acpi/acpica/uteval.c | 2 +- drivers/acpi/acpica/utglobal.c | 2 +- drivers/acpi/acpica/uthex.c | 2 +- drivers/acpi/acpica/utids.c | 2 +- drivers/acpi/acpica/utinit.c | 2 +- drivers/acpi/acpica/utlock.c | 2 +- drivers/acpi/acpica/utobject.c | 2 +- drivers/acpi/acpica/utosi.c | 2 +- drivers/acpi/acpica/utpredef.c | 2 +- drivers/acpi/acpica/utprint.c | 2 +- drivers/acpi/acpica/uttrack.c | 2 +- drivers/acpi/acpica/utuuid.c | 2 +- drivers/acpi/acpica/utxface.c | 2 +- drivers/acpi/acpica/utxfinit.c | 2 +- include/acpi/acbuffer.h | 2 +- include/acpi/acconfig.h | 2 +- include/acpi/acexcep.h | 2 +- include/acpi/acnames.h | 2 +- include/acpi/acoutput.h | 2 +- include/acpi/acpi.h | 2 +- include/acpi/acpiosxf.h | 2 +- include/acpi/acpixf.h | 2 +- include/acpi/acrestyp.h | 2 +- include/acpi/actbl.h | 2 +- include/acpi/actbl1.h | 2 +- include/acpi/actbl2.h | 2 +- include/acpi/actbl3.h | 2 +- include/acpi/actypes.h | 2 +- include/acpi/acuuid.h | 2 +- include/acpi/platform/acenv.h | 2 +- include/acpi/platform/acenvex.h | 2 +- include/acpi/platform/acgcc.h | 2 +- include/acpi/platform/acgccex.h | 2 +- include/acpi/platform/acintel.h | 2 +- include/acpi/platform/aclinux.h | 2 +- include/acpi/platform/aclinuxex.h | 2 +- tools/power/acpi/common/cmfsize.c | 2 +- tools/power/acpi/common/getopt.c | 2 +- tools/power/acpi/os_specific/service_layers/oslinuxtbl.c | 2 +- tools/power/acpi/os_specific/service_layers/osunixdir.c | 2 +- tools/power/acpi/os_specific/service_layers/osunixmap.c | 2 +- tools/power/acpi/os_specific/service_layers/osunixxf.c | 2 +- tools/power/acpi/tools/acpidump/acpidump.h | 2 +- tools/power/acpi/tools/acpidump/apdump.c | 2 +- tools/power/acpi/tools/acpidump/apfiles.c | 2 +- tools/power/acpi/tools/acpidump/apmain.c | 2 +- 174 files changed, 175 insertions(+), 175 deletions(-) diff --git a/drivers/acpi/acpica/acapps.h b/drivers/acpi/acpica/acapps.h index 173447d50acf..725e2f65cdca 100644 --- a/drivers/acpi/acpica/acapps.h +++ b/drivers/acpi/acpica/acapps.h @@ -3,7 +3,7 @@ * * Module Name: acapps - common include for ACPI applications/tools * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ @@ -17,7 +17,7 @@ /* Common info for tool signons */ #define ACPICA_NAME "Intel ACPI Component Architecture" -#define ACPICA_COPYRIGHT "Copyright (c) 2000 - 2020 Intel Corporation" +#define ACPICA_COPYRIGHT "Copyright (c) 2000 - 2021 Intel Corporation" #if ACPI_MACHINE_WIDTH == 64 #define ACPI_WIDTH " (64-bit version)" diff --git a/drivers/acpi/acpica/accommon.h b/drivers/acpi/acpica/accommon.h index 94e18bb76556..be3826f46f88 100644 --- a/drivers/acpi/acpica/accommon.h +++ b/drivers/acpi/acpica/accommon.h @@ -3,7 +3,7 @@ * * Name: accommon.h - Common include files for generation of ACPICA source * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acconvert.h b/drivers/acpi/acpica/acconvert.h index cf85d66da6e7..53b41c7a6119 100644 --- a/drivers/acpi/acpica/acconvert.h +++ b/drivers/acpi/acpica/acconvert.h @@ -3,7 +3,7 @@ * * Module Name: acapps - common include for ACPI applications/tools * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acdebug.h b/drivers/acpi/acpica/acdebug.h index f8a3abdfe250..3ccc7b2a76f1 100644 --- a/drivers/acpi/acpica/acdebug.h +++ b/drivers/acpi/acpica/acdebug.h @@ -3,7 +3,7 @@ * * Name: acdebug.h - ACPI/AML debugger * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acdispat.h b/drivers/acpi/acpica/acdispat.h index 7ba6e308f146..3170a24fe505 100644 --- a/drivers/acpi/acpica/acdispat.h +++ b/drivers/acpi/acpica/acdispat.h @@ -3,7 +3,7 @@ * * Name: acdispat.h - dispatcher (parser to interpreter interface) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acevents.h b/drivers/acpi/acpica/acevents.h index 79f292687bd6..82a75964343b 100644 --- a/drivers/acpi/acpica/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -3,7 +3,7 @@ * * Name: acevents.h - Event subcomponent prototypes and defines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h index 2fee91f57b21..d41b810e367c 100644 --- a/drivers/acpi/acpica/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -3,7 +3,7 @@ * * Name: acglobal.h - Declarations for global variables * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 6ab92e28330d..810de0b4c125 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -3,7 +3,7 @@ * * Name: achware.h -- hardware specific interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acinterp.h b/drivers/acpi/acpica/acinterp.h index a6d896cda2a5..816a16e1fc4c 100644 --- a/drivers/acpi/acpica/acinterp.h +++ b/drivers/acpi/acpica/acinterp.h @@ -3,7 +3,7 @@ * * Name: acinterp.h - Interpreter subcomponent prototypes and defines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index f83b98fa13ac..be57436182a1 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -3,7 +3,7 @@ * * Name: aclocal.h - Internal data types used across the ACPI subsystem * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acmacros.h b/drivers/acpi/acpica/acmacros.h index 168904ba3086..93bd2d19c156 100644 --- a/drivers/acpi/acpica/acmacros.h +++ b/drivers/acpi/acpica/acmacros.h @@ -3,7 +3,7 @@ * * Name: acmacros.h - C macros for the entire subsystem. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acnamesp.h b/drivers/acpi/acpica/acnamesp.h index 40f6a3c33a15..199aabac3790 100644 --- a/drivers/acpi/acpica/acnamesp.h +++ b/drivers/acpi/acpica/acnamesp.h @@ -3,7 +3,7 @@ * * Name: acnamesp.h - Namespace subcomponent prototypes and defines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h index 9f0219a8cb98..af47a3ffd2a4 100644 --- a/drivers/acpi/acpica/acobject.h +++ b/drivers/acpi/acpica/acobject.h @@ -3,7 +3,7 @@ * * Name: acobject.h - Definition of union acpi_operand_object (Internal object only) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acopcode.h b/drivers/acpi/acpica/acopcode.h index 8825394be9ab..c3f12ee9fc6f 100644 --- a/drivers/acpi/acpica/acopcode.h +++ b/drivers/acpi/acpica/acopcode.h @@ -3,7 +3,7 @@ * * Name: acopcode.h - AML opcode information for the AML parser and interpreter * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acparser.h b/drivers/acpi/acpica/acparser.h index bc00b85c0a8f..8e40e5909458 100644 --- a/drivers/acpi/acpica/acparser.h +++ b/drivers/acpi/acpica/acparser.h @@ -3,7 +3,7 @@ * * Module Name: acparser.h - AML Parser subcomponent prototypes and defines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acpredef.h b/drivers/acpi/acpica/acpredef.h index 57ea2276790f..15cf904f0751 100644 --- a/drivers/acpi/acpica/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h @@ -3,7 +3,7 @@ * * Name: acpredef - Information table for ACPI predefined methods and objects * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acresrc.h b/drivers/acpi/acpica/acresrc.h index 6de8a1650d3d..0cb975a3e01d 100644 --- a/drivers/acpi/acpica/acresrc.h +++ b/drivers/acpi/acpica/acresrc.h @@ -3,7 +3,7 @@ * * Name: acresrc.h - Resource Manager function prototypes * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acstruct.h b/drivers/acpi/acpica/acstruct.h index 4c900c108f3f..e3beb096c46d 100644 --- a/drivers/acpi/acpica/acstruct.h +++ b/drivers/acpi/acpica/acstruct.h @@ -3,7 +3,7 @@ * * Name: acstruct.h - Internal structs * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/actables.h b/drivers/acpi/acpica/actables.h index 734624facda3..e2d0046799a2 100644 --- a/drivers/acpi/acpica/actables.h +++ b/drivers/acpi/acpica/actables.h @@ -3,7 +3,7 @@ * * Name: actables.h - ACPI table management * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h index 7c89b470ec81..be6de7149e67 100644 --- a/drivers/acpi/acpica/acutils.h +++ b/drivers/acpi/acpica/acutils.h @@ -3,7 +3,7 @@ * * Name: acutils.h -- prototypes for the common (subsystem-wide) procedures * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/amlcode.h b/drivers/acpi/acpica/amlcode.h index 1d541bbac4a3..d6b088c5001f 100644 --- a/drivers/acpi/acpica/amlcode.h +++ b/drivers/acpi/acpica/amlcode.h @@ -5,7 +5,7 @@ * Declarations and definitions contained herein are derived * directly from the ACPI specification. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h index e5234e001acf..a9d91a3c2994 100644 --- a/drivers/acpi/acpica/amlresrc.h +++ b/drivers/acpi/acpica/amlresrc.h @@ -3,7 +3,7 @@ * * Module Name: amlresrc.h - AML resource descriptors * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dbhistry.c b/drivers/acpi/acpica/dbhistry.c index f5fba14461a6..fd813c5d3952 100644 --- a/drivers/acpi/acpica/dbhistry.c +++ b/drivers/acpi/acpica/dbhistry.c @@ -3,7 +3,7 @@ * * Module Name: dbhistry - debugger HISTORY command * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsargs.c b/drivers/acpi/acpica/dsargs.c index ad17f62e51d9..6630d6536fb0 100644 --- a/drivers/acpi/acpica/dsargs.c +++ b/drivers/acpi/acpica/dsargs.c @@ -4,7 +4,7 @@ * Module Name: dsargs - Support for execution of dynamic arguments for static * objects (regions, fields, buffer fields, etc.) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dscontrol.c b/drivers/acpi/acpica/dscontrol.c index b58ffc7acdb9..a152f03135cd 100644 --- a/drivers/acpi/acpica/dscontrol.c +++ b/drivers/acpi/acpica/dscontrol.c @@ -4,7 +4,7 @@ * Module Name: dscontrol - Support for execution control opcodes - * if/else/while/return * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsdebug.c b/drivers/acpi/acpica/dsdebug.c index 2c22e3eff535..b9b03d629930 100644 --- a/drivers/acpi/acpica/dsdebug.c +++ b/drivers/acpi/acpica/dsdebug.c @@ -3,7 +3,7 @@ * * Module Name: dsdebug - Parser/Interpreter interface - debugging * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsfield.c b/drivers/acpi/acpica/dsfield.c index fa768b3a989e..a16817767969 100644 --- a/drivers/acpi/acpica/dsfield.c +++ b/drivers/acpi/acpica/dsfield.c @@ -3,7 +3,7 @@ * * Module Name: dsfield - Dispatcher field routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsinit.c b/drivers/acpi/acpica/dsinit.c index 9be2a309424c..ba6f882e83bc 100644 --- a/drivers/acpi/acpica/dsinit.c +++ b/drivers/acpi/acpica/dsinit.c @@ -3,7 +3,7 @@ * * Module Name: dsinit - Object initialization namespace walk * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index cf67caff878a..8e011e59b9b4 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -3,7 +3,7 @@ * * Module Name: dsmethod - Parser/Interpreter interface - control method parsing * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsobject.c b/drivers/acpi/acpica/dsobject.c index c0a14a6a2c20..3c0c31157e7e 100644 --- a/drivers/acpi/acpica/dsobject.c +++ b/drivers/acpi/acpica/dsobject.c @@ -3,7 +3,7 @@ * * Module Name: dsobject - Dispatcher object management routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c index d9c26e720cb7..639635291ab7 100644 --- a/drivers/acpi/acpica/dsopcode.c +++ b/drivers/acpi/acpica/dsopcode.c @@ -3,7 +3,7 @@ * * Module Name: dsopcode - Dispatcher support for regions and fields * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dspkginit.c b/drivers/acpi/acpica/dspkginit.c index d869568d55c2..e642d65bcc66 100644 --- a/drivers/acpi/acpica/dspkginit.c +++ b/drivers/acpi/acpica/dspkginit.c @@ -3,7 +3,7 @@ * * Module Name: dspkginit - Completion of deferred package initialization * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dswexec.c b/drivers/acpi/acpica/dswexec.c index 4a9799246fae..41ba7773fd10 100644 --- a/drivers/acpi/acpica/dswexec.c +++ b/drivers/acpi/acpica/dswexec.c @@ -4,7 +4,7 @@ * Module Name: dswexec - Dispatcher method execution callbacks; * dispatch to interpreter. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dswload.c b/drivers/acpi/acpica/dswload.c index dd97c86f8e41..a377638e44f9 100644 --- a/drivers/acpi/acpica/dswload.c +++ b/drivers/acpi/acpica/dswload.c @@ -3,7 +3,7 @@ * * Module Name: dswload - Dispatcher first pass namespace load callbacks * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dswload2.c b/drivers/acpi/acpica/dswload2.c index d9a3dfca7555..3625952c3957 100644 --- a/drivers/acpi/acpica/dswload2.c +++ b/drivers/acpi/acpica/dswload2.c @@ -3,7 +3,7 @@ * * Module Name: dswload2 - Dispatcher second pass namespace load callbacks * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dswscope.c b/drivers/acpi/acpica/dswscope.c index 9c397642fed7..9c123af08bc1 100644 --- a/drivers/acpi/acpica/dswscope.c +++ b/drivers/acpi/acpica/dswscope.c @@ -3,7 +3,7 @@ * * Module Name: dswscope - Scope stack manipulation * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/dswstate.c b/drivers/acpi/acpica/dswstate.c index 809a0c0536b5..fbe2ba05c82a 100644 --- a/drivers/acpi/acpica/dswstate.c +++ b/drivers/acpi/acpica/dswstate.c @@ -3,7 +3,7 @@ * * Module Name: dswstate - Dispatcher parse tree walk management routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evevent.c b/drivers/acpi/acpica/evevent.c index 9efca54c51ac..35385148fedb 100644 --- a/drivers/acpi/acpica/evevent.c +++ b/drivers/acpi/acpica/evevent.c @@ -3,7 +3,7 @@ * * Module Name: evevent - Fixed Event handling and dispatch * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evglock.c b/drivers/acpi/acpica/evglock.c index 0ced84ae13e4..de4eea606ccd 100644 --- a/drivers/acpi/acpica/evglock.c +++ b/drivers/acpi/acpica/evglock.c @@ -3,7 +3,7 @@ * * Module Name: evglock - Global Lock support * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c index 06b9c8dd11c9..c5a06882bdf6 100644 --- a/drivers/acpi/acpica/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -3,7 +3,7 @@ * * Module Name: evgpe - General Purpose Event handling and dispatch * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c index f5298be4273a..e5f8245c2d93 100644 --- a/drivers/acpi/acpica/evgpeblk.c +++ b/drivers/acpi/acpica/evgpeblk.c @@ -3,7 +3,7 @@ * * Module Name: evgpeblk - GPE block creation and initialization. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c index 6d82d30d8f7b..b0724d6e6e80 100644 --- a/drivers/acpi/acpica/evgpeinit.c +++ b/drivers/acpi/acpica/evgpeinit.c @@ -3,7 +3,7 @@ * * Module Name: evgpeinit - System GPE initialization and update * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evgpeutil.c b/drivers/acpi/acpica/evgpeutil.c index 738873e876ca..2e74308d7725 100644 --- a/drivers/acpi/acpica/evgpeutil.c +++ b/drivers/acpi/acpica/evgpeutil.c @@ -3,7 +3,7 @@ * * Module Name: evgpeutil - GPE utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evhandler.c b/drivers/acpi/acpica/evhandler.c index 5884eba047f7..ea9485e6a475 100644 --- a/drivers/acpi/acpica/evhandler.c +++ b/drivers/acpi/acpica/evhandler.c @@ -3,7 +3,7 @@ * * Module Name: evhandler - Support for Address Space handlers * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evmisc.c b/drivers/acpi/acpica/evmisc.c index ce1eda6beb84..f14ebcd610ab 100644 --- a/drivers/acpi/acpica/evmisc.c +++ b/drivers/acpi/acpica/evmisc.c @@ -3,7 +3,7 @@ * * Module Name: evmisc - Miscellaneous event manager support functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c index a8a4c8c9b9ef..3ed7d9ae95cf 100644 --- a/drivers/acpi/acpica/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -3,7 +3,7 @@ * * Module Name: evregion - Operation Region support * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evrgnini.c b/drivers/acpi/acpica/evrgnini.c index 89be3ccdad53..984c172453bf 100644 --- a/drivers/acpi/acpica/evrgnini.c +++ b/drivers/acpi/acpica/evrgnini.c @@ -3,7 +3,7 @@ * * Module Name: evrgnini- ACPI address_space (op_region) init * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c index e4e012297eee..ff5cf5b0705a 100644 --- a/drivers/acpi/acpica/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -3,7 +3,7 @@ * * Module Name: evxface - External interfaces for ACPI events * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index 1a15b0087379..5445a361c621 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -3,7 +3,7 @@ * * Module Name: evxfevnt - External Interfaces, ACPI event disable/enable * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 3be60673e461..a6d53cf86450 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -3,7 +3,7 @@ * * Module Name: evxfgpe - External Interfaces for General Purpose Events (GPEs) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index da97fd0c6b51..7672d70da850 100644 --- a/drivers/acpi/acpica/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -4,7 +4,7 @@ * Module Name: evxfregn - External Interfaces, ACPI Operation Regions and * Address Spaces. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exconcat.c b/drivers/acpi/acpica/exconcat.c index 43711412722f..2d220d470c60 100644 --- a/drivers/acpi/acpica/exconcat.c +++ b/drivers/acpi/acpica/exconcat.c @@ -3,7 +3,7 @@ * * Module Name: exconcat - Concatenate-type AML operators * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exconfig.c b/drivers/acpi/acpica/exconfig.c index 68efd704e2dc..0cd9b3738e76 100644 --- a/drivers/acpi/acpica/exconfig.c +++ b/drivers/acpi/acpica/exconfig.c @@ -3,7 +3,7 @@ * * Module Name: exconfig - Namespace reconfiguration (Load/Unload opcodes) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exconvrt.c b/drivers/acpi/acpica/exconvrt.c index 50c7aad2e86d..6b7498371eb0 100644 --- a/drivers/acpi/acpica/exconvrt.c +++ b/drivers/acpi/acpica/exconvrt.c @@ -3,7 +3,7 @@ * * Module Name: exconvrt - Object conversion routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/excreate.c b/drivers/acpi/acpica/excreate.c index a17482428b46..80b52ad55775 100644 --- a/drivers/acpi/acpica/excreate.c +++ b/drivers/acpi/acpica/excreate.c @@ -3,7 +3,7 @@ * * Module Name: excreate - Named object creation * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exdebug.c b/drivers/acpi/acpica/exdebug.c index a5223dcaee70..6a01e38b7d5a 100644 --- a/drivers/acpi/acpica/exdebug.c +++ b/drivers/acpi/acpica/exdebug.c @@ -3,7 +3,7 @@ * * Module Name: exdebug - Support for stores to the AML Debug Object * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exdump.c b/drivers/acpi/acpica/exdump.c index 47a4d9a40d6b..2aea44ecc37d 100644 --- a/drivers/acpi/acpica/exdump.c +++ b/drivers/acpi/acpica/exdump.c @@ -3,7 +3,7 @@ * * Module Name: exdump - Interpreter debug output routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exfield.c b/drivers/acpi/acpica/exfield.c index 3323a2ba6a31..32f03ee81785 100644 --- a/drivers/acpi/acpica/exfield.c +++ b/drivers/acpi/acpica/exfield.c @@ -3,7 +3,7 @@ * * Module Name: exfield - AML execution - field_unit read/write * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c index cde24e0fa6a8..bdc7a30d1217 100644 --- a/drivers/acpi/acpica/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -3,7 +3,7 @@ * * Module Name: exfldio - Aml Field I/O * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exmisc.c b/drivers/acpi/acpica/exmisc.c index 717e3998fd77..ad19f914641b 100644 --- a/drivers/acpi/acpica/exmisc.c +++ b/drivers/acpi/acpica/exmisc.c @@ -3,7 +3,7 @@ * * Module Name: exmisc - ACPI AML (p-code) execution - specific opcodes * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exmutex.c b/drivers/acpi/acpica/exmutex.c index 9ff247cba571..6237ae8284b1 100644 --- a/drivers/acpi/acpica/exmutex.c +++ b/drivers/acpi/acpica/exmutex.c @@ -3,7 +3,7 @@ * * Module Name: exmutex - ASL Mutex Acquire/Release functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exnames.c b/drivers/acpi/acpica/exnames.c index 74f8b0d0452b..5283603d078d 100644 --- a/drivers/acpi/acpica/exnames.c +++ b/drivers/acpi/acpica/exnames.c @@ -3,7 +3,7 @@ * * Module Name: exnames - interpreter/scanner name load/execute * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index a46d685a3ffc..b639e930d642 100644 --- a/drivers/acpi/acpica/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -3,7 +3,7 @@ * * Module Name: exoparg1 - AML execution - opcodes with 1 argument * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exoparg2.c b/drivers/acpi/acpica/exoparg2.c index 03241d18ac1d..10323ab186da 100644 --- a/drivers/acpi/acpica/exoparg2.c +++ b/drivers/acpi/acpica/exoparg2.c @@ -3,7 +3,7 @@ * * Module Name: exoparg2 - AML execution - opcodes with 2 arguments * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exoparg3.c b/drivers/acpi/acpica/exoparg3.c index c8d0d75fc450..140aae009690 100644 --- a/drivers/acpi/acpica/exoparg3.c +++ b/drivers/acpi/acpica/exoparg3.c @@ -3,7 +3,7 @@ * * Module Name: exoparg3 - AML execution - opcodes with 3 arguments * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exoparg6.c b/drivers/acpi/acpica/exoparg6.c index 55d0fa056fe7..2cf9f37a0ba8 100644 --- a/drivers/acpi/acpica/exoparg6.c +++ b/drivers/acpi/acpica/exoparg6.c @@ -3,7 +3,7 @@ * * Module Name: exoparg6 - AML execution - opcodes with 6 arguments * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exprep.c b/drivers/acpi/acpica/exprep.c index 4a0f03157e08..d8c55dde191b 100644 --- a/drivers/acpi/acpica/exprep.c +++ b/drivers/acpi/acpica/exprep.c @@ -3,7 +3,7 @@ * * Module Name: exprep - ACPI AML field prep utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exregion.c b/drivers/acpi/acpica/exregion.c index 4914dbc44517..82b713a9a193 100644 --- a/drivers/acpi/acpica/exregion.c +++ b/drivers/acpi/acpica/exregion.c @@ -3,7 +3,7 @@ * * Module Name: exregion - ACPI default op_region (address space) handlers * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exresnte.c b/drivers/acpi/acpica/exresnte.c index 3e4018678c09..d80b76455c50 100644 --- a/drivers/acpi/acpica/exresnte.c +++ b/drivers/acpi/acpica/exresnte.c @@ -3,7 +3,7 @@ * * Module Name: exresnte - AML Interpreter object resolution * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exresolv.c b/drivers/acpi/acpica/exresolv.c index 912a078c60a4..fa6a96242835 100644 --- a/drivers/acpi/acpica/exresolv.c +++ b/drivers/acpi/acpica/exresolv.c @@ -3,7 +3,7 @@ * * Module Name: exresolv - AML Interpreter object resolution * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exresop.c b/drivers/acpi/acpica/exresop.c index 4a0f8b8bfe62..cbe2c88b1dc2 100644 --- a/drivers/acpi/acpica/exresop.c +++ b/drivers/acpi/acpica/exresop.c @@ -3,7 +3,7 @@ * * Module Name: exresop - AML Interpreter operand/object resolution * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exserial.c b/drivers/acpi/acpica/exserial.c index 760bc7cef55a..8e8d95f7947b 100644 --- a/drivers/acpi/acpica/exserial.c +++ b/drivers/acpi/acpica/exserial.c @@ -3,7 +3,7 @@ * * Module Name: exserial - field_unit support for serial address spaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exstore.c b/drivers/acpi/acpica/exstore.c index 8fe33051275d..12f4210ea085 100644 --- a/drivers/acpi/acpica/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -3,7 +3,7 @@ * * Module Name: exstore - AML Interpreter object store support * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exstoren.c b/drivers/acpi/acpica/exstoren.c index 8c34f4e2ab8f..08469d37e73e 100644 --- a/drivers/acpi/acpica/exstoren.c +++ b/drivers/acpi/acpica/exstoren.c @@ -4,7 +4,7 @@ * Module Name: exstoren - AML Interpreter object store support, * Store to Node (namespace object) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exstorob.c b/drivers/acpi/acpica/exstorob.c index dc66696080a5..a82628683329 100644 --- a/drivers/acpi/acpica/exstorob.c +++ b/drivers/acpi/acpica/exstorob.c @@ -3,7 +3,7 @@ * * Module Name: exstorob - AML object store support, store to object * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exsystem.c b/drivers/acpi/acpica/exsystem.c index f329b01672bb..1281c07112de 100644 --- a/drivers/acpi/acpica/exsystem.c +++ b/drivers/acpi/acpica/exsystem.c @@ -3,7 +3,7 @@ * * Module Name: exsystem - Interface to OS services * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/extrace.c b/drivers/acpi/acpica/extrace.c index 832a47885b99..8846f483fb02 100644 --- a/drivers/acpi/acpica/extrace.c +++ b/drivers/acpi/acpica/extrace.c @@ -3,7 +3,7 @@ * * Module Name: extrace - Support for interpreter execution tracing * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/exutils.c b/drivers/acpi/acpica/exutils.c index 8fefa6feac2f..4d41a866f633 100644 --- a/drivers/acpi/acpica/exutils.c +++ b/drivers/acpi/acpica/exutils.c @@ -3,7 +3,7 @@ * * Module Name: exutils - interpreter/scanner utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwacpi.c b/drivers/acpi/acpica/hwacpi.c index 9b9aac27ff7e..96f55f079988 100644 --- a/drivers/acpi/acpica/hwacpi.c +++ b/drivers/acpi/acpica/hwacpi.c @@ -3,7 +3,7 @@ * * Module Name: hwacpi - ACPI Hardware Initialization/Mode Interface * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwesleep.c b/drivers/acpi/acpica/hwesleep.c index d9be5d0545d4..803402aefaeb 100644 --- a/drivers/acpi/acpica/hwesleep.c +++ b/drivers/acpi/acpica/hwesleep.c @@ -4,7 +4,7 @@ * Name: hwesleep.c - ACPI Hardware Sleep/Wake Support functions for the * extended FADT-V5 sleep registers. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 0c84300e915c..0770aa176cd5 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -3,7 +3,7 @@ * * Module Name: hwgpe - Low level GPE enable/disable/clear functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 317ae870336b..14baa13bf848 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -4,7 +4,7 @@ * Name: hwsleep.c - ACPI Hardware Sleep/Wake Support functions for the * original/legacy sleep/PM registers. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwtimer.c b/drivers/acpi/acpica/hwtimer.c index 07473ddfa9a9..63deadde9f48 100644 --- a/drivers/acpi/acpica/hwtimer.c +++ b/drivers/acpi/acpica/hwtimer.c @@ -3,7 +3,7 @@ * * Name: hwtimer.c - ACPI Power Management Timer Interface * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwvalid.c b/drivers/acpi/acpica/hwvalid.c index b2ca7dfd3fc9..e15badf4077a 100644 --- a/drivers/acpi/acpica/hwvalid.c +++ b/drivers/acpi/acpica/hwvalid.c @@ -3,7 +3,7 @@ * * Module Name: hwvalid - I/O request validation * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c index 134dbfadcd15..fb27aaad0dee 100644 --- a/drivers/acpi/acpica/hwxface.c +++ b/drivers/acpi/acpica/hwxface.c @@ -3,7 +3,7 @@ * * Module Name: hwxface - Public ACPICA hardware interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/hwxfsleep.c b/drivers/acpi/acpica/hwxfsleep.c index a4b66f4b2714..89b12afed564 100644 --- a/drivers/acpi/acpica/hwxfsleep.c +++ b/drivers/acpi/acpica/hwxfsleep.c @@ -3,7 +3,7 @@ * * Name: hwxfsleep.c - ACPI Hardware Sleep/Wake External Interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsarguments.c b/drivers/acpi/acpica/nsarguments.c index 6bbc7d350a16..c8a2747005c5 100644 --- a/drivers/acpi/acpica/nsarguments.c +++ b/drivers/acpi/acpica/nsarguments.c @@ -3,7 +3,7 @@ * * Module Name: nsarguments - Validation of args for ACPI predefined methods * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsconvert.c b/drivers/acpi/acpica/nsconvert.c index c86c82939ebb..597d0eed23c1 100644 --- a/drivers/acpi/acpica/nsconvert.c +++ b/drivers/acpi/acpica/nsconvert.c @@ -4,7 +4,7 @@ * Module Name: nsconvert - Object conversions for objects returned by * predefined methods * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsdump.c b/drivers/acpi/acpica/nsdump.c index 994f0b556c60..2f66f3ed1810 100644 --- a/drivers/acpi/acpica/nsdump.c +++ b/drivers/acpi/acpica/nsdump.c @@ -3,7 +3,7 @@ * * Module Name: nsdump - table dumping routines for debug * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsdumpdv.c b/drivers/acpi/acpica/nsdumpdv.c index b691fe20e384..d3dc6761bcdd 100644 --- a/drivers/acpi/acpica/nsdumpdv.c +++ b/drivers/acpi/acpica/nsdumpdv.c @@ -3,7 +3,7 @@ * * Module Name: nsdump - table dumping routines for debug * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsinit.c b/drivers/acpi/acpica/nsinit.c index e16f6a0c2c3f..4db81f8ba29b 100644 --- a/drivers/acpi/acpica/nsinit.c +++ b/drivers/acpi/acpica/nsinit.c @@ -3,7 +3,7 @@ * * Module Name: nsinit - namespace initialization * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsload.c b/drivers/acpi/acpica/nsload.c index 9ba17891edb6..7d77956ed790 100644 --- a/drivers/acpi/acpica/nsload.c +++ b/drivers/acpi/acpica/nsload.c @@ -3,7 +3,7 @@ * * Module Name: nsload - namespace loading/expanding/contracting procedures * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsparse.c b/drivers/acpi/acpica/nsparse.c index 7e74a765e785..778f80e624be 100644 --- a/drivers/acpi/acpica/nsparse.c +++ b/drivers/acpi/acpica/nsparse.c @@ -3,7 +3,7 @@ * * Module Name: nsparse - namespace interface to AML parser * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nspredef.c b/drivers/acpi/acpica/nspredef.c index 167a1c2495ab..e4e5f32da7dc 100644 --- a/drivers/acpi/acpica/nspredef.c +++ b/drivers/acpi/acpica/nspredef.c @@ -3,7 +3,7 @@ * * Module Name: nspredef - Validation of ACPI predefined methods and objects * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsprepkg.c b/drivers/acpi/acpica/nsprepkg.c index 1875b1cba202..6742b50836f7 100644 --- a/drivers/acpi/acpica/nsprepkg.c +++ b/drivers/acpi/acpica/nsprepkg.c @@ -3,7 +3,7 @@ * * Module Name: nsprepkg - Validation of package objects for predefined names * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsrepair.c b/drivers/acpi/acpica/nsrepair.c index 90db2d85e7f5..499067daa22c 100644 --- a/drivers/acpi/acpica/nsrepair.c +++ b/drivers/acpi/acpica/nsrepair.c @@ -3,7 +3,7 @@ * * Module Name: nsrepair - Repair for objects returned by predefined methods * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c index d2c8d8279e7a..918513da1b26 100644 --- a/drivers/acpi/acpica/nsrepair2.c +++ b/drivers/acpi/acpica/nsrepair2.c @@ -4,7 +4,7 @@ * Module Name: nsrepair2 - Repair for objects returned by specific * predefined methods * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsutils.c b/drivers/acpi/acpica/nsutils.c index e66abdab8f31..83d0f276da4d 100644 --- a/drivers/acpi/acpica/nsutils.c +++ b/drivers/acpi/acpica/nsutils.c @@ -4,7 +4,7 @@ * Module Name: nsutils - Utilities for accessing ACPI namespace, accessing * parents and siblings and Scope manipulation * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nswalk.c b/drivers/acpi/acpica/nswalk.c index b7f3e8603ad8..915c2433463d 100644 --- a/drivers/acpi/acpica/nswalk.c +++ b/drivers/acpi/acpica/nswalk.c @@ -3,7 +3,7 @@ * * Module Name: nswalk - Functions for walking the ACPI namespace * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/nsxfname.c b/drivers/acpi/acpica/nsxfname.c index 0e6aba81605b..03487546da5a 100644 --- a/drivers/acpi/acpica/nsxfname.c +++ b/drivers/acpi/acpica/nsxfname.c @@ -4,7 +4,7 @@ * Module Name: nsxfname - Public interfaces to the ACPI subsystem * ACPI Namespace oriented interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psargs.c b/drivers/acpi/acpica/psargs.c index 3b40db4ad9f3..b9ff535aa02e 100644 --- a/drivers/acpi/acpica/psargs.c +++ b/drivers/acpi/acpica/psargs.c @@ -3,7 +3,7 @@ * * Module Name: psargs - Parse AML opcode arguments * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psloop.c b/drivers/acpi/acpica/psloop.c index 1ba17cf16c41..4b51dd939f29 100644 --- a/drivers/acpi/acpica/psloop.c +++ b/drivers/acpi/acpica/psloop.c @@ -3,7 +3,7 @@ * * Module Name: psloop - Main AML parse loop * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psobject.c b/drivers/acpi/acpica/psobject.c index 2480c26c5171..e4420cd6d281 100644 --- a/drivers/acpi/acpica/psobject.c +++ b/drivers/acpi/acpica/psobject.c @@ -3,7 +3,7 @@ * * Module Name: psobject - Support for parse objects * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psopcode.c b/drivers/acpi/acpica/psopcode.c index 28af49263ebf..3e80eb1a5f35 100644 --- a/drivers/acpi/acpica/psopcode.c +++ b/drivers/acpi/acpica/psopcode.c @@ -3,7 +3,7 @@ * * Module Name: psopcode - Parser/Interpreter opcode information table * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psopinfo.c b/drivers/acpi/acpica/psopinfo.c index ab9327f6a63c..476b00a121f3 100644 --- a/drivers/acpi/acpica/psopinfo.c +++ b/drivers/acpi/acpica/psopinfo.c @@ -3,7 +3,7 @@ * * Module Name: psopinfo - AML opcode information functions and dispatch tables * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psparse.c b/drivers/acpi/acpica/psparse.c index 06490a137982..7eb7a81619a3 100644 --- a/drivers/acpi/acpica/psparse.c +++ b/drivers/acpi/acpica/psparse.c @@ -3,7 +3,7 @@ * * Module Name: psparse - Parser top level AML parse routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psscope.c b/drivers/acpi/acpica/psscope.c index fceb311995e9..3f2eada44942 100644 --- a/drivers/acpi/acpica/psscope.c +++ b/drivers/acpi/acpica/psscope.c @@ -3,7 +3,7 @@ * * Module Name: psscope - Parser scope stack management routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/pstree.c b/drivers/acpi/acpica/pstree.c index c8aef0694864..ffb2a7bfc6d7 100644 --- a/drivers/acpi/acpica/pstree.c +++ b/drivers/acpi/acpica/pstree.c @@ -3,7 +3,7 @@ * * Module Name: pstree - Parser op tree manipulation/traversal/search * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psutils.c b/drivers/acpi/acpica/psutils.c index 00efae2f95ba..e6596051d548 100644 --- a/drivers/acpi/acpica/psutils.c +++ b/drivers/acpi/acpica/psutils.c @@ -3,7 +3,7 @@ * * Module Name: psutils - Parser miscellaneous utilities (Parser only) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/pswalk.c b/drivers/acpi/acpica/pswalk.c index 0fe3adf6b0e5..7018a789debc 100644 --- a/drivers/acpi/acpica/pswalk.c +++ b/drivers/acpi/acpica/pswalk.c @@ -3,7 +3,7 @@ * * Module Name: pswalk - Parser routines to walk parsed op tree(s) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/psxface.c b/drivers/acpi/acpica/psxface.c index 1bbfc8def388..fd0f28c7af1e 100644 --- a/drivers/acpi/acpica/psxface.c +++ b/drivers/acpi/acpica/psxface.c @@ -3,7 +3,7 @@ * * Module Name: psxface - Parser external interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c index 523b1e9b98d4..ebbca109edcb 100644 --- a/drivers/acpi/acpica/tbdata.c +++ b/drivers/acpi/acpica/tbdata.c @@ -3,7 +3,7 @@ * * Module Name: tbdata - Table manager data structure functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 907edc5edba7..5174abfa8af9 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -3,7 +3,7 @@ * * Module Name: tbfadt - FADT table utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbfind.c b/drivers/acpi/acpica/tbfind.c index 56d81e490a5c..2c2c2b1f5a28 100644 --- a/drivers/acpi/acpica/tbfind.c +++ b/drivers/acpi/acpica/tbfind.c @@ -3,7 +3,7 @@ * * Module Name: tbfind - find table * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbinstal.c b/drivers/acpi/acpica/tbinstal.c index 0bb15add2245..8d1e5b572493 100644 --- a/drivers/acpi/acpica/tbinstal.c +++ b/drivers/acpi/acpica/tbinstal.c @@ -3,7 +3,7 @@ * * Module Name: tbinstal - ACPI table installation and removal * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbprint.c b/drivers/acpi/acpica/tbprint.c index 0b3494ad9a70..254823d494a2 100644 --- a/drivers/acpi/acpica/tbprint.c +++ b/drivers/acpi/acpica/tbprint.c @@ -3,7 +3,7 @@ * * Module Name: tbprint - Table output utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbutils.c b/drivers/acpi/acpica/tbutils.c index dfe1ac3ae34a..4b9b329a5a92 100644 --- a/drivers/acpi/acpica/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -3,7 +3,7 @@ * * Module Name: tbutils - ACPI Table utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbxface.c b/drivers/acpi/acpica/tbxface.c index 7490429ddbf6..e6f51fedaf1a 100644 --- a/drivers/acpi/acpica/tbxface.c +++ b/drivers/acpi/acpica/tbxface.c @@ -3,7 +3,7 @@ * * Module Name: tbxface - ACPI table-oriented external interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbxfload.c b/drivers/acpi/acpica/tbxfload.c index bcba993d4dac..38623049b962 100644 --- a/drivers/acpi/acpica/tbxfload.c +++ b/drivers/acpi/acpica/tbxfload.c @@ -3,7 +3,7 @@ * * Module Name: tbxfload - Table load/unload external interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c index 0edc6ef5d46d..9fec3df6c3ba 100644 --- a/drivers/acpi/acpica/tbxfroot.c +++ b/drivers/acpi/acpica/tbxfroot.c @@ -3,7 +3,7 @@ * * Module Name: tbxfroot - Find the root ACPI table (RSDT) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utaddress.c b/drivers/acpi/acpica/utaddress.c index 99fa48722cf6..7001f4b113f1 100644 --- a/drivers/acpi/acpica/utaddress.c +++ b/drivers/acpi/acpica/utaddress.c @@ -3,7 +3,7 @@ * * Module Name: utaddress - op_region address range check * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utalloc.c b/drivers/acpi/acpica/utalloc.c index 303ab51b4fcf..796fd9b33a7d 100644 --- a/drivers/acpi/acpica/utalloc.c +++ b/drivers/acpi/acpica/utalloc.c @@ -3,7 +3,7 @@ * * Module Name: utalloc - local memory allocation routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utascii.c b/drivers/acpi/acpica/utascii.c index d78656d960e8..e1b55575d5fb 100644 --- a/drivers/acpi/acpica/utascii.c +++ b/drivers/acpi/acpica/utascii.c @@ -3,7 +3,7 @@ * * Module Name: utascii - Utility ascii functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utbuffer.c b/drivers/acpi/acpica/utbuffer.c index f2ec427f4e29..8ab90f78825b 100644 --- a/drivers/acpi/acpica/utbuffer.c +++ b/drivers/acpi/acpica/utbuffer.c @@ -3,7 +3,7 @@ * * Module Name: utbuffer - Buffer dump routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utcache.c b/drivers/acpi/acpica/utcache.c index 1b03a2747401..814145019f95 100644 --- a/drivers/acpi/acpica/utcache.c +++ b/drivers/acpi/acpica/utcache.c @@ -3,7 +3,7 @@ * * Module Name: utcache - local cache allocation routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utcopy.c b/drivers/acpi/acpica/utcopy.c index 41bdd0278dd8..d9877153f400 100644 --- a/drivers/acpi/acpica/utcopy.c +++ b/drivers/acpi/acpica/utcopy.c @@ -3,7 +3,7 @@ * * Module Name: utcopy - Internal to external object translation utilities * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c index 0c8cb0612414..09245945f319 100644 --- a/drivers/acpi/acpica/utdebug.c +++ b/drivers/acpi/acpica/utdebug.c @@ -3,7 +3,7 @@ * * Module Name: utdebug - Debug print/trace routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utdecode.c b/drivers/acpi/acpica/utdecode.c index ed9aedf604a1..bcd3871079d7 100644 --- a/drivers/acpi/acpica/utdecode.c +++ b/drivers/acpi/acpica/utdecode.c @@ -3,7 +3,7 @@ * * Module Name: utdecode - Utility decoding routines (value-to-string) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/uteval.c b/drivers/acpi/acpica/uteval.c index 8180d1a458f5..d2503920c620 100644 --- a/drivers/acpi/acpica/uteval.c +++ b/drivers/acpi/acpica/uteval.c @@ -3,7 +3,7 @@ * * Module Name: uteval - Object evaluation * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utglobal.c b/drivers/acpi/acpica/utglobal.c index e6dcbdc3fc6e..59a48371a7bc 100644 --- a/drivers/acpi/acpica/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -3,7 +3,7 @@ * * Module Name: utglobal - Global variables for the ACPI subsystem * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/uthex.c b/drivers/acpi/acpica/uthex.c index 0e02f12513dc..b1e94c094f9a 100644 --- a/drivers/acpi/acpica/uthex.c +++ b/drivers/acpi/acpica/uthex.c @@ -3,7 +3,7 @@ * * Module Name: uthex -- Hex/ASCII support functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utids.c b/drivers/acpi/acpica/utids.c index 3e68864ef242..08e9f316cbde 100644 --- a/drivers/acpi/acpica/utids.c +++ b/drivers/acpi/acpica/utids.c @@ -3,7 +3,7 @@ * * Module Name: utids - support for device Ids - HID, UID, CID, SUB, CLS * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utinit.c b/drivers/acpi/acpica/utinit.c index fdbc397c038d..7b606a1e6986 100644 --- a/drivers/acpi/acpica/utinit.c +++ b/drivers/acpi/acpica/utinit.c @@ -3,7 +3,7 @@ * * Module Name: utinit - Common ACPI subsystem initialization * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utlock.c b/drivers/acpi/acpica/utlock.c index 46be549539e7..923dd15e7a16 100644 --- a/drivers/acpi/acpica/utlock.c +++ b/drivers/acpi/acpica/utlock.c @@ -3,7 +3,7 @@ * * Module Name: utlock - Reader/Writer lock interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utobject.c b/drivers/acpi/acpica/utobject.c index bbec04c291d2..84a210b49e3a 100644 --- a/drivers/acpi/acpica/utobject.c +++ b/drivers/acpi/acpica/utobject.c @@ -3,7 +3,7 @@ * * Module Name: utobject - ACPI object create/delete/size/cache routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utosi.c b/drivers/acpi/acpica/utosi.c index 0a01c08dad8a..7b8e8bf1e824 100644 --- a/drivers/acpi/acpica/utosi.c +++ b/drivers/acpi/acpica/utosi.c @@ -3,7 +3,7 @@ * * Module Name: utosi - Support for the _OSI predefined control method * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utpredef.c b/drivers/acpi/acpica/utpredef.c index dd277f7e9f10..a6f87a88c30e 100644 --- a/drivers/acpi/acpica/utpredef.c +++ b/drivers/acpi/acpica/utpredef.c @@ -3,7 +3,7 @@ * * Module Name: utpredef - support functions for predefined names * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utprint.c b/drivers/acpi/acpica/utprint.c index 681c11f4af4e..e37d612e8db5 100644 --- a/drivers/acpi/acpica/utprint.c +++ b/drivers/acpi/acpica/utprint.c @@ -3,7 +3,7 @@ * * Module Name: utprint - Formatted printing routines * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/uttrack.c b/drivers/acpi/acpica/uttrack.c index d366be431a84..2ce85fcfeb5b 100644 --- a/drivers/acpi/acpica/uttrack.c +++ b/drivers/acpi/acpica/uttrack.c @@ -3,7 +3,7 @@ * * Module Name: uttrack - Memory allocation tracking routines (debug only) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utuuid.c b/drivers/acpi/acpica/utuuid.c index b8039954b0d1..090e44b6b6c7 100644 --- a/drivers/acpi/acpica/utuuid.c +++ b/drivers/acpi/acpica/utuuid.c @@ -3,7 +3,7 @@ * * Module Name: utuuid -- UUID support functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utxface.c b/drivers/acpi/acpica/utxface.c index ca7c9f0144ef..3285c1a92e40 100644 --- a/drivers/acpi/acpica/utxface.c +++ b/drivers/acpi/acpica/utxface.c @@ -3,7 +3,7 @@ * * Module Name: utxface - External interfaces, miscellaneous utility functions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/drivers/acpi/acpica/utxfinit.c b/drivers/acpi/acpica/utxfinit.c index 653e3bb20036..91016366de1d 100644 --- a/drivers/acpi/acpica/utxfinit.c +++ b/drivers/acpi/acpica/utxfinit.c @@ -3,7 +3,7 @@ * * Module Name: utxfinit - External interfaces for ACPICA initialization * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acbuffer.h b/include/acpi/acbuffer.h index 531c1e9a7d10..18197c16149f 100644 --- a/include/acpi/acbuffer.h +++ b/include/acpi/acbuffer.h @@ -3,7 +3,7 @@ * * Name: acbuffer.h - Support for buffers returned by ACPI predefined names * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h index a225eff499c8..e92f84fa8c68 100644 --- a/include/acpi/acconfig.h +++ b/include/acpi/acconfig.h @@ -3,7 +3,7 @@ * * Name: acconfig.h - Global configuration constants * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index f8a4afb0279a..ea3b1c41bc79 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -3,7 +3,7 @@ * * Name: acexcep.h - Exception codes returned by the ACPI subsystem * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h index 8922edb32730..a2bc381c7ce7 100644 --- a/include/acpi/acnames.h +++ b/include/acpi/acnames.h @@ -3,7 +3,7 @@ * * Name: acnames.h - Global names and strings * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h index c5d900c0ecda..1538a6853822 100644 --- a/include/acpi/acoutput.h +++ b/include/acpi/acoutput.h @@ -3,7 +3,7 @@ * * Name: acoutput.h -- debug output * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h index e3e8051d4812..6f6282a862bc 100644 --- a/include/acpi/acpi.h +++ b/include/acpi/acpi.h @@ -3,7 +3,7 @@ * * Name: acpi.h - Master public include file used to interface to ACPICA * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h index 33bb8c9a089d..690c369b717a 100644 --- a/include/acpi/acpiosxf.h +++ b/include/acpi/acpiosxf.h @@ -5,7 +5,7 @@ * interfaces must be implemented by OSL to interface the * ACPI components to the host operating system. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index be76e40769cb..be76ba3166cf 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -3,7 +3,7 @@ * * Name: acpixf.h - External interfaces to the ACPI subsystem * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h index d3521894ce6a..9bccac9becd7 100644 --- a/include/acpi/acrestyp.h +++ b/include/acpi/acrestyp.h @@ -3,7 +3,7 @@ * * Name: acrestyp.h - Defines, types, and structures for resource descriptors * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 5007c41f4d54..f9cda909f92c 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -3,7 +3,7 @@ * * Name: actbl.h - Basic ACPI Table Definitions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 43549547ed3e..ea1c2998d54e 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -3,7 +3,7 @@ * * Name: actbl1.h - Additional ACPI table definitions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index 94bfc0c2a893..d6478c430c99 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -3,7 +3,7 @@ * * Name: actbl2.h - ACPI Table Definitions (tables not in ACPI spec) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h index d90c3e1978e6..df5f4b27f3aa 100644 --- a/include/acpi/actbl3.h +++ b/include/acpi/actbl3.h @@ -3,7 +3,7 @@ * * Name: actbl3.h - ACPI Table Definitions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 2a32593691bc..92c71dfce0d5 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -3,7 +3,7 @@ * * Name: actypes.h - Common data types for the entire ACPI subsystem * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/acuuid.h b/include/acpi/acuuid.h index fb7d8d1fd93c..a5c2ca019a12 100644 --- a/include/acpi/acuuid.h +++ b/include/acpi/acuuid.h @@ -3,7 +3,7 @@ * * Name: acuuid.h - ACPI-related UUID/GUID definitions * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h index 8f6b2654c0b3..e8958e0d1646 100644 --- a/include/acpi/platform/acenv.h +++ b/include/acpi/platform/acenv.h @@ -3,7 +3,7 @@ * * Name: acenv.h - Host and compiler configuration * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/acenvex.h b/include/acpi/platform/acenvex.h index c3facf5f8495..277fe2fa4d9b 100644 --- a/include/acpi/platform/acenvex.h +++ b/include/acpi/platform/acenvex.h @@ -3,7 +3,7 @@ * * Name: acenvex.h - Extra host and compiler configuration * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/acgcc.h b/include/acpi/platform/acgcc.h index 91f7a02c798a..0cd4f61d4248 100644 --- a/include/acpi/platform/acgcc.h +++ b/include/acpi/platform/acgcc.h @@ -3,7 +3,7 @@ * * Name: acgcc.h - GCC specific defines, etc. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/acgccex.h b/include/acpi/platform/acgccex.h index 7c88fd1de955..738d52865e0a 100644 --- a/include/acpi/platform/acgccex.h +++ b/include/acpi/platform/acgccex.h @@ -3,7 +3,7 @@ * * Name: acgccex.h - Extra GCC specific defines, etc. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/acintel.h b/include/acpi/platform/acintel.h index e7fd5e71be62..550fe9a8cd6c 100644 --- a/include/acpi/platform/acintel.h +++ b/include/acpi/platform/acintel.h @@ -3,7 +3,7 @@ * * Name: acintel.h - VC specific defines, etc. * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 72f52a1342a0..b3ffb9bbf664 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -3,7 +3,7 @@ * * Name: aclinux.h - OS specific defines, etc. for Linux * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h index 04f88f2de781..5f642b07ad64 100644 --- a/include/acpi/platform/aclinuxex.h +++ b/include/acpi/platform/aclinuxex.h @@ -3,7 +3,7 @@ * * Name: aclinuxex.h - Extra OS specific defines, etc. for Linux * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/common/cmfsize.c b/tools/power/acpi/common/cmfsize.c index d1d18ff5c911..9ea2c0aeb86c 100644 --- a/tools/power/acpi/common/cmfsize.c +++ b/tools/power/acpi/common/cmfsize.c @@ -3,7 +3,7 @@ * * Module Name: cfsize - Common get file size function * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/common/getopt.c b/tools/power/acpi/common/getopt.c index c3708f30ab3a..3c265bc917a1 100644 --- a/tools/power/acpi/common/getopt.c +++ b/tools/power/acpi/common/getopt.c @@ -3,7 +3,7 @@ * * Module Name: getopt * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c index 11c5046dce16..ccabdbaae6a4 100644 --- a/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c +++ b/tools/power/acpi/os_specific/service_layers/oslinuxtbl.c @@ -3,7 +3,7 @@ * * Module Name: oslinuxtbl - Linux OSL for obtaining ACPI tables * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/os_specific/service_layers/osunixdir.c b/tools/power/acpi/os_specific/service_layers/osunixdir.c index fd05ddee240f..edd99274cd12 100644 --- a/tools/power/acpi/os_specific/service_layers/osunixdir.c +++ b/tools/power/acpi/os_specific/service_layers/osunixdir.c @@ -3,7 +3,7 @@ * * Module Name: osunixdir - Unix directory access interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/os_specific/service_layers/osunixmap.c b/tools/power/acpi/os_specific/service_layers/osunixmap.c index c565546e85bc..fee0022560d5 100644 --- a/tools/power/acpi/os_specific/service_layers/osunixmap.c +++ b/tools/power/acpi/os_specific/service_layers/osunixmap.c @@ -3,7 +3,7 @@ * * Module Name: osunixmap - Unix OSL for file mappings * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/os_specific/service_layers/osunixxf.c b/tools/power/acpi/os_specific/service_layers/osunixxf.c index 5b2fd968535f..0861728da562 100644 --- a/tools/power/acpi/os_specific/service_layers/osunixxf.c +++ b/tools/power/acpi/os_specific/service_layers/osunixxf.c @@ -3,7 +3,7 @@ * * Module Name: osunixxf - UNIX OSL interfaces * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/tools/acpidump/acpidump.h b/tools/power/acpi/tools/acpidump/acpidump.h index 26a5eae9f87f..e0ebc1dab1cc 100644 --- a/tools/power/acpi/tools/acpidump/acpidump.h +++ b/tools/power/acpi/tools/acpidump/acpidump.h @@ -3,7 +3,7 @@ * * Module Name: acpidump.h - Include file for acpi_dump utility * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c index 76433296055d..444e3d78bd89 100644 --- a/tools/power/acpi/tools/acpidump/apdump.c +++ b/tools/power/acpi/tools/acpidump/apdump.c @@ -3,7 +3,7 @@ * * Module Name: apdump - Dump routines for ACPI tables (acpidump) * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/tools/acpidump/apfiles.c b/tools/power/acpi/tools/acpidump/apfiles.c index a682bae4e6f6..da0c6e13042b 100644 --- a/tools/power/acpi/tools/acpidump/apfiles.c +++ b/tools/power/acpi/tools/acpidump/apfiles.c @@ -3,7 +3,7 @@ * * Module Name: apfiles - File-related functions for acpidump utility * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ diff --git a/tools/power/acpi/tools/acpidump/apmain.c b/tools/power/acpi/tools/acpidump/apmain.c index 046e6b8d6baa..a4cf6042fcfd 100644 --- a/tools/power/acpi/tools/acpidump/apmain.c +++ b/tools/power/acpi/tools/acpidump/apmain.c @@ -3,7 +3,7 @@ * * Module Name: apmain - Main module for the acpidump utility * - * Copyright (C) 2000 - 2020, Intel Corp. + * Copyright (C) 2000 - 2021, Intel Corp. * *****************************************************************************/ From df1d4b466bb6a4eccb899ab761f5fbc7b3f95b67 Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 15 Jan 2021 10:48:26 -0800 Subject: [PATCH 052/307] ACPICA: Update version to 20210105 ACPICA commit 28cb42013541950cf378582a5a5a5587061498ca Version 20210105. Link: https://github.com/acpica/acpica/commit/28cb4201 Signed-off-by: Bob Moore Signed-off-by: Erik Kaneda Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index be76ba3166cf..370293ee8399 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20201217 +#define ACPI_CA_VERSION 0x20210105 #include #include From 0f39ee8324e75c9d370e84a61323ceb194641a18 Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Thu, 17 Dec 2020 18:15:36 +0530 Subject: [PATCH 053/307] ACPI: Use DEVICE_ATTR_ macros Instead of open coding DEVICE_ATTR(), use the DEVICE_ATTR_RW(), DEVICE_ATTR_RO() and DEVICE_ATTR_WO() macros wherever possible. This required a few functions to be renamed but the functionality itself is unchanged. Signed-off-by: Dwaipayan Ray Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_pad.c | 24 ++++++++------------ drivers/acpi/acpi_tad.c | 14 ++++++------ drivers/acpi/bgrt.c | 20 ++++++++--------- drivers/acpi/device_sysfs.c | 44 ++++++++++++++++++------------------- drivers/acpi/dock.c | 26 +++++++++++----------- drivers/acpi/power.c | 9 ++++---- 6 files changed, 66 insertions(+), 71 deletions(-) diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index b8745ce48a47..b84ab722feb4 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -261,7 +261,7 @@ static uint32_t acpi_pad_idle_cpus_num(void) return ps_tsk_num; } -static ssize_t acpi_pad_rrtime_store(struct device *dev, +static ssize_t rrtime_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long num; @@ -275,16 +275,14 @@ static ssize_t acpi_pad_rrtime_store(struct device *dev, return count; } -static ssize_t acpi_pad_rrtime_show(struct device *dev, +static ssize_t rrtime_show(struct device *dev, struct device_attribute *attr, char *buf) { return scnprintf(buf, PAGE_SIZE, "%d\n", round_robin_time); } -static DEVICE_ATTR(rrtime, S_IRUGO|S_IWUSR, - acpi_pad_rrtime_show, - acpi_pad_rrtime_store); +static DEVICE_ATTR_RW(rrtime); -static ssize_t acpi_pad_idlepct_store(struct device *dev, +static ssize_t idlepct_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long num; @@ -298,16 +296,14 @@ static ssize_t acpi_pad_idlepct_store(struct device *dev, return count; } -static ssize_t acpi_pad_idlepct_show(struct device *dev, +static ssize_t idlepct_show(struct device *dev, struct device_attribute *attr, char *buf) { return scnprintf(buf, PAGE_SIZE, "%d\n", idle_pct); } -static DEVICE_ATTR(idlepct, S_IRUGO|S_IWUSR, - acpi_pad_idlepct_show, - acpi_pad_idlepct_store); +static DEVICE_ATTR_RW(idlepct); -static ssize_t acpi_pad_idlecpus_store(struct device *dev, +static ssize_t idlecpus_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long num; @@ -319,16 +315,14 @@ static ssize_t acpi_pad_idlecpus_store(struct device *dev, return count; } -static ssize_t acpi_pad_idlecpus_show(struct device *dev, +static ssize_t idlecpus_show(struct device *dev, struct device_attribute *attr, char *buf) { return cpumap_print_to_pagebuf(false, buf, to_cpumask(pad_busy_cpus_bits)); } -static DEVICE_ATTR(idlecpus, S_IRUGO|S_IWUSR, - acpi_pad_idlecpus_show, - acpi_pad_idlecpus_store); +static DEVICE_ATTR_RW(idlecpus); static int acpi_pad_add_sysfs(struct acpi_device *device) { diff --git a/drivers/acpi/acpi_tad.c b/drivers/acpi/acpi_tad.c index 7d45cce0c3c1..e9b8e8305e23 100644 --- a/drivers/acpi/acpi_tad.c +++ b/drivers/acpi/acpi_tad.c @@ -237,7 +237,7 @@ static ssize_t time_show(struct device *dev, struct device_attribute *attr, rt.tz, rt.daylight); } -static DEVICE_ATTR(time, S_IRUSR | S_IWUSR, time_show, time_store); +static DEVICE_ATTR_RW(time); static struct attribute *acpi_tad_time_attrs[] = { &dev_attr_time.attr, @@ -446,7 +446,7 @@ static ssize_t ac_alarm_show(struct device *dev, struct device_attribute *attr, return acpi_tad_alarm_read(dev, buf, ACPI_TAD_AC_TIMER); } -static DEVICE_ATTR(ac_alarm, S_IRUSR | S_IWUSR, ac_alarm_show, ac_alarm_store); +static DEVICE_ATTR_RW(ac_alarm); static ssize_t ac_policy_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -462,7 +462,7 @@ static ssize_t ac_policy_show(struct device *dev, struct device_attribute *attr, return acpi_tad_policy_read(dev, buf, ACPI_TAD_AC_TIMER); } -static DEVICE_ATTR(ac_policy, S_IRUSR | S_IWUSR, ac_policy_show, ac_policy_store); +static DEVICE_ATTR_RW(ac_policy); static ssize_t ac_status_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -478,7 +478,7 @@ static ssize_t ac_status_show(struct device *dev, struct device_attribute *attr, return acpi_tad_status_read(dev, buf, ACPI_TAD_AC_TIMER); } -static DEVICE_ATTR(ac_status, S_IRUSR | S_IWUSR, ac_status_show, ac_status_store); +static DEVICE_ATTR_RW(ac_status); static struct attribute *acpi_tad_attrs[] = { &dev_attr_caps.attr, @@ -505,7 +505,7 @@ static ssize_t dc_alarm_show(struct device *dev, struct device_attribute *attr, return acpi_tad_alarm_read(dev, buf, ACPI_TAD_DC_TIMER); } -static DEVICE_ATTR(dc_alarm, S_IRUSR | S_IWUSR, dc_alarm_show, dc_alarm_store); +static DEVICE_ATTR_RW(dc_alarm); static ssize_t dc_policy_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -521,7 +521,7 @@ static ssize_t dc_policy_show(struct device *dev, struct device_attribute *attr, return acpi_tad_policy_read(dev, buf, ACPI_TAD_DC_TIMER); } -static DEVICE_ATTR(dc_policy, S_IRUSR | S_IWUSR, dc_policy_show, dc_policy_store); +static DEVICE_ATTR_RW(dc_policy); static ssize_t dc_status_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -537,7 +537,7 @@ static ssize_t dc_status_show(struct device *dev, struct device_attribute *attr, return acpi_tad_status_read(dev, buf, ACPI_TAD_DC_TIMER); } -static DEVICE_ATTR(dc_status, S_IRUSR | S_IWUSR, dc_status_show, dc_status_store); +static DEVICE_ATTR_RW(dc_status); static struct attribute *acpi_tad_dc_attrs[] = { &dev_attr_dc_alarm.attr, diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c index 251f961c28cc..19bb7f870204 100644 --- a/drivers/acpi/bgrt.c +++ b/drivers/acpi/bgrt.c @@ -15,40 +15,40 @@ static void *bgrt_image; static struct kobject *bgrt_kobj; -static ssize_t show_version(struct device *dev, +static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.version); } -static DEVICE_ATTR(version, S_IRUGO, show_version, NULL); +static DEVICE_ATTR_RO(version); -static ssize_t show_status(struct device *dev, +static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.status); } -static DEVICE_ATTR(status, S_IRUGO, show_status, NULL); +static DEVICE_ATTR_RO(status); -static ssize_t show_type(struct device *dev, +static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.image_type); } -static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); +static DEVICE_ATTR_RO(type); -static ssize_t show_xoffset(struct device *dev, +static ssize_t xoffset_show(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.image_offset_x); } -static DEVICE_ATTR(xoffset, S_IRUGO, show_xoffset, NULL); +static DEVICE_ATTR_RO(xoffset); -static ssize_t show_yoffset(struct device *dev, +static ssize_t yoffset_show(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", bgrt_tab.image_offset_y); } -static DEVICE_ATTR(yoffset, S_IRUGO, show_yoffset, NULL); +static DEVICE_ATTR_RO(yoffset); static ssize_t image_read(struct file *file, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) diff --git a/drivers/acpi/device_sysfs.c b/drivers/acpi/device_sysfs.c index 96869f1538b9..a25f108240e4 100644 --- a/drivers/acpi/device_sysfs.c +++ b/drivers/acpi/device_sysfs.c @@ -333,11 +333,11 @@ int acpi_device_modalias(struct device *dev, char *buf, int size) EXPORT_SYMBOL_GPL(acpi_device_modalias); static ssize_t -acpi_device_modalias_show(struct device *dev, struct device_attribute *attr, char *buf) +modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { return __acpi_device_modalias(to_acpi_device(dev), buf, 1024); } -static DEVICE_ATTR(modalias, 0444, acpi_device_modalias_show, NULL); +static DEVICE_ATTR_RO(modalias); static ssize_t real_power_state_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -366,8 +366,8 @@ static ssize_t power_state_show(struct device *dev, static DEVICE_ATTR_RO(power_state); static ssize_t -acpi_eject_store(struct device *d, struct device_attribute *attr, - const char *buf, size_t count) +eject_store(struct device *d, struct device_attribute *attr, + const char *buf, size_t count) { struct acpi_device *acpi_device = to_acpi_device(d); acpi_object_type not_used; @@ -395,28 +395,28 @@ acpi_eject_store(struct device *d, struct device_attribute *attr, return status == AE_NO_MEMORY ? -ENOMEM : -EAGAIN; } -static DEVICE_ATTR(eject, 0200, NULL, acpi_eject_store); +static DEVICE_ATTR_WO(eject); static ssize_t -acpi_device_hid_show(struct device *dev, struct device_attribute *attr, char *buf) +hid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); return sprintf(buf, "%s\n", acpi_device_hid(acpi_dev)); } -static DEVICE_ATTR(hid, 0444, acpi_device_hid_show, NULL); +static DEVICE_ATTR_RO(hid); -static ssize_t acpi_device_uid_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t uid_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); return sprintf(buf, "%s\n", acpi_dev->pnp.unique_id); } -static DEVICE_ATTR(uid, 0444, acpi_device_uid_show, NULL); +static DEVICE_ATTR_RO(uid); -static ssize_t acpi_device_adr_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t adr_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); @@ -425,16 +425,16 @@ static ssize_t acpi_device_adr_show(struct device *dev, else return sprintf(buf, "0x%08llx\n", acpi_dev->pnp.bus_address); } -static DEVICE_ATTR(adr, 0444, acpi_device_adr_show, NULL); +static DEVICE_ATTR_RO(adr); -static ssize_t acpi_device_path_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t path_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); return acpi_object_path(acpi_dev->handle, buf); } -static DEVICE_ATTR(path, 0444, acpi_device_path_show, NULL); +static DEVICE_ATTR_RO(path); /* sysfs file that shows description text from the ACPI _STR method */ static ssize_t description_show(struct device *dev, @@ -463,8 +463,8 @@ static ssize_t description_show(struct device *dev, static DEVICE_ATTR_RO(description); static ssize_t -acpi_device_sun_show(struct device *dev, struct device_attribute *attr, - char *buf) { +sun_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); acpi_status status; unsigned long long sun; @@ -475,11 +475,11 @@ acpi_device_sun_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%llu\n", sun); } -static DEVICE_ATTR(sun, 0444, acpi_device_sun_show, NULL); +static DEVICE_ATTR_RO(sun); static ssize_t -acpi_device_hrv_show(struct device *dev, struct device_attribute *attr, - char *buf) { +hrv_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct acpi_device *acpi_dev = to_acpi_device(dev); acpi_status status; unsigned long long hrv; @@ -490,7 +490,7 @@ acpi_device_hrv_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%llu\n", hrv); } -static DEVICE_ATTR(hrv, 0444, acpi_device_hrv_show, NULL); +static DEVICE_ATTR_RO(hrv); static ssize_t status_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c index 24e076f44d23..0937ceab052e 100644 --- a/drivers/acpi/dock.c +++ b/drivers/acpi/dock.c @@ -484,7 +484,7 @@ int dock_notify(struct acpi_device *adev, u32 event) /* * show_docked - read method for "docked" file in sysfs */ -static ssize_t show_docked(struct device *dev, +static ssize_t docked_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dock_station *dock_station = dev->platform_data; @@ -493,25 +493,25 @@ static ssize_t show_docked(struct device *dev, acpi_bus_get_device(dock_station->handle, &adev); return snprintf(buf, PAGE_SIZE, "%u\n", acpi_device_enumerated(adev)); } -static DEVICE_ATTR(docked, S_IRUGO, show_docked, NULL); +static DEVICE_ATTR_RO(docked); /* * show_flags - read method for flags file in sysfs */ -static ssize_t show_flags(struct device *dev, +static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct dock_station *dock_station = dev->platform_data; return snprintf(buf, PAGE_SIZE, "%d\n", dock_station->flags); } -static DEVICE_ATTR(flags, S_IRUGO, show_flags, NULL); +static DEVICE_ATTR_RO(flags); /* * write_undock - write method for "undock" file in sysfs */ -static ssize_t write_undock(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t undock_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { int ret; struct dock_station *dock_station = dev->platform_data; @@ -525,13 +525,13 @@ static ssize_t write_undock(struct device *dev, struct device_attribute *attr, acpi_scan_lock_release(); return ret ? ret: count; } -static DEVICE_ATTR(undock, S_IWUSR, NULL, write_undock); +static DEVICE_ATTR_WO(undock); /* * show_dock_uid - read method for "uid" file in sysfs */ -static ssize_t show_dock_uid(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t uid_show(struct device *dev, + struct device_attribute *attr, char *buf) { unsigned long long lbuf; struct dock_station *dock_station = dev->platform_data; @@ -542,10 +542,10 @@ static ssize_t show_dock_uid(struct device *dev, return snprintf(buf, PAGE_SIZE, "%llx\n", lbuf); } -static DEVICE_ATTR(uid, S_IRUGO, show_dock_uid, NULL); +static DEVICE_ATTR_RO(uid); -static ssize_t show_dock_type(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t type_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct dock_station *dock_station = dev->platform_data; char *type; @@ -561,7 +561,7 @@ static ssize_t show_dock_type(struct device *dev, return snprintf(buf, PAGE_SIZE, "%s\n", type); } -static DEVICE_ATTR(type, S_IRUGO, show_dock_type, NULL); +static DEVICE_ATTR_RO(type); static struct attribute *dock_attributes[] = { &dev_attr_docked.attr, diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 189a0d4c6d06..3a7d0d703059 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -886,15 +886,16 @@ static void acpi_release_power_resource(struct device *dev) kfree(resource); } -static ssize_t acpi_power_in_use_show(struct device *dev, - struct device_attribute *attr, - char *buf) { +static ssize_t resource_in_use_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ struct acpi_power_resource *resource; resource = to_power_resource(to_acpi_device(dev)); return sprintf(buf, "%u\n", !!resource->ref_count); } -static DEVICE_ATTR(resource_in_use, 0444, acpi_power_in_use_show, NULL); +static DEVICE_ATTR_RO(resource_in_use); static void acpi_power_sysfs_remove(struct acpi_device *device) { From b1f4213cfa2a21d07fc34519cb8c6c999f8784b1 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 21 Dec 2020 08:03:02 -0800 Subject: [PATCH 054/307] PNP: add printf attribute to log function Attributing the function allows the compiler to more thoroughly check the use of the function with -Wformat and similar flags. Signed-off-by: Tom Rix Signed-off-by: Rafael J. Wysocki --- drivers/pnp/interface.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pnp/interface.c b/drivers/pnp/interface.c index 187e4a1175b0..602c46893e83 100644 --- a/drivers/pnp/interface.c +++ b/drivers/pnp/interface.c @@ -33,6 +33,7 @@ struct pnp_info_buffer { typedef struct pnp_info_buffer pnp_info_buffer_t; +__printf(2, 3) static int pnp_printf(pnp_info_buffer_t * buffer, char *fmt, ...) { va_list args; From 96228223933bf5ac920f93862c82449ec28247c0 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Mon, 28 Dec 2020 21:50:26 +0800 Subject: [PATCH 055/307] PNP: pnpbios: Use DEFINE_SPINLOCK() for spinlock spinlock can be initialized automatically with DEFINE_SPINLOCK() rather than explicitly calling spin_lock_init(). Signed-off-by: Zheng Yongjun Signed-off-by: Rafael J. Wysocki --- drivers/pnp/pnpbios/bioscalls.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index ba5cfc3dbe11..ddc6f2163c8e 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -72,7 +72,7 @@ __visible u32 pnp_bios_fault_esp; __visible u32 pnp_bios_fault_eip; __visible u32 pnp_bios_is_utter_crap = 0; -static spinlock_t pnp_bios_lock; +static DEFINE_SPINLOCK(pnp_bios_lock); /* * Support Functions @@ -473,7 +473,6 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) { int i; - spin_lock_init(&pnp_bios_lock); pnp_bios_callpoint.offset = header->fields.pm16offset; pnp_bios_callpoint.segment = PNP_CS16; From d8f85cc021afbb3697858672e1a11802f2568d91 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Thu, 7 Jan 2021 11:17:15 +0000 Subject: [PATCH 056/307] ACPI: CPPC: remove __iomem annotation for cpc_reg's address The cpc_reg address does not represent either an I/O virtual address, nor a field located in iomem. This address is used as an address offset which eventually is given as physical address argument to ioremap or PCC space offset to GET_PCC_VADDR. Therefore, having the __iomem annotation does not make sense. Fix the following sparse warnings by removing the __iomem annotation for cpc_reg's address. drivers/acpi/cppc_acpi.c:762:37: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:765:48: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:948:25: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:954:67: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:987:25: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:993:68: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:1120:13: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:1134:13: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:1137:13: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:1182:14: warning: dereference of noderef expression drivers/acpi/cppc_acpi.c:1212:13: warning: dereference of noderef expression Suggested-by: Al Viro Signed-off-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki --- include/acpi/cppc_acpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 232838d28f50..c7fc4524e151 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -39,7 +39,7 @@ struct cpc_reg { u8 bit_width; u8 bit_offset; u8 access_width; - u64 __iomem address; + u64 address; } __packed; /* From 1d9b4abefcca19187e219c3132f3b0593992e95e Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Thu, 7 Jan 2021 11:17:16 +0000 Subject: [PATCH 057/307] ACPI: CPPC: add __iomem annotation to generic_comm_base pointer ppc_comm_addr is a virtual address to the PCC space and it's annotated with __iomem. Therefore, generic_comm_base which gets assigned the value of pcc_comm_address should be annotated as well. This already happens in check_pcc_chan(), but not in send_pcc_cmd(), which results in the following sparse warnings: drivers/acpi/cppc_acpi.c:237:18: warning: cast removes address space '__iomem' of expression drivers/acpi/cppc_acpi.c:299:9: warning: incorrect type in argument 2 (different address spaces) drivers/acpi/cppc_acpi.c:299:9: expected void volatile [noderef] __iomem *addr drivers/acpi/cppc_acpi.c:299:9: got unsigned short * drivers/acpi/cppc_acpi.c:302:9: warning: incorrect type in argument 2 (different address spaces) drivers/acpi/cppc_acpi.c:302:9: expected void volatile [noderef] __iomem *addr drivers/acpi/cppc_acpi.c:302:9: got unsigned short * Signed-off-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 75aaf94ae0a9..fd71020f5d5f 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -233,8 +233,8 @@ static int send_pcc_cmd(int pcc_ss_id, u16 cmd) { int ret = -EIO, i; struct cppc_pcc_data *pcc_ss_data = pcc_data[pcc_ss_id]; - struct acpi_pcct_shared_memory *generic_comm_base = - (struct acpi_pcct_shared_memory *)pcc_ss_data->pcc_comm_addr; + struct acpi_pcct_shared_memory __iomem *generic_comm_base = + pcc_ss_data->pcc_comm_addr; unsigned int time_delta; /* From 26692cd93265a5d1227da8400f32efb00f57bf83 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Thu, 7 Jan 2021 11:17:17 +0000 Subject: [PATCH 058/307] ACPI: CPPC: initialise vaddr pointers to NULL Properly initialise vaddr pointers in cpc_read() and cpc_write() to NULL instead of 0. This fixes the following sparse warnings: drivers/acpi/cppc_acpi.c:937:31: warning: Using plain integer as NULL pointer drivers/acpi/cppc_acpi.c:982:31: warning: Using plain integer as NULL pointer Signed-off-by: Ionela Voinescu Signed-off-by: Rafael J. Wysocki --- drivers/acpi/cppc_acpi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index fd71020f5d5f..69057fcd2c04 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -934,7 +934,7 @@ int __weak cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) { int ret_val = 0; - void __iomem *vaddr = 0; + void __iomem *vaddr = NULL; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cpc_reg *reg = ®_res->cpc_entry.reg; @@ -979,7 +979,7 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val) { int ret_val = 0; - void __iomem *vaddr = 0; + void __iomem *vaddr = NULL; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cpc_reg *reg = ®_res->cpc_entry.reg; From 731e97e0769805cdebfd7d2b19a7d3f6abcace09 Mon Sep 17 00:00:00 2001 From: Flavio Suligoi Date: Fri, 8 Jan 2021 16:24:47 +0100 Subject: [PATCH 059/307] Documentation: ACPI: add new rule for gpio-line-names The gpio-line-names lists must respect some rules. This patch adds a new rule in documentation, to avoid the use of duplicate names in the same gpiochip. Signed-off-by: Flavio Suligoi Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/gpio-properties.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/firmware-guide/acpi/gpio-properties.rst b/Documentation/firmware-guide/acpi/gpio-properties.rst index b36aa3e743d8..4e264c16ddff 100644 --- a/Documentation/firmware-guide/acpi/gpio-properties.rst +++ b/Documentation/firmware-guide/acpi/gpio-properties.rst @@ -146,6 +146,7 @@ following rules (see also the examples): other words, it is not mandatory to fill all the GPIO lines - empty names are allowed (two quotation marks ``""`` correspond to an empty name) + - names inside one GPIO controller/expander must be unique Example of a GPIO controller of 16 lines, with an incomplete list with two empty names:: From 67e40054de86aae520ddc2a072d7f6951812a14f Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Fri, 15 Jan 2021 10:22:50 +0800 Subject: [PATCH 060/307] ACPI: configfs: add missing check after configfs_register_default_group() A list_add corruption is reported by Hulk Robot like this: ============== list_add corruption. Call Trace: link_obj+0xc0/0x1c0 link_group+0x21/0x140 configfs_register_subsystem+0xdb/0x380 acpi_configfs_init+0x25/0x1000 [acpi_configfs] do_one_initcall+0x149/0x820 do_init_module+0x1ef/0x720 load_module+0x35c8/0x4380 __do_sys_finit_module+0x10d/0x1a0 do_syscall_64+0x34/0x80 It's because of the missing check after configfs_register_default_group, where configfs_unregister_subsystem should be called once failure. Fixes: 612bd01fc6e0 ("ACPI: add support for loading SSDTs via configfs") Reported-by: Hulk Robot Suggested-by: Hanjun Guo Signed-off-by: Qinglang Miao Cc: 4.10+ # 4.10+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_configfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_configfs.c b/drivers/acpi/acpi_configfs.c index cf91f49101ea..3a14859dbb75 100644 --- a/drivers/acpi/acpi_configfs.c +++ b/drivers/acpi/acpi_configfs.c @@ -268,7 +268,12 @@ static int __init acpi_configfs_init(void) acpi_table_group = configfs_register_default_group(root, "table", &acpi_tables_type); - return PTR_ERR_OR_ZERO(acpi_table_group); + if (IS_ERR(acpi_table_group)) { + configfs_unregister_subsystem(&acpi_configfs); + return PTR_ERR(acpi_table_group); + } + + return 0; } module_init(acpi_configfs_init); From 651bc5816c39e57833fea4478c8ecfb72ad47e44 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 15 Jan 2021 15:56:46 -0800 Subject: [PATCH 061/307] intel_idle: remove definition of DEBUG Defining DEBUG should only be done in development. So remove DEBUG. Signed-off-by: Tom Rix Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 28f93b9aa51b..3273360f30f7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -37,7 +37,7 @@ */ /* un-comment DEBUG to enable pr_debug() statements */ -#define DEBUG +/* #define DEBUG */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt From 10aa694ea0d0adfbd97400fb39ea237a273c335f Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 20 Jan 2021 20:03:12 +0530 Subject: [PATCH 062/307] PM: runtime: Fix resposible -> responsible in runtime.c s/resposible/responsible/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index bfda153b1a41..a46a7e30881b 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1100,7 +1100,7 @@ EXPORT_SYMBOL_GPL(__pm_runtime_resume); * suspending the device when both its runtime PM status is %RPM_ACTIVE and its * runtime PM usage counter is not zero. * - * The caller is resposible for decrementing the runtime PM usage counter of + * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) From 75a8d877d65732b9669a0ebaa36311f12011fdcd Mon Sep 17 00:00:00 2001 From: Nigel Christian Date: Sat, 16 Jan 2021 19:47:05 -0500 Subject: [PATCH 063/307] cpufreq: intel_pstate: Remove repeated word In the comment for trace in passive mode there is an unnecessary "the". Eradicate it. Signed-off-by: Nigel Christian Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 6f2ff2775664..5175ae3cac44 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2498,7 +2498,7 @@ static int intel_cpufreq_verify_policy(struct cpufreq_policy_data *policy) * driver call was via the normal or fast switch path. Various graphs * output from the intel_pstate_tracer.py utility that include core_busy * (or performance or core_avg_perf) have a fixed y-axis from 0 to 100%, - * so we use 10 to indicate the the normal path through the driver, and + * so we use 10 to indicate the normal path through the driver, and * 90 to indicate the fast switch path through the driver. * The scaled_busy field is not used, and is set to 0. */ From 67e3242ee28052daa90e1fa193efc601939fde58 Mon Sep 17 00:00:00 2001 From: Lina Iyer Date: Wed, 20 Jan 2021 08:50:41 -0700 Subject: [PATCH 064/307] PM: domains: inform PM domain of a device's next wakeup Some devices may have a predictable interrupt pattern while executing usecases. An example would be the VSYNC interrupt associated with display devices. A 60 Hz display could cause a interrupt every 16 ms. If the device were in a PM domain, the domain would need to be powered up for device to resume and handle the interrupt. Entering a domain idle state saves power, only if the residency of the idle state is met. Without knowing the idle duration of the domain, the governor would just choose the deepest idle state that matches the QoS requirements. The domain might be powered off just as the device is expecting to wake up. If devices could inform PM frameworks of their next event, the parent PM domain's idle duration can be determined. So let's add the dev_pm_genpd_set_next_wakeup() API for the device to inform PM domains of the impending wakeup. This information will be the domain governor to determine the best idle state given the wakeup. Signed-off-by: Lina Iyer Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 30 ++++++++++++++++++++++++++++++ include/linux/pm_domain.h | 6 ++++++ 2 files changed, 36 insertions(+) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 9a14eedacb92..014033c7c287 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -423,6 +423,35 @@ int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state) } EXPORT_SYMBOL_GPL(dev_pm_genpd_set_performance_state); +/** + * dev_pm_genpd_set_next_wakeup - Notify PM framework of an impending wakeup. + * + * @dev: Device to handle + * @next: impending interrupt/wakeup for the device + * + * + * Allow devices to inform of the next wakeup. It's assumed that the users + * guarantee that the genpd wouldn't be detached while this routine is getting + * called. Additionally, it's also assumed that @dev isn't runtime suspended + * (RPM_SUSPENDED)." + * Although devices are expected to update the next_wakeup after the end of + * their usecase as well, it is possible the devices themselves may not know + * about that, so stale @next will be ignored when powering off the domain. + */ +void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next) +{ + struct generic_pm_domain_data *gpd_data; + struct generic_pm_domain *genpd; + + genpd = dev_to_genpd_safe(dev); + if (!genpd) + return; + + gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); + gpd_data->next_wakeup = next; +} +EXPORT_SYMBOL_GPL(dev_pm_genpd_set_next_wakeup); + static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed) { unsigned int state_idx = genpd->state_idx; @@ -1465,6 +1494,7 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev) gpd_data->td.constraint_changed = true; gpd_data->td.effective_constraint_ns = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS; gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier; + gpd_data->next_wakeup = KTIME_MAX; spin_lock_irq(&dev->power.lock); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 2ca919ae8d36..735583c0bc6d 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -9,6 +9,7 @@ #define _LINUX_PM_DOMAIN_H #include +#include #include #include #include @@ -191,6 +192,7 @@ struct generic_pm_domain_data { struct notifier_block *power_nb; int cpu; unsigned int performance_state; + ktime_t next_wakeup; void *data; }; @@ -217,6 +219,7 @@ int pm_genpd_remove(struct generic_pm_domain *genpd); int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state); int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb); int dev_pm_genpd_remove_notifier(struct device *dev); +void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next); extern struct dev_power_governor simple_qos_governor; extern struct dev_power_governor pm_domain_always_on_gov; @@ -275,6 +278,9 @@ static inline int dev_pm_genpd_remove_notifier(struct device *dev) return -EOPNOTSUPP; } +static inline void dev_pm_genpd_set_next_wakeup(struct device *dev, ktime_t next) +{ } + #define simple_qos_governor (*(struct dev_power_governor *)(NULL)) #define pm_domain_always_on_gov (*(struct dev_power_governor *)(NULL)) #endif From c79aa080fb0f60a0e24c87014dc9c2f373e1379b Mon Sep 17 00:00:00 2001 From: Lina Iyer Date: Wed, 20 Jan 2021 08:50:42 -0700 Subject: [PATCH 065/307] PM: domains: use device's next wakeup to determine domain idle state Currently, a PM domain's idle state is determined based on whether the QoS requirements are met. However, even entering an idle state may waste power if the minimum residency requirements aren't fulfilled. CPU PM domains use the next timer wakeup for the CPUs in the domain to determine the sleep duration of the domain. This is compared with the idle state residencies to determine the optimal idle state. For other PM domains, determining the sleep length is not that straight forward. But if the device's next_event is available, we can use that to determine the sleep duration of the PM domain. Let's update the domain governor logic to check for idle state residency based on the next wakeup of devices as well as QoS constraints. But since, not all domains may contain devices capable of specifying the next wakeup, let's enable this additional check only if specified by the domain's flags when initializing the domain. Signed-off-by: Lina Iyer Reviewed-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain_governor.c | 102 ++++++++++++++++++++++++--- include/linux/pm_domain.h | 6 ++ 2 files changed, 99 insertions(+), 9 deletions(-) diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c index 490ed7deb99a..c6c218758f0b 100644 --- a/drivers/base/power/domain_governor.c +++ b/drivers/base/power/domain_governor.c @@ -117,6 +117,55 @@ static bool default_suspend_ok(struct device *dev) return td->cached_suspend_ok; } +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now) +{ + ktime_t domain_wakeup = KTIME_MAX; + ktime_t next_wakeup; + struct pm_domain_data *pdd; + struct gpd_link *link; + + if (!(genpd->flags & GENPD_FLAG_MIN_RESIDENCY)) + return; + + /* + * Devices that have a predictable wakeup pattern, may specify + * their next wakeup. Let's find the next wakeup from all the + * devices attached to this domain and from all the sub-domains. + * It is possible that component's a next wakeup may have become + * stale when we read that here. We will ignore to ensure the domain + * is able to enter its optimal idle state. + */ + list_for_each_entry(pdd, &genpd->dev_list, list_node) { + next_wakeup = to_gpd_data(pdd)->next_wakeup; + if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now)) + if (ktime_before(next_wakeup, domain_wakeup)) + domain_wakeup = next_wakeup; + } + + list_for_each_entry(link, &genpd->parent_links, parent_node) { + next_wakeup = link->child->next_wakeup; + if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now)) + if (ktime_before(next_wakeup, domain_wakeup)) + domain_wakeup = next_wakeup; + } + + genpd->next_wakeup = domain_wakeup; +} + +static bool next_wakeup_allows_state(struct generic_pm_domain *genpd, + unsigned int state, ktime_t now) +{ + ktime_t domain_wakeup = genpd->next_wakeup; + s64 idle_time_ns, min_sleep_ns; + + min_sleep_ns = genpd->states[state].power_off_latency_ns + + genpd->states[state].residency_ns; + + idle_time_ns = ktime_to_ns(ktime_sub(domain_wakeup, now)); + + return idle_time_ns >= min_sleep_ns; +} + static bool __default_power_down_ok(struct dev_pm_domain *pd, unsigned int state) { @@ -201,16 +250,41 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd, } /** - * default_power_down_ok - Default generic PM domain power off governor routine. + * _default_power_down_ok - Default generic PM domain power off governor routine. * @pd: PM domain to check. * * This routine must be executed under the PM domain's lock. */ -static bool default_power_down_ok(struct dev_pm_domain *pd) +static bool _default_power_down_ok(struct dev_pm_domain *pd, ktime_t now) { struct generic_pm_domain *genpd = pd_to_genpd(pd); + int state_idx = genpd->state_count - 1; struct gpd_link *link; + /* + * Find the next wakeup from devices that can determine their own wakeup + * to find when the domain would wakeup and do it for every device down + * the hierarchy. It is not worth while to sleep if the state's residency + * cannot be met. + */ + update_domain_next_wakeup(genpd, now); + if ((genpd->flags & GENPD_FLAG_MIN_RESIDENCY) && (genpd->next_wakeup != KTIME_MAX)) { + /* Let's find out the deepest domain idle state, the devices prefer */ + while (state_idx >= 0) { + if (next_wakeup_allows_state(genpd, state_idx, now)) { + genpd->max_off_time_changed = true; + break; + } + state_idx--; + } + + if (state_idx < 0) { + state_idx = 0; + genpd->cached_power_down_ok = false; + goto done; + } + } + if (!genpd->max_off_time_changed) { genpd->state_idx = genpd->cached_power_down_state_idx; return genpd->cached_power_down_ok; @@ -228,21 +302,30 @@ static bool default_power_down_ok(struct dev_pm_domain *pd) genpd->max_off_time_ns = -1; genpd->max_off_time_changed = false; genpd->cached_power_down_ok = true; - genpd->state_idx = genpd->state_count - 1; - /* Find a state to power down to, starting from the deepest. */ - while (!__default_power_down_ok(pd, genpd->state_idx)) { - if (genpd->state_idx == 0) { + /* + * Find a state to power down to, starting from the state + * determined by the next wakeup. + */ + while (!__default_power_down_ok(pd, state_idx)) { + if (state_idx == 0) { genpd->cached_power_down_ok = false; break; } - genpd->state_idx--; + state_idx--; } +done: + genpd->state_idx = state_idx; genpd->cached_power_down_state_idx = genpd->state_idx; return genpd->cached_power_down_ok; } +static bool default_power_down_ok(struct dev_pm_domain *pd) +{ + return _default_power_down_ok(pd, ktime_get()); +} + static bool always_on_power_down_ok(struct dev_pm_domain *domain) { return false; @@ -254,11 +337,12 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) struct generic_pm_domain *genpd = pd_to_genpd(pd); struct cpuidle_device *dev; ktime_t domain_wakeup, next_hrtimer; + ktime_t now = ktime_get(); s64 idle_duration_ns; int cpu, i; /* Validate dev PM QoS constraints. */ - if (!default_power_down_ok(pd)) + if (!_default_power_down_ok(pd, now)) return false; if (!(genpd->flags & GENPD_FLAG_CPU_DOMAIN)) @@ -280,7 +364,7 @@ static bool cpu_power_down_ok(struct dev_pm_domain *pd) } /* The minimum idle duration is from now - until the next wakeup. */ - idle_duration_ns = ktime_to_ns(ktime_sub(domain_wakeup, ktime_get())); + idle_duration_ns = ktime_to_ns(ktime_sub(domain_wakeup, now)); if (idle_duration_ns <= 0) return false; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 735583c0bc6d..dfcfbcecc34b 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -56,6 +56,10 @@ * * GENPD_FLAG_RPM_ALWAYS_ON: Instructs genpd to always keep the PM domain * powered on except for system suspend. + * + * GENPD_FLAG_MIN_RESIDENCY: Enable the genpd governor to consider its + * components' next wakeup when determining the + * optimal idle state. */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -63,6 +67,7 @@ #define GENPD_FLAG_ACTIVE_WAKEUP (1U << 3) #define GENPD_FLAG_CPU_DOMAIN (1U << 4) #define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5) +#define GENPD_FLAG_MIN_RESIDENCY (1U << 6) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ @@ -130,6 +135,7 @@ struct generic_pm_domain { unsigned int state); struct gpd_dev_ops dev_ops; s64 max_off_time_ns; /* Maximum allowed "suspended" time. */ + ktime_t next_wakeup; /* Maintained by the domain governor */ bool max_off_time_changed; bool cached_power_down_ok; bool cached_power_down_state_idx; From 079c42a0ed73500f1d11b5564e31d56c52bee21e Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 21 Jan 2021 00:12:30 +0300 Subject: [PATCH 066/307] PM: domains: Make set_performance_state() callback optional Make set_performance_state() callback optional in order to remove the need from power domain drivers to implement a dummy callback. If callback isn't implemented by a GENPD driver, then the performance state is passed to the parent domain. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar [tested on NVIDIA Tegra20/30/124 SoCs] Suggested-by: Ulf Hansson Reviewed-by: Ulf Hansson Signed-off-by: Dmitry Osipenko Reviewed-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 014033c7c287..4878c824e66c 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -297,6 +297,18 @@ static int _genpd_reeval_performance_state(struct generic_pm_domain *genpd, return state; } +static int genpd_xlate_performance_state(struct generic_pm_domain *genpd, + struct generic_pm_domain *parent, + unsigned int pstate) +{ + if (!parent->set_performance_state) + return pstate; + + return dev_pm_opp_xlate_performance_state(genpd->opp_table, + parent->opp_table, + pstate); +} + static int _genpd_set_performance_state(struct generic_pm_domain *genpd, unsigned int state, int depth) { @@ -311,13 +323,8 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, list_for_each_entry(link, &genpd->child_links, child_node) { parent = link->parent; - if (!parent->set_performance_state) - continue; - /* Find parent's performance state */ - ret = dev_pm_opp_xlate_performance_state(genpd->opp_table, - parent->opp_table, - state); + ret = genpd_xlate_performance_state(genpd, parent, state); if (unlikely(ret < 0)) goto err; @@ -339,9 +346,11 @@ static int _genpd_set_performance_state(struct generic_pm_domain *genpd, goto err; } - ret = genpd->set_performance_state(genpd, state); - if (ret) - goto err; + if (genpd->set_performance_state) { + ret = genpd->set_performance_state(genpd, state); + if (ret) + goto err; + } genpd->performance_state = state; return 0; @@ -352,9 +361,6 @@ err: child_node) { parent = link->parent; - if (!parent->set_performance_state) - continue; - genpd_lock_nested(parent, depth + 1); parent_state = link->prev_performance_state; @@ -399,9 +405,6 @@ int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state) if (!genpd) return -ENODEV; - if (unlikely(!genpd->set_performance_state)) - return -EINVAL; - if (WARN_ON(!dev->power.subsys_data || !dev->power.subsys_data->domain_data)) return -EINVAL; From 18027d6f392ee8d89d9df4dff0a7db4fb2d6f8a5 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 21 Jan 2021 00:12:31 +0300 Subject: [PATCH 067/307] PM: domains: Make of_genpd_add_subdomain() return -EPROBE_DEFER Driver of a power domain provider may not be ready at the time of of_genpd_add_subdomain() invocation. Make this function to return -EPROBE_DEFER instead of -ENOENT in order to remove a need from power domain drivers to handle the error code specially. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar [tested on NVIDIA Tegra20/30/124 SoCs] Suggested-by: Ulf Hansson Reviewed-by: Ulf Hansson Reviewed-by: Viresh Kumar Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 4878c824e66c..c615abf56c52 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2496,7 +2496,7 @@ int of_genpd_add_subdomain(struct of_phandle_args *parent_spec, out: mutex_unlock(&gpd_list_lock); - return ret; + return ret == -ENOENT ? -EPROBE_DEFER : ret; } EXPORT_SYMBOL_GPL(of_genpd_add_subdomain); From 45fbc464b047b3fbd760c9cb460a50a1ef2cf933 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 21 Jan 2021 00:12:32 +0300 Subject: [PATCH 068/307] PM: domains: Add "performance" column to debug summary Add "performance" column to debug summary which shows performance state of all power domains and theirs devices. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar [tested on NVIDIA Tegra20/30/124 SoCs] Reviewed-by: Ulf Hansson Reviewed-by: Viresh Kumar Signed-off-by: Dmitry Osipenko Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index c615abf56c52..50211a402fa5 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -2985,7 +2985,15 @@ static void rtpm_status_str(struct seq_file *s, struct device *dev) else WARN_ON(1); - seq_puts(s, p); + seq_printf(s, "%-25s ", p); +} + +static void perf_status_str(struct seq_file *s, struct device *dev) +{ + struct generic_pm_domain_data *gpd_data; + + gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); + seq_put_decimal_ull(s, "", gpd_data->performance_state); } static int genpd_summary_one(struct seq_file *s, @@ -3013,7 +3021,7 @@ static int genpd_summary_one(struct seq_file *s, else snprintf(state, sizeof(state), "%s", status_lookup[genpd->status]); - seq_printf(s, "%-30s %-15s ", genpd->name, state); + seq_printf(s, "%-30s %-50s %u", genpd->name, state, genpd->performance_state); /* * Modifications on the list require holding locks on both @@ -3021,6 +3029,8 @@ static int genpd_summary_one(struct seq_file *s, * Also genpd->name is immutable. */ list_for_each_entry(link, &genpd->parent_links, parent_node) { + if (list_is_first(&link->parent_node, &genpd->parent_links)) + seq_printf(s, "\n%48s", " "); seq_printf(s, "%s", link->child->name); if (!list_is_last(&link->parent_node, &genpd->parent_links)) seq_puts(s, ", "); @@ -3035,6 +3045,7 @@ static int genpd_summary_one(struct seq_file *s, seq_printf(s, "\n %-50s ", kobj_path); rtpm_status_str(s, pm_data->dev); + perf_status_str(s, pm_data->dev); kfree(kobj_path); } @@ -3050,9 +3061,9 @@ static int summary_show(struct seq_file *s, void *data) struct generic_pm_domain *genpd; int ret = 0; - seq_puts(s, "domain status children\n"); + seq_puts(s, "domain status children performance\n"); seq_puts(s, " /device runtime status\n"); - seq_puts(s, "----------------------------------------------------------------------\n"); + seq_puts(s, "----------------------------------------------------------------------------------------------\n"); ret = mutex_lock_interruptible(&gpd_list_lock); if (ret) From 9d56653d14cd5e545599cd9e3013daa17df50cd4 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 25 Jan 2021 12:59:56 +0100 Subject: [PATCH 069/307] ACPI: platform-profile: Drop const qualifier for cur_profile Drop the const qualifier from the static global cur_profile pointer declaration. This is a preparation patch for passing the cur_profile pointer as parameter to the profile_get() and profile_set() callbacks so that drivers dynamically allocating their driver-data struct, with their platform_profile_handler struct embedded, can use this pointer to get to their driver-data. Note this also requires dropping the const from the pprof platform_profile_register() function argument. Dropping this const is not a problem, non of the queued up consumers of platform_profile_register() actually pass in a const pointer. Link: https://lore.kernel.org/linux-acpi/5e7a4d87-52ef-e487-9cc2-8e7094beaa08@redhat.com/ Link: https://lore.kernel.org/r/20210114073429.176462-2-jiaxun.yang@flygoat.com Suggested-by: Hans de Goede Signed-off-by: Jiaxun Yang [ hdegoede@redhat.com: Also remove const from platform_profile_register() ] Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/platform_profile.c | 4 ++-- include/linux/platform_profile.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index 91be50a32cc8..f65c61db7921 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -9,7 +9,7 @@ #include #include -static const struct platform_profile_handler *cur_profile; +static struct platform_profile_handler *cur_profile; static DEFINE_MUTEX(profile_lock); static const char * const profile_names[] = { @@ -132,7 +132,7 @@ void platform_profile_notify(void) } EXPORT_SYMBOL_GPL(platform_profile_notify); -int platform_profile_register(const struct platform_profile_handler *pprof) +int platform_profile_register(struct platform_profile_handler *pprof) { int err; diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 3623d7108421..c797fdb3d91a 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -32,7 +32,7 @@ struct platform_profile_handler { int (*profile_set)(enum platform_profile_option profile); }; -int platform_profile_register(const struct platform_profile_handler *pprof); +int platform_profile_register(struct platform_profile_handler *pprof); int platform_profile_remove(void); void platform_profile_notify(void); From c1013ff7a5472db637c56bb6237f8343398c03a7 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Jan 2021 19:46:47 +0100 Subject: [PATCH 070/307] ACPI: scan: Rearrange memory allocation in acpi_device_add() The upfront allocation of new_bus_id is done to avoid allocating memory under acpi_device_lock, but it doesn't really help, because (1) it leads to many unnecessary memory allocations for _ADR devices, (2) kstrdup_const() is run under that lock anyway and (3) it complicates the code. Rearrange acpi_device_add() to allocate memory for a new struct acpi_device_bus_id instance only when necessary, eliminate a redundant local variable from it and reduce the number of labels in there. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede --- drivers/acpi/scan.c | 57 +++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 1db063b02f63..0fb1811772b5 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -623,12 +623,23 @@ void acpi_bus_put_acpi_device(struct acpi_device *adev) put_device(&adev->dev); } +static struct acpi_device_bus_id *acpi_device_bus_id_match(const char *dev_id) +{ + struct acpi_device_bus_id *acpi_device_bus_id; + + /* Find suitable bus_id and instance number in acpi_bus_id_list. */ + list_for_each_entry(acpi_device_bus_id, &acpi_bus_id_list, node) { + if (!strcmp(acpi_device_bus_id->bus_id, dev_id)) + return acpi_device_bus_id; + } + return NULL; +} + int acpi_device_add(struct acpi_device *device, void (*release)(struct device *)) { + struct acpi_device_bus_id *acpi_device_bus_id; int result; - struct acpi_device_bus_id *acpi_device_bus_id, *new_bus_id; - int found = 0; if (device->handle) { acpi_status status; @@ -654,38 +665,26 @@ int acpi_device_add(struct acpi_device *device, INIT_LIST_HEAD(&device->del_list); mutex_init(&device->physical_node_lock); - new_bus_id = kzalloc(sizeof(struct acpi_device_bus_id), GFP_KERNEL); - if (!new_bus_id) { - pr_err(PREFIX "Memory allocation error\n"); - result = -ENOMEM; - goto err_detach; - } - mutex_lock(&acpi_device_lock); - /* - * Find suitable bus_id and instance number in acpi_bus_id_list - * If failed, create one and link it into acpi_bus_id_list - */ - list_for_each_entry(acpi_device_bus_id, &acpi_bus_id_list, node) { - if (!strcmp(acpi_device_bus_id->bus_id, - acpi_device_hid(device))) { - acpi_device_bus_id->instance_no++; - found = 1; - kfree(new_bus_id); - break; + + acpi_device_bus_id = acpi_device_bus_id_match(acpi_device_hid(device)); + if (acpi_device_bus_id) { + acpi_device_bus_id->instance_no++; + } else { + acpi_device_bus_id = kzalloc(sizeof(*acpi_device_bus_id), + GFP_KERNEL); + if (!acpi_device_bus_id) { + result = -ENOMEM; + goto err_unlock; } - } - if (!found) { - acpi_device_bus_id = new_bus_id; acpi_device_bus_id->bus_id = kstrdup_const(acpi_device_hid(device), GFP_KERNEL); if (!acpi_device_bus_id->bus_id) { - pr_err(PREFIX "Memory allocation error for bus id\n"); + kfree(acpi_device_bus_id); result = -ENOMEM; - goto err_free_new_bus_id; + goto err_unlock; } - acpi_device_bus_id->instance_no = 0; list_add_tail(&acpi_device_bus_id->node, &acpi_bus_id_list); } dev_set_name(&device->dev, "%s:%02x", acpi_device_bus_id->bus_id, acpi_device_bus_id->instance_no); @@ -720,13 +719,9 @@ int acpi_device_add(struct acpi_device *device, list_del(&device->node); list_del(&device->wakeup_list); - err_free_new_bus_id: - if (!found) - kfree(new_bus_id); - + err_unlock: mutex_unlock(&acpi_device_lock); - err_detach: acpi_detach_data(device->handle, acpi_scan_drop_device); return result; } From 5e73c5187cf4f40a5e02b6c8e4dd0fcf9686c006 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Jan 2021 19:47:37 +0100 Subject: [PATCH 071/307] ACPI: scan: Adjust white space in acpi_device_add() Add empty lines in some places in acpi_device_add() to help readability and drop leading spaces before the labels in there. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede --- drivers/acpi/scan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 0fb1811772b5..1510afa7094d 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -694,10 +694,12 @@ int acpi_device_add(struct acpi_device *device, if (device->wakeup.flags.valid) list_add_tail(&device->wakeup_list, &acpi_wakeup_device_list); + mutex_unlock(&acpi_device_lock); if (device->parent) device->dev.parent = &device->parent->dev; + device->dev.bus = &acpi_bus_type; device->dev.release = release; result = device_add(&device->dev); @@ -713,16 +715,19 @@ int acpi_device_add(struct acpi_device *device, return 0; - err: +err: mutex_lock(&acpi_device_lock); + if (device->parent) list_del(&device->node); + list_del(&device->wakeup_list); - err_unlock: +err_unlock: mutex_unlock(&acpi_device_lock); acpi_detach_data(device->handle, acpi_scan_drop_device); + return result; } From 83e2c8fc7ab89458b805e96ab37bccadf84f932b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 18 Jan 2021 20:25:37 +0100 Subject: [PATCH 072/307] ACPI: scan: Rearrange code related to acpi_get_device_data() There are two callers of acpi_get_device_data(), acpi_bus_get_device() and acpi_bus_get_acpi_device(), but only one of them takes the int return value into account. Moreover, the latter knows that it passes a valid return pointer to acpi_get_device_data() and it properly clears that pointer upfront, so it doesn't need acpi_get_device_data() to do that. For this reason, rearrange acpi_get_device_data() to return a strct acpi_device pointer instead of an int and adapt its callers to that. While at it, rename acpi_get_device_data() to handle_to_device(), because the old name does not really reflect the functionality provided by that function. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede --- drivers/acpi/scan.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 1510afa7094d..bcbf0fc215c8 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -578,29 +578,31 @@ static void acpi_scan_drop_device(acpi_handle handle, void *context) mutex_unlock(&acpi_device_del_lock); } -static int acpi_get_device_data(acpi_handle handle, struct acpi_device **device, - void (*callback)(void *)) +static struct acpi_device *handle_to_device(acpi_handle handle, + void (*callback)(void *)) { + struct acpi_device *adev = NULL; acpi_status status; - if (!device) - return -EINVAL; - - *device = NULL; - status = acpi_get_data_full(handle, acpi_scan_drop_device, - (void **)device, callback); - if (ACPI_FAILURE(status) || !*device) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No context for object [%p]\n", - handle)); - return -ENODEV; + (void **)&adev, callback); + if (ACPI_FAILURE(status) || !adev) { + acpi_handle_debug(handle, "No context!\n"); + return NULL; } - return 0; + return adev; } int acpi_bus_get_device(acpi_handle handle, struct acpi_device **device) { - return acpi_get_device_data(handle, device, NULL); + if (!device) + return -EINVAL; + + *device = handle_to_device(handle, NULL); + if (!*device) + return -ENODEV; + + return 0; } EXPORT_SYMBOL(acpi_bus_get_device); @@ -612,10 +614,7 @@ static void get_acpi_device(void *dev) struct acpi_device *acpi_bus_get_acpi_device(acpi_handle handle) { - struct acpi_device *adev = NULL; - - acpi_get_device_data(handle, &adev, get_acpi_device); - return adev; + return handle_to_device(handle, get_acpi_device); } void acpi_bus_put_acpi_device(struct acpi_device *adev) From 313d64a35d36b4bb00edde418179ff1a5f342070 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jan 2021 23:49:04 -0500 Subject: [PATCH 073/307] do_splice_to(): move the logics for limiting the read length in Both callers have the identical logics limiting the amount of data we try to read into pipe - no more than would fit into that pipe. Move that into do_splice_to() itself. Signed-off-by: Al Viro --- fs/splice.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 866d5c2367b2..c1ca2cc63b43 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -771,11 +771,16 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + unsigned int p_space; int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; + /* Don't try to read more the pipe has space for. */ + p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + len = min_t(size_t, len, p_space << PAGE_SHIFT); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; @@ -856,15 +861,10 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { - unsigned int p_space; size_t read_len; loff_t pos = sd->pos, prev_pos = pos; - /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - - pipe_occupancy(pipe->head, pipe->tail); - read_len = min_t(size_t, len, p_space << PAGE_SHIFT); - ret = do_splice_to(in, &pos, pipe, read_len, flags); + ret = do_splice_to(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto out_release; @@ -1083,15 +1083,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, pipe_lock(opipe); ret = wait_for_space(opipe, flags); - if (!ret) { - unsigned int p_space; - - /* Don't try to read more the pipe has space for. */ - p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail); - len = min_t(size_t, len, p_space << PAGE_SHIFT); - + if (!ret) ret = do_splice_to(in, &offset, opipe, len, flags); - } pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); From faa97c48c33454ac0107db930a491b692dd1dff1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Jan 2021 22:23:03 -0500 Subject: [PATCH 074/307] take the guts of file-to-pipe splice into a helper function Signed-off-by: Al Viro --- fs/splice.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index c1ca2cc63b43..74f968c65a93 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1002,6 +1002,23 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); +static long splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags) +{ + long ret; + + pipe_lock(opipe); + ret = wait_for_space(opipe, flags); + if (!ret) + ret = do_splice_to(in, offset, opipe, len, flags); + pipe_unlock(opipe); + if (ret > 0) + wakeup_pipe_readers(opipe); + return ret; +} + /* * Determine where to splice to/from. */ @@ -1081,13 +1098,7 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - pipe_lock(opipe); - ret = wait_for_space(opipe, flags); - if (!ret) - ret = do_splice_to(in, &offset, opipe, len, flags); - pipe_unlock(opipe); - if (ret > 0) - wakeup_pipe_readers(opipe); + ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else From b964bf53e540262f2d12672b3cca10842c0172e7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 25 Jan 2021 22:24:28 -0500 Subject: [PATCH 075/307] teach sendfile(2) to handle send-to-pipe directly no point going through the intermediate pipe Signed-off-by: Al Viro --- fs/internal.h | 9 +++++++++ fs/read_write.c | 19 +++++++++++++------ fs/splice.c | 2 +- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 77c50befbfbe..cff1f30cfefb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -15,6 +15,7 @@ struct mount; struct shrink_control; struct fs_context; struct user_namespace; +struct pipe_inode_info; /* * block_dev.c @@ -193,3 +194,11 @@ int sb_init_dio_done_wq(struct super_block *sb); */ int do_statx(int dfd, const char __user *filename, unsigned flags, unsigned int mask, struct statx __user *buffer); + +/* + * fs/splice.c: + */ +long splice_file_to_pipe(struct file *in, + struct pipe_inode_info *opipe, + loff_t *offset, + size_t len, unsigned int flags); diff --git a/fs/read_write.c b/fs/read_write.c index 75f764b43418..9db7adf160d2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1188,6 +1188,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, { struct fd in, out; struct inode *in_inode, *out_inode; + struct pipe_inode_info *opipe; loff_t pos; loff_t out_pos; ssize_t retval; @@ -1228,9 +1229,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, in_inode = file_inode(in.file); out_inode = file_inode(out.file); out_pos = out.file->f_pos; - retval = rw_verify_area(WRITE, out.file, &out_pos, count); - if (retval < 0) - goto fput_out; if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); @@ -1253,9 +1251,18 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - file_start_write(out.file); - retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); - file_end_write(out.file); + opipe = get_pipe_info(out.file, true); + if (!opipe) { + retval = rw_verify_area(WRITE, out.file, &out_pos, count); + if (retval < 0) + goto fput_out; + file_start_write(out.file); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, + count, fl); + file_end_write(out.file); + } else { + retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); + } if (retval > 0) { add_rchar(current, retval); diff --git a/fs/splice.c b/fs/splice.c index 74f968c65a93..b06846f1e6ee 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1002,7 +1002,7 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -static long splice_file_to_pipe(struct file *in, +long splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) From 629d512d682de2259179046e2364f1f1ff4232e3 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:34:36 -0600 Subject: [PATCH 076/307] cpupower: Update msr_pstate union struct naming The msr_pstate union struct named fam17h_bits is misleading since this is the struct to use for all families >= 0x17, not just for family 0x17. Rename the bits structs to be 'pstate' (for pre family 17h CPUs) and 'pstatedef' (for CPUs since fam 17h) to align closer with PPR/BDKG (1) naming. There are no functional changes as part of this update. 1: AMD Processor Programming Reference (PPR) and BIOS and Kernel Developer's Guide (BKDG) available at: http://developer.amd.com/resources/developer-guides-manuals Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Reviewed-by: skhan@linuxfoundation.org Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 26 +++++++++++++----------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 7c4f83a8c973..34368436bbd6 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -13,7 +13,8 @@ #define MSR_AMD_PSTATE 0xc0010064 #define MSR_AMD_PSTATE_LIMIT 0xc0010061 -union msr_pstate { +union core_pstate { + /* pre fam 17h: */ struct { unsigned fid:6; unsigned did:3; @@ -26,7 +27,8 @@ union msr_pstate { unsigned idddiv:2; unsigned res3:21; unsigned en:1; - } bits; + } pstate; + /* since fam 17h: */ struct { unsigned fid:8; unsigned did:6; @@ -35,36 +37,36 @@ union msr_pstate { unsigned idddiv:2; unsigned res1:31; unsigned en:1; - } fam17h_bits; + } pstatedef; unsigned long long val; }; -static int get_did(int family, union msr_pstate pstate) +static int get_did(int family, union core_pstate pstate) { int t; if (family == 0x12) t = pstate.val & 0xf; else if (family == 0x17 || family == 0x18) - t = pstate.fam17h_bits.did; + t = pstate.pstatedef.did; else - t = pstate.bits.did; + t = pstate.pstate.did; return t; } -static int get_cof(int family, union msr_pstate pstate) +static int get_cof(int family, union core_pstate pstate) { int t; int fid, did, cof; did = get_did(family, pstate); if (family == 0x17 || family == 0x18) { - fid = pstate.fam17h_bits.fid; + fid = pstate.pstatedef.fid; cof = 200 * fid / did; } else { t = 0x10; - fid = pstate.bits.fid; + fid = pstate.pstate.fid; if (family == 0x11) t = 0x8; cof = (100 * (fid + t)) >> did; @@ -89,7 +91,7 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, int boost_states, unsigned long *pstates, int *no) { int i, psmax, pscur; - union msr_pstate pstate; + union core_pstate pstate; unsigned long long val; /* Only read out frequencies from HW when CPU might be boostable @@ -119,9 +121,9 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, } if (read_msr(cpu, MSR_AMD_PSTATE + i, &pstate.val)) return -1; - if ((cpu_family == 0x17) && (!pstate.fam17h_bits.en)) + if ((cpu_family == 0x17) && (!pstate.pstatedef.en)) continue; - else if (!pstate.bits.en) + else if (!pstate.pstate.en) continue; pstates[i] = get_cof(cpu_family, pstate); From 7a136a8fcd7ef14c63d07667e81c4dcac77e0a13 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 Jan 2021 11:34:42 -0600 Subject: [PATCH 077/307] cpupower: Correct macro name for CPB caps flag The name is Core Performance Boost (CPB) for the cpuid flag. Correct cpuid caps flag to use this name (instead of CBP). Signed-off-by: Robert Richter Signed-off-by: Nathan Fontenot Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/cpuid.c | 2 +- tools/power/cpupower/utils/helpers/helpers.h | 2 +- tools/power/cpupower/utils/helpers/misc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index 73bfafc60e9b..f9a66a430b72 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -130,7 +130,7 @@ out: cpu_info->vendor == X86_VENDOR_HYGON) { if (ext_cpuid_level >= 0x80000007 && (cpuid_edx(0x80000007) & (1 << 9))) - cpu_info->caps |= CPUPOWER_CAP_AMD_CBP; + cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; if (ext_cpuid_level >= 0x80000008 && cpuid_ebx(0x80000008) & (1 << 4)) diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 0642e60a6ce1..a84f85a9dbd2 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -64,7 +64,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_INV_TSC 0x00000001 #define CPUPOWER_CAP_APERF 0x00000002 -#define CPUPOWER_CAP_AMD_CBP 0x00000004 +#define CPUPOWER_CAP_AMD_CPB 0x00000004 #define CPUPOWER_CAP_PERF_BIAS 0x00000008 #define CPUPOWER_CAP_HAS_TURBO_RATIO 0x00000010 #define CPUPOWER_CAP_IS_SNB 0x00000020 diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index 650b9a9a6584..f9bcce9c72d5 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -26,7 +26,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, if (ret) return ret; - if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CBP) { + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) { *support = 1; /* AMD Family 0x17 does not utilize PCI D18F4 like prior From a0255a76bf3a78d322adfe4eb4e73eb83998f61a Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:34:49 -0600 Subject: [PATCH 078/307] cpupower: Add CPUPOWER_CAP_AMD_HW_PSTATE cpuid caps flag Add a check in get_cpu_info() for the ability to read frequencies from hardware and set the CPUPOWER_CAP_AMD_HW_PSTATE cpuid flag. The cpuid flag is set when CPUID_80000007_EDX[7] is set, which is all families >= 10h. The check excludes family 14h because HW pstate reporting was not implemented on family 14h. This is intended to reduce family checks in the main code paths. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Reviewed-by: skhan@linuxfoundation.org Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 9 ++++----- tools/power/cpupower/utils/helpers/cpuid.c | 12 +++++++++--- tools/power/cpupower/utils/helpers/helpers.h | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 34368436bbd6..8b69c7ff639a 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -94,11 +94,10 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, union core_pstate pstate; unsigned long long val; - /* Only read out frequencies from HW when CPU might be boostable - to keep the code as short and clean as possible. - Otherwise frequencies are exported via ACPI tables. - */ - if (cpu_family < 0x10 || cpu_family == 0x14) + /* Only read out frequencies from HW if HW Pstate is supported, + * otherwise frequencies are exported via ACPI tables. + */ + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_HW_PSTATE)) return -1; if (read_msr(cpu, MSR_AMD_PSTATE_LIMIT, &val)) diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index f9a66a430b72..d577220a193b 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -128,9 +128,15 @@ out: /* AMD or Hygon Boost state enable/disable register */ if (cpu_info->vendor == X86_VENDOR_AMD || cpu_info->vendor == X86_VENDOR_HYGON) { - if (ext_cpuid_level >= 0x80000007 && - (cpuid_edx(0x80000007) & (1 << 9))) - cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; + if (ext_cpuid_level >= 0x80000007) { + if (cpuid_edx(0x80000007) & (1 << 9)) + cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; + + if ((cpuid_edx(0x80000007) & (1 << 7)) && + cpu_info->family != 0x14) + /* HW pstate was not implemented in family 0x14 */ + cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE; + } if (ext_cpuid_level >= 0x80000008 && cpuid_ebx(0x80000008) & (1 << 4)) diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index a84f85a9dbd2..5f61eefff5b2 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -70,6 +70,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_IS_SNB 0x00000020 #define CPUPOWER_CAP_INTEL_IDA 0x00000040 #define CPUPOWER_CAP_AMD_RDPRU 0x00000080 +#define CPUPOWER_CAP_AMD_HW_PSTATE 0x00000100 #define CPUPOWER_AMD_CPBDIS 0x02000000 From 1421de7919cd082bad692626937f055f367586ba Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:35:10 -0600 Subject: [PATCH 079/307] cpupower: Remove unused pscur variable. The pscur variable is set but not uused, just remove it. This may have previsously been set to validate the MSR_AMD_PSTATE_STATUS MSR. With the addition of the CPUPOWER_CAP_AMD_HW_PSTATE cap flag this is no longer needed since the cpuid bit to enable this cap flag also validates that the MSR_AMD_PSTATE_STATUS MSR is present. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 8b69c7ff639a..fc2ac1e6bfb2 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -90,7 +90,7 @@ static int get_cof(int family, union core_pstate pstate) int decode_pstates(unsigned int cpu, unsigned int cpu_family, int boost_states, unsigned long *pstates, int *no) { - int i, psmax, pscur; + int i, psmax; union core_pstate pstate; unsigned long long val; @@ -104,13 +104,6 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, return -1; psmax = (val >> 4) & 0x7; - - if (read_msr(cpu, MSR_AMD_PSTATE_STATUS, &val)) - return -1; - - pscur = val & 0x7; - - pscur += boost_states; psmax += boost_states; for (i = 0; i <= psmax; i++) { if (i >= MAX_HW_PSTATES) { From 23765b82a808da416b70b41d711468e723531e6a Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:35:17 -0600 Subject: [PATCH 080/307] cpupower: Update family checks when decoding HW pstates The family checks in get_cof() and get_did() need to use the correct MSR format depending on the family. Add a cpupower capability for using the pstatedef (family 17h and newer) to control this instead of direct family checks. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 8 ++++---- tools/power/cpupower/utils/helpers/cpuid.c | 6 +++++- tools/power/cpupower/utils/helpers/helpers.h | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index fc2ac1e6bfb2..b4731daa6820 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -45,10 +45,10 @@ static int get_did(int family, union core_pstate pstate) { int t; - if (family == 0x12) - t = pstate.val & 0xf; - else if (family == 0x17 || family == 0x18) + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) t = pstate.pstatedef.did; + else if (family == 0x12) + t = pstate.val & 0xf; else t = pstate.pstate.did; @@ -61,7 +61,7 @@ static int get_cof(int family, union core_pstate pstate) int fid, did, cof; did = get_did(family, pstate); - if (family == 0x17 || family == 0x18) { + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) { fid = pstate.pstatedef.fid; cof = 200 * fid / did; } else { diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index d577220a193b..db2e88ceb67b 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -133,9 +133,13 @@ out: cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; if ((cpuid_edx(0x80000007) & (1 << 7)) && - cpu_info->family != 0x14) + cpu_info->family != 0x14) { /* HW pstate was not implemented in family 0x14 */ cpu_info->caps |= CPUPOWER_CAP_AMD_HW_PSTATE; + + if (cpu_info->family >= 0x17) + cpu_info->caps |= CPUPOWER_CAP_AMD_PSTATEDEF; + } } if (ext_cpuid_level >= 0x80000008 && diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 5f61eefff5b2..e4dc44ced770 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -71,6 +71,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_INTEL_IDA 0x00000040 #define CPUPOWER_CAP_AMD_RDPRU 0x00000080 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x00000100 +#define CPUPOWER_CAP_AMD_PSTATEDEF 0x00000200 #define CPUPOWER_AMD_CPBDIS 0x02000000 From 56a85eebebdba62ebf6c46bd957949cc6e926aa0 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:35:23 -0600 Subject: [PATCH 081/307] cpupower: Condense pstate enabled bit checks in decode_pstates() The enabled bit (bit 63) is common for all families so we can remove the multiple enabled checks based on family and have a common check for HW pstate enabled. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index b4731daa6820..216240e2b771 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -113,9 +113,9 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, } if (read_msr(cpu, MSR_AMD_PSTATE + i, &pstate.val)) return -1; - if ((cpu_family == 0x17) && (!pstate.pstatedef.en)) - continue; - else if (!pstate.pstate.en) + + /* The enabled bit (bit 63) is common for all families */ + if (!pstate.pstatedef.en) continue; pstates[i] = get_cof(cpu_family, pstate); From d1abc4e996d7784ce4d56749e4b5ca8ff23b1e0f Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:36:01 -0600 Subject: [PATCH 082/307] cpupower: Remove family arg to decode_pstates() The decode_pstates() routine no longer uses the CPU family and the caleed routines (get_cof() and get_did()) can grab the family from the global cpupower_cpu_info struct. These update removes passing the family arg to all these routines. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/cpufreq-info.c | 3 +-- tools/power/cpupower/utils/helpers/amd.c | 19 +++++++++---------- tools/power/cpupower/utils/helpers/helpers.h | 9 ++++----- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/tools/power/cpupower/utils/cpufreq-info.c b/tools/power/cpupower/utils/cpufreq-info.c index 6efc0f6b1b11..f9895e31ff5a 100644 --- a/tools/power/cpupower/utils/cpufreq-info.c +++ b/tools/power/cpupower/utils/cpufreq-info.c @@ -186,8 +186,7 @@ static int get_boost_mode_x86(unsigned int cpu) if ((cpupower_cpu_info.vendor == X86_VENDOR_AMD && cpupower_cpu_info.family >= 0x10) || cpupower_cpu_info.vendor == X86_VENDOR_HYGON) { - ret = decode_pstates(cpu, cpupower_cpu_info.family, b_states, - pstates, &pstate_no); + ret = decode_pstates(cpu, b_states, pstates, &pstate_no); if (ret) return ret; diff --git a/tools/power/cpupower/utils/helpers/amd.c b/tools/power/cpupower/utils/helpers/amd.c index 216240e2b771..97f2c857048e 100644 --- a/tools/power/cpupower/utils/helpers/amd.c +++ b/tools/power/cpupower/utils/helpers/amd.c @@ -41,13 +41,13 @@ union core_pstate { unsigned long long val; }; -static int get_did(int family, union core_pstate pstate) +static int get_did(union core_pstate pstate) { int t; if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) t = pstate.pstatedef.did; - else if (family == 0x12) + else if (cpupower_cpu_info.family == 0x12) t = pstate.val & 0xf; else t = pstate.pstate.did; @@ -55,19 +55,19 @@ static int get_did(int family, union core_pstate pstate) return t; } -static int get_cof(int family, union core_pstate pstate) +static int get_cof(union core_pstate pstate) { int t; int fid, did, cof; - did = get_did(family, pstate); + did = get_did(pstate); if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_PSTATEDEF) { fid = pstate.pstatedef.fid; cof = 200 * fid / did; } else { t = 0x10; fid = pstate.pstate.fid; - if (family == 0x11) + if (cpupower_cpu_info.family == 0x11) t = 0x8; cof = (100 * (fid + t)) >> did; } @@ -76,8 +76,7 @@ static int get_cof(int family, union core_pstate pstate) /* Needs: * cpu -> the cpu that gets evaluated - * cpu_family -> The cpu's family (0x10, 0x12,...) - * boots_states -> how much boost states the machines support + * boost_states -> how much boost states the machines support * * Fills up: * pstates -> a pointer to an array of size MAX_HW_PSTATES @@ -87,8 +86,8 @@ static int get_cof(int family, union core_pstate pstate) * * returns zero on success, -1 on failure */ -int decode_pstates(unsigned int cpu, unsigned int cpu_family, - int boost_states, unsigned long *pstates, int *no) +int decode_pstates(unsigned int cpu, int boost_states, + unsigned long *pstates, int *no) { int i, psmax; union core_pstate pstate; @@ -118,7 +117,7 @@ int decode_pstates(unsigned int cpu, unsigned int cpu_family, if (!pstate.pstatedef.en) continue; - pstates[i] = get_cof(cpu_family, pstate); + pstates[i] = get_cof(pstate); } *no = i; return 0; diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index e4dc44ced770..8a0c11c6ec63 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -127,8 +127,8 @@ extern struct pci_dev *pci_slot_func_init(struct pci_access **pacc, /* AMD HW pstate decoding **************************/ -extern int decode_pstates(unsigned int cpu, unsigned int cpu_family, - int boost_states, unsigned long *pstates, int *no); +extern int decode_pstates(unsigned int cpu, int boost_states, + unsigned long *pstates, int *no); /* AMD HW pstate decoding **************************/ @@ -145,9 +145,8 @@ unsigned int cpuid_edx(unsigned int op); /* cpuid and cpuinfo helpers **************************/ /* X86 ONLY ********************************************/ #else -static inline int decode_pstates(unsigned int cpu, unsigned int cpu_family, - int boost_states, unsigned long *pstates, - int *no) +static inline int decode_pstates(unsigned int cpu, int boost_states, + unsigned long *pstates, int *no) { return -1; }; static inline int read_msr(int cpu, unsigned int idx, unsigned long long *val) From 3a3ecfdb605cc8d98988012a4f88c34b4d220c21 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Mon, 25 Jan 2021 11:36:23 -0600 Subject: [PATCH 083/307] cpupower: Add cpuid cap flag for MSR_AMD_HWCR support Remove the family check for accessing the MSR_AMD_HWCR MSR and replace it with a cpupower cap flag. This update also allows for the removal of the local cpupower_cpu_info variable in cpufreq_has_boost_support() since we no longer need it to check the family. Signed-off-by: Nathan Fontenot Reviewed-by: Robert Richter Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/helpers/cpuid.c | 6 +++++- tools/power/cpupower/utils/helpers/helpers.h | 1 + tools/power/cpupower/utils/helpers/misc.c | 7 +------ 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/power/cpupower/utils/helpers/cpuid.c b/tools/power/cpupower/utils/helpers/cpuid.c index db2e88ceb67b..72eb43593180 100644 --- a/tools/power/cpupower/utils/helpers/cpuid.c +++ b/tools/power/cpupower/utils/helpers/cpuid.c @@ -129,9 +129,13 @@ out: if (cpu_info->vendor == X86_VENDOR_AMD || cpu_info->vendor == X86_VENDOR_HYGON) { if (ext_cpuid_level >= 0x80000007) { - if (cpuid_edx(0x80000007) & (1 << 9)) + if (cpuid_edx(0x80000007) & (1 << 9)) { cpu_info->caps |= CPUPOWER_CAP_AMD_CPB; + if (cpu_info->family >= 0x17) + cpu_info->caps |= CPUPOWER_CAP_AMD_CPB_MSR; + } + if ((cpuid_edx(0x80000007) & (1 << 7)) && cpu_info->family != 0x14) { /* HW pstate was not implemented in family 0x14 */ diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index 8a0c11c6ec63..33ffacee7fcb 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -72,6 +72,7 @@ enum cpupower_cpu_vendor {X86_VENDOR_UNKNOWN = 0, X86_VENDOR_INTEL, #define CPUPOWER_CAP_AMD_RDPRU 0x00000080 #define CPUPOWER_CAP_AMD_HW_PSTATE 0x00000100 #define CPUPOWER_CAP_AMD_PSTATEDEF 0x00000200 +#define CPUPOWER_CAP_AMD_CPB_MSR 0x00000400 #define CPUPOWER_AMD_CPBDIS 0x02000000 diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index f9bcce9c72d5..fc6e34511721 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -16,16 +16,11 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, int *states) { - struct cpupower_cpu_info cpu_info; int ret; unsigned long long val; *support = *active = *states = 0; - ret = get_cpu_info(&cpu_info); - if (ret) - return ret; - if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB) { *support = 1; @@ -34,7 +29,7 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, * has Hardware determined variable increments instead. */ - if (cpu_info.family == 0x17 || cpu_info.family == 0x18) { + if (cpupower_cpu_info.caps & CPUPOWER_CAP_AMD_CPB_MSR) { if (!read_msr(cpu, MSR_AMD_HWCR, &val)) { if (!(val & CPUPOWER_AMD_CPBDIS)) *active = 1; From 10e927249c4f78b25c4941eda93548aeaad04a46 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Jan 2021 14:23:17 -0600 Subject: [PATCH 084/307] ACPI: Test for ACPI_SUCCESS rather than !ACPI_FAILURE The double negative makes it hard to read "if (!ACPI_FAILURE(status))". Replace it with "if (ACPI_SUCCESS(status))". Signed-off-by: Bjorn Helgaas Acked-by: Guenter Roeck Acked-by: Alex Deucher Acked-by: Takashi Iwai Acked-by: Mark Brown Signed-off-by: Rafael J. Wysocki --- drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c | 4 ++-- drivers/gpu/drm/radeon/radeon_bios.c | 4 ++-- drivers/hwmon/acpi_power_meter.c | 4 ++-- drivers/platform/x86/asus-laptop.c | 6 +++--- drivers/spi/spi.c | 2 +- sound/pci/hda/hda_intel.c | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c index 6333cada1e09..055f600eeed8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c @@ -291,7 +291,7 @@ static bool amdgpu_atrm_get_bios(struct amdgpu_device *adev) continue; status = acpi_get_handle(dhandle, "ATRM", &atrm_handle); - if (!ACPI_FAILURE(status)) { + if (ACPI_SUCCESS(status)) { found = true; break; } @@ -304,7 +304,7 @@ static bool amdgpu_atrm_get_bios(struct amdgpu_device *adev) continue; status = acpi_get_handle(dhandle, "ATRM", &atrm_handle); - if (!ACPI_FAILURE(status)) { + if (ACPI_SUCCESS(status)) { found = true; break; } diff --git a/drivers/gpu/drm/radeon/radeon_bios.c b/drivers/gpu/drm/radeon/radeon_bios.c index bb29cf02974d..43bbbfd6ade8 100644 --- a/drivers/gpu/drm/radeon/radeon_bios.c +++ b/drivers/gpu/drm/radeon/radeon_bios.c @@ -205,7 +205,7 @@ static bool radeon_atrm_get_bios(struct radeon_device *rdev) continue; status = acpi_get_handle(dhandle, "ATRM", &atrm_handle); - if (!ACPI_FAILURE(status)) { + if (ACPI_SUCCESS(status)) { found = true; break; } @@ -218,7 +218,7 @@ static bool radeon_atrm_get_bios(struct radeon_device *rdev) continue; status = acpi_get_handle(dhandle, "ATRM", &atrm_handle); - if (!ACPI_FAILURE(status)) { + if (ACPI_SUCCESS(status)) { found = true; break; } diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c index 848718ab7312..7d3ddcba34ce 100644 --- a/drivers/hwmon/acpi_power_meter.c +++ b/drivers/hwmon/acpi_power_meter.c @@ -161,7 +161,7 @@ static ssize_t set_avg_interval(struct device *dev, mutex_lock(&resource->lock); status = acpi_evaluate_integer(resource->acpi_dev->handle, "_PAI", &args, &data); - if (!ACPI_FAILURE(status)) + if (ACPI_SUCCESS(status)) resource->avg_interval = temp; mutex_unlock(&resource->lock); @@ -232,7 +232,7 @@ static ssize_t set_cap(struct device *dev, struct device_attribute *devattr, mutex_lock(&resource->lock); status = acpi_evaluate_integer(resource->acpi_dev->handle, "_SHL", &args, &data); - if (!ACPI_FAILURE(status)) + if (ACPI_SUCCESS(status)) resource->cap = temp; mutex_unlock(&resource->lock); diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c index 0edafe687fa9..bfea656e910c 100644 --- a/drivers/platform/x86/asus-laptop.c +++ b/drivers/platform/x86/asus-laptop.c @@ -861,7 +861,7 @@ static ssize_t infos_show(struct device *dev, struct device_attribute *attr, * The significance of others is yet to be found. */ rv = acpi_evaluate_integer(asus->handle, "SFUN", NULL, &temp); - if (!ACPI_FAILURE(rv)) + if (ACPI_SUCCESS(rv)) len += sprintf(page + len, "SFUN value : %#x\n", (uint) temp); /* @@ -873,7 +873,7 @@ static ssize_t infos_show(struct device *dev, struct device_attribute *attr, * takes several seconds to run on some systems. */ rv = acpi_evaluate_integer(asus->handle, "HWRS", NULL, &temp); - if (!ACPI_FAILURE(rv)) + if (ACPI_SUCCESS(rv)) len += sprintf(page + len, "HWRS value : %#x\n", (uint) temp); /* @@ -884,7 +884,7 @@ static ssize_t infos_show(struct device *dev, struct device_attribute *attr, * silently ignored. */ rv = acpi_evaluate_integer(asus->handle, "ASYM", NULL, &temp); - if (!ACPI_FAILURE(rv)) + if (ACPI_SUCCESS(rv)) len += sprintf(page + len, "ASYM value : %#x\n", (uint) temp); if (asus->dsdt_info) { diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 720ab34784c1..801d8b499788 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2210,7 +2210,7 @@ static acpi_status acpi_register_spi_device(struct spi_controller *ctlr, return AE_OK; if (!lookup.max_speed_hz && - !ACPI_FAILURE(acpi_get_parent(adev->handle, &parent_handle)) && + ACPI_SUCCESS(acpi_get_parent(adev->handle, &parent_handle)) && ACPI_HANDLE(ctlr->dev.parent) == parent_handle) { /* Apple does not use _CRS but nested devices for SPI slaves */ acpi_spi_parse_apple_properties(adev, &lookup); diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e4dd2ff5e473..db587aed2061 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -1444,7 +1444,7 @@ static bool atpx_present(void) dhandle = ACPI_HANDLE(&pdev->dev); if (dhandle) { status = acpi_get_handle(dhandle, "ATPX", &atpx_handle); - if (!ACPI_FAILURE(status)) { + if (ACPI_SUCCESS(status)) { pci_dev_put(pdev); return true; } @@ -1454,7 +1454,7 @@ static bool atpx_present(void) dhandle = ACPI_HANDLE(&pdev->dev); if (dhandle) { status = acpi_get_handle(dhandle, "ATPX", &atpx_handle); - if (!ACPI_FAILURE(status)) { + if (ACPI_SUCCESS(status)) { pci_dev_put(pdev); return true; } From 84f9017c37c479c4f70456a645d24d2296ad2208 Mon Sep 17 00:00:00 2001 From: Jiaxun Yang Date: Mon, 25 Jan 2021 12:59:57 +0100 Subject: [PATCH 085/307] ACPI: platform-profile: Introduce object pointers to callbacks Add an object pointer to handler callbacks to avoid the need for drivers to have a global variable to get to their driver-data struct. Link: https://lore.kernel.org/linux-acpi/6a29f338-d9e4-150c-81dd-2ffb54f5bc35@redhat.com/ Link: https://lore.kernel.org/r/20210114073429.176462-3-jiaxun.yang@flygoat.com Signed-off-by: Jiaxun Yang Suggested-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/platform_profile.c | 4 ++-- include/linux/platform_profile.h | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index f65c61db7921..80e9df427eb8 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -64,7 +64,7 @@ static ssize_t platform_profile_show(struct device *dev, return -ENODEV; } - err = cur_profile->profile_get(&profile); + err = cur_profile->profile_get(cur_profile, &profile); mutex_unlock(&profile_lock); if (err) return err; @@ -104,7 +104,7 @@ static ssize_t platform_profile_store(struct device *dev, return -EOPNOTSUPP; } - err = cur_profile->profile_set(i); + err = cur_profile->profile_set(cur_profile, i); mutex_unlock(&profile_lock); if (err) return err; diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index c797fdb3d91a..a26542d53058 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -28,8 +28,10 @@ enum platform_profile_option { struct platform_profile_handler { unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; - int (*profile_get)(enum platform_profile_option *profile); - int (*profile_set)(enum platform_profile_option profile); + int (*profile_get)(struct platform_profile_handler *pprof, + enum platform_profile_option *profile); + int (*profile_set)(struct platform_profile_handler *pprof, + enum platform_profile_option profile); }; int platform_profile_register(struct platform_profile_handler *pprof); From 041142d7d25294c17d39552ae51c1d8d89434010 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 25 Jan 2021 20:09:09 +0100 Subject: [PATCH 086/307] ACPI: platform-profile: Fix possible deadlock in platform_profile_remove() After a rmmod thinkpad_acpi, lockdep pointed out this possible deadlock: Our _show and _store sysfs attr functions get called with the kn->active lock held for the sysfs attr and then take the profile_lock. sysfs_remove_group() also takes the kn->active lock for the sysfs attr, so if we call it with the profile_lock held, then we get an ABBA deadlock. platform_profile_remove() must only be called by drivers which have first *successfully* called platform_profile_register(). Anything else is a driver bug. So the check for cur_profile being set before calling sysfs_remove_group() is not necessary and it can be dropped. It is safe to call sysfs_remove_group() without holding the profile_lock since the attr-group group cannot be re-added until after we clear cur_profile. Change platform_profile_remove() to only hold the profile_lock while clearing the cur_profile, fixing the deadlock. Signed-off-by: Hans de Goede Signed-off-by: Rafael J. Wysocki --- drivers/acpi/platform_profile.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index 80e9df427eb8..4a59c5993bde 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -164,13 +164,9 @@ EXPORT_SYMBOL_GPL(platform_profile_register); int platform_profile_remove(void) { - mutex_lock(&profile_lock); - if (!cur_profile) { - mutex_unlock(&profile_lock); - return -ENODEV; - } - sysfs_remove_group(acpi_kobj, &platform_profile_group); + + mutex_lock(&profile_lock); cur_profile = NULL; mutex_unlock(&profile_lock); return 0; From dc20c4092049941289683f20626da95fda5a6009 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 26 Jan 2021 10:32:01 -0600 Subject: [PATCH 087/307] ACPI: APEI: Add is_generic_error() to identify GHES sources Refactor duplicated GHES identity logic into is_generic_error(). Signed-off-by: Yazen Ghannam Reviewed-by: Robert Richter Co-developed-by: Terry Bowman Signed-off-by: Terry Bowman Acked-by: Borislav Petkov Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/hest.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c index 6e980fe16772..f220bb00e91b 100644 --- a/drivers/acpi/apei/hest.c +++ b/drivers/acpi/apei/hest.c @@ -49,6 +49,12 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] = { [ACPI_HEST_TYPE_IA32_DEFERRED_CHECK] = -1, }; +static inline bool is_generic_error(struct acpi_hest_header *hest_hdr) +{ + return hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR || + hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2; +} + static int hest_esrc_len(struct acpi_hest_header *hest_hdr) { u16 hest_type = hest_hdr->type; @@ -141,8 +147,7 @@ static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void { int *count = data; - if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR || - hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2) + if (is_generic_error(hest_hdr)) (*count)++; return 0; } @@ -153,8 +158,7 @@ static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data) struct ghes_arr *ghes_arr = data; int rc, i; - if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR && - hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2) + if (!is_generic_error(hest_hdr)) return 0; if (!((struct acpi_hest_generic *)hest_hdr)->enabled) From ccf7ce46ab91515a7146c00300e168efa9dc777e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 25 Jan 2021 12:18:28 +0800 Subject: [PATCH 088/307] PM: sleep: No need to check PF_WQ_WORKER in thaw_kernel_threads() Because PF_KTHREAD is set for all wq worker threads, it is not necessary to check PF_WQ_WORKER in addition to it in thaw_kernel_threads(), so stop doing that. Signed-off-by: Zqiang [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 45b054b7b5ec..50cc63534486 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -235,7 +235,7 @@ void thaw_kernel_threads(void) read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + if (p->flags & PF_KTHREAD) __thaw_task(p); } read_unlock(&tasklist_lock); From eb23d91af55bc2369fe3f0aa6997e72eb20e16fe Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Jan 2021 15:29:40 -0600 Subject: [PATCH 089/307] PM: sleep: Use dev_printk() when possible Use dev_printk() when possible to make messages more consistent with other device-related messages. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 46793276598d..f893c3c5af07 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -16,6 +16,7 @@ */ #define pr_fmt(fmt) "PM: " fmt +#define dev_fmt pr_fmt #include #include @@ -449,8 +450,8 @@ static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { - pr_err("Device %s failed to %s%s: error %d\n", - dev_name(dev), pm_verb(state.event), info, error); + dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, + error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, @@ -1897,8 +1898,8 @@ int dpm_prepare(pm_message_t state) error = 0; continue; } - pr_info("Device %s not prepared for power transition: code %d\n", - dev_name(dev), error); + dev_info(dev, "not prepared for power transition: code %d\n", + error); put_device(dev); break; } From 309663093c8aba02cbea83b0bc8ee9a99833c482 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 26 Jan 2021 15:26:55 -0600 Subject: [PATCH 090/307] PM: runtime: Fix typos and grammar Fix minor typos and grammatical issues. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 0553008b6279..d9c777b18f7a 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -579,7 +579,7 @@ should be used. Of course, for this purpose the device's runtime PM has to be enabled earlier by calling pm_runtime_enable(). Note, if the device may execute pm_runtime calls during the probe (such as -if it is registers with a subsystem that may call back in) then the +if it is registered with a subsystem that may call back in) then the pm_runtime_get_sync() call paired with a pm_runtime_put() call will be appropriate to ensure that the device is not put back to sleep during the probe. This can happen with systems such as the network device layer. @@ -587,11 +587,11 @@ probe. This can happen with systems such as the network device layer. It may be desirable to suspend the device once ->probe() has finished. Therefore the driver core uses the asynchronous pm_request_idle() to submit a request to execute the subsystem-level idle callback for the device at that -time. A driver that makes use of the runtime autosuspend feature, may want to +time. A driver that makes use of the runtime autosuspend feature may want to update the last busy mark before returning from ->probe(). Moreover, the driver core prevents runtime PM callbacks from racing with the bus -notifier callback in __device_release_driver(), which is necessary, because the +notifier callback in __device_release_driver(), which is necessary because the notifier is used by some subsystems to carry out operations affecting the runtime PM functionality. It does so by calling pm_runtime_get_sync() before driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This @@ -603,7 +603,7 @@ calling pm_runtime_suspend() from their ->remove() routines, the driver core executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER notifications in __device_release_driver(). This requires bus types and drivers to make their ->remove() callbacks avoid races with runtime PM directly, -but also it allows of more flexibility in the handling of devices during the +but it also allows more flexibility in the handling of devices during the removal of their drivers. Drivers in ->remove() callback should undo the runtime PM changes done @@ -693,7 +693,7 @@ that the device appears to be runtime-suspended and its state is fine, so it may be left in runtime suspend provided that all of its descendants are also left in runtime suspend. If that happens, the PM core will not execute any system suspend and resume callbacks for all of those devices, except for the -complete callback, which is then entirely responsible for handling the device +.complete() callback, which is then entirely responsible for handling the device as appropriate. This only applies to system suspend transitions that are not related to hibernation (see Documentation/driver-api/pm/devices.rst for more information). @@ -706,7 +706,7 @@ out the following operations: right before executing the subsystem-level .prepare() callback for it and pm_runtime_barrier() is called for every device right before executing the subsystem-level .suspend() callback for it. In addition to that the PM core - calls __pm_runtime_disable() with 'false' as the second argument for every + calls __pm_runtime_disable() with 'false' as the second argument for every device right before executing the subsystem-level .suspend_late() callback for it. @@ -783,7 +783,7 @@ driver/base/power/generic_ops.c: `int pm_generic_restore_noirq(struct device *dev);` - invoke the ->restore_noirq() callback provided by the device's driver -These functions are the defaults used by the PM core, if a subsystem doesn't +These functions are the defaults used by the PM core if a subsystem doesn't provide its own callbacks for ->runtime_idle(), ->runtime_suspend(), ->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(), ->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(), From cca26b66efc1e92c10701087aca4895530660b85 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 27 Jan 2021 13:42:27 +0800 Subject: [PATCH 091/307] powercap/intel_rapl: add support for AlderLake Mobile Add intel_rapl support for the AlderLake Mobile platform. Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c9e57237d778..f0799837c2dd 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1049,6 +1049,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &rapl_defaults_core), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &rapl_defaults_core), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD, &rapl_defaults_core), From 0bfa0820c274b019583b3454c6c889c99c24558d Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 25 Jan 2021 14:29:18 -0500 Subject: [PATCH 092/307] PM: clk: make PM clock layer compatible with clocks that must sleep The clock API splits its interface into sleepable ant atomic contexts: - clk_prepare/clk_unprepare for stuff that might sleep - clk_enable_clk_disable for anything that may be done in atomic context The code handling runtime PM for clocks only calls clk_disable() on suspend requests, and clk_enable on resume requests. This means that runtime PM with clock providers that only have the prepare/unprepare methods implemented is basically useless. Many clock implementations can't accommodate atomic contexts. This is often the case when communication with the clock happens through another subsystem like I2C or SCMI. Let's make the clock PM code useful with such clocks by safely invoking clk_prepare/clk_unprepare upon resume/suspend requests. Of course, when such clocks are registered with the PM layer then pm_runtime_irq_safe() can't be used, and neither pm_runtime_suspend() nor pm_runtime_resume() may be invoked in atomic context. For clocks that do implement the enable and disable methods then everything just works as before. A note on sparse: According to https://lwn.net/Articles/109066/ there are things that sparse can't cope with. In particular, pm_clk_op_lock() and pm_clk_op_unlock() may or may not lock/unlock psd->lock depending on some runtime condition. To work around that we tell it the lock is always untaken for the purpose of static analisys. Thanks to Naresh Kamboju for reporting issues with the initial patch. Signed-off-by: Nicolas Pitre Tested-by: Naresh Kamboju Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 223 +++++++++++++++++++++++++++------ drivers/clk/clk.c | 21 ++++ include/linux/clk.h | 24 +++- include/linux/pm.h | 2 + 4 files changed, 228 insertions(+), 42 deletions(-) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index ced6863a16a5..84d5acb6301b 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -23,6 +23,7 @@ enum pce_status { PCE_STATUS_NONE = 0, PCE_STATUS_ACQUIRED, + PCE_STATUS_PREPARED, PCE_STATUS_ENABLED, PCE_STATUS_ERROR, }; @@ -32,8 +33,112 @@ struct pm_clock_entry { char *con_id; struct clk *clk; enum pce_status status; + bool enabled_when_prepared; }; +/** + * pm_clk_list_lock - ensure exclusive access for modifying the PM clock + * entry list. + * @psd: pm_subsys_data instance corresponding to the PM clock entry list + * and clk_op_might_sleep count to be modified. + * + * Get exclusive access before modifying the PM clock entry list and the + * clock_op_might_sleep count to guard against concurrent modifications. + * This also protects against a concurrent clock_op_might_sleep and PM clock + * entry list usage in pm_clk_suspend()/pm_clk_resume() that may or may not + * happen in atomic context, hence both the mutex and the spinlock must be + * taken here. + */ +static void pm_clk_list_lock(struct pm_subsys_data *psd) + __acquires(&psd->lock) +{ + mutex_lock(&psd->clock_mutex); + spin_lock_irq(&psd->lock); +} + +/** + * pm_clk_list_unlock - counterpart to pm_clk_list_lock(). + * @psd: the same pm_subsys_data instance previously passed to + * pm_clk_list_lock(). + */ +static void pm_clk_list_unlock(struct pm_subsys_data *psd) + __releases(&psd->lock) +{ + spin_unlock_irq(&psd->lock); + mutex_unlock(&psd->clock_mutex); +} + +/** + * pm_clk_op_lock - ensure exclusive access for performing clock operations. + * @psd: pm_subsys_data instance corresponding to the PM clock entry list + * and clk_op_might_sleep count being used. + * @flags: stored irq flags. + * @fn: string for the caller function's name. + * + * This is used by pm_clk_suspend() and pm_clk_resume() to guard + * against concurrent modifications to the clock entry list and the + * clock_op_might_sleep count. If clock_op_might_sleep is != 0 then + * only the mutex can be locked and those functions can only be used in + * non atomic context. If clock_op_might_sleep == 0 then these functions + * may be used in any context and only the spinlock can be locked. + * Returns -EINVAL if called in atomic context when clock ops might sleep. + */ +static int pm_clk_op_lock(struct pm_subsys_data *psd, unsigned long *flags, + const char *fn) + /* sparse annotations don't work here as exit state isn't static */ +{ + bool atomic_context = in_atomic() || irqs_disabled(); + +try_again: + spin_lock_irqsave(&psd->lock, *flags); + if (!psd->clock_op_might_sleep) { + /* the __release is there to work around sparse limitations */ + __release(&psd->lock); + return 0; + } + + /* bail out if in atomic context */ + if (atomic_context) { + pr_err("%s: atomic context with clock_ops_might_sleep = %d", + fn, psd->clock_op_might_sleep); + spin_unlock_irqrestore(&psd->lock, *flags); + might_sleep(); + return -EPERM; + } + + /* we must switch to the mutex */ + spin_unlock_irqrestore(&psd->lock, *flags); + mutex_lock(&psd->clock_mutex); + + /* + * There was a possibility for psd->clock_op_might_sleep + * to become 0 above. Keep the mutex only if not the case. + */ + if (likely(psd->clock_op_might_sleep)) + return 0; + + mutex_unlock(&psd->clock_mutex); + goto try_again; +} + +/** + * pm_clk_op_unlock - counterpart to pm_clk_op_lock(). + * @psd: the same pm_subsys_data instance previously passed to + * pm_clk_op_lock(). + * @flags: irq flags provided by pm_clk_op_lock(). + */ +static void pm_clk_op_unlock(struct pm_subsys_data *psd, unsigned long *flags) + /* sparse annotations don't work here as entry state isn't static */ +{ + if (psd->clock_op_might_sleep) { + mutex_unlock(&psd->clock_mutex); + } else { + /* the __acquire is there to work around sparse limitations */ + __acquire(&psd->lock); + spin_unlock_irqrestore(&psd->lock, *flags); + } +} + /** * pm_clk_enable - Enable a clock, reporting any errors * @dev: The device for the given clock @@ -43,14 +148,21 @@ static inline void __pm_clk_enable(struct device *dev, struct pm_clock_entry *ce { int ret; - if (ce->status < PCE_STATUS_ERROR) { + switch (ce->status) { + case PCE_STATUS_ACQUIRED: + ret = clk_prepare_enable(ce->clk); + break; + case PCE_STATUS_PREPARED: ret = clk_enable(ce->clk); - if (!ret) - ce->status = PCE_STATUS_ENABLED; - else - dev_err(dev, "%s: failed to enable clk %p, error %d\n", - __func__, ce->clk, ret); + break; + default: + return; } + if (!ret) + ce->status = PCE_STATUS_ENABLED; + else + dev_err(dev, "%s: failed to enable clk %p, error %d\n", + __func__, ce->clk, ret); } /** @@ -64,17 +176,20 @@ static void pm_clk_acquire(struct device *dev, struct pm_clock_entry *ce) ce->clk = clk_get(dev, ce->con_id); if (IS_ERR(ce->clk)) { ce->status = PCE_STATUS_ERROR; + return; + } else if (clk_is_enabled_when_prepared(ce->clk)) { + /* we defer preparing the clock in that case */ + ce->status = PCE_STATUS_ACQUIRED; + ce->enabled_when_prepared = true; + } else if (clk_prepare(ce->clk)) { + ce->status = PCE_STATUS_ERROR; + dev_err(dev, "clk_prepare() failed\n"); + return; } else { - if (clk_prepare(ce->clk)) { - ce->status = PCE_STATUS_ERROR; - dev_err(dev, "clk_prepare() failed\n"); - } else { - ce->status = PCE_STATUS_ACQUIRED; - dev_dbg(dev, - "Clock %pC con_id %s managed by runtime PM.\n", - ce->clk, ce->con_id); - } + ce->status = PCE_STATUS_PREPARED; } + dev_dbg(dev, "Clock %pC con_id %s managed by runtime PM.\n", + ce->clk, ce->con_id); } static int __pm_clk_add(struct device *dev, const char *con_id, @@ -106,9 +221,11 @@ static int __pm_clk_add(struct device *dev, const char *con_id, pm_clk_acquire(dev, ce); - spin_lock_irq(&psd->lock); + pm_clk_list_lock(psd); list_add_tail(&ce->node, &psd->clock_list); - spin_unlock_irq(&psd->lock); + if (ce->enabled_when_prepared) + psd->clock_op_might_sleep++; + pm_clk_list_unlock(psd); return 0; } @@ -239,14 +356,20 @@ static void __pm_clk_remove(struct pm_clock_entry *ce) if (!ce) return; - if (ce->status < PCE_STATUS_ERROR) { - if (ce->status == PCE_STATUS_ENABLED) - clk_disable(ce->clk); - - if (ce->status >= PCE_STATUS_ACQUIRED) { - clk_unprepare(ce->clk); + switch (ce->status) { + case PCE_STATUS_ENABLED: + clk_disable(ce->clk); + fallthrough; + case PCE_STATUS_PREPARED: + clk_unprepare(ce->clk); + fallthrough; + case PCE_STATUS_ACQUIRED: + case PCE_STATUS_ERROR: + if (!IS_ERR(ce->clk)) clk_put(ce->clk); - } + break; + default: + break; } kfree(ce->con_id); @@ -269,7 +392,7 @@ void pm_clk_remove(struct device *dev, const char *con_id) if (!psd) return; - spin_lock_irq(&psd->lock); + pm_clk_list_lock(psd); list_for_each_entry(ce, &psd->clock_list, node) { if (!con_id && !ce->con_id) @@ -280,12 +403,14 @@ void pm_clk_remove(struct device *dev, const char *con_id) goto remove; } - spin_unlock_irq(&psd->lock); + pm_clk_list_unlock(psd); return; remove: list_del(&ce->node); - spin_unlock_irq(&psd->lock); + if (ce->enabled_when_prepared) + psd->clock_op_might_sleep--; + pm_clk_list_unlock(psd); __pm_clk_remove(ce); } @@ -307,19 +432,21 @@ void pm_clk_remove_clk(struct device *dev, struct clk *clk) if (!psd || !clk) return; - spin_lock_irq(&psd->lock); + pm_clk_list_lock(psd); list_for_each_entry(ce, &psd->clock_list, node) { if (clk == ce->clk) goto remove; } - spin_unlock_irq(&psd->lock); + pm_clk_list_unlock(psd); return; remove: list_del(&ce->node); - spin_unlock_irq(&psd->lock); + if (ce->enabled_when_prepared) + psd->clock_op_might_sleep--; + pm_clk_list_unlock(psd); __pm_clk_remove(ce); } @@ -330,13 +457,16 @@ EXPORT_SYMBOL_GPL(pm_clk_remove_clk); * @dev: Device to initialize the list of PM clocks for. * * Initialize the lock and clock_list members of the device's pm_subsys_data - * object. + * object, set the count of clocks that might sleep to 0. */ void pm_clk_init(struct device *dev) { struct pm_subsys_data *psd = dev_to_psd(dev); - if (psd) + if (psd) { INIT_LIST_HEAD(&psd->clock_list); + mutex_init(&psd->clock_mutex); + psd->clock_op_might_sleep = 0; + } } EXPORT_SYMBOL_GPL(pm_clk_init); @@ -372,12 +502,13 @@ void pm_clk_destroy(struct device *dev) INIT_LIST_HEAD(&list); - spin_lock_irq(&psd->lock); + pm_clk_list_lock(psd); list_for_each_entry_safe_reverse(ce, c, &psd->clock_list, node) list_move(&ce->node, &list); + psd->clock_op_might_sleep = 0; - spin_unlock_irq(&psd->lock); + pm_clk_list_unlock(psd); dev_pm_put_subsys_data(dev); @@ -397,23 +528,30 @@ int pm_clk_suspend(struct device *dev) struct pm_subsys_data *psd = dev_to_psd(dev); struct pm_clock_entry *ce; unsigned long flags; + int ret; dev_dbg(dev, "%s()\n", __func__); if (!psd) return 0; - spin_lock_irqsave(&psd->lock, flags); + ret = pm_clk_op_lock(psd, &flags, __func__); + if (ret) + return ret; list_for_each_entry_reverse(ce, &psd->clock_list, node) { - if (ce->status < PCE_STATUS_ERROR) { - if (ce->status == PCE_STATUS_ENABLED) + if (ce->status == PCE_STATUS_ENABLED) { + if (ce->enabled_when_prepared) { + clk_disable_unprepare(ce->clk); + ce->status = PCE_STATUS_ACQUIRED; + } else { clk_disable(ce->clk); - ce->status = PCE_STATUS_ACQUIRED; + ce->status = PCE_STATUS_PREPARED; + } } } - spin_unlock_irqrestore(&psd->lock, flags); + pm_clk_op_unlock(psd, &flags); return 0; } @@ -428,18 +566,21 @@ int pm_clk_resume(struct device *dev) struct pm_subsys_data *psd = dev_to_psd(dev); struct pm_clock_entry *ce; unsigned long flags; + int ret; dev_dbg(dev, "%s()\n", __func__); if (!psd) return 0; - spin_lock_irqsave(&psd->lock, flags); + ret = pm_clk_op_lock(psd, &flags, __func__); + if (ret) + return ret; list_for_each_entry(ce, &psd->clock_list, node) __pm_clk_enable(dev, ce); - spin_unlock_irqrestore(&psd->lock, flags); + pm_clk_op_unlock(psd, &flags); return 0; } diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 8c1d04db990d..3d751ae5bc70 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1164,6 +1164,27 @@ int clk_enable(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_enable); +/** + * clk_is_enabled_when_prepared - indicate if preparing a clock also enables it. + * @clk: clock source + * + * Returns true if clk_prepare() implicitly enables the clock, effectively + * making clk_enable()/clk_disable() no-ops, false otherwise. + * + * This is of interest mainly to power management code where actually + * disabling the clock also requires unpreparing it to have any material + * effect. + * + * Regardless of the value returned here, the caller must always invoke + * clk_enable() or clk_prepare_enable() and counterparts for usage counts + * to be right. + */ +bool clk_is_enabled_when_prepared(struct clk *clk) +{ + return clk && !(clk->core->ops->enable && clk->core->ops->disable); +} +EXPORT_SYMBOL_GPL(clk_is_enabled_when_prepared); + static int clk_core_prepare_enable(struct clk_core *core) { int ret; diff --git a/include/linux/clk.h b/include/linux/clk.h index 31ff1bf1b79f..a4a86aa8b11a 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -238,6 +238,7 @@ static inline bool clk_is_match(const struct clk *p, const struct clk *q) #endif +#ifdef CONFIG_HAVE_CLK_PREPARE /** * clk_prepare - prepare a clock source * @clk: clock source @@ -246,10 +247,26 @@ static inline bool clk_is_match(const struct clk *p, const struct clk *q) * * Must not be called from within atomic context. */ -#ifdef CONFIG_HAVE_CLK_PREPARE int clk_prepare(struct clk *clk); int __must_check clk_bulk_prepare(int num_clks, const struct clk_bulk_data *clks); + +/** + * clk_is_enabled_when_prepared - indicate if preparing a clock also enables it. + * @clk: clock source + * + * Returns true if clk_prepare() implicitly enables the clock, effectively + * making clk_enable()/clk_disable() no-ops, false otherwise. + * + * This is of interest mainly to the power management code where actually + * disabling the clock also requires unpreparing it to have any material + * effect. + * + * Regardless of the value returned here, the caller must always invoke + * clk_enable() or clk_prepare_enable() and counterparts for usage counts + * to be right. + */ +bool clk_is_enabled_when_prepared(struct clk *clk); #else static inline int clk_prepare(struct clk *clk) { @@ -263,6 +280,11 @@ clk_bulk_prepare(int num_clks, const struct clk_bulk_data *clks) might_sleep(); return 0; } + +static inline bool clk_is_enabled_when_prepared(struct clk *clk) +{ + return false; +} #endif /** diff --git a/include/linux/pm.h b/include/linux/pm.h index 47aca6bac1d6..482313a8ccfc 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -537,6 +537,8 @@ struct pm_subsys_data { spinlock_t lock; unsigned int refcount; #ifdef CONFIG_PM_CLK + unsigned int clock_op_might_sleep; + struct mutex clock_mutex; struct list_head clock_list; #endif #ifdef CONFIG_PM_GENERIC_DOMAINS From 6dc466d34f51767ad34fb900de8d278a66a3f1ed Mon Sep 17 00:00:00 2001 From: Abaci Team Date: Wed, 27 Jan 2021 16:42:05 +0800 Subject: [PATCH 093/307] PM: domains: Simplify the calculation of variables Fix the following coccicheck warnings: ./drivers/base/power/domain.c:938:31-33: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Suggested-by: Jiapeng Zhong Signed-off-by: Abaci Team Acked-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 50211a402fa5..aaf6c83b5cf6 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -966,8 +966,7 @@ static int genpd_runtime_resume(struct device *dev) err_stop: genpd_stop_dev(genpd, dev); err_poweroff: - if (!pm_runtime_is_irq_safe(dev) || - (pm_runtime_is_irq_safe(dev) && genpd_is_irq_safe(genpd))) { + if (!pm_runtime_is_irq_safe(dev) || genpd_is_irq_safe(genpd)) { genpd_lock(genpd); genpd_power_off(genpd, true, 0); genpd_unlock(genpd); From 32715be4fe95fc98762959f8dff6f9f8a39df28f Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:13 +0300 Subject: [PATCH 094/307] opp: Fix adding OPP entries in a wrong order if rate is unavailable Fix adding OPP entries in a wrong (opposite) order if OPP rate is unavailable. The OPP comparison was erroneously skipped, thus OPPs were left unsorted. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 8c905aabacc0..5793c833b86a 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1527,12 +1527,10 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, mutex_lock(&opp_table->lock); head = &opp_table->opp_list; - if (likely(!rate_not_available)) { - ret = _opp_is_duplicate(dev, new_opp, opp_table, &head); - if (ret) { - mutex_unlock(&opp_table->lock); - return ret; - } + ret = _opp_is_duplicate(dev, new_opp, opp_table, &head); + if (ret) { + mutex_unlock(&opp_table->lock); + return ret; } list_add(&new_opp->node, head); From cf65948d62c6aefd22f51c1433743f80517ee3fe Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:14 +0300 Subject: [PATCH 095/307] opp: Filter out OPPs based on availability of a required-OPP A required OPP may not be available, and thus, all OPPs which are using this required OPP should be unavailable too. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 5793c833b86a..253bc87b5695 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1522,6 +1522,7 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table, bool rate_not_available) { struct list_head *head; + unsigned int i; int ret; mutex_lock(&opp_table->lock); @@ -1547,6 +1548,16 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, __func__, new_opp->rate); } + for (i = 0; i < opp_table->required_opp_count; i++) { + if (new_opp->required_opps[i]->available) + continue; + + new_opp->available = false; + dev_warn(dev, "%s: OPP not supported by required OPP %pOF (%lu)\n", + __func__, new_opp->required_opps[i]->np, new_opp->rate); + break; + } + return 0; } From d7b9d9b31a3e55dcc9b5c289abfafe31efa5b5c4 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:15 +0300 Subject: [PATCH 096/307] opp: Correct debug message in _opp_add_static_v2() The debug message always prints rate=0 instead of a proper value, fix it. Fixes: 6c591eec67cb ("OPP: Add helpers for reading the binding properties") Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko [ Viresh: Added Fixes tag ] Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 03cb387236c4..d0c0336be39b 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -755,7 +755,6 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table, struct device *dev, struct device_node *np) { struct dev_pm_opp *new_opp; - u64 rate = 0; u32 val; int ret; bool rate_not_available = false; @@ -772,7 +771,8 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table, /* Check if the OPP supports hardware's hierarchy of versions or not */ if (!_opp_is_supported(dev, opp_table, np)) { - dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate); + dev_dbg(dev, "OPP not supported by hardware: %lu\n", + new_opp->rate); goto free_opp; } From d758eaf5f8cbdf2554e34269c75694f60c38745d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 28 Jan 2021 11:08:47 +0530 Subject: [PATCH 097/307] opp: Staticize _add_opp_table() _add_opp_table() isn't used outside of core.c, mark it static. Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 +- drivers/opp/opp.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 253bc87b5695..dc7a298f3611 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1193,7 +1193,7 @@ unlock: return opp_table; } -struct opp_table *_add_opp_table(struct device *dev) +static struct opp_table *_add_opp_table(struct device *dev) { return _add_opp_table_indexed(dev, 0); } diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 4ced7ffa8158..ee2593afae0c 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -223,7 +223,6 @@ int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2); int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table, bool rate_not_available); int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic); void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu); -struct opp_table *_add_opp_table(struct device *dev); struct opp_table *_add_opp_table_indexed(struct device *dev, int index); void _put_opp_list_kref(struct opp_table *opp_table); From 8dd5cada393f6f4e825833a6ff05b1f51f36a791 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:18 +0300 Subject: [PATCH 098/307] opp: Add dev_pm_opp_find_level_ceil() Add a ceil version of the dev_pm_opp_find_level(). It's handy to have if levels don't start from 0 in OPP table and zero usually means a minimal level. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 49 ++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 8 +++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index dc7a298f3611..b29f31146770 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -449,6 +449,55 @@ struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_exact); +/** + * dev_pm_opp_find_level_ceil() - search for an rounded up level + * @dev: device for which we do this operation + * @level: level to search for + * + * Return: Searches for rounded up match in the opp table and returns pointer + * to the matching opp if found, else returns ERR_PTR in case of error and + * should be handled using IS_ERR. Error return values can be: + * EINVAL: for bad pointer + * ERANGE: no match found for search + * ENODEV: if device not found in list of registered devices + * + * The callers are required to call dev_pm_opp_put() for the returned OPP after + * use. + */ +struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, + unsigned int *level) +{ + struct opp_table *opp_table; + struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE); + + opp_table = _find_opp_table(dev); + if (IS_ERR(opp_table)) { + int r = PTR_ERR(opp_table); + + dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r); + return ERR_PTR(r); + } + + mutex_lock(&opp_table->lock); + + list_for_each_entry(temp_opp, &opp_table->opp_list, node) { + if (temp_opp->available && temp_opp->level >= *level) { + opp = temp_opp; + *level = opp->level; + + /* Increment the reference count of OPP */ + dev_pm_opp_get(opp); + break; + } + } + + mutex_unlock(&opp_table->lock); + dev_pm_opp_put_opp_table(opp_table); + + return opp; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_find_level_ceil); + static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table, unsigned long *freq) { diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 1435c054016a..2b3030cb2ed2 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -111,6 +111,8 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, bool available); struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, unsigned int level); +struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, + unsigned int *level); struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq); @@ -226,6 +228,12 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, return ERR_PTR(-ENOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, + unsigned int *level) +{ + return ERR_PTR(-ENOTSUPP); +} + static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq) { From 597ff5431fd41afa888809f7936508a15c977cde Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:19 +0300 Subject: [PATCH 099/307] opp: Add dev_pm_opp_get_required_pstate() Add dev_pm_opp_get_required_pstate() which allows OPP users to retrieve required performance state of a given OPP. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 22 ++++++++++++++++++++++ include/linux/pm_opp.h | 10 ++++++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index b29f31146770..64a95aaa90eb 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -145,6 +145,28 @@ unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp) } EXPORT_SYMBOL_GPL(dev_pm_opp_get_level); +/** + * dev_pm_opp_get_required_pstate() - Gets the required performance state + * corresponding to an available opp + * @opp: opp for which performance state has to be returned for + * @index: index of the required opp + * + * Return: performance state read from device tree corresponding to the + * required opp, else return 0. + */ +unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp, + unsigned int index) +{ + if (IS_ERR_OR_NULL(opp) || !opp->available || + index >= opp->opp_table->required_opp_count) { + pr_err("%s: Invalid parameters\n", __func__); + return 0; + } + + return opp->required_opps[index]->pstate; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_get_required_pstate); + /** * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not * @opp: opp for which turbo mode is being verified diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 2b3030cb2ed2..8f926815bad9 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -98,6 +98,9 @@ unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp); unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp); +unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp, + unsigned int index); + bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp); int dev_pm_opp_get_opp_count(struct device *dev); @@ -186,6 +189,13 @@ static inline unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp) return 0; } +static inline +unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp, + unsigned int index) +{ + return 0; +} + static inline bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp) { return false; From ce8073d83f63a2cdcfc1b86d769456726faad51d Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 21 Jan 2021 01:26:47 +0300 Subject: [PATCH 100/307] opp: Add dev_pm_opp_sync_regulators() Extend OPP API with dev_pm_opp_sync_regulators() function, which syncs voltage state of regulators. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko [ Viresh: Added unlikely() ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 6 ++++++ 2 files changed, 47 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 64a95aaa90eb..bf7cdab0ba64 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2584,3 +2584,44 @@ void dev_pm_opp_remove_table(struct device *dev) dev_pm_opp_put_opp_table(opp_table); } EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table); + +/** + * dev_pm_opp_sync_regulators() - Sync state of voltage regulators + * @dev: device for which we do this operation + * + * Sync voltage state of the OPP table regulators. + * + * Return: 0 on success or a negative error value. + */ +int dev_pm_opp_sync_regulators(struct device *dev) +{ + struct opp_table *opp_table; + struct regulator *reg; + int i, ret = 0; + + /* Device may not have OPP table */ + opp_table = _find_opp_table(dev); + if (IS_ERR(opp_table)) + return 0; + + /* Regulator may not be required for the device */ + if (unlikely(!opp_table->regulators)) + goto put_table; + + /* Nothing to sync if voltage wasn't changed */ + if (!opp_table->enabled) + goto put_table; + + for (i = 0; i < opp_table->regulator_count; i++) { + reg = opp_table->regulators[i]; + ret = regulator_sync_voltage(reg); + if (ret) + break; + } +put_table: + /* Drop reference taken by _find_opp_table() */ + dev_pm_opp_put_opp_table(opp_table); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_sync_regulators); diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 8f926815bad9..979b208bc4a8 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -161,6 +161,7 @@ int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cp int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask); void dev_pm_opp_remove_table(struct device *dev); void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask); +int dev_pm_opp_sync_regulators(struct device *dev); #else static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) { @@ -384,6 +385,11 @@ static inline void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask { } +static inline int dev_pm_opp_sync_regulators(struct device *dev) +{ + return -ENOTSUPP; +} + #endif /* CONFIG_PM_OPP */ #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) From 406e47652161d4f0d9bc4cd6237b36c51497ec75 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 27 Jan 2021 12:45:56 +0530 Subject: [PATCH 101/307] opp: Create _of_add_table_indexed() to reduce code duplication The implementation of dev_pm_opp_of_add_table() and dev_pm_opp_of_add_table_indexed() are almost identical. Create _of_add_table_indexed() to reduce code redundancy. Also remove the duplication of the doc style comments by referring to dev_pm_opp_of_add_table() from dev_pm_opp_of_add_table_indexed(). Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/of.c | 111 ++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 63 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index d0c0336be39b..c6856dcf4c34 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -956,68 +956,7 @@ remove_static_opp: return ret; } -/** - * dev_pm_opp_of_add_table() - Initialize opp table from device tree - * @dev: device pointer used to lookup OPP table. - * - * Register the initial OPP table with the OPP library for given device. - * - * Return: - * 0 On success OR - * Duplicate OPPs (both freq and volt are same) and opp->available - * -EEXIST Freq are same and volt are different OR - * Duplicate OPPs (both freq and volt are same) and !opp->available - * -ENOMEM Memory allocation failure - * -ENODEV when 'operating-points' property is not found or is invalid data - * in device node. - * -ENODATA when empty 'operating-points' property is found - * -EINVAL when invalid entries are found in opp-v2 table - */ -int dev_pm_opp_of_add_table(struct device *dev) -{ - struct opp_table *opp_table; - int ret; - - opp_table = _add_opp_table_indexed(dev, 0); - if (IS_ERR(opp_table)) - return PTR_ERR(opp_table); - - /* - * OPPs have two version of bindings now. Also try the old (v1) - * bindings for backward compatibility with older dtbs. - */ - if (opp_table->np) - ret = _of_add_opp_table_v2(dev, opp_table); - else - ret = _of_add_opp_table_v1(dev, opp_table); - - if (ret) - dev_pm_opp_put_opp_table(opp_table); - - return ret; -} -EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table); - -/** - * dev_pm_opp_of_add_table_indexed() - Initialize indexed opp table from device tree - * @dev: device pointer used to lookup OPP table. - * @index: Index number. - * - * Register the initial OPP table with the OPP library for given device only - * using the "operating-points-v2" property. - * - * Return: - * 0 On success OR - * Duplicate OPPs (both freq and volt are same) and opp->available - * -EEXIST Freq are same and volt are different OR - * Duplicate OPPs (both freq and volt are same) and !opp->available - * -ENOMEM Memory allocation failure - * -ENODEV when 'operating-points' property is not found or is invalid data - * in device node. - * -ENODATA when empty 'operating-points' property is found - * -EINVAL when invalid entries are found in opp-v2 table - */ -int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) +static int _of_add_table_indexed(struct device *dev, int index) { struct opp_table *opp_table; int ret, count; @@ -1037,12 +976,58 @@ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) if (IS_ERR(opp_table)) return PTR_ERR(opp_table); - ret = _of_add_opp_table_v2(dev, opp_table); + /* + * OPPs have two version of bindings now. Also try the old (v1) + * bindings for backward compatibility with older dtbs. + */ + if (opp_table->np) + ret = _of_add_opp_table_v2(dev, opp_table); + else + ret = _of_add_opp_table_v1(dev, opp_table); + if (ret) dev_pm_opp_put_opp_table(opp_table); return ret; } + +/** + * dev_pm_opp_of_add_table() - Initialize opp table from device tree + * @dev: device pointer used to lookup OPP table. + * + * Register the initial OPP table with the OPP library for given device. + * + * Return: + * 0 On success OR + * Duplicate OPPs (both freq and volt are same) and opp->available + * -EEXIST Freq are same and volt are different OR + * Duplicate OPPs (both freq and volt are same) and !opp->available + * -ENOMEM Memory allocation failure + * -ENODEV when 'operating-points' property is not found or is invalid data + * in device node. + * -ENODATA when empty 'operating-points' property is found + * -EINVAL when invalid entries are found in opp-v2 table + */ +int dev_pm_opp_of_add_table(struct device *dev) +{ + return _of_add_table_indexed(dev, 0); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table); + +/** + * dev_pm_opp_of_add_table_indexed() - Initialize indexed opp table from device tree + * @dev: device pointer used to lookup OPP table. + * @index: Index number. + * + * Register the initial OPP table with the OPP library for given device only + * using the "operating-points-v2" property. + * + * Return: Refer to dev_pm_opp_of_add_table() for return values. + */ +int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) +{ + return _of_add_table_indexed(dev, index); +} EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed); /* CPU device specific helpers */ From 32439ac7535a8eddfa016c62ca66ce33b7df1573 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 28 Jan 2021 12:05:22 +0530 Subject: [PATCH 102/307] opp: Defer acquiring the clk until OPPs are added We acquire the clk at the time the OPP table is allocated, though it works fine, it is not the best place to do so. One of the main reason being we may need to acquire it again from dev_pm_opp_set_clkname() if the platform wants another clock to be acquired instead. There is also requirement from some of the platforms where they do not want the OPP core to manage the clock at all. This patch hence defers acquiring the clk until the time we are certain about which clk we need to acquire and if we really need to acquire one. With this commit, the clk will get acquired either from dev_pm_opp_set_clkname() or while we initialize the OPPs within the table. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 73 ++++++++++++++++++++++++++++------------------ drivers/opp/of.c | 8 ++--- drivers/opp/opp.h | 2 +- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index bf7cdab0ba64..52f4a64926e6 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1158,21 +1158,11 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) _of_init_opp_table(opp_table, dev, index); - /* Find clk for the device */ - opp_table->clk = clk_get(dev, NULL); - if (IS_ERR(opp_table->clk)) { - ret = PTR_ERR(opp_table->clk); - if (ret == -EPROBE_DEFER) - goto remove_opp_dev; - - dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret); - } - /* Find interconnect path(s) for the device */ ret = dev_pm_opp_of_find_icc_paths(dev, opp_table); if (ret) { if (ret == -EPROBE_DEFER) - goto put_clk; + goto remove_opp_dev; dev_warn(dev, "%s: Error finding interconnect paths: %d\n", __func__, ret); @@ -1184,9 +1174,6 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) return opp_table; -put_clk: - if (!IS_ERR(opp_table->clk)) - clk_put(opp_table->clk); remove_opp_dev: _remove_opp_dev(opp_dev, opp_table); err: @@ -1199,6 +1186,33 @@ void _get_opp_table_kref(struct opp_table *opp_table) kref_get(&opp_table->kref); } +static struct opp_table *_update_opp_table_clk(struct device *dev, + struct opp_table *opp_table, + bool getclk) +{ + /* + * Return early if we don't need to get clk or we have already tried it + * earlier. + */ + if (!getclk || IS_ERR(opp_table) || opp_table->clk) + return opp_table; + + /* Find clk for the device */ + opp_table->clk = clk_get(dev, NULL); + if (IS_ERR(opp_table->clk)) { + int ret = PTR_ERR(opp_table->clk); + + if (ret == -EPROBE_DEFER) { + dev_pm_opp_put_opp_table(opp_table); + return ERR_PTR(ret); + } + + dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret); + } + + return opp_table; +} + /* * We need to make sure that the OPP table for a device doesn't get added twice, * if this routine gets called in parallel with the same device pointer. @@ -1214,7 +1228,8 @@ void _get_opp_table_kref(struct opp_table *opp_table) * uses the opp_tables_busy flag to indicate if another creator is in the middle * of adding an OPP table and others should wait for it to finish. */ -struct opp_table *_add_opp_table_indexed(struct device *dev, int index) +struct opp_table *_add_opp_table_indexed(struct device *dev, int index, + bool getclk) { struct opp_table *opp_table; @@ -1261,12 +1276,12 @@ again: unlock: mutex_unlock(&opp_table_lock); - return opp_table; + return _update_opp_table_clk(dev, opp_table, getclk); } -static struct opp_table *_add_opp_table(struct device *dev) +static struct opp_table *_add_opp_table(struct device *dev, bool getclk) { - return _add_opp_table_indexed(dev, 0); + return _add_opp_table_indexed(dev, 0, getclk); } struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) @@ -1711,7 +1726,7 @@ struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, { struct opp_table *opp_table; - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, false); if (IS_ERR(opp_table)) return opp_table; @@ -1773,7 +1788,7 @@ struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name) { struct opp_table *opp_table; - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, false); if (IS_ERR(opp_table)) return opp_table; @@ -1869,7 +1884,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev, struct regulator *reg; int ret, i; - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, false); if (IS_ERR(opp_table)) return opp_table; @@ -1980,7 +1995,7 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name) struct opp_table *opp_table; int ret; - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, false); if (IS_ERR(opp_table)) return opp_table; @@ -1990,9 +2005,11 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name) goto err; } - /* Already have default clk set, free it */ - if (!IS_ERR(opp_table->clk)) - clk_put(opp_table->clk); + /* clk shouldn't be initialized at this point */ + if (WARN_ON(opp_table->clk)) { + ret = -EBUSY; + goto err; + } /* Find clk for the device */ opp_table->clk = clk_get(dev, name); @@ -2051,7 +2068,7 @@ struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, if (!set_opp) return ERR_PTR(-EINVAL); - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, false); if (IS_ERR(opp_table)) return opp_table; @@ -2138,7 +2155,7 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, int index = 0, ret = -EINVAL; const char **name = names; - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, false); if (IS_ERR(opp_table)) return opp_table; @@ -2306,7 +2323,7 @@ int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) struct opp_table *opp_table; int ret; - opp_table = _add_opp_table(dev); + opp_table = _add_opp_table(dev, true); if (IS_ERR(opp_table)) return PTR_ERR(opp_table); diff --git a/drivers/opp/of.c b/drivers/opp/of.c index c6856dcf4c34..d4b51b2e384f 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -956,7 +956,7 @@ remove_static_opp: return ret; } -static int _of_add_table_indexed(struct device *dev, int index) +static int _of_add_table_indexed(struct device *dev, int index, bool getclk) { struct opp_table *opp_table; int ret, count; @@ -972,7 +972,7 @@ static int _of_add_table_indexed(struct device *dev, int index) index = 0; } - opp_table = _add_opp_table_indexed(dev, index); + opp_table = _add_opp_table_indexed(dev, index, getclk); if (IS_ERR(opp_table)) return PTR_ERR(opp_table); @@ -1010,7 +1010,7 @@ static int _of_add_table_indexed(struct device *dev, int index) */ int dev_pm_opp_of_add_table(struct device *dev) { - return _of_add_table_indexed(dev, 0); + return _of_add_table_indexed(dev, 0, true); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table); @@ -1026,7 +1026,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table); */ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) { - return _of_add_table_indexed(dev, index); + return _of_add_table_indexed(dev, index, true); } EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed); diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index ee2593afae0c..6e83855ade1f 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -223,7 +223,7 @@ int _opp_compare_key(struct dev_pm_opp *opp1, struct dev_pm_opp *opp2); int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table, bool rate_not_available); int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic); void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu); -struct opp_table *_add_opp_table_indexed(struct device *dev, int index); +struct opp_table *_add_opp_table_indexed(struct device *dev, int index, bool getclk); void _put_opp_list_kref(struct opp_table *opp_table); #ifdef CONFIG_OF From 559fef0dfd91145b59b7c61061504f344ecf9ad8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 27 Jan 2021 14:23:45 +0530 Subject: [PATCH 103/307] opp: Add dev_pm_opp_of_add_table_noclk() A few drivers have device's clk but they don't want the OPP core to handle that. Add a new helper for them, dev_pm_opp_of_add_table_noclk(). Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/of.c | 18 ++++++++++++++++++ include/linux/pm_opp.h | 6 ++++++ 2 files changed, 24 insertions(+) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index d4b51b2e384f..a905497c75f8 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1030,6 +1030,24 @@ int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) } EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_indexed); +/** + * dev_pm_opp_of_add_table_noclk() - Initialize indexed opp table from device + * tree without getting clk for device. + * @dev: device pointer used to lookup OPP table. + * @index: Index number. + * + * Register the initial OPP table with the OPP library for given device only + * using the "operating-points-v2" property. Do not try to get the clk for the + * device. + * + * Return: Refer to dev_pm_opp_of_add_table() for return values. + */ +int dev_pm_opp_of_add_table_noclk(struct device *dev, int index) +{ + return _of_add_table_indexed(dev, index, false); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table_noclk); + /* CPU device specific helpers */ /** diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 979b208bc4a8..158158620dde 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -395,6 +395,7 @@ static inline int dev_pm_opp_sync_regulators(struct device *dev) #if defined(CONFIG_PM_OPP) && defined(CONFIG_OF) int dev_pm_opp_of_add_table(struct device *dev); int dev_pm_opp_of_add_table_indexed(struct device *dev, int index); +int dev_pm_opp_of_add_table_noclk(struct device *dev, int index); void dev_pm_opp_of_remove_table(struct device *dev); int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask); void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask); @@ -419,6 +420,11 @@ static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) return -ENOTSUPP; } +static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index) +{ + return -ENOTSUPP; +} + static inline void dev_pm_opp_of_remove_table(struct device *dev) { } From a3c47af6942dc8e07a4328913d0263a965786895 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:20 +0300 Subject: [PATCH 104/307] opp: Add devm_pm_opp_register_set_opp_helper Add resource-managed version of dev_pm_opp_register_set_opp_helper(). Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko [ Viresh: Manually apply the patch and relocate the routines ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 8 ++++++++ 2 files changed, 42 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 52f4a64926e6..09069a564896 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2106,6 +2106,40 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) } EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper); +static void devm_pm_opp_unregister_set_opp_helper(void *data) +{ + dev_pm_opp_unregister_set_opp_helper(data); +} + +/** + * devm_pm_opp_register_set_opp_helper() - Register custom set OPP helper + * @dev: Device for which the helper is getting registered. + * @set_opp: Custom set OPP helper. + * + * This is a resource-managed version of dev_pm_opp_register_set_opp_helper(). + * + * Return: pointer to 'struct opp_table' on success and errorno otherwise. + */ +struct opp_table * +devm_pm_opp_register_set_opp_helper(struct device *dev, + int (*set_opp)(struct dev_pm_set_opp_data *data)) +{ + struct opp_table *opp_table; + int err; + + opp_table = dev_pm_opp_register_set_opp_helper(dev, set_opp); + if (IS_ERR(opp_table)) + return opp_table; + + err = devm_add_action_or_reset(dev, devm_pm_opp_unregister_set_opp_helper, + opp_table); + if (err) + return ERR_PTR(err); + + return opp_table; +} +EXPORT_SYMBOL_GPL(devm_pm_opp_register_set_opp_helper); + static void _opp_detach_genpd(struct opp_table *opp_table) { int index; diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 158158620dde..473daf34160d 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -152,6 +152,7 @@ struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name); void dev_pm_opp_put_clkname(struct opp_table *opp_table); struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table); +struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); void dev_pm_opp_detach_genpd(struct opp_table *opp_table); int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); @@ -324,6 +325,13 @@ static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {} +static inline struct opp_table * +devm_pm_opp_register_set_opp_helper(struct device *dev, + int (*set_opp)(struct dev_pm_set_opp_data *data)) +{ + return ERR_PTR(-ENOTSUPP); +} + static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name) { return ERR_PTR(-ENOTSUPP); From b4b9e223eccaeec6e05d927c292d4425fd18f243 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:21 +0300 Subject: [PATCH 105/307] opp: Add devm_pm_opp_attach_genpd Add resource-managed version of dev_pm_opp_attach_genpd(). Signed-off-by: Dmitry Osipenko [ Viresh: Manually apply the patch and relocate the routines ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 7 +++++++ 2 files changed, 43 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 09069a564896..ce0ec5bde22a 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2275,6 +2275,42 @@ void dev_pm_opp_detach_genpd(struct opp_table *opp_table) } EXPORT_SYMBOL_GPL(dev_pm_opp_detach_genpd); +static void devm_pm_opp_detach_genpd(void *data) +{ + dev_pm_opp_detach_genpd(data); +} + +/** + * devm_pm_opp_attach_genpd - Attach genpd(s) for the device and save virtual + * device pointer + * @dev: Consumer device for which the genpd is getting attached. + * @names: Null terminated array of pointers containing names of genpd to attach. + * @virt_devs: Pointer to return the array of virtual devices. + * + * This is a resource-managed version of dev_pm_opp_attach_genpd(). + * + * Return: pointer to 'struct opp_table' on success and errorno otherwise. + */ +struct opp_table * +devm_pm_opp_attach_genpd(struct device *dev, const char **names, + struct device ***virt_devs) +{ + struct opp_table *opp_table; + int err; + + opp_table = dev_pm_opp_attach_genpd(dev, names, virt_devs); + if (IS_ERR(opp_table)) + return opp_table; + + err = devm_add_action_or_reset(dev, devm_pm_opp_detach_genpd, + opp_table); + if (err) + return ERR_PTR(err); + + return opp_table; +} +EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd); + /** * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table. * @src_table: OPP table which has dst_table as one of its required OPP table. diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 473daf34160d..a2c871799603 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -155,6 +155,7 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table); struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); void dev_pm_opp_detach_genpd(struct opp_table *opp_table); +struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp); @@ -360,6 +361,12 @@ static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, cons static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {} +static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, + const char **names, struct device ***virt_devs) +{ + return ERR_PTR(-ENOTSUPP); +} + static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate) { return -ENOTSUPP; From f2f4d2b86f432fecfd76afa5f4f60f47833121b5 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:23 +0300 Subject: [PATCH 106/307] opp: Handle missing OPP table in dev_pm_opp_xlate_performance_state() NVIDIA Tegra SoCs have a power domains topology such that child domains only clamp a power rail, while parent domain controls shared performance state of the multiple child domains. In this case child's domain doesn't need to have OPP table. Hence we want to allow children power domains to pass performance state to the parent domain if child's domain doesn't have OPP table. The dev_pm_opp_xlate_performance_state() gets src_table=NULL if a child power domain doesn't have OPP table and in this case we should pass the performance state to the parent domain. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index ce0ec5bde22a..0417cd34b805 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2339,7 +2339,7 @@ int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, * and so none of them have the "required-opps" property set. Return the * pstate of the src_table as it is in such cases. */ - if (!src_table->required_opp_count) + if (!src_table || !src_table->required_opp_count) return pstate; for (i = 0; i < src_table->required_opp_count; i++) { From b6ecd5d4f6941628d0140735d3f05eb61907141e Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Mon, 18 Jan 2021 03:55:24 +0300 Subject: [PATCH 107/307] opp: Print OPP level in debug message of _opp_add_static_v2() Print OPP level in debug message of _opp_add_static_v2(). This helps to chase GENPD bugs. Tested-by: Peter Geis Tested-by: Nicolas Chauvet Tested-by: Matt Merhar Signed-off-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/of.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/opp/of.c b/drivers/opp/of.c index a905497c75f8..20ccdaab9384 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -822,10 +822,11 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table, if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max) opp_table->clock_latency_ns_max = new_opp->clock_latency_ns; - pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n", + pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu level:%u\n", __func__, new_opp->turbo, new_opp->rate, new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min, - new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns); + new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns, + new_opp->level); /* * Notify the changes in the availability of the operable From 38bb34393804b79eff647bdf96762db5efce392c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 19 Jan 2021 11:58:58 +0530 Subject: [PATCH 108/307] opp: Prepare for ->set_opp() helper to work without regulators Until now the ->set_opp() helper (i.e. special implementation for setting the OPPs for platforms) was implemented only to take care of multiple regulators case, but going forward we would need that for other use cases as well. This patch prepares for that by allocating the regulator specific part from dev_pm_opp_set_regulators() and the opp helper part from dev_pm_opp_register_set_opp_helper(). Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 84 +++++++++++++++++++++++++--------------------- drivers/opp/opp.h | 2 ++ 2 files changed, 48 insertions(+), 38 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 0417cd34b805..f482937d72eb 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1832,38 +1832,6 @@ void dev_pm_opp_put_prop_name(struct opp_table *opp_table) } EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name); -static int _allocate_set_opp_data(struct opp_table *opp_table) -{ - struct dev_pm_set_opp_data *data; - int len, count = opp_table->regulator_count; - - if (WARN_ON(!opp_table->regulators)) - return -EINVAL; - - /* space for set_opp_data */ - len = sizeof(*data); - - /* space for old_opp.supplies and new_opp.supplies */ - len += 2 * sizeof(struct dev_pm_opp_supply) * count; - - data = kzalloc(len, GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->old_opp.supplies = (void *)(data + 1); - data->new_opp.supplies = data->old_opp.supplies + count; - - opp_table->set_opp_data = data; - - return 0; -} - -static void _free_set_opp_data(struct opp_table *opp_table) -{ - kfree(opp_table->set_opp_data); - opp_table->set_opp_data = NULL; -} - /** * dev_pm_opp_set_regulators() - Set regulator names for the device * @dev: Device for which regulator name is being set. @@ -1880,6 +1848,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count) { + struct dev_pm_opp_supply *supplies; struct opp_table *opp_table; struct regulator *reg; int ret, i; @@ -1921,10 +1890,19 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev, opp_table->regulator_count = count; - /* Allocate block only once to pass to set_opp() routines */ - ret = _allocate_set_opp_data(opp_table); - if (ret) + supplies = kmalloc_array(count * 2, sizeof(*supplies), GFP_KERNEL); + if (!supplies) { + ret = -ENOMEM; goto free_regulators; + } + + mutex_lock(&opp_table->lock); + opp_table->sod_supplies = supplies; + if (opp_table->set_opp_data) { + opp_table->set_opp_data->old_opp.supplies = supplies; + opp_table->set_opp_data->new_opp.supplies = supplies + count; + } + mutex_unlock(&opp_table->lock); return opp_table; @@ -1967,7 +1945,15 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table) for (i = opp_table->regulator_count - 1; i >= 0; i--) regulator_put(opp_table->regulators[i]); - _free_set_opp_data(opp_table); + mutex_lock(&opp_table->lock); + if (opp_table->set_opp_data) { + opp_table->set_opp_data->old_opp.supplies = NULL; + opp_table->set_opp_data->new_opp.supplies = NULL; + } + + kfree(opp_table->sod_supplies); + opp_table->sod_supplies = NULL; + mutex_unlock(&opp_table->lock); kfree(opp_table->regulators); opp_table->regulators = NULL; @@ -2063,6 +2049,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname); struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)) { + struct dev_pm_set_opp_data *data; struct opp_table *opp_table; if (!set_opp) @@ -2079,8 +2066,23 @@ struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, } /* Another CPU that shares the OPP table has set the helper ? */ - if (!opp_table->set_opp) - opp_table->set_opp = set_opp; + if (opp_table->set_opp) + return opp_table; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + mutex_lock(&opp_table->lock); + opp_table->set_opp_data = data; + if (opp_table->sod_supplies) { + data->old_opp.supplies = opp_table->sod_supplies; + data->new_opp.supplies = opp_table->sod_supplies + + opp_table->regulator_count; + } + mutex_unlock(&opp_table->lock); + + opp_table->set_opp = set_opp; return opp_table; } @@ -2102,6 +2104,12 @@ void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) WARN_ON(!list_empty(&opp_table->opp_list)); opp_table->set_opp = NULL; + + mutex_lock(&opp_table->lock); + kfree(opp_table->set_opp_data); + opp_table->set_opp_data = NULL; + mutex_unlock(&opp_table->lock); + dev_pm_opp_put_opp_table(opp_table); } EXPORT_SYMBOL_GPL(dev_pm_opp_unregister_set_opp_helper); diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 6e83855ade1f..64b9cb782a93 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -155,6 +155,7 @@ enum opp_table_access { * @genpd_performance_state: Device's power domain support performance state. * @is_genpd: Marks if the OPP table belongs to a genpd. * @set_opp: Platform specific set_opp callback + * @sod_supplies: Set opp data supplies * @set_opp_data: Data to be passed to set_opp callback * @dentry: debugfs dentry pointer of the real device directory (not links). * @dentry_name: Name of the real dentry. @@ -202,6 +203,7 @@ struct opp_table { bool is_genpd; int (*set_opp)(struct dev_pm_set_opp_data *data); + struct dev_pm_opp_supply *sod_supplies; struct dev_pm_set_opp_data *set_opp_data; #ifdef CONFIG_DEBUG_FS From 04b447df1d098dcd7d133203a310a6d415875547 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Thu, 21 Jan 2021 01:26:49 +0300 Subject: [PATCH 109/307] opp: Make _set_opp_custom() work without regulators Check whether OPP table has regulators in _set_opp_custom() and set up dev_pm_set_opp_data accordingly. Now _set_opp_custom() works properly, i.e. it doesn't crash if OPP table doesn't have assigned regulators. Signed-off-by: Dmitry Osipenko [ Viresh: Rearrange the routine a bit ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index f482937d72eb..b4528e40ad01 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -828,24 +828,31 @@ static int _set_opp_custom(const struct opp_table *opp_table, struct dev_pm_opp_supply *old_supply, struct dev_pm_opp_supply *new_supply) { - struct dev_pm_set_opp_data *data; + struct dev_pm_set_opp_data *data = opp_table->set_opp_data; int size; - data = opp_table->set_opp_data; + /* + * We support this only if dev_pm_opp_set_regulators() was called + * earlier. + */ + if (opp_table->sod_supplies) { + size = sizeof(*old_supply) * opp_table->regulator_count; + if (!old_supply) + memset(data->old_opp.supplies, 0, size); + else + memcpy(data->old_opp.supplies, old_supply, size); + + memcpy(data->new_opp.supplies, new_supply, size); + data->regulator_count = opp_table->regulator_count; + } else { + data->regulator_count = 0; + } + data->regulators = opp_table->regulators; - data->regulator_count = opp_table->regulator_count; data->clk = opp_table->clk; data->dev = dev; - data->old_opp.rate = old_freq; - size = sizeof(*old_supply) * opp_table->regulator_count; - if (!old_supply) - memset(data->old_opp.supplies, 0, size); - else - memcpy(data->old_opp.supplies, old_supply, size); - data->new_opp.rate = freq; - memcpy(data->new_opp.supplies, new_supply, size); return opp_table->set_opp(data); } From 5ad58bbacf802f7d11cadd76881311d6e4b2bce0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 12:08:45 +0530 Subject: [PATCH 110/307] opp: Rename _opp_set_rate_zero() This routine has nothing to do with frequency, it just disables all the resources previously enabled. Rename it to match its purpose. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index b4528e40ad01..9637f2994d2e 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -950,7 +950,7 @@ int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) } EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw); -static int _opp_set_rate_zero(struct device *dev, struct opp_table *opp_table) +static int _disable_opp_table(struct device *dev, struct opp_table *opp_table) { int ret; @@ -1004,7 +1004,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) } if (unlikely(!target_freq)) { - ret = _opp_set_rate_zero(dev, opp_table); + ret = _disable_opp_table(dev, opp_table); goto put_opp_table; } From 1d3c42cabbd351e9c171e906603b5cc2ea513640 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 20 Jan 2021 15:57:21 +0530 Subject: [PATCH 111/307] opp: No need to check clk for errors Clock is not optional for users who call into dev_pm_opp_set_rate(). Remove the unnecessary checks. While at it also drop the local variable for clk and use opp_table->clk instead. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 9637f2994d2e..8ef85cd918ce 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -994,7 +994,6 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) struct opp_table *opp_table; unsigned long freq, old_freq, temp_freq; struct dev_pm_opp *old_opp, *opp; - struct clk *clk; int ret; opp_table = _find_opp_table(dev); @@ -1008,19 +1007,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) goto put_opp_table; } - clk = opp_table->clk; - if (IS_ERR(clk)) { - dev_err(dev, "%s: No clock available for the device\n", - __func__); - ret = PTR_ERR(clk); - goto put_opp_table; - } - - freq = clk_round_rate(clk, target_freq); + freq = clk_round_rate(opp_table->clk, target_freq); if ((long)freq <= 0) freq = target_freq; - old_freq = clk_get_rate(clk); + old_freq = clk_get_rate(opp_table->clk); /* Return early if nothing to do */ if (opp_table->enabled && old_freq == freq) { @@ -1038,7 +1029,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) * equivalent to a clk_set_rate() */ if (!_get_opp_count(opp_table)) { - ret = _generic_set_opp_clk_only(dev, clk, freq); + ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq); goto put_opp_table; } @@ -1078,7 +1069,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) opp->supplies); } else { /* Only frequency scaling */ - ret = _generic_set_opp_clk_only(dev, clk, freq); + ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq); } /* Scaling down? Configure required OPPs after frequency */ From 81c4d8a3c41488e5491142c31cd7a821ff5d71ec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 20 Jan 2021 16:16:48 +0530 Subject: [PATCH 112/307] opp: Keep track of currently programmed OPP The dev_pm_opp_set_rate() helper needs to know the currently programmed OPP to make few decisions and currently we try to find it on every invocation of this routine. Lets start keeping track of the current_opp programmed for the devices of the opp table, that will be quite useful going forward. If we fail to find the current OPP, we pick the first one available in the list, as the list is in ascending order of frequencies, level, or bandwidth and that's the best guess we can make anyway. Note that we used to do the frequency comparison a bit early in dev_pm_opp_set_rate() previously, and now instead we check the target opp, which shall be more accurate anyway. We need to make sure that current_opp's memory doesn't get freed while it is being used and so we keep a reference of it until the time it is used. Now that current_opp will always be set, we can drop some unnecessary checks as well. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 84 +++++++++++++++++++++++++++++----------------- drivers/opp/opp.h | 2 ++ 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 8ef85cd918ce..c77d8ae89836 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -788,8 +788,7 @@ restore_freq: __func__, old_freq); restore_voltage: /* This shouldn't harm even if the voltages weren't updated earlier */ - if (old_supply) - _set_opp_voltage(dev, reg, old_supply); + _set_opp_voltage(dev, reg, old_supply); return ret; } @@ -837,11 +836,7 @@ static int _set_opp_custom(const struct opp_table *opp_table, */ if (opp_table->sod_supplies) { size = sizeof(*old_supply) * opp_table->regulator_count; - if (!old_supply) - memset(data->old_opp.supplies, 0, size); - else - memcpy(data->old_opp.supplies, old_supply, size); - + memcpy(data->old_opp.supplies, old_supply, size); memcpy(data->new_opp.supplies, new_supply, size); data->regulator_count = opp_table->regulator_count; } else { @@ -950,6 +945,31 @@ int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) } EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw); +static void _find_current_opp(struct device *dev, struct opp_table *opp_table) +{ + struct dev_pm_opp *opp = ERR_PTR(-ENODEV); + unsigned long freq; + + if (!IS_ERR(opp_table->clk)) { + freq = clk_get_rate(opp_table->clk); + opp = _find_freq_ceil(opp_table, &freq); + } + + /* + * Unable to find the current OPP ? Pick the first from the list since + * it is in ascending order, otherwise rest of the code will need to + * make special checks to validate current_opp. + */ + if (IS_ERR(opp)) { + mutex_lock(&opp_table->lock); + opp = list_first_entry(&opp_table->opp_list, struct dev_pm_opp, node); + dev_pm_opp_get(opp); + mutex_unlock(&opp_table->lock); + } + + opp_table->current_opp = opp; +} + static int _disable_opp_table(struct device *dev, struct opp_table *opp_table) { int ret; @@ -1011,16 +1031,6 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) if ((long)freq <= 0) freq = target_freq; - old_freq = clk_get_rate(opp_table->clk); - - /* Return early if nothing to do */ - if (opp_table->enabled && old_freq == freq) { - dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", - __func__, freq); - ret = 0; - goto put_opp_table; - } - /* * For IO devices which require an OPP on some platforms/SoCs * while just needing to scale the clock on some others @@ -1033,12 +1043,9 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) goto put_opp_table; } - temp_freq = old_freq; - old_opp = _find_freq_ceil(opp_table, &temp_freq); - if (IS_ERR(old_opp)) { - dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n", - __func__, old_freq, PTR_ERR(old_opp)); - } + /* Find the currently set OPP if we don't know already */ + if (unlikely(!opp_table->current_opp)) + _find_current_opp(dev, opp_table); temp_freq = freq; opp = _find_freq_ceil(opp_table, &temp_freq); @@ -1046,7 +1053,17 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) ret = PTR_ERR(opp); dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n", __func__, freq, ret); - goto put_old_opp; + goto put_opp_table; + } + + old_opp = opp_table->current_opp; + old_freq = old_opp->rate; + + /* Return early if nothing to do */ + if (opp_table->enabled && old_opp == opp) { + dev_dbg(dev, "%s: OPPs are same, nothing to do\n", __func__); + ret = 0; + goto put_opp; } dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__, @@ -1061,11 +1078,10 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) if (opp_table->set_opp) { ret = _set_opp_custom(opp_table, dev, old_freq, freq, - IS_ERR(old_opp) ? NULL : old_opp->supplies, - opp->supplies); + old_opp->supplies, opp->supplies); } else if (opp_table->regulators) { ret = _generic_set_opp_regulator(opp_table, dev, old_freq, freq, - IS_ERR(old_opp) ? NULL : old_opp->supplies, + old_opp->supplies, opp->supplies); } else { /* Only frequency scaling */ @@ -1081,15 +1097,18 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) if (!ret) { ret = _set_opp_bw(opp_table, opp, dev, false); - if (!ret) + if (!ret) { opp_table->enabled = true; + dev_pm_opp_put(old_opp); + + /* Make sure current_opp doesn't get freed */ + dev_pm_opp_get(opp); + opp_table->current_opp = opp; + } } put_opp: dev_pm_opp_put(opp); -put_old_opp: - if (!IS_ERR(old_opp)) - dev_pm_opp_put(old_opp); put_opp_table: dev_pm_opp_put_opp_table(opp_table); return ret; @@ -1298,6 +1317,9 @@ static void _opp_table_kref_release(struct kref *kref) list_del(&opp_table->node); mutex_unlock(&opp_table_lock); + if (opp_table->current_opp) + dev_pm_opp_put(opp_table->current_opp); + _of_clear_opp_table(opp_table); /* Release clk */ diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 64b9cb782a93..372df68e185b 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -135,6 +135,7 @@ enum opp_table_access { * @clock_latency_ns_max: Max clock latency in nanoseconds. * @parsed_static_opps: Count of devices for which OPPs are initialized from DT. * @shared_opp: OPP is shared between multiple devices. + * @current_opp: Currently configured OPP for the table. * @suspend_opp: Pointer to OPP to be used during device suspend. * @genpd_virt_dev_lock: Mutex protecting the genpd virtual device pointers. * @genpd_virt_devs: List of virtual devices for multiple genpd support. @@ -183,6 +184,7 @@ struct opp_table { unsigned int parsed_static_opps; enum opp_table_access shared_opp; + struct dev_pm_opp *current_opp; struct dev_pm_opp *suspend_opp; struct mutex genpd_virt_dev_lock; From 386ba854d9f3163aed0119b167a874169410d8bc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 12:12:09 +0530 Subject: [PATCH 113/307] opp: Split _set_opp() out of dev_pm_opp_set_rate() The _set_opp() helper will be used for devices which don't change their frequency (like power domains, etc.) later on, prepare for that by breaking the generic part out of dev_pm_opp_set_rate(). Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 126 +++++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 55 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index c77d8ae89836..2c8939d18783 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -998,72 +998,27 @@ static int _disable_opp_table(struct device *dev, struct opp_table *opp_table) return ret; } -/** - * dev_pm_opp_set_rate() - Configure new OPP based on frequency - * @dev: device for which we do this operation - * @target_freq: frequency to achieve - * - * This configures the power-supplies to the levels specified by the OPP - * corresponding to the target_freq, and programs the clock to a value <= - * target_freq, as rounded by clk_round_rate(). Device wanting to run at fmax - * provided by the opp, should have already rounded to the target OPP's - * frequency. - */ -int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) +static int _set_opp(struct device *dev, struct opp_table *opp_table, + struct dev_pm_opp *opp, unsigned long freq) { - struct opp_table *opp_table; - unsigned long freq, old_freq, temp_freq; - struct dev_pm_opp *old_opp, *opp; + struct dev_pm_opp *old_opp; + unsigned long old_freq; int ret; - opp_table = _find_opp_table(dev); - if (IS_ERR(opp_table)) { - dev_err(dev, "%s: device opp doesn't exist\n", __func__); - return PTR_ERR(opp_table); - } - - if (unlikely(!target_freq)) { - ret = _disable_opp_table(dev, opp_table); - goto put_opp_table; - } - - freq = clk_round_rate(opp_table->clk, target_freq); - if ((long)freq <= 0) - freq = target_freq; - - /* - * For IO devices which require an OPP on some platforms/SoCs - * while just needing to scale the clock on some others - * we look for empty OPP tables with just a clock handle and - * scale only the clk. This makes dev_pm_opp_set_rate() - * equivalent to a clk_set_rate() - */ - if (!_get_opp_count(opp_table)) { - ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq); - goto put_opp_table; - } + if (unlikely(!opp)) + return _disable_opp_table(dev, opp_table); /* Find the currently set OPP if we don't know already */ if (unlikely(!opp_table->current_opp)) _find_current_opp(dev, opp_table); - temp_freq = freq; - opp = _find_freq_ceil(opp_table, &temp_freq); - if (IS_ERR(opp)) { - ret = PTR_ERR(opp); - dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n", - __func__, freq, ret); - goto put_opp_table; - } - old_opp = opp_table->current_opp; old_freq = old_opp->rate; /* Return early if nothing to do */ if (opp_table->enabled && old_opp == opp) { dev_dbg(dev, "%s: OPPs are same, nothing to do\n", __func__); - ret = 0; - goto put_opp; + return 0; } dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__, @@ -1073,7 +1028,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) if (freq >= old_freq) { ret = _set_required_opps(dev, opp_table, opp, true); if (ret) - goto put_opp; + return ret; } if (opp_table->set_opp) { @@ -1107,8 +1062,69 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) } } -put_opp: - dev_pm_opp_put(opp); + return ret; +} + +/** + * dev_pm_opp_set_rate() - Configure new OPP based on frequency + * @dev: device for which we do this operation + * @target_freq: frequency to achieve + * + * This configures the power-supplies to the levels specified by the OPP + * corresponding to the target_freq, and programs the clock to a value <= + * target_freq, as rounded by clk_round_rate(). Device wanting to run at fmax + * provided by the opp, should have already rounded to the target OPP's + * frequency. + */ +int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) +{ + struct opp_table *opp_table; + unsigned long freq = 0, temp_freq; + struct dev_pm_opp *opp = NULL; + int ret; + + opp_table = _find_opp_table(dev); + if (IS_ERR(opp_table)) { + dev_err(dev, "%s: device's opp table doesn't exist\n", __func__); + return PTR_ERR(opp_table); + } + + if (target_freq) { + /* + * For IO devices which require an OPP on some platforms/SoCs + * while just needing to scale the clock on some others + * we look for empty OPP tables with just a clock handle and + * scale only the clk. This makes dev_pm_opp_set_rate() + * equivalent to a clk_set_rate() + */ + if (!_get_opp_count(opp_table)) { + ret = _generic_set_opp_clk_only(dev, opp_table->clk, target_freq); + goto put_opp_table; + } + + freq = clk_round_rate(opp_table->clk, target_freq); + if ((long)freq <= 0) + freq = target_freq; + + /* + * The clock driver may support finer resolution of the + * frequencies than the OPP table, don't update the frequency we + * pass to clk_set_rate() here. + */ + temp_freq = freq; + opp = _find_freq_ceil(opp_table, &temp_freq); + if (IS_ERR(opp)) { + ret = PTR_ERR(opp); + dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n", + __func__, freq, ret); + goto put_opp_table; + } + } + + ret = _set_opp(dev, opp_table, opp, freq); + + if (target_freq) + dev_pm_opp_put(opp); put_opp_table: dev_pm_opp_put_opp_table(opp_table); return ret; From f0b88fa45595254fa51427bd8ca321732e2eb73d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 16:00:12 +0530 Subject: [PATCH 114/307] opp: Allow _set_opp() to work for non-freq devices The _set_opp() helper will be used for devices which don't change frequency (like power domains, etc.) later on, prepare for that by not relying on frequency for making decisions here. While at it, also update the debug print to contain all relevant information. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 2c8939d18783..cce1b59d7bca 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1003,7 +1003,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, { struct dev_pm_opp *old_opp; unsigned long old_freq; - int ret; + int scaling_down, ret; if (unlikely(!opp)) return _disable_opp_table(dev, opp_table); @@ -1021,11 +1021,17 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, return 0; } - dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__, - old_freq, freq); + dev_dbg(dev, "%s: switching OPP: Freq %lu -> %lu Hz, Level %u -> %u, Bw %u -> %u\n", + __func__, old_freq, freq, old_opp->level, opp->level, + old_opp->bandwidth ? old_opp->bandwidth[0].peak : 0, + opp->bandwidth ? opp->bandwidth[0].peak : 0); + + scaling_down = _opp_compare_key(old_opp, opp); + if (scaling_down == -1) + scaling_down = 0; /* Scaling up? Configure required OPPs before frequency */ - if (freq >= old_freq) { + if (!scaling_down) { ret = _set_required_opps(dev, opp_table, opp, true); if (ret) return ret; @@ -1044,7 +1050,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, } /* Scaling down? Configure required OPPs after frequency */ - if (!ret && freq < old_freq) { + if (!ret && scaling_down) { ret = _set_required_opps(dev, opp_table, opp, false); if (ret) dev_err(dev, "Failed to set required opps: %d\n", ret); From 3f62670fcca4af3fe6492100a548603831ecc61d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 12:38:13 +0530 Subject: [PATCH 115/307] opp: Allow _generic_set_opp_regulator() to work for non-freq devices The _generic_set_opp_regulator() helper will be used for devices which don't change frequency (like power domains, etc.) later on, prepare for that by not relying on frequency for making decisions here. While at it, update its parameters to pass only what is necessary. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index cce1b59d7bca..c078c7dab6b2 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -737,12 +737,12 @@ static inline int _generic_set_opp_clk_only(struct device *dev, struct clk *clk, static int _generic_set_opp_regulator(struct opp_table *opp_table, struct device *dev, - unsigned long old_freq, + struct dev_pm_opp *opp, unsigned long freq, - struct dev_pm_opp_supply *old_supply, - struct dev_pm_opp_supply *new_supply) + int scaling_down) { struct regulator *reg = opp_table->regulators[0]; + struct dev_pm_opp *old_opp = opp_table->current_opp; int ret; /* This function only supports single regulator per device */ @@ -752,8 +752,8 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table, } /* Scaling up? Scale voltage before frequency */ - if (freq >= old_freq) { - ret = _set_opp_voltage(dev, reg, new_supply); + if (!scaling_down) { + ret = _set_opp_voltage(dev, reg, opp->supplies); if (ret) goto restore_voltage; } @@ -764,8 +764,8 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table, goto restore_voltage; /* Scaling down? Scale voltage after frequency */ - if (freq < old_freq) { - ret = _set_opp_voltage(dev, reg, new_supply); + if (scaling_down) { + ret = _set_opp_voltage(dev, reg, opp->supplies); if (ret) goto restore_freq; } @@ -783,12 +783,12 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table, return 0; restore_freq: - if (_generic_set_opp_clk_only(dev, opp_table->clk, old_freq)) + if (_generic_set_opp_clk_only(dev, opp_table->clk, old_opp->rate)) dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n", - __func__, old_freq); + __func__, old_opp->rate); restore_voltage: /* This shouldn't harm even if the voltages weren't updated earlier */ - _set_opp_voltage(dev, reg, old_supply); + _set_opp_voltage(dev, reg, old_opp->supplies); return ret; } @@ -1041,9 +1041,8 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, ret = _set_opp_custom(opp_table, dev, old_freq, freq, old_opp->supplies, opp->supplies); } else if (opp_table->regulators) { - ret = _generic_set_opp_regulator(opp_table, dev, old_freq, freq, - old_opp->supplies, - opp->supplies); + ret = _generic_set_opp_regulator(opp_table, dev, opp, freq, + scaling_down); } else { /* Only frequency scaling */ ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq); From 35e74b2ee8ec64da6f8067c5b0744f16ff19915b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 12:38:13 +0530 Subject: [PATCH 116/307] opp: Allow _generic_set_opp_clk_only() to work for non-freq devices In order to avoid conditional statements at the caller site, this patch updates _generic_set_opp_clk_only() to work for devices that don't change frequency (like power domains, etc.). Return 0 if the clk pointer passed to this routine is not valid. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index c078c7dab6b2..f21ce52a5002 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -726,6 +726,10 @@ static inline int _generic_set_opp_clk_only(struct device *dev, struct clk *clk, { int ret; + /* We may reach here for devices which don't change frequency */ + if (IS_ERR(clk)) + return 0; + ret = clk_set_rate(clk, freq); if (ret) { dev_err(dev, "%s: failed to set clock rate: %d\n", __func__, From 509e4777ca41d30808deda5ae3c1e09e3f58a33f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 13:06:01 +0530 Subject: [PATCH 117/307] opp: Update parameters of _set_opp_custom() Drop the unnecessary parameters and follow the pattern from _generic_set_opp_regulator(). While at it, also remove the local variable old_freq. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index f21ce52a5002..2b5584ef0350 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -826,12 +826,11 @@ static int _set_opp_bw(const struct opp_table *opp_table, } static int _set_opp_custom(const struct opp_table *opp_table, - struct device *dev, unsigned long old_freq, - unsigned long freq, - struct dev_pm_opp_supply *old_supply, - struct dev_pm_opp_supply *new_supply) + struct device *dev, struct dev_pm_opp *opp, + unsigned long freq) { struct dev_pm_set_opp_data *data = opp_table->set_opp_data; + struct dev_pm_opp *old_opp = opp_table->current_opp; int size; /* @@ -839,9 +838,9 @@ static int _set_opp_custom(const struct opp_table *opp_table, * earlier. */ if (opp_table->sod_supplies) { - size = sizeof(*old_supply) * opp_table->regulator_count; - memcpy(data->old_opp.supplies, old_supply, size); - memcpy(data->new_opp.supplies, new_supply, size); + size = sizeof(*old_opp->supplies) * opp_table->regulator_count; + memcpy(data->old_opp.supplies, old_opp->supplies, size); + memcpy(data->new_opp.supplies, opp->supplies, size); data->regulator_count = opp_table->regulator_count; } else { data->regulator_count = 0; @@ -850,7 +849,7 @@ static int _set_opp_custom(const struct opp_table *opp_table, data->regulators = opp_table->regulators; data->clk = opp_table->clk; data->dev = dev; - data->old_opp.rate = old_freq; + data->old_opp.rate = old_opp->rate; data->new_opp.rate = freq; return opp_table->set_opp(data); @@ -1006,7 +1005,6 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, struct dev_pm_opp *opp, unsigned long freq) { struct dev_pm_opp *old_opp; - unsigned long old_freq; int scaling_down, ret; if (unlikely(!opp)) @@ -1017,7 +1015,6 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, _find_current_opp(dev, opp_table); old_opp = opp_table->current_opp; - old_freq = old_opp->rate; /* Return early if nothing to do */ if (opp_table->enabled && old_opp == opp) { @@ -1026,7 +1023,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, } dev_dbg(dev, "%s: switching OPP: Freq %lu -> %lu Hz, Level %u -> %u, Bw %u -> %u\n", - __func__, old_freq, freq, old_opp->level, opp->level, + __func__, old_opp->rate, freq, old_opp->level, opp->level, old_opp->bandwidth ? old_opp->bandwidth[0].peak : 0, opp->bandwidth ? opp->bandwidth[0].peak : 0); @@ -1042,8 +1039,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, } if (opp_table->set_opp) { - ret = _set_opp_custom(opp_table, dev, old_freq, freq, - old_opp->supplies, opp->supplies); + ret = _set_opp_custom(opp_table, dev, opp, freq); } else if (opp_table->regulators) { ret = _generic_set_opp_regulator(opp_table, dev, opp, freq, scaling_down); From abbe348340c7df9e08fd7c24491c1be31ab65370 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 12:15:36 +0530 Subject: [PATCH 118/307] opp: Implement dev_pm_opp_set_opp() The new helper dev_pm_opp_set_opp() can be used for configuring the devices for a particular OPP and can be used by different type of devices, even the ones which don't change frequency (like power domains). Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 28 ++++++++++++++++++++++++++++ include/linux/pm_opp.h | 6 ++++++ 2 files changed, 34 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 2b5584ef0350..fac84d5a1d45 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1136,6 +1136,34 @@ put_opp_table: } EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate); +/** + * dev_pm_opp_set_opp() - Configure device for OPP + * @dev: device for which we do this operation + * @opp: OPP to set to + * + * This configures the device based on the properties of the OPP passed to this + * routine. + * + * Return: 0 on success, a negative error number otherwise. + */ +int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp) +{ + struct opp_table *opp_table; + int ret; + + opp_table = _find_opp_table(dev); + if (IS_ERR(opp_table)) { + dev_err(dev, "%s: device opp doesn't exist\n", __func__); + return PTR_ERR(opp_table); + } + + ret = _set_opp(dev, opp_table, opp, opp ? opp->rate : 0); + dev_pm_opp_put_opp_table(opp_table); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_set_opp); + /* OPP-dev Helpers */ static void _remove_opp_dev(struct opp_device *opp_dev, struct opp_table *opp_table) diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index a2c871799603..7b1d47ab3fb3 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -158,6 +158,7 @@ void dev_pm_opp_detach_genpd(struct opp_table *opp_table); struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); +int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp); int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp); int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask); int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask); @@ -377,6 +378,11 @@ static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_f return -ENOTSUPP; } +static inline int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp) +{ + return -ENOTSUPP; +} + static inline int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) { return -EOPNOTSUPP; From 8d25157f738c413b40b82776b0d260cd23505266 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 15:27:55 +0530 Subject: [PATCH 119/307] cpufreq: qcom: Migrate to dev_pm_opp_set_opp() dev_pm_opp_set_bw() is getting removed and dev_pm_opp_set_opp() should be used instead. Migrate to the new API. Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/cpufreq/qcom-cpufreq-hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 9ed5341dc515..7df18903b66c 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -54,7 +54,7 @@ static int qcom_cpufreq_set_bw(struct cpufreq_policy *policy, if (IS_ERR(opp)) return PTR_ERR(opp); - ret = dev_pm_opp_set_bw(dev, opp); + ret = dev_pm_opp_set_opp(dev, opp); dev_pm_opp_put(opp); return ret; } From 920b4a678099dd7429f03cb00649c5455f21cc67 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 15:27:55 +0530 Subject: [PATCH 120/307] drm: msm: Migrate to dev_pm_opp_set_opp() dev_pm_opp_set_bw() is getting removed and dev_pm_opp_set_opp() should be used instead. Migrate to the new API. Signed-off-by: Viresh Kumar --- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index e6703ae98760..05e0ef58fe32 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -134,7 +134,7 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) if (!gmu->legacy) { a6xx_hfi_set_freq(gmu, perf_index); - dev_pm_opp_set_bw(&gpu->pdev->dev, opp); + dev_pm_opp_set_opp(&gpu->pdev->dev, opp); pm_runtime_put(gmu->dev); return; } @@ -158,7 +158,7 @@ void a6xx_gmu_set_freq(struct msm_gpu *gpu, struct dev_pm_opp *opp) if (ret) dev_err(gmu->dev, "GMU set GPU frequency error: %d\n", ret); - dev_pm_opp_set_bw(&gpu->pdev->dev, opp); + dev_pm_opp_set_opp(&gpu->pdev->dev, opp); pm_runtime_put(gmu->dev); } @@ -866,7 +866,7 @@ static void a6xx_gmu_set_initial_bw(struct msm_gpu *gpu, struct a6xx_gmu *gmu) if (IS_ERR_OR_NULL(gpu_opp)) return; - dev_pm_opp_set_bw(&gpu->pdev->dev, gpu_opp); + dev_pm_opp_set_opp(&gpu->pdev->dev, gpu_opp); dev_pm_opp_put(gpu_opp); } @@ -1072,7 +1072,7 @@ int a6xx_gmu_stop(struct a6xx_gpu *a6xx_gpu) a6xx_gmu_shutdown(gmu); /* Remove the bus vote */ - dev_pm_opp_set_bw(&gpu->pdev->dev, NULL); + dev_pm_opp_set_opp(&gpu->pdev->dev, NULL); /* * Make sure the GX domain is off before turning off the GMU (CX) From c7f142190d91a7e8b3df0a6ef9fabb591fb83c71 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 15:27:55 +0530 Subject: [PATCH 121/307] devfreq: tegra30: Migrate to dev_pm_opp_set_opp() dev_pm_opp_set_bw() is getting removed and dev_pm_opp_set_opp() should be used instead. Migrate to the new API. We don't want the OPP core to manage the clk for this driver, migrate to dev_pm_opp_of_add_table_noclk() to make sure dev_pm_opp_set_opp() doesn't have any side effects. Signed-off-by: Viresh Kumar Acked-by: Chanwoo Choi Tested-by: Dmitry Osipenko --- drivers/devfreq/tegra30-devfreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/devfreq/tegra30-devfreq.c b/drivers/devfreq/tegra30-devfreq.c index 117cad7968ab..ce83f883ca65 100644 --- a/drivers/devfreq/tegra30-devfreq.c +++ b/drivers/devfreq/tegra30-devfreq.c @@ -647,7 +647,7 @@ static int tegra_devfreq_target(struct device *dev, unsigned long *freq, return PTR_ERR(opp); } - ret = dev_pm_opp_set_bw(dev, opp); + ret = dev_pm_opp_set_opp(dev, opp); dev_pm_opp_put(opp); return ret; @@ -849,7 +849,7 @@ static int tegra_devfreq_probe(struct platform_device *pdev) return err; } - err = dev_pm_opp_of_add_table(&pdev->dev); + err = dev_pm_opp_of_add_table_noclk(&pdev->dev, 0); if (err) { dev_err(&pdev->dev, "Failed to add OPP table: %d\n", err); goto put_hw; From 240ae50e23061cd1fe1937daab195c17226ffd2e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 21 Jan 2021 15:27:55 +0530 Subject: [PATCH 122/307] opp: Remove dev_pm_opp_set_bw() All the users have migrated to dev_pm_opp_set_opp() now, get rid of the duplicate API, dev_pm_opp_set_bw(), which only performs a part of the new API. While at it, remove the unnecessary parameter to _set_opp_bw(). Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 41 +++++------------------------------------ include/linux/pm_opp.h | 6 ------ 2 files changed, 5 insertions(+), 42 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index fac84d5a1d45..6958a5cd2fd8 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -798,7 +798,7 @@ restore_voltage: } static int _set_opp_bw(const struct opp_table *opp_table, - struct dev_pm_opp *opp, struct device *dev, bool remove) + struct dev_pm_opp *opp, struct device *dev) { u32 avg, peak; int i, ret; @@ -807,7 +807,7 @@ static int _set_opp_bw(const struct opp_table *opp_table, return 0; for (i = 0; i < opp_table->path_count; i++) { - if (remove) { + if (!opp) { avg = 0; peak = 0; } else { @@ -817,7 +817,7 @@ static int _set_opp_bw(const struct opp_table *opp_table, ret = icc_set_bw(opp_table->paths[i], avg, peak); if (ret) { dev_err(dev, "Failed to %s bandwidth[%d]: %d\n", - remove ? "remove" : "set", i, ret); + opp ? "set" : "remove", i, ret); return ret; } } @@ -917,37 +917,6 @@ static int _set_required_opps(struct device *dev, return ret; } -/** - * dev_pm_opp_set_bw() - sets bandwidth levels corresponding to an opp - * @dev: device for which we do this operation - * @opp: opp based on which the bandwidth levels are to be configured - * - * This configures the bandwidth to the levels specified by the OPP. However - * if the OPP specified is NULL the bandwidth levels are cleared out. - * - * Return: 0 on success or a negative error value. - */ -int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) -{ - struct opp_table *opp_table; - int ret; - - opp_table = _find_opp_table(dev); - if (IS_ERR(opp_table)) { - dev_err(dev, "%s: device opp table doesn't exist\n", __func__); - return PTR_ERR(opp_table); - } - - if (opp) - ret = _set_opp_bw(opp_table, opp, dev, false); - else - ret = _set_opp_bw(opp_table, NULL, dev, true); - - dev_pm_opp_put_opp_table(opp_table); - return ret; -} -EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw); - static void _find_current_opp(struct device *dev, struct opp_table *opp_table) { struct dev_pm_opp *opp = ERR_PTR(-ENODEV); @@ -988,7 +957,7 @@ static int _disable_opp_table(struct device *dev, struct opp_table *opp_table) if (!_get_opp_count(opp_table)) return 0; - ret = _set_opp_bw(opp_table, NULL, dev, true); + ret = _set_opp_bw(opp_table, NULL, dev); if (ret) return ret; @@ -1056,7 +1025,7 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, } if (!ret) { - ret = _set_opp_bw(opp_table, opp, dev, false); + ret = _set_opp_bw(opp_table, opp, dev); if (!ret) { opp_table->enabled = true; dev_pm_opp_put(old_opp); diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 7b1d47ab3fb3..25e47ab937b9 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -159,7 +159,6 @@ struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **name int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp); -int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp); int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask); int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask); void dev_pm_opp_remove_table(struct device *dev); @@ -383,11 +382,6 @@ static inline int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp) return -ENOTSUPP; } -static inline int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) -{ - return -EOPNOTSUPP; -} - static inline int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask) { return -ENOTSUPP; From 7eba0c7641b0009818e469dbfcdd87a0155ab9d4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 25 Nov 2019 13:57:58 +0530 Subject: [PATCH 123/307] opp: Allow lazy-linking of required-opps The OPP core currently requires the required opp tables to be available before the dependent OPP table is added, as it needs to create links from the dependent OPP table to the required ones. This may not be convenient for all the platforms though, as this requires strict ordering for probing the drivers. This patch allows lazy-linking of the required-opps. The OPP tables for which the required-opp-tables aren't available at the time of their initialization, are added to a special list of OPP tables: lazy_opp_tables. Later on, whenever a new OPP table is registered with the OPP core, we check if it is required by an OPP table in the pending list; if yes, then we complete the linking then and there. An OPP table is marked unusable until the time all its required-opp tables are available. And if lazy-linking fails for an OPP table, the OPP core disables all of its OPPs to make sure no one can use them. Tested-by: Hsin-Yi Wang Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 45 +++++++++++++---- drivers/opp/of.c | 122 +++++++++++++++++++++++++++++++++++++++++++-- drivers/opp/opp.h | 10 +++- 3 files changed, 161 insertions(+), 16 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 6958a5cd2fd8..e03600547b98 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -27,6 +27,10 @@ * various states of availability. */ LIST_HEAD(opp_tables); + +/* OPP tables with uninitialized required OPPs */ +LIST_HEAD(lazy_opp_tables); + /* Lock to allow exclusive modification to the device and opp lists */ DEFINE_MUTEX(opp_table_lock); /* Flag indicating that opp_tables list is being updated at the moment */ @@ -163,6 +167,10 @@ unsigned int dev_pm_opp_get_required_pstate(struct dev_pm_opp *opp, return 0; } + /* required-opps not fully initialized yet */ + if (lazy_linking_pending(opp->opp_table)) + return 0; + return opp->required_opps[index]->pstate; } EXPORT_SYMBOL_GPL(dev_pm_opp_get_required_pstate); @@ -885,6 +893,10 @@ static int _set_required_opps(struct device *dev, if (!required_opp_tables) return 0; + /* required-opps not fully initialized yet */ + if (lazy_linking_pending(opp_table)) + return -EBUSY; + /* Single genpd case */ if (!genpd_virt_devs) return _set_required_opp(dev, dev, opp, 0); @@ -1181,6 +1193,7 @@ static struct opp_table *_allocate_opp_table(struct device *dev, int index) mutex_init(&opp_table->lock); mutex_init(&opp_table->genpd_virt_dev_lock); INIT_LIST_HEAD(&opp_table->dev_list); + INIT_LIST_HEAD(&opp_table->lazy); /* Mark regulator count uninitialized */ opp_table->regulator_count = -1; @@ -1632,6 +1645,21 @@ static int _opp_is_duplicate(struct device *dev, struct dev_pm_opp *new_opp, return 0; } +void _required_opps_available(struct dev_pm_opp *opp, int count) +{ + int i; + + for (i = 0; i < count; i++) { + if (opp->required_opps[i]->available) + continue; + + opp->available = false; + pr_warn("%s: OPP not supported by required OPP %pOF (%lu)\n", + __func__, opp->required_opps[i]->np, opp->rate); + return; + } +} + /* * Returns: * 0: On success. And appropriate error message for duplicate OPPs. @@ -1646,7 +1674,6 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table, bool rate_not_available) { struct list_head *head; - unsigned int i; int ret; mutex_lock(&opp_table->lock); @@ -1672,15 +1699,11 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, __func__, new_opp->rate); } - for (i = 0; i < opp_table->required_opp_count; i++) { - if (new_opp->required_opps[i]->available) - continue; + /* required-opps not fully initialized yet */ + if (lazy_linking_pending(opp_table)) + return 0; - new_opp->available = false; - dev_warn(dev, "%s: OPP not supported by required OPP %pOF (%lu)\n", - __func__, new_opp->required_opps[i]->np, new_opp->rate); - break; - } + _required_opps_available(new_opp, opp_table->required_opp_count); return 0; } @@ -2388,6 +2411,10 @@ int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, if (!src_table || !src_table->required_opp_count) return pstate; + /* required-opps not fully initialized yet */ + if (lazy_linking_pending(src_table)) + return -EBUSY; + for (i = 0; i < src_table->required_opp_count; i++) { if (src_table->required_opp_tables[i]->np == dst_table->np) break; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 20ccdaab9384..f480c10e6314 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -144,7 +144,7 @@ static void _opp_table_free_required_tables(struct opp_table *opp_table) for (i = 0; i < opp_table->required_opp_count; i++) { if (IS_ERR_OR_NULL(required_opp_tables[i])) - break; + continue; dev_pm_opp_put_opp_table(required_opp_tables[i]); } @@ -153,6 +153,7 @@ static void _opp_table_free_required_tables(struct opp_table *opp_table) opp_table->required_opp_count = 0; opp_table->required_opp_tables = NULL; + list_del(&opp_table->lazy); } /* @@ -165,6 +166,7 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, { struct opp_table **required_opp_tables; struct device_node *required_np, *np; + bool lazy = false; int count, i; /* Traversing the first OPP node is all we need */ @@ -195,8 +197,10 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, required_opp_tables[i] = _find_table_of_opp_np(required_np); of_node_put(required_np); - if (IS_ERR(required_opp_tables[i])) - goto free_required_tables; + if (IS_ERR(required_opp_tables[i])) { + lazy = true; + continue; + } /* * We only support genpd's OPPs in the "required-opps" for now, @@ -210,6 +214,10 @@ static void _opp_table_alloc_required_tables(struct opp_table *opp_table, } } + /* Let's do the linking later on */ + if (lazy) + list_add(&opp_table->lazy, &lazy_opp_tables); + goto put_np; free_required_tables: @@ -278,14 +286,14 @@ void _of_opp_free_required_opps(struct opp_table *opp_table, for (i = 0; i < opp_table->required_opp_count; i++) { if (!required_opps[i]) - break; + continue; /* Put the reference back */ dev_pm_opp_put(required_opps[i]); } - kfree(required_opps); opp->required_opps = NULL; + kfree(required_opps); } /* Populate all required OPPs which are part of "required-opps" list */ @@ -309,6 +317,10 @@ static int _of_opp_alloc_required_opps(struct opp_table *opp_table, for (i = 0; i < count; i++) { required_table = opp_table->required_opp_tables[i]; + /* Required table not added yet, we will link later */ + if (IS_ERR_OR_NULL(required_table)) + continue; + np = of_parse_required_opp(opp->np, i); if (unlikely(!np)) { ret = -ENODEV; @@ -334,6 +346,104 @@ free_required_opps: return ret; } +/* Link required OPPs for an individual OPP */ +static int lazy_link_required_opps(struct opp_table *opp_table, + struct opp_table *new_table, int index) +{ + struct device_node *required_np; + struct dev_pm_opp *opp; + + list_for_each_entry(opp, &opp_table->opp_list, node) { + required_np = of_parse_required_opp(opp->np, index); + if (unlikely(!required_np)) + return -ENODEV; + + opp->required_opps[index] = _find_opp_of_np(new_table, required_np); + of_node_put(required_np); + + if (!opp->required_opps[index]) { + pr_err("%s: Unable to find required OPP node: %pOF (%d)\n", + __func__, opp->np, index); + return -ENODEV; + } + } + + return 0; +} + +/* Link required OPPs for all OPPs of the newly added OPP table */ +static void lazy_link_required_opp_table(struct opp_table *new_table) +{ + struct opp_table *opp_table, *temp, **required_opp_tables; + struct device_node *required_np, *opp_np, *required_table_np; + struct dev_pm_opp *opp; + int i, ret; + + /* + * We only support genpd's OPPs in the "required-opps" for now, + * as we don't know much about other cases. + */ + if (!new_table->is_genpd) + return; + + mutex_lock(&opp_table_lock); + + list_for_each_entry_safe(opp_table, temp, &lazy_opp_tables, lazy) { + bool lazy = false; + + /* opp_np can't be invalid here */ + opp_np = of_get_next_available_child(opp_table->np, NULL); + + for (i = 0; i < opp_table->required_opp_count; i++) { + required_opp_tables = opp_table->required_opp_tables; + + /* Required opp-table is already parsed */ + if (!IS_ERR(required_opp_tables[i])) + continue; + + /* required_np can't be invalid here */ + required_np = of_parse_required_opp(opp_np, i); + required_table_np = of_get_parent(required_np); + + of_node_put(required_table_np); + of_node_put(required_np); + + /* + * Newly added table isn't the required opp-table for + * opp_table. + */ + if (required_table_np != new_table->np) { + lazy = true; + continue; + } + + required_opp_tables[i] = new_table; + _get_opp_table_kref(new_table); + + /* Link OPPs now */ + ret = lazy_link_required_opps(opp_table, new_table, i); + if (ret) { + /* The OPPs will be marked unusable */ + lazy = false; + break; + } + } + + of_node_put(opp_np); + + /* All required opp-tables found, remove from lazy list */ + if (!lazy) { + list_del(&opp_table->lazy); + INIT_LIST_HEAD(&opp_table->lazy); + + list_for_each_entry(opp, &opp_table->opp_list, node) + _required_opps_available(opp, opp_table->required_opp_count); + } + } + + mutex_unlock(&opp_table_lock); +} + static int _bandwidth_supported(struct device *dev, struct opp_table *opp_table) { struct device_node *np, *opp_np; @@ -889,6 +999,8 @@ static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table) } } + lazy_link_required_opp_table(opp_table); + return 0; remove_static_opp: diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index 372df68e185b..9b9daf83b074 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -26,7 +26,7 @@ struct regulator; /* Lock to allow exclusive modification to the device and opp lists */ extern struct mutex opp_table_lock; -extern struct list_head opp_tables; +extern struct list_head opp_tables, lazy_opp_tables; /* * Internal data structure organization with the OPP layer library is as @@ -168,7 +168,7 @@ enum opp_table_access { * meant for book keeping and private to OPP library. */ struct opp_table { - struct list_head node; + struct list_head node, lazy; struct blocking_notifier_head head; struct list_head dev_list; @@ -229,6 +229,12 @@ int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long f void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, int last_cpu); struct opp_table *_add_opp_table_indexed(struct device *dev, int index, bool getclk); void _put_opp_list_kref(struct opp_table *opp_table); +void _required_opps_available(struct dev_pm_opp *opp, int count); + +static inline bool lazy_linking_pending(struct opp_table *opp_table) +{ + return unlikely(!list_empty(&opp_table->lazy)); +} #ifdef CONFIG_OF void _of_init_opp_table(struct opp_table *opp_table, struct device *dev, int index); From 870d5d963972ddefa83a09a7dbe4bef01f0b35b8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 28 Jan 2021 15:30:00 +0530 Subject: [PATCH 124/307] opp: Update bandwidth requirements based on scaling up/down The bandwidth must be scaled at a different point in the code flow based on if we are scaling up or down the frequency, otherwise this may cause undesired effects as the device will try to use more of the memory bandwidth which may be shared across several devices. Much like how regulators and required-opps are programmed. Reported-by: Dmitry Osipenko Reported-by: Akhil P Oommen Signed-off-by: Viresh Kumar Tested-by: Dmitry Osipenko --- drivers/opp/core.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index e03600547b98..a518173fd64a 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1015,8 +1015,16 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, /* Scaling up? Configure required OPPs before frequency */ if (!scaling_down) { ret = _set_required_opps(dev, opp_table, opp, true); - if (ret) + if (ret) { + dev_err(dev, "Failed to set required opps: %d\n", ret); return ret; + } + + ret = _set_opp_bw(opp_table, opp, dev); + if (ret) { + dev_err(dev, "Failed to set bw: %d\n", ret); + return ret; + } } if (opp_table->set_opp) { @@ -1029,25 +1037,31 @@ static int _set_opp(struct device *dev, struct opp_table *opp_table, ret = _generic_set_opp_clk_only(dev, opp_table->clk, freq); } + if (ret) + return ret; + /* Scaling down? Configure required OPPs after frequency */ - if (!ret && scaling_down) { - ret = _set_required_opps(dev, opp_table, opp, false); - if (ret) - dev_err(dev, "Failed to set required opps: %d\n", ret); - } - - if (!ret) { + if (scaling_down) { ret = _set_opp_bw(opp_table, opp, dev); - if (!ret) { - opp_table->enabled = true; - dev_pm_opp_put(old_opp); + if (ret) { + dev_err(dev, "Failed to set bw: %d\n", ret); + return ret; + } - /* Make sure current_opp doesn't get freed */ - dev_pm_opp_get(opp); - opp_table->current_opp = opp; + ret = _set_required_opps(dev, opp_table, opp, false); + if (ret) { + dev_err(dev, "Failed to set required opps: %d\n", ret); + return ret; } } + opp_table->enabled = true; + dev_pm_opp_put(old_opp); + + /* Make sure current_opp doesn't get freed */ + dev_pm_opp_get(opp); + opp_table->current_opp = opp; + return ret; } From d4a4c7a41153d701f23322ea5d39c766e9ff6eee Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 29 Jan 2021 16:12:04 +0530 Subject: [PATCH 125/307] opp: Don't ignore clk_get() errors other than -ENOENT Not all devices that need to use OPP core need to have clocks, a missing clock is fine in which case -ENOENT shall be returned by clk_get(). Anything else is an error and must be handled properly. Reported-by: Dmitry Osipenko Tested-by: Dmitry Osipenko Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index a518173fd64a..dc95d29e94c1 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -1252,6 +1252,8 @@ static struct opp_table *_update_opp_table_clk(struct device *dev, struct opp_table *opp_table, bool getclk) { + int ret; + /* * Return early if we don't need to get clk or we have already tried it * earlier. @@ -1261,18 +1263,20 @@ static struct opp_table *_update_opp_table_clk(struct device *dev, /* Find clk for the device */ opp_table->clk = clk_get(dev, NULL); - if (IS_ERR(opp_table->clk)) { - int ret = PTR_ERR(opp_table->clk); - if (ret == -EPROBE_DEFER) { - dev_pm_opp_put_opp_table(opp_table); - return ERR_PTR(ret); - } + ret = PTR_ERR_OR_ZERO(opp_table->clk); + if (!ret) + return opp_table; + if (ret == -ENOENT) { dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__, ret); + return opp_table; } - return opp_table; + dev_pm_opp_put_opp_table(opp_table); + dev_err_probe(dev, ret, "Couldn't find clock\n"); + + return ERR_PTR(ret); } /* From f3988bc5d58b768c5cf0dadf5f0e49f7176432df Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 1 Feb 2021 10:35:07 +0530 Subject: [PATCH 126/307] opp: Fix "foo * bar" should be "foo *bar" Fix checkpatch warning: ERROR: "foo * bar" should be "foo *bar". Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 25e47ab937b9..c6c7d73eb015 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -148,7 +148,7 @@ struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name) void dev_pm_opp_put_prop_name(struct opp_table *opp_table); struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count); void dev_pm_opp_put_regulators(struct opp_table *opp_table); -struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name); +struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name); void dev_pm_opp_put_clkname(struct opp_table *opp_table); struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table); @@ -347,7 +347,7 @@ static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, co static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {} -static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char * name) +static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name) { return ERR_PTR(-ENOTSUPP); } From 1d614920318b914f86c1fec2adec06ad2f7c3f55 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 1 Feb 2021 10:48:54 +0530 Subject: [PATCH 127/307] opp: Replace ENOTSUPP with EOPNOTSUPP Checkpatch gives following warning for new patches, and the new patches normally follow the existing standards for such stuff. Lets fix it properly. WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 64 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index c6c7d73eb015..ab1d15ce559d 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -167,12 +167,12 @@ int dev_pm_opp_sync_regulators(struct device *dev); #else static inline struct opp_table *dev_pm_opp_get_opp_table(struct device *dev) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct opp_table *dev_pm_opp_get_opp_table_indexed(struct device *dev, int index) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_put_opp_table(struct opp_table *opp_table) {} @@ -232,37 +232,37 @@ static inline unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev) static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, unsigned long freq, bool available) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct dev_pm_opp *dev_pm_opp_find_level_exact(struct device *dev, unsigned int level) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, unsigned int *level) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev, unsigned long *freq) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil_by_volt(struct device *dev, unsigned long u_volt) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev, unsigned long *freq) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {} @@ -270,7 +270,7 @@ static inline void dev_pm_opp_put(struct dev_pm_opp *opp) {} static inline int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline void dev_pm_opp_remove(struct device *dev, unsigned long freq) @@ -301,19 +301,19 @@ static inline int dev_pm_opp_disable(struct device *dev, unsigned long freq) static inline int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_unregister_notifier(struct device *dev, struct notifier_block *nb) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, unsigned int count) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {} @@ -321,7 +321,7 @@ static inline void dev_pm_opp_put_supported_hw(struct opp_table *opp_table) {} static inline struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_unregister_set_opp_helper(struct opp_table *opp_table) {} @@ -330,33 +330,33 @@ static inline struct opp_table * devm_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_put_prop_name(struct opp_table *opp_table) {} static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {} static inline struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_put_clkname(struct opp_table *opp_table) {} static inline struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {} @@ -364,27 +364,27 @@ static inline void dev_pm_opp_detach_genpd(struct opp_table *opp_table) {} static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs) { - return ERR_PTR(-ENOTSUPP); + return ERR_PTR(-EOPNOTSUPP); } static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) @@ -402,7 +402,7 @@ static inline void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask static inline int dev_pm_opp_sync_regulators(struct device *dev) { - return -ENOTSUPP; + return -EOPNOTSUPP; } #endif /* CONFIG_PM_OPP */ @@ -427,17 +427,17 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev) #else static inline int dev_pm_opp_of_add_table(struct device *dev) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_of_add_table_indexed(struct device *dev, int index) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_of_add_table_noclk(struct device *dev, int index) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline void dev_pm_opp_of_remove_table(struct device *dev) @@ -446,7 +446,7 @@ static inline void dev_pm_opp_of_remove_table(struct device *dev) static inline int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask) @@ -455,7 +455,7 @@ static inline void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpum static inline int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev) @@ -471,7 +471,7 @@ static inline struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp) static inline int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline void dev_pm_opp_of_unregister_em(struct device *dev) @@ -480,12 +480,12 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev) static inline int of_get_required_opp_performance_state(struct device_node *np, int index) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table) { - return -ENOTSUPP; + return -EOPNOTSUPP; } #endif From fc1745c0e40cfc98c0bc466b95ddedf28e5019b4 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 2 Feb 2021 14:55:24 +0800 Subject: [PATCH 128/307] PM / devfreq: rk3399_dmc: Remove unneeded semicolon Eliminate the following coccicheck warning: ./drivers/devfreq/rk3399_dmc.c:403:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Chanwoo Choi --- drivers/devfreq/rk3399_dmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c index 2e912166a993..9e9d3b4c6d48 100644 --- a/drivers/devfreq/rk3399_dmc.c +++ b/drivers/devfreq/rk3399_dmc.c @@ -400,7 +400,7 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) default: ret = -EINVAL; goto err_edev; - }; + } no_pmu: arm_smccc_smc(ROCKCHIP_SIP_DRAM_FREQ, 0, 0, From 7d8658ef65a4f891d0cff6340fa717b378384642 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 4 Feb 2021 16:14:22 +0800 Subject: [PATCH 129/307] OPP: Add function to look up required OPP's for a given OPP Add a function that allows looking up required OPPs given a source OPP table, destination OPP table and the source OPP. Signed-off-by: Saravana Kannan Signed-off-by: Hsin-Yi Wang [ Viresh: Rearranged code, fixed return errors ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 55 ++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 7 ++++++ 2 files changed, 62 insertions(+) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index dc95d29e94c1..c3f3d9249cc5 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -2398,6 +2398,61 @@ devm_pm_opp_attach_genpd(struct device *dev, const char **names, } EXPORT_SYMBOL_GPL(devm_pm_opp_attach_genpd); +/** + * dev_pm_opp_xlate_required_opp() - Find required OPP for @src_table OPP. + * @src_table: OPP table which has @dst_table as one of its required OPP table. + * @dst_table: Required OPP table of the @src_table. + * @src_opp: OPP from the @src_table. + * + * This function returns the OPP (present in @dst_table) pointed out by the + * "required-opps" property of the @src_opp (present in @src_table). + * + * The callers are required to call dev_pm_opp_put() for the returned OPP after + * use. + * + * Return: pointer to 'struct dev_pm_opp' on success and errorno otherwise. + */ +struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, + struct opp_table *dst_table, + struct dev_pm_opp *src_opp) +{ + struct dev_pm_opp *opp, *dest_opp = ERR_PTR(-ENODEV); + int i; + + if (!src_table || !dst_table || !src_opp || + !src_table->required_opp_tables) + return ERR_PTR(-EINVAL); + + /* required-opps not fully initialized yet */ + if (lazy_linking_pending(src_table)) + return ERR_PTR(-EBUSY); + + for (i = 0; i < src_table->required_opp_count; i++) { + if (src_table->required_opp_tables[i] == dst_table) { + mutex_lock(&src_table->lock); + + list_for_each_entry(opp, &src_table->opp_list, node) { + if (opp == src_opp) { + dest_opp = opp->required_opps[i]; + dev_pm_opp_get(dest_opp); + break; + } + } + + mutex_unlock(&src_table->lock); + break; + } + } + + if (IS_ERR(dest_opp)) { + pr_err("%s: Couldn't find matching OPP (%p: %p)\n", __func__, + src_table, dst_table); + } + + return dest_opp; +} +EXPORT_SYMBOL_GPL(dev_pm_opp_xlate_required_opp); + /** * dev_pm_opp_xlate_performance_state() - Find required OPP's pstate for src_table. * @src_table: OPP table which has dst_table as one of its required OPP table. diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index ab1d15ce559d..c0371efa4a0f 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -156,6 +156,7 @@ struct opp_table *devm_pm_opp_register_set_opp_helper(struct device *dev, int (* struct opp_table *dev_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); void dev_pm_opp_detach_genpd(struct opp_table *opp_table); struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, const char **names, struct device ***virt_devs); +struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, struct opp_table *dst_table, struct dev_pm_opp *src_opp); int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); int dev_pm_opp_set_opp(struct device *dev, struct dev_pm_opp *opp); @@ -367,6 +368,12 @@ static inline struct opp_table *devm_pm_opp_attach_genpd(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_xlate_required_opp(struct opp_table *src_table, + struct opp_table *dst_table, struct dev_pm_opp *src_opp) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline int dev_pm_opp_xlate_performance_state(struct opp_table *src_table, struct opp_table *dst_table, unsigned int pstate) { return -EOPNOTSUPP; From 26f9c7cc42a6dc036edf871544fd0e6b3a0601c1 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 4 Feb 2021 16:14:23 +0800 Subject: [PATCH 130/307] PM / devfreq: Cache OPP table reference in devfreq The OPP table can be used often in devfreq. Trying to get it each time can be expensive, so cache it in the devfreq struct. Signed-off-by: Saravana Kannan Acked-by: MyungJoo Ham Acked-by: Chanwoo Choi Signed-off-by: Hsin-Yi Wang [ Viresh: Added a blank line ] Signed-off-by: Viresh Kumar --- drivers/devfreq/devfreq.c | 7 +++++++ include/linux/devfreq.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 6aa10de792b3..cefe84a10824 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -757,6 +757,9 @@ static void devfreq_dev_release(struct device *dev) if (devfreq->profile->exit) devfreq->profile->exit(devfreq->dev.parent); + if (devfreq->opp_table) + dev_pm_opp_put_opp_table(devfreq->opp_table); + mutex_destroy(&devfreq->lock); kfree(devfreq); } @@ -844,6 +847,10 @@ struct devfreq *devfreq_add_device(struct device *dev, } devfreq->suspend_freq = dev_pm_opp_get_suspend_opp_freq(dev); + devfreq->opp_table = dev_pm_opp_get_opp_table(dev); + if (IS_ERR(devfreq->opp_table)) + devfreq->opp_table = NULL; + atomic_set(&devfreq->suspend_count, 0); dev_set_name(&devfreq->dev, "%s", dev_name(dev)); diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index b6d3bae1c74d..26ea0850be9b 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -137,6 +137,7 @@ struct devfreq_stats { * using devfreq. * @profile: device-specific devfreq profile * @governor: method how to choose frequency based on the usage. + * @opp_table: Reference to OPP table of dev.parent, if one exists. * @nb: notifier block used to notify devfreq object that it should * reevaluate operable frequencies. Devfreq users may use * devfreq.nb to the corresponding register notifier call chain. @@ -173,6 +174,7 @@ struct devfreq { struct device dev; struct devfreq_dev_profile *profile; const struct devfreq_governor *governor; + struct opp_table *opp_table; struct notifier_block nb; struct delayed_work work; From 86ad9a24f21ea7aac7deed06fe9556392568d88a Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 4 Feb 2021 16:14:24 +0800 Subject: [PATCH 131/307] PM / devfreq: Add required OPPs support to passive governor Look at the required OPPs of the "parent" device to determine the OPP that is required from the slave device managed by the passive governor. This allows having mappings between a parent device and a slave device even when they don't have the same number of OPPs. While at it do a minor spell-fix and remove out label. Signed-off-by: Saravana Kannan Acked-by: MyungJoo Ham Acked-by: Chanwoo Choi Signed-off-by: Hsin-Yi Wang [ Viresh: Rearranged code and clean error paths ] Signed-off-by: Viresh Kumar --- drivers/devfreq/governor_passive.c | 44 +++++++++++++++++------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c index 63332e4a65ae..b094132bd20b 100644 --- a/drivers/devfreq/governor_passive.c +++ b/drivers/devfreq/governor_passive.c @@ -19,18 +19,16 @@ static int devfreq_passive_get_target_freq(struct devfreq *devfreq, = (struct devfreq_passive_data *)devfreq->data; struct devfreq *parent_devfreq = (struct devfreq *)p_data->parent; unsigned long child_freq = ULONG_MAX; - struct dev_pm_opp *opp; - int i, count, ret = 0; + struct dev_pm_opp *opp, *p_opp; + int i, count; /* * If the devfreq device with passive governor has the specific method * to determine the next frequency, should use the get_target_freq() * of struct devfreq_passive_data. */ - if (p_data->get_target_freq) { - ret = p_data->get_target_freq(devfreq, freq); - goto out; - } + if (p_data->get_target_freq) + return p_data->get_target_freq(devfreq, freq); /* * If the parent and passive devfreq device uses the OPP table, @@ -56,26 +54,35 @@ static int devfreq_passive_get_target_freq(struct devfreq *devfreq, * list of parent device. Because in this case, *freq is temporary * value which is decided by ondemand governor. */ - opp = devfreq_recommended_opp(parent_devfreq->dev.parent, freq, 0); - if (IS_ERR(opp)) { - ret = PTR_ERR(opp); - goto out; + if (devfreq->opp_table && parent_devfreq->opp_table) { + p_opp = devfreq_recommended_opp(parent_devfreq->dev.parent, + freq, 0); + if (IS_ERR(p_opp)) + return PTR_ERR(p_opp); + + opp = dev_pm_opp_xlate_required_opp(parent_devfreq->opp_table, + devfreq->opp_table, p_opp); + dev_pm_opp_put(p_opp); + + if (IS_ERR(opp)) + return PTR_ERR(opp); + + *freq = dev_pm_opp_get_freq(opp); + dev_pm_opp_put(opp); + + return 0; } - dev_pm_opp_put(opp); - /* - * Get the OPP table's index of decided freqeuncy by governor + * Get the OPP table's index of decided frequency by governor * of parent device. */ for (i = 0; i < parent_devfreq->profile->max_state; i++) if (parent_devfreq->profile->freq_table[i] == *freq) break; - if (i == parent_devfreq->profile->max_state) { - ret = -EINVAL; - goto out; - } + if (i == parent_devfreq->profile->max_state) + return -EINVAL; /* Get the suitable frequency by using index of parent device. */ if (i < devfreq->profile->max_state) { @@ -88,8 +95,7 @@ static int devfreq_passive_get_target_freq(struct devfreq *devfreq, /* Return the suitable frequency for passive device. */ *freq = child_freq; -out: - return ret; + return 0; } static int devfreq_passive_notifier_call(struct notifier_block *nb, From 56ce8339d41bf63fd769f10419cd188e6272d9d6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jan 2021 19:57:03 +0100 Subject: [PATCH 132/307] ACPI: power: Clean up printing messages Replace all of the ACPI_DEBUG_PRINT() instances in power.c with acpi_handle_debug() or pr_debug(), depending on the context, drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more, and replace the direct invocations of printk() in there with acpi_handle_info() or pr_info(), depending on the context. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/power.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index 189a0d4c6d06..962aec238d9d 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -21,6 +21,8 @@ * may be shared by multiple devices. */ +#define pr_fmt(fmt) "ACPI: PM: " fmt + #include #include #include @@ -32,8 +34,6 @@ #include "sleep.h" #include "internal.h" -#define _COMPONENT ACPI_POWER_COMPONENT -ACPI_MODULE_NAME("power"); #define ACPI_POWER_CLASS "power_resource" #define ACPI_POWER_DEVICE_NAME "Power Resource" #define ACPI_POWER_RESOURCE_STATE_OFF 0x00 @@ -181,9 +181,6 @@ static int acpi_power_get_state(acpi_handle handle, int *state) { acpi_status status = AE_OK; unsigned long long sta = 0; - char node_name[5]; - struct acpi_buffer buffer = { sizeof(node_name), node_name }; - if (!handle || !state) return -EINVAL; @@ -195,11 +192,8 @@ static int acpi_power_get_state(acpi_handle handle, int *state) *state = (sta & 0x01)?ACPI_POWER_RESOURCE_STATE_ON: ACPI_POWER_RESOURCE_STATE_OFF; - acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer); - - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Resource [%s] is %s\n", - node_name, - *state ? "on" : "off")); + acpi_handle_debug(handle, "Power resource is %s\n", + *state ? "on" : "off"); return 0; } @@ -229,8 +223,7 @@ static int acpi_power_get_list_state(struct list_head *list, int *state) break; } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Resource list is %s\n", - cur_state ? "on" : "off")); + pr_debug("Power resource list is %s\n", cur_state ? "on" : "off"); *state = cur_state; return 0; @@ -357,8 +350,7 @@ static int __acpi_power_on(struct acpi_power_resource *resource) if (ACPI_FAILURE(status)) return -ENODEV; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Power resource [%s] turned on\n", - resource->name)); + pr_debug("Power resource [%s] turned on\n", resource->name); /* * If there are other dependents on this power resource we need to @@ -383,9 +375,7 @@ static int acpi_power_on_unlocked(struct acpi_power_resource *resource) int result = 0; if (resource->ref_count++) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Power resource [%s] already on\n", - resource->name)); + pr_debug("Power resource [%s] already on\n", resource->name); } else { result = __acpi_power_on(resource); if (result) @@ -413,8 +403,8 @@ static int __acpi_power_off(struct acpi_power_resource *resource) if (ACPI_FAILURE(status)) return -ENODEV; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Power resource [%s] turned off\n", - resource->name)); + pr_debug("Power resource [%s] turned off\n", resource->name); + return 0; } @@ -423,16 +413,12 @@ static int acpi_power_off_unlocked(struct acpi_power_resource *resource) int result = 0; if (!resource->ref_count) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Power resource [%s] already off\n", - resource->name)); + pr_debug("Power resource [%s] already off\n", resource->name); return 0; } if (--resource->ref_count) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Power resource [%s] still in use\n", - resource->name)); + pr_debug("Power resource [%s] still in use\n", resource->name); } else { result = __acpi_power_off(resource); if (result) @@ -672,7 +658,7 @@ int acpi_device_sleep_wake(struct acpi_device *dev, if (ACPI_SUCCESS(status)) { return 0; } else if (status != AE_NOT_FOUND) { - printk(KERN_ERR PREFIX "_DSW execution failed\n"); + acpi_handle_info(dev->handle, "_DSW execution failed\n"); dev->wakeup.flags.valid = 0; return -ENODEV; } @@ -680,7 +666,7 @@ int acpi_device_sleep_wake(struct acpi_device *dev, /* Execute _PSW */ status = acpi_execute_simple_method(dev->handle, "_PSW", enable); if (ACPI_FAILURE(status) && (status != AE_NOT_FOUND)) { - printk(KERN_ERR PREFIX "_PSW execution failed\n"); + acpi_handle_info(dev->handle, "_PSW execution failed\n"); dev->wakeup.flags.valid = 0; return -ENODEV; } @@ -960,8 +946,8 @@ int acpi_add_power_resource(acpi_handle handle) if (result) goto err; - printk(KERN_INFO PREFIX "%s [%s] (%s)\n", acpi_device_name(device), - acpi_device_bid(device), state ? "on" : "off"); + pr_info("%s [%s] (%s)\n", acpi_device_name(device), + acpi_device_bid(device), state ? "on" : "off"); device->flags.match_driver = true; result = acpi_device_add(device, acpi_release_power_resource); From c56fd5ead29b6ad6625af632a91a231129027185 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jan 2021 19:58:18 +0100 Subject: [PATCH 133/307] ACPI: PM: Clean up printing messages Replace the remaining ACPI_DEBUG_PRINT() instances in device_pm.c with dev_dbg() invocations, drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more, and drop the no longer needed ACPI_POWER_COMPONENT definition from the headers and documentation. Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/debug.rst | 1 - drivers/acpi/device_pm.c | 20 ++++++++------------ drivers/acpi/sysfs.c | 1 - include/acpi/acpi_drivers.h | 1 - 4 files changed, 8 insertions(+), 15 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index 1a152dd1d765..73f88a27f12b 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -59,7 +59,6 @@ shows the supported mask values, currently these:: ACPI_SBS_COMPONENT 0x00100000 ACPI_FAN_COMPONENT 0x00200000 ACPI_PCI_COMPONENT 0x00400000 - ACPI_POWER_COMPONENT 0x00800000 ACPI_CONTAINER_COMPONENT 0x01000000 ACPI_SYSTEM_COMPONENT 0x02000000 ACPI_THERMAL_COMPONENT 0x04000000 diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 3586434d0ded..096153761ebc 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -10,6 +10,8 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +#define pr_fmt(fmt) "ACPI: PM: " fmt + #include #include #include @@ -20,9 +22,6 @@ #include "internal.h" -#define _COMPONENT ACPI_POWER_COMPONENT -ACPI_MODULE_NAME("device_pm"); - /** * acpi_power_state_string - String representation of ACPI device power state. * @state: ACPI device power state to return the string representation of. @@ -130,8 +129,8 @@ int acpi_device_get_power(struct acpi_device *device, int *state) *state = result; out: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] power state is %s\n", - device->pnp.bus_id, acpi_power_state_string(*state))); + dev_dbg(&device->dev, "Device power state is %s\n", + acpi_power_state_string(*state)); return 0; } @@ -174,9 +173,8 @@ int acpi_device_set_power(struct acpi_device *device, int state) /* There is a special case for D0 addressed below. */ if (state > ACPI_STATE_D0 && state == device->power.state) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] already in %s\n", - device->pnp.bus_id, - acpi_power_state_string(state))); + dev_dbg(&device->dev, "Device already in %s\n", + acpi_power_state_string(state)); return 0; } @@ -276,10 +274,8 @@ int acpi_device_set_power(struct acpi_device *device, int state) acpi_power_state_string(target_state)); } else { device->power.state = target_state; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Device [%s] transitioned to %s\n", - device->pnp.bus_id, - acpi_power_state_string(target_state))); + dev_dbg(&device->dev, "Power state changed to %s\n", + acpi_power_state_string(target_state)); } return result; diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index a5cc4f3bb1e3..eeb0419d68a8 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -59,7 +59,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_SBS_COMPONENT), ACPI_DEBUG_INIT(ACPI_FAN_COMPONENT), ACPI_DEBUG_INIT(ACPI_PCI_COMPONENT), - ACPI_DEBUG_INIT(ACPI_POWER_COMPONENT), ACPI_DEBUG_INIT(ACPI_CONTAINER_COMPONENT), ACPI_DEBUG_INIT(ACPI_SYSTEM_COMPONENT), ACPI_DEBUG_INIT(ACPI_THERMAL_COMPONENT), diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index d4f39a20aa2a..14da491bad96 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -22,7 +22,6 @@ #define ACPI_SBS_COMPONENT 0x00100000 #define ACPI_FAN_COMPONENT 0x00200000 #define ACPI_PCI_COMPONENT 0x00400000 -#define ACPI_POWER_COMPONENT 0x00800000 #define ACPI_CONTAINER_COMPONENT 0x01000000 #define ACPI_SYSTEM_COMPONENT 0x02000000 #define ACPI_THERMAL_COMPONENT 0x04000000 From ee98460b2ff90fad5ece2f380c77b7ea3b3e622f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jan 2021 19:59:05 +0100 Subject: [PATCH 134/307] ACPI: bus: Clean up printing messages Replace all of the ACPI_DEBUG_PRINT() and ACPI_EXCEPTION() instances in bus.c with pr_debug() and pr_info(), respectively, drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more and replace direct printk() invocations with the matching pr_*() calls (with a couple of exceptions where the log level is decreased). Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 60 ++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 1682f8b454a2..c4b0328a0010 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -5,6 +5,8 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh */ +#define pr_fmt(fmt) "ACPI: " fmt + #include #include #include @@ -31,9 +33,6 @@ #include "internal.h" -#define _COMPONENT ACPI_BUS_COMPONENT -ACPI_MODULE_NAME("bus"); - struct acpi_device *acpi_root; struct proc_dir_entry *acpi_root_dir; EXPORT_SYMBOL(acpi_root_dir); @@ -47,8 +46,7 @@ static inline int set_copy_dsdt(const struct dmi_system_id *id) #else static int set_copy_dsdt(const struct dmi_system_id *id) { - printk(KERN_NOTICE "%s detected - " - "force copy of DSDT to local memory\n", id->ident); + pr_notice("%s detected - force copy of DSDT to local memory\n", id->ident); acpi_gbl_copy_dsdt_locally = 1; return 0; } @@ -116,13 +114,11 @@ int acpi_bus_get_status(struct acpi_device *device) acpi_set_device_status(device, sta); if (device->status.functional && !device->status.present) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] status [%08x]: " - "functional but not present;\n", - device->pnp.bus_id, (u32)sta)); + pr_debug("Device [%s] status [%08x]: functional but not present\n", + device->pnp.bus_id, (u32)sta); } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device [%s] status [%08x]\n", - device->pnp.bus_id, (u32)sta)); + pr_debug("Device [%s] status [%08x]\n", device->pnp.bus_id, (u32)sta); return 0; } EXPORT_SYMBOL(acpi_bus_get_status); @@ -915,9 +911,9 @@ static int acpi_device_probe(struct device *dev) return ret; acpi_dev->driver = acpi_drv; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Driver [%s] successfully bound to device [%s]\n", - acpi_drv->name, acpi_dev->pnp.bus_id)); + + pr_debug("Driver [%s] successfully bound to device [%s]\n", + acpi_drv->name, acpi_dev->pnp.bus_id); if (acpi_drv->ops.notify) { ret = acpi_device_install_notify_handler(acpi_dev); @@ -931,8 +927,9 @@ static int acpi_device_probe(struct device *dev) } } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found driver [%s] for device [%s]\n", - acpi_drv->name, acpi_dev->pnp.bus_id)); + pr_debug("Found driver [%s] for device [%s]\n", acpi_drv->name, + acpi_dev->pnp.bus_id); + get_device(dev); return 0; } @@ -995,15 +992,15 @@ static int __init acpi_bus_init_irq(void) message = "platform specific model"; break; default: - printk(KERN_WARNING PREFIX "Unknown interrupt routing model\n"); + pr_info("Unknown interrupt routing model\n"); return -ENODEV; } - printk(KERN_INFO PREFIX "Using %s for interrupt routing\n", message); + pr_info("Using %s for interrupt routing\n", message); status = acpi_execute_simple_method(NULL, "\\_PIC", acpi_irq_model); if (ACPI_FAILURE(status) && (status != AE_NOT_FOUND)) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PIC")); + pr_info("_PIC evaluation failed: %s\n", acpi_format_exception(status)); return -ENODEV; } @@ -1027,7 +1024,7 @@ void __init acpi_early_init(void) if (acpi_disabled) return; - printk(KERN_INFO PREFIX "Core revision %08x\n", ACPI_CA_VERSION); + pr_info("Core revision %08x\n", ACPI_CA_VERSION); /* enable workarounds, unless strict ACPI spec. compliance */ if (!acpi_strict) @@ -1048,15 +1045,13 @@ void __init acpi_early_init(void) status = acpi_reallocate_root_table(); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to reallocate ACPI tables\n"); + pr_err("Unable to reallocate ACPI tables\n"); goto error0; } status = acpi_initialize_subsystem(); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to initialize the ACPI Interpreter\n"); + pr_err("Unable to initialize the ACPI Interpreter\n"); goto error0; } @@ -1102,7 +1097,7 @@ void __init acpi_subsystem_init(void) status = acpi_enable_subsystem(~ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX "Unable to enable ACPI\n"); + pr_err("Unable to enable ACPI\n"); disable_acpi(); } else { /* @@ -1131,8 +1126,7 @@ static int __init acpi_bus_init(void) status = acpi_load_tables(); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to load the System Description Tables\n"); + pr_err("Unable to load the System Description Tables\n"); goto error1; } @@ -1150,14 +1144,13 @@ static int __init acpi_bus_init(void) status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to start the ACPI Interpreter\n"); + pr_err("Unable to start the ACPI Interpreter\n"); goto error1; } status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX "Unable to initialize ACPI objects\n"); + pr_err("Unable to initialize ACPI objects\n"); goto error1; } @@ -1186,7 +1179,7 @@ static int __init acpi_bus_init(void) */ acpi_ec_dsdt_probe(); - printk(KERN_INFO PREFIX "Interpreter enabled\n"); + pr_info("Interpreter enabled\n"); /* Initialize sleep structures */ acpi_sleep_init(); @@ -1205,8 +1198,7 @@ static int __init acpi_bus_init(void) acpi_install_notify_handler(ACPI_ROOT_OBJECT, ACPI_SYSTEM_NOTIFY, &acpi_bus_notify, NULL); if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to register for device notifications\n"); + pr_err("Unable to register for system notifications\n"); goto error1; } @@ -1233,13 +1225,13 @@ static int __init acpi_init(void) int result; if (acpi_disabled) { - printk(KERN_INFO PREFIX "Interpreter disabled.\n"); + pr_info("Interpreter disabled.\n"); return -ENODEV; } acpi_kobj = kobject_create_and_add("acpi", firmware_kobj); if (!acpi_kobj) { - printk(KERN_WARNING "%s: kset create error\n", __func__); + pr_debug("%s: kset create error\n", __func__); acpi_kobj = NULL; } From e52d9d8c08644129cbc7df04f965c6505a53baeb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jan 2021 19:59:51 +0100 Subject: [PATCH 135/307] ACPI: scan: Clean up printing messages Replace all of the ACPI_DEBUG_PRINT() and ACPI_EXCEPTION() instances in scan.c with acpi_handle_debug() and acpi_handle_info(), respectively, and drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more. While at it, drop the redundant "Memory allocation error" message from acpi_add_single_object() and clean up the list of local variables in that function. Signed-off-by: Rafael J. Wysocki Reported-by: kernel test robot --- drivers/acpi/scan.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index bcbf0fc215c8..1d7a02ee45e0 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -19,8 +19,6 @@ #include "internal.h" -#define _COMPONENT ACPI_BUS_COMPONENT -ACPI_MODULE_NAME("scan"); extern struct acpi_device *acpi_root; #define ACPI_BUS_CLASS "system_bus" @@ -265,8 +263,7 @@ static int acpi_scan_hot_remove(struct acpi_device *device) return error; } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Hot-removing device %s...\n", dev_name(&device->dev))); + acpi_handle_debug(handle, "Ejecting\n"); acpi_bus_trim(device); @@ -829,7 +826,8 @@ static int acpi_bus_extract_wakeup_device_power_package(struct acpi_device *dev) /* _PRW */ status = acpi_evaluate_object(handle, "_PRW", NULL, &buffer); if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PRW")); + acpi_handle_info(handle, "_PRW evaluation failed: %s\n", + acpi_format_exception(status)); return err; } @@ -934,7 +932,7 @@ static void acpi_bus_get_wakeup_device_flags(struct acpi_device *device) err = acpi_bus_extract_wakeup_device_power_package(device); if (err) { - dev_err(&device->dev, "_PRW evaluation error: %d\n", err); + dev_err(&device->dev, "Unable to extract wakeup power resources"); return; } @@ -1170,8 +1168,7 @@ acpi_backlight_cap_match(acpi_handle handle, u32 level, void *context, if (acpi_has_method(handle, "_BCM") && acpi_has_method(handle, "_BCL")) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found generic backlight " - "support\n")); + acpi_handle_debug(handle, "Found generic backlight support\n"); *cap |= ACPI_VIDEO_BACKLIGHT; /* We have backlight support, no need to scan further */ return AE_CTRL_TERMINATE; @@ -1662,17 +1659,15 @@ static int acpi_add_single_object(struct acpi_device **child, acpi_handle handle, int type, unsigned long long sta) { - int result; - struct acpi_device *device; - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; struct acpi_device_info *info = NULL; + struct acpi_device *device; + int result; if (handle != ACPI_ROOT_OBJECT && type == ACPI_BUS_TYPE_DEVICE) acpi_get_object_info(handle, &info); device = kzalloc(sizeof(struct acpi_device), GFP_KERNEL); if (!device) { - printk(KERN_ERR PREFIX "Memory allocation error\n"); kfree(info); return -ENOMEM; } @@ -1699,11 +1694,11 @@ static int acpi_add_single_object(struct acpi_device **child, acpi_power_add_remove_device(device, true); acpi_device_add_finalize(device); - acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Added %s [%s] parent %s\n", - dev_name(&device->dev), (char *) buffer.pointer, - device->parent ? dev_name(&device->parent->dev) : "(null)")); - kfree(buffer.pointer); + + acpi_handle_debug(handle, "Added as %s, parent %s\n", + dev_name(&device->dev), device->parent ? + dev_name(&device->parent->dev) : "(null)"); + *child = device; return 0; } From fba2ae30fe8cd13fd1f6b723cdb37d51248c29fc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jan 2021 20:00:32 +0100 Subject: [PATCH 136/307] ACPI: utils: Clean up printing messages Replace all of the ACPI_DEBUG_PRINT() instances in utils.c with pr_debug() and acpi_handle_debug(), drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more and replace direct printk() invocations with pr_debug() calls (the log level in there is way excessive). Also add a special pr_fmt() definition, but this only affects the pr_debug() messages mentioned above. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 56 ++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index d5411a166685..cdbc6bf9e4ef 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -6,6 +6,8 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh */ +#define pr_fmt(fmt) "ACPI: utils: " fmt + #include #include #include @@ -18,24 +20,12 @@ #include "internal.h" #include "sleep.h" -#define _COMPONENT ACPI_BUS_COMPONENT -ACPI_MODULE_NAME("utils"); - /* -------------------------------------------------------------------------- Object Evaluation Helpers -------------------------------------------------------------------------- */ -static void -acpi_util_eval_error(acpi_handle h, acpi_string p, acpi_status s) +static void acpi_util_eval_error(acpi_handle h, acpi_string p, acpi_status s) { -#ifdef ACPI_DEBUG_OUTPUT - char prefix[80] = {'\0'}; - struct acpi_buffer buffer = {sizeof(prefix), prefix}; - acpi_get_name(h, ACPI_FULL_PATHNAME, &buffer); - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Evaluate [%s.%s]: %s\n", - (char *) prefix, p, acpi_format_exception(s))); -#else - return; -#endif + acpi_handle_debug(h, "Evaluate [%s]: %s\n", p, acpi_format_exception(s)); } acpi_status @@ -53,25 +43,24 @@ acpi_extract_package(union acpi_object *package, if (!package || (package->type != ACPI_TYPE_PACKAGE) || (package->package.count < 1)) { - printk(KERN_WARNING PREFIX "Invalid package argument\n"); + pr_debug("Invalid package argument\n"); return AE_BAD_PARAMETER; } if (!format || !format->pointer || (format->length < 1)) { - printk(KERN_WARNING PREFIX "Invalid format argument\n"); + pr_debug("Invalid format argument\n"); return AE_BAD_PARAMETER; } if (!buffer) { - printk(KERN_WARNING PREFIX "Invalid buffer argument\n"); + pr_debug("Invalid buffer argument\n"); return AE_BAD_PARAMETER; } format_count = (format->length / sizeof(char)) - 1; if (format_count > package->package.count) { - printk(KERN_WARNING PREFIX "Format specifies more objects [%d]" - " than exist in package [%d].\n", - format_count, package->package.count); + pr_debug("Format specifies more objects [%d] than present [%d]\n", + format_count, package->package.count); return AE_BAD_DATA; } @@ -99,10 +88,8 @@ acpi_extract_package(union acpi_object *package, tail_offset += sizeof(char *); break; default: - printk(KERN_WARNING PREFIX "Invalid package element" - " [%d]: got number, expecting" - " [%c]\n", - i, format_string[i]); + pr_debug("Invalid package element [%d]: got number, expected [%c]\n", + i, format_string[i]); return AE_BAD_DATA; } break; @@ -123,10 +110,8 @@ acpi_extract_package(union acpi_object *package, tail_offset += sizeof(u8 *); break; default: - printk(KERN_WARNING PREFIX "Invalid package element" - " [%d] got string/buffer," - " expecting [%c]\n", - i, format_string[i]); + pr_debug("Invalid package element [%d] got string/buffer, expected [%c]\n", + i, format_string[i]); return AE_BAD_DATA; } break; @@ -137,19 +122,15 @@ acpi_extract_package(union acpi_object *package, tail_offset += sizeof(void *); break; default: - printk(KERN_WARNING PREFIX "Invalid package element" - " [%d] got reference," - " expecting [%c]\n", - i, format_string[i]); + pr_debug("Invalid package element [%d] got reference, expected [%c]\n", + i, format_string[i]); return AE_BAD_DATA; } break; case ACPI_TYPE_PACKAGE: default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Found unsupported element at index=%d\n", - i)); + pr_debug("Unsupported element at index=%d\n", i); /* TBD: handle nested packages... */ return AE_SUPPORT; } @@ -289,7 +270,7 @@ acpi_evaluate_integer(acpi_handle handle, *data = element.integer.value; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Return value [%llu]\n", *data)); + acpi_handle_debug(handle, "Return value [%llu]\n", *data); return AE_OK; } @@ -363,8 +344,7 @@ acpi_evaluate_reference(acpi_handle handle, /* Get the acpi_handle. */ list->handles[i] = element->reference.handle; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found reference [%p]\n", - list->handles[i])); + acpi_handle_debug(list->handles[i], "Found in reference list\n"); } end: From 12bfee94c23063142e8c370c651ba33482388a51 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jan 2021 20:01:18 +0100 Subject: [PATCH 137/307] ACPI: bus: Drop ACPI_BUS_COMPONENT which is not used any more After dropping all of the code using ACPI_BUS_COMPONENT drop it too and modify the example in the documentation using it. Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/debug.rst | 13 ++++++------- drivers/acpi/sysfs.c | 1 - include/acpi/acpi_drivers.h | 1 - 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index 73f88a27f12b..c7bad74c6ff7 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -52,7 +52,6 @@ shows the supported mask values, currently these:: ACPI_CA_DISASSEMBLER 0x00000800 ACPI_COMPILER 0x00001000 ACPI_TOOLS 0x00002000 - ACPI_BUS_COMPONENT 0x00010000 ACPI_AC_COMPONENT 0x00020000 ACPI_BATTERY_COMPONENT 0x00040000 ACPI_BUTTON_COMPONENT 0x00080000 @@ -117,15 +116,15 @@ currently these:: Examples ======== -For example, drivers/acpi/bus.c contains this:: +For example, drivers/acpi/acpica/evxfevnt.c contains this:: - #define _COMPONENT ACPI_BUS_COMPONENT + #define _COMPONENT ACPI_EVENTS ... - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Device insertion detected\n")); + ACPI_DEBUG_PRINT((ACPI_DB_INIT, "ACPI mode disabled\n")); -To turn on this message, set the ACPI_BUS_COMPONENT bit in acpi.debug_layer -and the ACPI_LV_INFO bit in acpi.debug_level. (The ACPI_DEBUG_PRINT -statement uses ACPI_DB_INFO, which is macro based on the ACPI_LV_INFO +To turn on this message, set the ACPI_EVENTS bit in acpi.debug_layer +and the ACPI_LV_INIT bit in acpi.debug_level. (The ACPI_DEBUG_PRINT +statement uses ACPI_DB_INIT, which is a macro based on the ACPI_LV_INIT definition.) Enable all AML "Debug" output (stores to the Debug object while interpreting diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index eeb0419d68a8..b065f2af8821 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -52,7 +52,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_COMPILER), ACPI_DEBUG_INIT(ACPI_TOOLS), - ACPI_DEBUG_INIT(ACPI_BUS_COMPONENT), ACPI_DEBUG_INIT(ACPI_AC_COMPONENT), ACPI_DEBUG_INIT(ACPI_BATTERY_COMPONENT), ACPI_DEBUG_INIT(ACPI_BUTTON_COMPONENT), diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 14da491bad96..4baa7a7dc83a 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -15,7 +15,6 @@ * Please update drivers/acpi/debug.c and Documentation/firmware-guide/acpi/debug.rst * if you add to this list. */ -#define ACPI_BUS_COMPONENT 0x00010000 #define ACPI_AC_COMPONENT 0x00020000 #define ACPI_BATTERY_COMPONENT 0x00040000 #define ACPI_BUTTON_COMPONENT 0x00080000 From 5ae4a4b45d4396aa7f7c008c4ae9eca981d43f8c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 2 Feb 2021 10:25:11 +0530 Subject: [PATCH 138/307] cpufreq: Remove CPUFREQ_STICKY flag During cpufreq driver's registration, if the ->init() callback for all the CPUs fail then there is not much point in keeping the driver around as it will only account for more of unnecessary noise, for example cpufreq core will try to suspend/resume the driver which never got registered properly. The removal of such a driver is avoided if the driver carries the CPUFREQ_STICKY flag. This was added way back [1] in 2004 and perhaps no one should ever need it now. A lot of drivers do set this flag, probably because they just copied it from other drivers. This was added earlier for some platforms [2] because their cpufreq drivers were getting registered before the CPUs were registered with subsys framework. And hence they used to fail. The same isn't true anymore though. The current code flow in the kernel is: start_kernel() -> kernel_init() -> kernel_init_freeable() -> do_basic_setup() -> driver_init() -> cpu_dev_init() -> subsys_system_register() //For CPUs -> do_initcalls() -> cpufreq_register_driver() Clearly, the CPUs will always get registered with subsys framework before any cpufreq driver can get probed. Remove the flag and update the relevant drivers. Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/include/linux/cpufreq.h?id=7cc9f0d9a1ab04cedc60d64fd8dcf7df224a3b4d # [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/arch/arm/mach-sa1100/cpu-sa1100.c?id=f59d3bbe35f6268d729f51be82af8325d62f20f5 # [2] Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 2 +- drivers/cpufreq/cpufreq.c | 3 +-- drivers/cpufreq/davinci-cpufreq.c | 2 +- drivers/cpufreq/loongson1-cpufreq.c | 2 +- drivers/cpufreq/mediatek-cpufreq.c | 2 +- drivers/cpufreq/omap-cpufreq.c | 2 +- drivers/cpufreq/qcom-cpufreq-hw.c | 2 +- drivers/cpufreq/s3c24xx-cpufreq.c | 2 +- drivers/cpufreq/s5pv210-cpufreq.c | 2 +- drivers/cpufreq/sa1100-cpufreq.c | 2 +- drivers/cpufreq/sa1110-cpufreq.c | 2 +- drivers/cpufreq/scmi-cpufreq.c | 2 +- drivers/cpufreq/scpi-cpufreq.c | 2 +- drivers/cpufreq/spear-cpufreq.c | 2 +- drivers/cpufreq/tegra186-cpufreq.c | 2 +- drivers/cpufreq/tegra194-cpufreq.c | 3 +-- drivers/cpufreq/vexpress-spc-cpufreq.c | 3 +-- include/linux/cpufreq.h | 17 +++++++---------- 18 files changed, 24 insertions(+), 30 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index ad4234518ef6..b1e1bdc63b01 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -175,7 +175,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy) } static struct cpufreq_driver dt_cpufreq_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, .target_index = set_target, diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d0a3525ce27f..7d0ae968def7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2810,8 +2810,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) if (ret) goto err_boost_unreg; - if (!(cpufreq_driver->flags & CPUFREQ_STICKY) && - list_empty(&cpufreq_policy_list)) { + if (unlikely(list_empty(&cpufreq_policy_list))) { /* if all ->init() calls failed, unregister */ ret = -ENODEV; pr_debug("%s: No CPU initialized for driver %s\n", __func__, diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c index 91f477a6cbc4..9e97f60f8199 100644 --- a/drivers/cpufreq/davinci-cpufreq.c +++ b/drivers/cpufreq/davinci-cpufreq.c @@ -95,7 +95,7 @@ static int davinci_cpu_init(struct cpufreq_policy *policy) } static struct cpufreq_driver davinci_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = davinci_target, .get = cpufreq_generic_get, diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c index 86f612593e49..fb72d709db56 100644 --- a/drivers/cpufreq/loongson1-cpufreq.c +++ b/drivers/cpufreq/loongson1-cpufreq.c @@ -116,7 +116,7 @@ static int ls1x_cpufreq_exit(struct cpufreq_policy *policy) static struct cpufreq_driver ls1x_cpufreq_driver = { .name = "cpufreq-ls1x", - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = ls1x_cpufreq_target, .get = cpufreq_generic_get, diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 022e3e966e71..f2e491b25b07 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -463,7 +463,7 @@ static int mtk_cpufreq_exit(struct cpufreq_policy *policy) } static struct cpufreq_driver mtk_cpufreq_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 3694bb030df3..e035ee216b0f 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -144,7 +144,7 @@ static int omap_cpu_exit(struct cpufreq_policy *policy) } static struct cpufreq_driver omap_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = omap_target, .get = cpufreq_generic_get, diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 9ed5341dc515..2a3b4f44488b 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -374,7 +374,7 @@ static struct freq_attr *qcom_cpufreq_hw_attr[] = { }; static struct cpufreq_driver cpufreq_qcom_hw_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, diff --git a/drivers/cpufreq/s3c24xx-cpufreq.c b/drivers/cpufreq/s3c24xx-cpufreq.c index 37efc0dc3f91..7380c32b238e 100644 --- a/drivers/cpufreq/s3c24xx-cpufreq.c +++ b/drivers/cpufreq/s3c24xx-cpufreq.c @@ -420,7 +420,7 @@ static int s3c_cpufreq_resume(struct cpufreq_policy *policy) #endif static struct cpufreq_driver s3c24xx_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .target = s3c_cpufreq_target, .get = cpufreq_generic_get, .init = s3c_cpufreq_init, diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c index bed496cf8d24..69786e5bbf05 100644 --- a/drivers/cpufreq/s5pv210-cpufreq.c +++ b/drivers/cpufreq/s5pv210-cpufreq.c @@ -574,7 +574,7 @@ static int s5pv210_cpufreq_reboot_notifier_event(struct notifier_block *this, } static struct cpufreq_driver s5pv210_driver = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = s5pv210_target, .get = cpufreq_generic_get, diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c index 5c075ef6adc0..252b9fc26124 100644 --- a/drivers/cpufreq/sa1100-cpufreq.c +++ b/drivers/cpufreq/sa1100-cpufreq.c @@ -186,7 +186,7 @@ static int __init sa1100_cpu_init(struct cpufreq_policy *policy) } static struct cpufreq_driver sa1100_driver __refdata = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING, .verify = cpufreq_generic_frequency_table_verify, .target_index = sa1100_target, diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c index d9d04d935b3a..1a83c8678a63 100644 --- a/drivers/cpufreq/sa1110-cpufreq.c +++ b/drivers/cpufreq/sa1110-cpufreq.c @@ -310,7 +310,7 @@ static int __init sa1110_cpu_init(struct cpufreq_policy *policy) /* sa1110_driver needs __refdata because it must remain after init registers * it with cpufreq_register_driver() */ static struct cpufreq_driver sa1110_driver __refdata = { - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING, .verify = cpufreq_generic_frequency_table_verify, .target_index = sa1110_target, diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 491a0a24fb1e..5bd03b59887f 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -217,7 +217,7 @@ static int scmi_cpufreq_exit(struct cpufreq_policy *policy) static struct cpufreq_driver scmi_cpufreq_driver = { .name = "scmi", - .flags = CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | + .flags = CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index e5140ad63db8..d6a698a1b5d1 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -191,7 +191,7 @@ static int scpi_cpufreq_exit(struct cpufreq_policy *policy) static struct cpufreq_driver scpi_cpufreq_driver = { .name = "scpi-cpufreq", - .flags = CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | + .flags = CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c index 73bd8dc47074..7d0d62a06bf3 100644 --- a/drivers/cpufreq/spear-cpufreq.c +++ b/drivers/cpufreq/spear-cpufreq.c @@ -160,7 +160,7 @@ static int spear_cpufreq_init(struct cpufreq_policy *policy) static struct cpufreq_driver spear_cpufreq_driver = { .name = "cpufreq-spear", - .flags = CPUFREQ_STICKY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = spear_cpufreq_target, .get = cpufreq_generic_get, diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index e566ea298b59..5d1943e787b0 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -117,7 +117,7 @@ static unsigned int tegra186_cpufreq_get(unsigned int cpu) static struct cpufreq_driver tegra186_cpufreq_driver = { .name = "tegra186", - .flags = CPUFREQ_STICKY | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | + .flags = CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, .get = tegra186_cpufreq_get, .verify = cpufreq_generic_frequency_table_verify, diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index 6a67f36f3b80..a9620e4489ae 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -272,8 +272,7 @@ static int tegra194_cpufreq_set_target(struct cpufreq_policy *policy, static struct cpufreq_driver tegra194_cpufreq_driver = { .name = "tegra194", - .flags = CPUFREQ_STICKY | CPUFREQ_CONST_LOOPS | - CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = tegra194_cpufreq_set_target, .get = tegra194_get_speed, diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c index f711d8eaea6a..51dfa9ae6cf5 100644 --- a/drivers/cpufreq/vexpress-spc-cpufreq.c +++ b/drivers/cpufreq/vexpress-spc-cpufreq.c @@ -486,8 +486,7 @@ static void ve_spc_cpufreq_ready(struct cpufreq_policy *policy) static struct cpufreq_driver ve_spc_cpufreq_driver = { .name = "vexpress-spc", - .flags = CPUFREQ_STICKY | - CPUFREQ_HAVE_GOVERNOR_PER_POLICY | + .flags = CPUFREQ_HAVE_GOVERNOR_PER_POLICY | CPUFREQ_NEED_INITIAL_FREQ_CHECK, .verify = cpufreq_generic_frequency_table_verify, .target_index = ve_spc_cpufreq_set_target, diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9c8b7437b6cd..c8e40e91fe9b 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -387,8 +387,13 @@ struct cpufreq_driver { /* flags */ -/* driver isn't removed even if all ->init() calls failed */ -#define CPUFREQ_STICKY BIT(0) +/* + * Set by drivers that need to update internale upper and lower boundaries along + * with the target frequency and so the core and governors should also invoke + * the diver if the target frequency does not change, but the policy min or max + * may have changed. + */ +#define CPUFREQ_NEED_UPDATE_LIMITS BIT(0) /* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */ #define CPUFREQ_CONST_LOOPS BIT(1) @@ -432,14 +437,6 @@ struct cpufreq_driver { */ #define CPUFREQ_IS_COOLING_DEV BIT(7) -/* - * Set by drivers that need to update internale upper and lower boundaries along - * with the target frequency and so the core and governors should also invoke - * the diver if the target frequency does not change, but the policy min or max - * may have changed. - */ -#define CPUFREQ_NEED_UPDATE_LIMITS BIT(8) - int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From 2f0531869fd22182e769b10dd6cf151861ede791 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 2 Feb 2021 11:11:55 +0530 Subject: [PATCH 139/307] cpufreq: Remove unused flag CPUFREQ_PM_NO_WARN This flag is set by one of the drivers but it isn't used in the code otherwise. Remove the unused flag and update the driver. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/pmac32-cpufreq.c | 3 +-- include/linux/cpufreq.h | 13 +++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index 73621bc11976..4f20c6a9108d 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -439,8 +439,7 @@ static struct cpufreq_driver pmac_cpufreq_driver = { .init = pmac_cpufreq_cpu_init, .suspend = pmac_cpufreq_suspend, .resume = pmac_cpufreq_resume, - .flags = CPUFREQ_PM_NO_WARN | - CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING, + .flags = CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING, .attr = cpufreq_generic_attr, .name = "powermac", }; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c8e40e91fe9b..353969c7acd3 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -398,8 +398,11 @@ struct cpufreq_driver { /* loops_per_jiffy or other kernel "constants" aren't affected by frequency transitions */ #define CPUFREQ_CONST_LOOPS BIT(1) -/* don't warn on suspend/resume speed mismatches */ -#define CPUFREQ_PM_NO_WARN BIT(2) +/* + * Set by drivers that want the core to automatically register the cpufreq + * driver as a thermal cooling device. + */ +#define CPUFREQ_IS_COOLING_DEV BIT(2) /* * This should be set by platforms having multiple clock-domains, i.e. @@ -431,12 +434,6 @@ struct cpufreq_driver { */ #define CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING BIT(6) -/* - * Set by drivers that want the core to automatically register the cpufreq - * driver as a thermal cooling device. - */ -#define CPUFREQ_IS_COOLING_DEV BIT(7) - int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From 2249ff344467b5ab4da31c1e0873c56521aa345b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Feb 2021 19:43:17 +0100 Subject: [PATCH 140/307] ACPI: AC: Clean up printing messages Replace the ACPI_DEBUG_PRINT() and ACPI_EXCEPTION() instances in ac.c with acpi_handle_debug() and acpi_handle_info() calls, respectively, which among other things causes the excessive log level of the messages previously printed via ACPI_EXCEPTION() to be increased. Drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more, drop the no longer needed ACPI_AC_COMPONENT definition from the headers and update the documentation accordingly. While at it, replace the direct printk() invocation with pr_info(), add a pr_fmt() definition to ac.c and drop the unneeded PREFIX symbol definition from there. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Hans de Goede --- Documentation/firmware-guide/acpi/debug.rst | 1 - drivers/acpi/ac.c | 23 +++++++++------------ drivers/acpi/sysfs.c | 1 - include/acpi/acpi_drivers.h | 1 - 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index c7bad74c6ff7..cb61b1ab7276 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -52,7 +52,6 @@ shows the supported mask values, currently these:: ACPI_CA_DISASSEMBLER 0x00000800 ACPI_COMPILER 0x00001000 ACPI_TOOLS 0x00002000 - ACPI_AC_COMPONENT 0x00020000 ACPI_BATTERY_COMPONENT 0x00040000 ACPI_BUTTON_COMPONENT 0x00080000 ACPI_SBS_COMPONENT 0x00100000 diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c index 46a64e9fa716..b41180330cc1 100644 --- a/drivers/acpi/ac.c +++ b/drivers/acpi/ac.c @@ -6,6 +6,8 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh */ +#define pr_fmt(fmt) "ACPI: AC: " fmt + #include #include #include @@ -18,8 +20,6 @@ #include #include -#define PREFIX "ACPI: " - #define ACPI_AC_CLASS "ac_adapter" #define ACPI_AC_DEVICE_NAME "AC Adapter" #define ACPI_AC_FILE_STATE "state" @@ -28,9 +28,6 @@ #define ACPI_AC_STATUS_ONLINE 0x01 #define ACPI_AC_STATUS_UNKNOWN 0xFF -#define _COMPONENT ACPI_AC_COMPONENT -ACPI_MODULE_NAME("ac"); - MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI AC Adapter Driver"); MODULE_LICENSE("GPL"); @@ -102,8 +99,9 @@ static int acpi_ac_get_state(struct acpi_ac *ac) status = acpi_evaluate_integer(ac->device->handle, "_PSR", NULL, &ac->state); if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, - "Error reading AC Adapter state")); + acpi_handle_info(ac->device->handle, + "Error reading AC Adapter state: %s\n", + acpi_format_exception(status)); ac->state = ACPI_AC_STATUS_UNKNOWN; return -ENODEV; } @@ -153,8 +151,8 @@ static void acpi_ac_notify(struct acpi_device *device, u32 event) switch (event) { default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); + acpi_handle_debug(device->handle, "Unsupported event [0x%x]\n", + event); fallthrough; case ACPI_AC_NOTIFY_STATUS: case ACPI_NOTIFY_BUS_CHECK: @@ -278,9 +276,8 @@ static int acpi_ac_add(struct acpi_device *device) goto end; } - printk(KERN_INFO PREFIX "%s [%s] (%s)\n", - acpi_device_name(device), acpi_device_bid(device), - ac->state ? "on-line" : "off-line"); + pr_info("%s [%s] (%s)\n", acpi_device_name(device), + acpi_device_bid(device), ac->state ? "on-line" : "off-line"); ac->battery_nb.notifier_call = acpi_ac_battery_notify; register_acpi_notifier(&ac->battery_nb); @@ -348,7 +345,7 @@ static int __init acpi_ac_init(void) for (i = 0; i < ARRAY_SIZE(acpi_ac_blacklist); i++) if (acpi_dev_present(acpi_ac_blacklist[i].hid, "1", acpi_ac_blacklist[i].hrv)) { - pr_info(PREFIX "AC: found native %s PMIC, not loading\n", + pr_info("found native %s PMIC, not loading\n", acpi_ac_blacklist[i].hid); return -ENODEV; } diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index b065f2af8821..64f7674ee498 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -52,7 +52,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_COMPILER), ACPI_DEBUG_INIT(ACPI_TOOLS), - ACPI_DEBUG_INIT(ACPI_AC_COMPONENT), ACPI_DEBUG_INIT(ACPI_BATTERY_COMPONENT), ACPI_DEBUG_INIT(ACPI_BUTTON_COMPONENT), ACPI_DEBUG_INIT(ACPI_SBS_COMPONENT), diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 4baa7a7dc83a..b0d6c4cc1a39 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -15,7 +15,6 @@ * Please update drivers/acpi/debug.c and Documentation/firmware-guide/acpi/debug.rst * if you add to this list. */ -#define ACPI_AC_COMPONENT 0x00020000 #define ACPI_BATTERY_COMPONENT 0x00040000 #define ACPI_BUTTON_COMPONENT 0x00080000 #define ACPI_SBS_COMPONENT 0x00100000 From bd8c5d1ee37ff4726367128ccdfd83300ee4e3d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Feb 2021 19:44:57 +0100 Subject: [PATCH 141/307] ACPI: battery: Clean up printing messages Replace the ACPI_DEBUG_PRINT() and ACPI_EXCEPTION() instances in battery.c with acpi_handle_debug() and acpi_handle_info() calls, respectively, which among other things causes the excessive log level of the messages previously printed via ACPI_EXCEPTION() to be increased. Drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more, drop the no longer needed ACPI_BATTERY_COMPONENT definition from the headers and update the documentation accordingly. While at it, update the pr_fmt() definition and drop the unneeded PREFIX sybmbol definition from battery.c. Also adapt the existing pr_info() calls to the new pr_fmt() definition. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Hans de Goede --- Documentation/firmware-guide/acpi/debug.rst | 1 - drivers/acpi/battery.c | 33 ++++++++++----------- drivers/acpi/sysfs.c | 1 - include/acpi/acpi_drivers.h | 1 - 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index cb61b1ab7276..60d877913da3 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -52,7 +52,6 @@ shows the supported mask values, currently these:: ACPI_CA_DISASSEMBLER 0x00000800 ACPI_COMPILER 0x00001000 ACPI_TOOLS 0x00002000 - ACPI_BATTERY_COMPONENT 0x00040000 ACPI_BUTTON_COMPONENT 0x00080000 ACPI_SBS_COMPONENT 0x00100000 ACPI_FAN_COMPONENT 0x00200000 diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 08ee1c7b12e0..b822f77afba6 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -8,7 +8,7 @@ * Copyright (C) 2001, 2002 Paul Diefenbaugh */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define pr_fmt(fmt) "ACPI: battery: " fmt #include #include @@ -29,8 +29,6 @@ #include -#define PREFIX "ACPI: " - #define ACPI_BATTERY_VALUE_UNKNOWN 0xFFFFFFFF #define ACPI_BATTERY_CAPACITY_VALID(capacity) \ ((capacity) != 0 && (capacity) != ACPI_BATTERY_VALUE_UNKNOWN) @@ -44,10 +42,6 @@ #define ACPI_BATTERY_STATE_CHARGING 0x2 #define ACPI_BATTERY_STATE_CRITICAL 0x4 -#define _COMPONENT ACPI_BATTERY_COMPONENT - -ACPI_MODULE_NAME("battery"); - MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_AUTHOR("Alexey Starikovskiy "); MODULE_DESCRIPTION("ACPI Battery Driver"); @@ -466,7 +460,8 @@ static int extract_package(struct acpi_battery *battery, static int acpi_battery_get_status(struct acpi_battery *battery) { if (acpi_bus_get_status(battery->device)) { - ACPI_EXCEPTION((AE_INFO, AE_ERROR, "Evaluating _STA")); + acpi_handle_info(battery->device->handle, + "_STA evaluation failed\n"); return -ENODEV; } return 0; @@ -535,8 +530,10 @@ static int acpi_battery_get_info(struct acpi_battery *battery) mutex_unlock(&battery->lock); if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating %s", - use_bix ? "_BIX":"_BIF")); + acpi_handle_info(battery->device->handle, + "%s evaluation failed: %s\n", + use_bix ?"_BIX":"_BIF", + acpi_format_exception(status)); } else { result = extract_battery_info(use_bix, battery, @@ -573,7 +570,9 @@ static int acpi_battery_get_state(struct acpi_battery *battery) mutex_unlock(&battery->lock); if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating _BST")); + acpi_handle_info(battery->device->handle, + "_BST evaluation failed: %s", + acpi_format_exception(status)); return -ENODEV; } @@ -590,7 +589,7 @@ static int acpi_battery_get_state(struct acpi_battery *battery) battery->rate_now != ACPI_BATTERY_VALUE_UNKNOWN && (s16)(battery->rate_now) < 0) { battery->rate_now = abs((s16)battery->rate_now); - pr_warn_once(FW_BUG "battery: (dis)charge rate invalid.\n"); + pr_warn_once(FW_BUG "(dis)charge rate invalid.\n"); } if (test_bit(ACPI_BATTERY_QUIRK_PERCENTAGE_CAPACITY, &battery->flags) @@ -625,7 +624,9 @@ static int acpi_battery_set_alarm(struct acpi_battery *battery) if (ACPI_FAILURE(status)) return -ENODEV; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Alarm set to %d\n", battery->alarm)); + acpi_handle_debug(battery->device->handle, "Alarm set to %d\n", + battery->alarm); + return 0; } @@ -1201,8 +1202,7 @@ static int acpi_battery_add(struct acpi_device *device) if (result) goto fail; - pr_info(PREFIX "%s Slot [%s] (battery %s)\n", - ACPI_BATTERY_DEVICE_NAME, acpi_device_bid(device), + pr_info("Slot [%s] (battery %s)\n", acpi_device_bid(device), device->status.battery_present ? "present" : "absent"); battery->pm_nb.notifier_call = battery_notify; @@ -1282,8 +1282,7 @@ static void __init acpi_battery_init_async(void *unused, async_cookie_t cookie) if (battery_check_pmic) { for (i = 0; i < ARRAY_SIZE(acpi_battery_blacklist); i++) if (acpi_dev_present(acpi_battery_blacklist[i], "1", -1)) { - pr_info(PREFIX ACPI_BATTERY_DEVICE_NAME - ": found native %s PMIC, not loading\n", + pr_info("found native %s PMIC, not loading\n", acpi_battery_blacklist[i]); return; } diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 64f7674ee498..4d3eec9dc0ee 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -52,7 +52,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_COMPILER), ACPI_DEBUG_INIT(ACPI_TOOLS), - ACPI_DEBUG_INIT(ACPI_BATTERY_COMPONENT), ACPI_DEBUG_INIT(ACPI_BUTTON_COMPONENT), ACPI_DEBUG_INIT(ACPI_SBS_COMPONENT), ACPI_DEBUG_INIT(ACPI_FAN_COMPONENT), diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index b0d6c4cc1a39..8fc70b273c34 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -15,7 +15,6 @@ * Please update drivers/acpi/debug.c and Documentation/firmware-guide/acpi/debug.rst * if you add to this list. */ -#define ACPI_BATTERY_COMPONENT 0x00040000 #define ACPI_BUTTON_COMPONENT 0x00080000 #define ACPI_SBS_COMPONENT 0x00100000 #define ACPI_FAN_COMPONENT 0x00200000 From 411e3216d4ee7e3c25c365b0d09e18f7798d705a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Feb 2021 19:46:14 +0100 Subject: [PATCH 142/307] ACPI: button: Clean up printing messages Replace the ACPI_DEBUG_PRINT() instance in button.c with an acpi_handle_debug() call, drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more, drop the no longer needed ACPI_BUTTON_COMPONENT definition from the headers and update the documentation accordingly. While at it, replace the direct printk() invocations with pr_info() (that changes the excessive log level for some of them too) and drop the unneeded PREFIX sybmbol definition from battery.c. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Hans de Goede --- Documentation/firmware-guide/acpi/debug.rst | 1 - drivers/acpi/button.c | 15 +++++---------- drivers/acpi/sysfs.c | 1 - include/acpi/acpi_drivers.h | 1 - 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index 60d877913da3..67a5ad75a52e 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -52,7 +52,6 @@ shows the supported mask values, currently these:: ACPI_CA_DISASSEMBLER 0x00000800 ACPI_COMPILER 0x00001000 ACPI_TOOLS 0x00002000 - ACPI_BUTTON_COMPONENT 0x00080000 ACPI_SBS_COMPONENT 0x00100000 ACPI_FAN_COMPONENT 0x00200000 ACPI_PCI_COMPONENT 0x00400000 diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c index 0d93a5ef4d07..85e5e0328a2e 100644 --- a/drivers/acpi/button.c +++ b/drivers/acpi/button.c @@ -21,8 +21,6 @@ #include #include -#define PREFIX "ACPI: " - #define ACPI_BUTTON_CLASS "button" #define ACPI_BUTTON_FILE_STATE "state" #define ACPI_BUTTON_TYPE_UNKNOWN 0x00 @@ -54,9 +52,6 @@ static const char * const lid_init_state_str[] = { [ACPI_BUTTON_LID_INIT_DISABLED] = "disabled", }; -#define _COMPONENT ACPI_BUTTON_COMPONENT -ACPI_MODULE_NAME("button"); - MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Button Driver"); MODULE_LICENSE("GPL"); @@ -285,7 +280,7 @@ static int acpi_button_add_fs(struct acpi_device *device) return 0; if (acpi_button_dir || acpi_lid_dir) { - printk(KERN_ERR PREFIX "More than one Lid device found!\n"); + pr_info("More than one Lid device found!\n"); return -EEXIST; } @@ -434,8 +429,8 @@ static void acpi_button_notify(struct acpi_device *device, u32 event) } break; default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); + acpi_handle_debug(device->handle, "Unsupported event [0x%x]\n", + event); break; } } @@ -523,7 +518,7 @@ static int acpi_button_add(struct acpi_device *device) ACPI_BUTTON_CLASS, ACPI_BUTTON_SUBCLASS_LID); input->open = acpi_lid_input_open; } else { - printk(KERN_ERR PREFIX "Unsupported hid [%s]\n", hid); + pr_info("Unsupported hid [%s]\n", hid); error = -ENODEV; goto err_free_input; } @@ -567,7 +562,7 @@ static int acpi_button_add(struct acpi_device *device) } device_init_wakeup(&device->dev, true); - printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device)); + pr_info("%s [%s]\n", name, acpi_device_bid(device)); return 0; err_remove_fs: diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 4d3eec9dc0ee..152e8eec6f13 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -52,7 +52,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_COMPILER), ACPI_DEBUG_INIT(ACPI_TOOLS), - ACPI_DEBUG_INIT(ACPI_BUTTON_COMPONENT), ACPI_DEBUG_INIT(ACPI_SBS_COMPONENT), ACPI_DEBUG_INIT(ACPI_FAN_COMPONENT), ACPI_DEBUG_INIT(ACPI_PCI_COMPONENT), diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 8fc70b273c34..25df44b2ed25 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -15,7 +15,6 @@ * Please update drivers/acpi/debug.c and Documentation/firmware-guide/acpi/debug.rst * if you add to this list. */ -#define ACPI_BUTTON_COMPONENT 0x00080000 #define ACPI_SBS_COMPONENT 0x00100000 #define ACPI_FAN_COMPONENT 0x00200000 #define ACPI_PCI_COMPONENT 0x00400000 From 2924d2f837788bb0efaa79ece1e5b9e57928834b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Feb 2021 19:48:33 +0100 Subject: [PATCH 143/307] ACPI: video: Clean up printing messages Replace the ACPI_DEBUG_PRINT() instances in acpi_video.c with acpi_handle_debug() calls and the ACPI_EXCEPTION()/ACPI_ERROR()/ ACPI_WARNING() instances in there with acpi_handle_info() calls, which among other things causes the excessive log levels of those messages to be increased. Drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more from acpi_video.c, drop the no longer needed ACPI_VIDEO_COMPONENT definition from the headers and update the documentation accordingly. While at it, add a pr_fmt() definition to acpi_video.c, replace the direct printk() invocations in there with acpi_handle_info() or pr_info() (and reduce the excessive log level where applicable) and drop the PREFIX sybmbol definition which is not necessary any more from acpi_video.c. Also make unrelated janitorial changes to fix up white space and use ACPI_FAILURE() instead of negating ACPI_SUCCESS(). Signed-off-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Hans de Goede --- Documentation/firmware-guide/acpi/debug.rst | 1 - drivers/acpi/acpi_video.c | 99 +++++++++++---------- drivers/acpi/sysfs.c | 1 - include/acpi/acpi_drivers.h | 1 - 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index 67a5ad75a52e..761fae76bcce 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -59,7 +59,6 @@ shows the supported mask values, currently these:: ACPI_SYSTEM_COMPONENT 0x02000000 ACPI_THERMAL_COMPONENT 0x04000000 ACPI_MEMORY_DEVICE_COMPONENT 0x08000000 - ACPI_VIDEO_COMPONENT 0x10000000 ACPI_PROCESSOR_COMPONENT 0x20000000 debug_level diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c index a322a7bd286b..2ea1781290cc 100644 --- a/drivers/acpi/acpi_video.c +++ b/drivers/acpi/acpi_video.c @@ -7,6 +7,8 @@ * Copyright (C) 2006 Thomas Tuttle */ +#define pr_fmt(fmt) "ACPI: video: " fmt + #include #include #include @@ -26,16 +28,11 @@ #include #include -#define PREFIX "ACPI: " - #define ACPI_VIDEO_BUS_NAME "Video Bus" #define ACPI_VIDEO_DEVICE_NAME "Video Device" #define MAX_NAME_LEN 20 -#define _COMPONENT ACPI_VIDEO_COMPONENT -ACPI_MODULE_NAME("video"); - MODULE_AUTHOR("Bruno Ducrot"); MODULE_DESCRIPTION("ACPI Video Driver"); MODULE_LICENSE("GPL"); @@ -326,11 +323,11 @@ acpi_video_device_lcd_query_levels(acpi_handle handle, *levels = NULL; status = acpi_evaluate_object(handle, "_BCL", NULL, &buffer); - if (!ACPI_SUCCESS(status)) + if (ACPI_FAILURE(status)) return status; obj = (union acpi_object *)buffer.pointer; if (!obj || (obj->type != ACPI_TYPE_PACKAGE)) { - printk(KERN_ERR PREFIX "Invalid _BCL data\n"); + acpi_handle_info(handle, "Invalid _BCL data\n"); status = -EFAULT; goto err; } @@ -354,7 +351,7 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level) status = acpi_execute_simple_method(device->dev->handle, "_BCM", level); if (ACPI_FAILURE(status)) { - ACPI_ERROR((AE_INFO, "Evaluating _BCM failed")); + acpi_handle_info(device->dev->handle, "_BCM evaluation failed\n"); return -EIO; } @@ -368,7 +365,7 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level) return 0; } - ACPI_ERROR((AE_INFO, "Current brightness invalid")); + acpi_handle_info(device->dev->handle, "Current brightness invalid\n"); return -EINVAL; } @@ -622,9 +619,8 @@ acpi_video_device_lcd_get_level_current(struct acpi_video_device *device, * BQC returned an invalid level. * Stop using it. */ - ACPI_WARNING((AE_INFO, - "%s returned an invalid level", - buf)); + acpi_handle_info(device->dev->handle, + "%s returned an invalid level", buf); device->cap._BQC = device->cap._BCQ = 0; } else { /* @@ -635,7 +631,8 @@ acpi_video_device_lcd_get_level_current(struct acpi_video_device *device, * ACPI video backlight still works w/ buggy _BQC. * http://bugzilla.kernel.org/show_bug.cgi?id=12233 */ - ACPI_WARNING((AE_INFO, "Evaluating %s failed", buf)); + acpi_handle_info(device->dev->handle, + "%s evaluation failed", buf); device->cap._BQC = device->cap._BCQ = 0; } } @@ -675,7 +672,7 @@ acpi_video_device_EDID(struct acpi_video_device *device, if (obj && obj->type == ACPI_TYPE_BUFFER) *edid = obj; else { - printk(KERN_ERR PREFIX "Invalid _DDC data\n"); + acpi_handle_info(device->dev->handle, "Invalid _DDC data\n"); status = -EFAULT; kfree(obj); } @@ -827,10 +824,9 @@ int acpi_video_get_levels(struct acpi_device *device, int result = 0; u32 value; - if (!ACPI_SUCCESS(acpi_video_device_lcd_query_levels(device->handle, - &obj))) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Could not query available " - "LCD brightness level\n")); + if (ACPI_FAILURE(acpi_video_device_lcd_query_levels(device->handle, &obj))) { + acpi_handle_debug(device->handle, + "Could not query available LCD brightness level\n"); result = -ENODEV; goto out; } @@ -842,7 +838,6 @@ int acpi_video_get_levels(struct acpi_device *device, br = kzalloc(sizeof(*br), GFP_KERNEL); if (!br) { - printk(KERN_ERR "can't allocate memory\n"); result = -ENOMEM; goto out; } @@ -863,7 +858,7 @@ int acpi_video_get_levels(struct acpi_device *device, for (i = 0; i < obj->package.count; i++) { o = (union acpi_object *)&obj->package.elements[i]; if (o->type != ACPI_TYPE_INTEGER) { - printk(KERN_ERR PREFIX "Invalid data\n"); + acpi_handle_info(device->handle, "Invalid data\n"); continue; } value = (u32) o->integer.value; @@ -900,7 +895,8 @@ int acpi_video_get_levels(struct acpi_device *device, br->levels[i] = br->levels[i - level_ac_battery]; count += level_ac_battery; } else if (level_ac_battery > ACPI_VIDEO_FIRST_LEVEL) - ACPI_ERROR((AE_INFO, "Too many duplicates in _BCL package")); + acpi_handle_info(device->handle, + "Too many duplicates in _BCL package"); /* Check if the _BCL package is in a reversed order */ if (max_level == br->levels[ACPI_VIDEO_FIRST_LEVEL]) { @@ -910,8 +906,8 @@ int acpi_video_get_levels(struct acpi_device *device, sizeof(br->levels[ACPI_VIDEO_FIRST_LEVEL]), acpi_video_cmp_level, NULL); } else if (max_level != br->levels[count - 1]) - ACPI_ERROR((AE_INFO, - "Found unordered _BCL package")); + acpi_handle_info(device->handle, + "Found unordered _BCL package"); br->count = count; *dev_br = br; @@ -989,9 +985,9 @@ set_level: if (result) goto out_free_levels; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "found %d brightness levels\n", - br->count - ACPI_VIDEO_FIRST_LEVEL)); + acpi_handle_debug(device->dev->handle, "found %d brightness levels\n", + br->count - ACPI_VIDEO_FIRST_LEVEL); + return 0; out_free_levels: @@ -1023,7 +1019,8 @@ static void acpi_video_device_find_cap(struct acpi_video_device *device) if (acpi_has_method(device->dev->handle, "_BQC")) { device->cap._BQC = 1; } else if (acpi_has_method(device->dev->handle, "_BCQ")) { - printk(KERN_WARNING FW_BUG "_BCQ is used instead of _BQC\n"); + acpi_handle_info(device->dev->handle, + "_BCQ is used instead of _BQC\n"); device->cap._BCQ = 1; } @@ -1083,8 +1080,7 @@ static int acpi_video_bus_check(struct acpi_video_bus *video) /* Does this device support video switching? */ if (video->cap._DOS || video->cap._DOD) { if (!video->cap._DOS) { - printk(KERN_WARNING FW_BUG - "ACPI(%s) defines _DOD but not _DOS\n", + pr_info(FW_BUG "ACPI(%s) defines _DOD but not _DOS\n", acpi_device_bid(video->device)); } video->flags.multihead = 1; @@ -1272,7 +1268,8 @@ acpi_video_device_bind(struct acpi_video_bus *video, ids = &video->attached_array[i]; if (device->device_id == (ids->value.int_val & 0xffff)) { ids->bind_info = device; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "device_bind %d\n", i)); + acpi_handle_debug(video->device->handle, "%s: %d\n", + __func__, i); } } } @@ -1324,20 +1321,22 @@ static int acpi_video_device_enumerate(struct acpi_video_bus *video) return AE_NOT_EXIST; status = acpi_evaluate_object(video->device->handle, "_DOD", NULL, &buffer); - if (!ACPI_SUCCESS(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating _DOD")); + if (ACPI_FAILURE(status)) { + acpi_handle_info(video->device->handle, + "_DOD evaluation failed: %s\n", + acpi_format_exception(status)); return status; } dod = buffer.pointer; if (!dod || (dod->type != ACPI_TYPE_PACKAGE)) { - ACPI_EXCEPTION((AE_INFO, status, "Invalid _DOD data")); + acpi_handle_info(video->device->handle, "Invalid _DOD data\n"); status = -EFAULT; goto out; } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d video heads in _DOD\n", - dod->package.count)); + acpi_handle_debug(video->device->handle, "Found %d video heads in _DOD\n", + dod->package.count); active_list = kcalloc(1 + dod->package.count, sizeof(struct acpi_video_enumerated_device), @@ -1352,15 +1351,18 @@ static int acpi_video_device_enumerate(struct acpi_video_bus *video) obj = &dod->package.elements[i]; if (obj->type != ACPI_TYPE_INTEGER) { - printk(KERN_ERR PREFIX - "Invalid _DOD data in element %d\n", i); + acpi_handle_info(video->device->handle, + "Invalid _DOD data in element %d\n", i); continue; } active_list[count].value.int_val = obj->integer.value; active_list[count].bind_info = NULL; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "dod element[%d] = %d\n", i, - (int)obj->integer.value)); + + acpi_handle_debug(video->device->handle, + "_DOD element[%d] = %d\n", i, + (int)obj->integer.value); + count++; } @@ -1451,7 +1453,8 @@ acpi_video_switch_brightness(struct work_struct *work) out: if (result) - printk(KERN_ERR PREFIX "Failed to switch the brightness\n"); + acpi_handle_info(device->dev->handle, + "Failed to switch brightness\n"); } int acpi_video_get_edid(struct acpi_device *device, int type, int device_id, @@ -1601,8 +1604,8 @@ static void acpi_video_bus_notify(struct acpi_device *device, u32 event) break; default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); + acpi_handle_debug(device->handle, "Unsupported event [0x%x]\n", + event); break; } @@ -1675,8 +1678,7 @@ static void acpi_video_device_notify(acpi_handle handle, u32 event, void *data) keycode = KEY_DISPLAY_OFF; break; default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); + acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); break; } @@ -1812,11 +1814,12 @@ static void acpi_video_dev_register_backlight(struct acpi_video_device *device) &device->cooling_dev->device.kobj, "thermal_cooling"); if (result) - printk(KERN_ERR PREFIX "Create sysfs link\n"); + pr_info("sysfs link creation failed\n"); + result = sysfs_create_link(&device->cooling_dev->device.kobj, &device->dev->dev.kobj, "device"); if (result) - printk(KERN_ERR PREFIX "Create sysfs link\n"); + pr_info("Reverse sysfs link creation failed\n"); } static void acpi_video_run_bcl_for_osi(struct acpi_video_bus *video) @@ -2030,7 +2033,7 @@ static int acpi_video_bus_add(struct acpi_device *device) acpi_video_bus_match, NULL, device, NULL); if (status == AE_ALREADY_EXISTS) { - printk(KERN_WARNING FW_BUG + pr_info(FW_BUG "Duplicate ACPI video bus devices for the" " same VGA controller, please try module " "parameter \"video.allow_duplicates=1\"" @@ -2073,7 +2076,7 @@ static int acpi_video_bus_add(struct acpi_device *device) if (error) goto err_put_video; - printk(KERN_INFO PREFIX "%s [%s] (multi-head: %s rom: %s post: %s)\n", + pr_info("%s [%s] (multi-head: %s rom: %s post: %s)\n", ACPI_VIDEO_DEVICE_NAME, acpi_device_bid(device), video->flags.multihead ? "yes" : "no", video->flags.rom ? "yes" : "no", diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 152e8eec6f13..53125f08779c 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -59,7 +59,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_SYSTEM_COMPONENT), ACPI_DEBUG_INIT(ACPI_THERMAL_COMPONENT), ACPI_DEBUG_INIT(ACPI_MEMORY_DEVICE_COMPONENT), - ACPI_DEBUG_INIT(ACPI_VIDEO_COMPONENT), ACPI_DEBUG_INIT(ACPI_PROCESSOR_COMPONENT), }; diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index 25df44b2ed25..fdf93f83ebaf 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -22,7 +22,6 @@ #define ACPI_SYSTEM_COMPONENT 0x02000000 #define ACPI_THERMAL_COMPONENT 0x04000000 #define ACPI_MEMORY_DEVICE_COMPONENT 0x08000000 -#define ACPI_VIDEO_COMPONENT 0x10000000 #define ACPI_PROCESSOR_COMPONENT 0x20000000 /* From f86b15a1e6541446a4a5f69bcc211348238db97f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 3 Feb 2021 19:49:21 +0100 Subject: [PATCH 144/307] ACPI: thermal: Clean up printing messages Replace the ACPI_DEBUG_PRINT() instances in thermal.c with acpi_handle_debug() calls and modify the ACPI_THERMAL_TRIPS_EXCEPTION() macro in there to use acpi_handle_info() internally, which among other things causes the excessive log level of the messages printed by it to be increased. Drop the _COMPONENT and ACPI_MODULE_NAME() definitions that are not used any more from thermal.c, drop the no longer needed ACPI_THERMAL_COMPONENT definition from the headers and update the documentation accordingly. While at it, add a pr_fmt() definition to thermal.c, drop the PREFIX definition from there and replace some pr_warn() calls with pr_info() or acpi_handle_info() to reduce the excessive log level and (in the latter case) facilitate easier identification of the message source. Signed-off-by: Rafael J. Wysocki Reviewed-by: Hanjun Guo Reviewed-by: Hans de Goede --- Documentation/firmware-guide/acpi/debug.rst | 1 - drivers/acpi/sysfs.c | 1 - drivers/acpi/thermal.c | 87 ++++++++++----------- include/acpi/acpi_drivers.h | 1 - 4 files changed, 43 insertions(+), 47 deletions(-) diff --git a/Documentation/firmware-guide/acpi/debug.rst b/Documentation/firmware-guide/acpi/debug.rst index 761fae76bcce..03cd4e25fc45 100644 --- a/Documentation/firmware-guide/acpi/debug.rst +++ b/Documentation/firmware-guide/acpi/debug.rst @@ -57,7 +57,6 @@ shows the supported mask values, currently these:: ACPI_PCI_COMPONENT 0x00400000 ACPI_CONTAINER_COMPONENT 0x01000000 ACPI_SYSTEM_COMPONENT 0x02000000 - ACPI_THERMAL_COMPONENT 0x04000000 ACPI_MEMORY_DEVICE_COMPONENT 0x08000000 ACPI_PROCESSOR_COMPONENT 0x20000000 diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 53125f08779c..8baf7644a0d0 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -57,7 +57,6 @@ static const struct acpi_dlayer acpi_debug_layers[] = { ACPI_DEBUG_INIT(ACPI_PCI_COMPONENT), ACPI_DEBUG_INIT(ACPI_CONTAINER_COMPONENT), ACPI_DEBUG_INIT(ACPI_SYSTEM_COMPONENT), - ACPI_DEBUG_INIT(ACPI_THERMAL_COMPONENT), ACPI_DEBUG_INIT(ACPI_MEMORY_DEVICE_COMPONENT), ACPI_DEBUG_INIT(ACPI_PROCESSOR_COMPONENT), }; diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 859b1de31ddc..4f906380b031 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -13,6 +13,8 @@ * concepts of 'multiple limiters', upper/lower limits, etc. */ +#define pr_fmt(fmt) "ACPI: thermal: " fmt + #include #include #include @@ -29,8 +31,6 @@ #include #include -#define PREFIX "ACPI: " - #define ACPI_THERMAL_CLASS "thermal_zone" #define ACPI_THERMAL_DEVICE_NAME "Thermal Zone" #define ACPI_THERMAL_NOTIFY_TEMPERATURE 0x80 @@ -43,9 +43,6 @@ #define ACPI_THERMAL_MAX_ACTIVE 10 #define ACPI_THERMAL_MAX_LIMIT_STR_LEN 65 -#define _COMPONENT ACPI_THERMAL_COMPONENT -ACPI_MODULE_NAME("thermal"); - MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Thermal Zone Driver"); MODULE_LICENSE("GPL"); @@ -197,8 +194,9 @@ static int acpi_thermal_get_temperature(struct acpi_thermal *tz) return -ENODEV; tz->temperature = tmp; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Temperature is %lu dK\n", - tz->temperature)); + + acpi_handle_debug(tz->device->handle, "Temperature is %lu dK\n", + tz->temperature); return 0; } @@ -216,8 +214,8 @@ static int acpi_thermal_get_polling_frequency(struct acpi_thermal *tz) return -ENODEV; tz->polling_frequency = tmp; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Polling frequency is %lu dS\n", - tz->polling_frequency)); + acpi_handle_debug(tz->device->handle, "Polling frequency is %lu dS\n", + tz->polling_frequency); return 0; } @@ -254,12 +252,12 @@ static int acpi_thermal_set_cooling_mode(struct acpi_thermal *tz, int mode) * 2.TODO: Devices listed in _PSL, _ALx, _TZD may change. * We need to re-bind the cooling devices of a thermal zone when this occurs. */ -#define ACPI_THERMAL_TRIPS_EXCEPTION(flags, str) \ +#define ACPI_THERMAL_TRIPS_EXCEPTION(flags, tz, str) \ do { \ if (flags != ACPI_TRIPS_INIT) \ - ACPI_EXCEPTION((AE_INFO, AE_ERROR, \ + acpi_handle_info(tz->device->handle, \ "ACPI thermal trip point %s changed\n" \ - "Please send acpidump to linux-acpi@vger.kernel.org", str)); \ + "Please report to linux-acpi@vger.kernel.org\n", str); \ } while (0) static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) @@ -283,17 +281,17 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) */ if (ACPI_FAILURE(status)) { tz->trips.critical.flags.valid = 0; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "No critical threshold\n")); + acpi_handle_debug(tz->device->handle, + "No critical threshold\n"); } else if (tmp <= 2732) { - pr_warn(FW_BUG "Invalid critical threshold (%llu)\n", + pr_info(FW_BUG "Invalid critical threshold (%llu)\n", tmp); tz->trips.critical.flags.valid = 0; } else { tz->trips.critical.flags.valid = 1; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, + acpi_handle_debug(tz->device->handle, "Found critical threshold [%lu]\n", - tz->trips.critical.temperature)); + tz->trips.critical.temperature); } if (tz->trips.critical.flags.valid == 1) { if (crt == -1) { @@ -305,8 +303,8 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) * Allow override critical threshold */ if (crt_k > tz->trips.critical.temperature) - pr_warn(PREFIX "Critical threshold %d C\n", - crt); + pr_info("Critical threshold %d C\n", crt); + tz->trips.critical.temperature = crt_k; } } @@ -318,14 +316,14 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) "_HOT", NULL, &tmp); if (ACPI_FAILURE(status)) { tz->trips.hot.flags.valid = 0; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "No hot threshold\n")); + acpi_handle_debug(tz->device->handle, + "No hot threshold\n"); } else { tz->trips.hot.temperature = tmp; tz->trips.hot.flags.valid = 1; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Found hot threshold [%lu]\n", - tz->trips.hot.temperature)); + acpi_handle_debug(tz->device->handle, + "Found hot threshold [%lu]\n", + tz->trips.hot.temperature); } } @@ -378,7 +376,8 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) status = acpi_evaluate_reference(tz->device->handle, "_PSL", NULL, &devices); if (ACPI_FAILURE(status)) { - pr_warn(PREFIX "Invalid passive threshold\n"); + acpi_handle_info(tz->device->handle, + "Invalid passive threshold\n"); tz->trips.passive.flags.valid = 0; } else @@ -388,12 +387,12 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) sizeof(struct acpi_handle_list))) { memcpy(&tz->trips.passive.devices, &devices, sizeof(struct acpi_handle_list)); - ACPI_THERMAL_TRIPS_EXCEPTION(flag, "device"); + ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "device"); } } if ((flag & ACPI_TRIPS_PASSIVE) || (flag & ACPI_TRIPS_DEVICES)) { if (valid != tz->trips.passive.flags.valid) - ACPI_THERMAL_TRIPS_EXCEPTION(flag, "state"); + ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "state"); } /* Active (optional) */ @@ -440,8 +439,8 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) status = acpi_evaluate_reference(tz->device->handle, name, NULL, &devices); if (ACPI_FAILURE(status)) { - pr_warn(PREFIX "Invalid active%d threshold\n", - i); + acpi_handle_info(tz->device->handle, + "Invalid active%d threshold\n", i); tz->trips.active[i].flags.valid = 0; } else @@ -451,12 +450,12 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) sizeof(struct acpi_handle_list))) { memcpy(&tz->trips.active[i].devices, &devices, sizeof(struct acpi_handle_list)); - ACPI_THERMAL_TRIPS_EXCEPTION(flag, "device"); + ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "device"); } } if ((flag & ACPI_TRIPS_ACTIVE) || (flag & ACPI_TRIPS_DEVICES)) if (valid != tz->trips.active[i].flags.valid) - ACPI_THERMAL_TRIPS_EXCEPTION(flag, "state"); + ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "state"); if (!tz->trips.active[i].flags.valid) break; @@ -469,7 +468,7 @@ static int acpi_thermal_trips_update(struct acpi_thermal *tz, int flag) if (ACPI_SUCCESS(status) && memcmp(&tz->devices, &devices, sizeof(devices))) { tz->devices = devices; - ACPI_THERMAL_TRIPS_EXCEPTION(flag, "device"); + ACPI_THERMAL_TRIPS_EXCEPTION(flag, tz, "device"); } } @@ -925,8 +924,8 @@ static void acpi_thermal_notify(struct acpi_device *device, u32 event) dev_name(&device->dev), event, 0); break; default: - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Unsupported event [0x%x]\n", event)); + acpi_handle_debug(device->handle, "Unsupported event [0x%x]\n", + event); break; } } @@ -1074,7 +1073,7 @@ static int acpi_thermal_add(struct acpi_device *device) mutex_init(&tz->thermal_check_lock); INIT_WORK(&tz->thermal_check_work, acpi_thermal_check_fn); - pr_info(PREFIX "%s [%s] (%ld C)\n", acpi_device_name(device), + pr_info("%s [%s] (%ld C)\n", acpi_device_name(device), acpi_device_bid(device), deci_kelvin_to_celsius(tz->temperature)); goto end; @@ -1146,24 +1145,24 @@ static int acpi_thermal_resume(struct device *dev) static int thermal_act(const struct dmi_system_id *d) { if (act == 0) { - pr_notice(PREFIX "%s detected: " - "disabling all active thermal trip points\n", d->ident); + pr_notice("%s detected: disabling all active thermal trip points\n", + d->ident); act = -1; } return 0; } static int thermal_nocrt(const struct dmi_system_id *d) { - pr_notice(PREFIX "%s detected: " - "disabling all critical thermal trip point actions.\n", d->ident); + pr_notice("%s detected: disabling all critical thermal trip point actions.\n", + d->ident); nocrt = 1; return 0; } static int thermal_tzp(const struct dmi_system_id *d) { if (tzp == 0) { - pr_notice(PREFIX "%s detected: " - "enabling thermal zone polling\n", d->ident); + pr_notice("%s detected: enabling thermal zone polling\n", + d->ident); tzp = 300; /* 300 dS = 30 Seconds */ } return 0; @@ -1171,8 +1170,8 @@ static int thermal_tzp(const struct dmi_system_id *d) { static int thermal_psv(const struct dmi_system_id *d) { if (psv == 0) { - pr_notice(PREFIX "%s detected: " - "disabling all passive thermal trip points\n", d->ident); + pr_notice("%s detected: disabling all passive thermal trip points\n", + d->ident); psv = -1; } return 0; @@ -1225,7 +1224,7 @@ static int __init acpi_thermal_init(void) dmi_check_system(thermal_dmi_table); if (off) { - pr_notice(PREFIX "thermal control disabled\n"); + pr_notice("thermal control disabled\n"); return -ENODEV; } diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h index fdf93f83ebaf..94d356fcc483 100644 --- a/include/acpi/acpi_drivers.h +++ b/include/acpi/acpi_drivers.h @@ -20,7 +20,6 @@ #define ACPI_PCI_COMPONENT 0x00400000 #define ACPI_CONTAINER_COMPONENT 0x01000000 #define ACPI_SYSTEM_COMPONENT 0x02000000 -#define ACPI_THERMAL_COMPONENT 0x04000000 #define ACPI_MEMORY_DEVICE_COMPONENT 0x08000000 #define ACPI_PROCESSOR_COMPONENT 0x20000000 From 4ffa84b861cbe251ac55de6f538835f6c4a342ad Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 2 Feb 2021 14:51:42 +0800 Subject: [PATCH 145/307] ACPI: APEI: ERST: remove unneeded semicolon Eliminate the following coccicheck warning: ./drivers/acpi/apei/erst.c:691:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Rafael J. Wysocki --- drivers/acpi/apei/erst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 2e0b0fcad960..b9597216d021 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -688,7 +688,7 @@ static int __erst_read_from_storage(u64 record_id, u64 offset) break; if (erst_timedout(&timeout, SPIN_UNIT)) return -EIO; - }; + } rc = apei_exec_run(&ctx, ACPI_ERST_GET_COMMAND_STATUS); if (rc) return rc; From cc4a3f885e8f2bc3c86a265972e94fef32d68f67 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 3 Feb 2021 15:41:56 +0300 Subject: [PATCH 146/307] fcntl: make F_GETOWN(EX) return 0 on dead owner task Currently there is no way to differentiate the file with alive owner from the file with dead owner but pid of the owner reused. That's why CRIU can't actually know if it needs to restore file owner or not, because if it restores owner but actual owner was dead, this can introduce unexpected signals to the "false"-owner (which reused the pid). Let's change the api, so that F_GETOWN(EX) returns 0 in case actual owner is dead already. This comports with the POSIX spec, which states that a PID of 0 indicates that no signal will be sent. Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Cyrill Gorcunov Cc: Andrei Vagin Signed-off-by: Pavel Tikhomirov Signed-off-by: Jeff Layton --- fs/fcntl.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index 05b36b28f2e8..483ef8861376 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -148,11 +148,15 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { - pid_t pid; + pid_t pid = 0; read_lock(&filp->f_owner.lock); - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) - pid = -pid; + rcu_read_lock(); + if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { + pid = pid_vnr(filp->f_owner.pid); + if (filp->f_owner.pid_type == PIDTYPE_PGID) + pid = -pid; + } + rcu_read_unlock(); read_unlock(&filp->f_owner.lock); return pid; } @@ -200,11 +204,14 @@ static int f_setown_ex(struct file *filp, unsigned long arg) static int f_getown_ex(struct file *filp, unsigned long arg) { struct f_owner_ex __user *owner_p = (void __user *)arg; - struct f_owner_ex owner; + struct f_owner_ex owner = {}; int ret = 0; read_lock(&filp->f_owner.lock); - owner.pid = pid_vnr(filp->f_owner.pid); + rcu_read_lock(); + if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) + owner.pid = pid_vnr(filp->f_owner.pid); + rcu_read_unlock(); switch (filp->f_owner.pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; From 093e0687c5baacc29e4e8dd3ea205bac518e38bc Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sun, 7 Feb 2021 16:32:50 +0800 Subject: [PATCH 147/307] jfs: turn diLog(), dataLog() and txLog() into void functions These functions always return '0' and no callers use the return value. So make it a void function. This eliminates the following coccicheck warning: ./fs/jfs/jfs_txnmgr.c:1365:5-7: Unneeded variable: "rc". Return "0" on line 1414 ./fs/jfs/jfs_txnmgr.c:1422:5-7: Unneeded variable: "rc". Return "0" on line 1527 Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_txnmgr.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index dca8edd2378c..053295cd7bc6 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -148,10 +148,10 @@ static struct { /* * forward references */ -static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck, struct commit * cd); -static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck); +static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck, struct commit *cd); +static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck); static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, struct tlock * tlck); static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, @@ -159,8 +159,8 @@ static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, static void txAllocPMap(struct inode *ip, struct maplock * maplock, struct tblock * tblk); static void txForce(struct tblock * tblk); -static int txLog(struct jfs_log * log, struct tblock * tblk, - struct commit * cd); +static void txLog(struct jfs_log *log, struct tblock *tblk, + struct commit *cd); static void txUpdateMap(struct tblock * tblk); static void txRelease(struct tblock * tblk); static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, @@ -1256,8 +1256,7 @@ int txCommit(tid_t tid, /* transaction identifier */ * * txUpdateMap() resets XAD_NEW in XAD. */ - if ((rc = txLog(log, tblk, &cd))) - goto TheEnd; + txLog(log, tblk, &cd); /* * Ensure that inode isn't reused before @@ -1365,9 +1364,8 @@ int txCommit(tid_t tid, /* transaction identifier */ * * RETURN : */ -static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) +static void txLog(struct jfs_log *log, struct tblock *tblk, struct commit *cd) { - int rc = 0; struct inode *ip; lid_t lid; struct tlock *tlck; @@ -1414,7 +1412,7 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) } } - return rc; + return; } /* @@ -1422,10 +1420,9 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) * * function: log inode tlock and format maplock to update bmap; */ -static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck, struct commit * cd) +static void diLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck, struct commit *cd) { - int rc = 0; struct metapage *mp; pxd_t *pxd; struct pxd_lock *pxdlock; @@ -1527,7 +1524,7 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } #endif /* _JFS_WIP */ - return rc; + return; } /* @@ -1535,8 +1532,8 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * * function: log data tlock */ -static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck) +static void dataLog(struct jfs_log *log, struct tblock *tblk, struct lrd *lrd, + struct tlock *tlck) { struct metapage *mp; pxd_t *pxd; @@ -1562,7 +1559,7 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, metapage_homeok(mp); discard_metapage(mp); tlck->mp = NULL; - return 0; + return; } PXDaddress(pxd, mp->index); @@ -1573,7 +1570,7 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, /* mark page as homeward bound */ tlck->flag |= tlckWRITEPAGE; - return 0; + return; } /* From 4f4317c13a40194940acf4a71670179c4faca2b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Dec 2020 09:53:23 -0500 Subject: [PATCH 148/307] btrfs: fix error handling in commit_fs_roots While doing error injection I would sometimes get a corrupt file system. This is because I was injecting errors at btrfs_search_slot, but would only do it one time per stack. This uncovered a problem in commit_fs_roots, where if we get an error we would just break. However we're in a nested loop, the first loop being a loop to find all the dirty fs roots, and then subsequent root updates would succeed clearing the error value. This isn't likely to happen in real scenarios, however we could potentially get a random ENOMEM once and then not again, and we'd end up with a corrupted file system. Fix this by moving the error checking around a bit to the main loop, as this is the only place where something will fail, and return the error as soon as it occurs. With this patch my reproducer no longer corrupts the file system. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6af7f2bf92de..fbf93067642a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1319,7 +1319,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) struct btrfs_root *gang[8]; int i; int ret; - int err = 0; spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1331,6 +1330,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; + int ret2; + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1350,17 +1351,17 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) root->node); } - err = btrfs_update_root(trans, fs_info->tree_root, + ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret2) + return ret2; spin_lock(&fs_info->fs_roots_radix_lock); - if (err) - break; btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); - return err; + return 0; } /* From 3cc64e7ebfb0d7faaba2438334c43466955a96e8 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Fri, 20 Nov 2020 09:08:04 +0800 Subject: [PATCH 149/307] btrfs: clarify error returns values in __load_free_space_cache Return value in __load_free_space_cache is not properly set after (unlikely) memory allocation failures and 0 is returned instead. This is not a problem for the caller load_free_space_cache because only value 1 is considered as 'cache loaded' but for clarity it's better to set the errors accordingly. Fixes: a67509c30079 ("Btrfs: add a io_ctl struct and helpers for dealing with the space cache") Reported-by: Hulk Robot Signed-off-by: Zhihao Cheng Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4d8897879c9c..71d0d14bc18b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -775,8 +775,10 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, while (num_entries) { e = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); - if (!e) + if (!e) { + ret = -ENOMEM; goto free_cache; + } ret = io_ctl_read_entry(&io_ctl, e, &type); if (ret) { @@ -785,6 +787,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, } if (!e->bytes) { + ret = -1; kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; } @@ -805,6 +808,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, e->bitmap = kmem_cache_zalloc( btrfs_free_space_bitmap_cachep, GFP_NOFS); if (!e->bitmap) { + ret = -ENOMEM; kmem_cache_free( btrfs_free_space_cachep, e); goto free_cache; From 149716570be98185150860fe922bf89ed080bd3c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 10 Dec 2020 10:38:32 +0200 Subject: [PATCH 150/307] btrfs: cleanup local variables in btrfs_file_write_iter First replace all inode instances with a pointer to btrfs_inode. This removes multiple invocations of the BTRFS_I macro, subsequently remove 2 local variables as they are called only once and simply refer to them directly. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e41459b8de6..e65223e3510d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1997,9 +1997,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written = 0; const bool sync = iocb->ki_flags & IOCB_DSYNC; @@ -2008,7 +2006,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2016,7 +2014,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, return -EOPNOTSUPP; if (sync) - atomic_inc(&BTRFS_I(inode)->sync_writers); + atomic_inc(&inode->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) num_written = btrfs_direct_write(iocb, from); @@ -2028,14 +2026,14 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occurred. */ - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->last_sub_trans = root->log_transid; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); if (num_written > 0) num_written = generic_write_sync(iocb, num_written); if (sync) - atomic_dec(&BTRFS_I(inode)->sync_writers); + atomic_dec(&inode->sync_writers); current->backing_dev_info = NULL; return num_written; From 453e4873869f5e967188d8b018efc34a57eed44f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:32 +0200 Subject: [PATCH 151/307] btrfs: rename btrfs_find_highest_objectid to btrfs_init_root_free_objectid This function is used to initialize the in-memory btrfs_root::highest_objectid member, which is used to get an available objectid. Rename it to better reflect its semantics. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++------- fs/btrfs/disk-io.h | 2 +- fs/btrfs/tree-log.c | 3 +-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6b35b7e88136..47646a79d3fc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1367,8 +1367,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) } mutex_lock(&root->objectid_mutex); - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); goto fail; @@ -2646,8 +2645,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user */ - ret = btrfs_find_highest_objectid(tree_root, - &tree_root->highest_objectid); + ret = btrfs_init_root_free_objectid(tree_root); if (ret < 0) { handle_error = true; continue; @@ -4745,7 +4743,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_init_root_free_objectid(struct btrfs_root *root) { struct btrfs_path *path; int ret; @@ -4769,10 +4767,10 @@ int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = max_t(u64, found_key.objectid, + root->highest_objectid = max_t(u64, found_key.objectid, BTRFS_FIRST_FREE_OBJECTID - 1); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->highest_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index e45057c0c016..5e5bc603fbdf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -134,7 +134,7 @@ int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); -int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_init_root_free_objectid(struct btrfs_root *root); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 254c2ee43aae..8ee0700a980f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -6307,8 +6307,7 @@ again: * root->objectid_mutex is not acquired as log replay * could only happen during mount. */ - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); + ret = btrfs_init_root_free_objectid(root); } wc.replay_dest->log_root = NULL; From 543068a217a877bb6fa831fc448c9cc131db4feb Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:33 +0200 Subject: [PATCH 152/307] btrfs: rename btrfs_find_free_objectid to btrfs_get_free_objectid This better reflects the semantics of the function i.e no search is performed whatsoever. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/disk-io.h | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/inode.c | 12 ++++++------ fs/btrfs/ioctl.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/transaction.c | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 47646a79d3fc..98384d3e41ac 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4778,7 +4778,7 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e5bc603fbdf..9f4a2a1e3d36 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -133,7 +133,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); -int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 71d0d14bc18b..fd6ddd6b8165 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -198,7 +198,7 @@ int create_free_space_inode(struct btrfs_trans_handle *trans, int ret; u64 ino; - ret = btrfs_find_free_objectid(trans->fs_info->tree_root, &ino); + ret = btrfs_get_free_objectid(trans->fs_info->tree_root, &ino); if (ret < 0) return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8e0a6b038d3..0fe4df05006d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6371,7 +6371,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6435,7 +6435,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -6579,7 +6579,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_fail; @@ -9079,7 +9079,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, u64 objectid; u64 index; - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) return ret; @@ -9575,7 +9575,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out_unlock; @@ -9909,7 +9909,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_find_free_objectid(root, &objectid); + ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dde49a791f3e..ec83803800c5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -613,7 +613,7 @@ static noinline int create_subvol(struct inode *dir, if (!root_item) return -ENOMEM; - ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); + ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) goto fail_free; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index df63ef64c5c0..2698805ebd26 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3434,7 +3434,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, return ERR_CAST(trans); } - err = btrfs_find_free_objectid(root, &objectid); + err = btrfs_get_free_objectid(root, &objectid); if (err) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index fbf93067642a..3bcb5444536e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1526,7 +1526,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; - pending->error = btrfs_find_free_objectid(tree_root, &objectid); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto no_free_objectid; From 6b8fad576a3c8f822a888873c5acdfb31de53c4c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:35 +0200 Subject: [PATCH 153/307] btrfs: rename btrfs_root::highest_objectid to free_objectid This reflects the true purpose of the member as it's being used solely in context where a new objectid is being allocated. Future changes will also change the way it's being used to closely follow this semantics. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 14 +++++++------- fs/btrfs/ioctl.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4debdbdde2ab..dc77aac2476c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1104,7 +1104,7 @@ struct btrfs_root { u32 type; - u64 highest_objectid; + u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 98384d3e41ac..a5ae0bbd983e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1016,7 +1016,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->orphan_cleanup_state = 0; root->last_trans = 0; - root->highest_objectid = 0; + root->free_objectid = 0; root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; @@ -1373,7 +1373,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail; } - ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); mutex_unlock(&root->objectid_mutex); @@ -2651,7 +2651,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) continue; } - ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); ret = btrfs_read_roots(fs_info); if (ret < 0) { @@ -4767,10 +4767,10 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->highest_objectid = max_t(u64, found_key.objectid, + root->free_objectid = max_t(u64, found_key.objectid, BTRFS_FIRST_FREE_OBJECTID - 1); } else { - root->highest_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: @@ -4783,7 +4783,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) { btrfs_warn(root->fs_info, "the objectid of root %llu reaches its highest value", root->root_key.objectid); @@ -4791,7 +4791,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->highest_objectid; + *objectid = ++root->free_objectid; ret = 0; out: mutex_unlock(&root->objectid_mutex); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ec83803800c5..2041c4b6fd0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -725,7 +725,7 @@ static noinline int create_subvol(struct inode *dir, } mutex_lock(&new_root->objectid_mutex); - new_root->highest_objectid = new_dirid; + new_root->free_objectid = new_dirid; mutex_unlock(&new_root->objectid_mutex); /* From 23125104d8485505cd19581025a3d6fc14e9945a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:36 +0200 Subject: [PATCH 154/307] btrfs: make btrfs_root::free_objectid hold the next available objectid Adjust the way free_objectid is being initialized, it now stores BTRFS_FIRST_FREE_OBJECTID rather than the, somewhat arbitrary, BTRFS_FIRST_FREE_OBJECTID - 1. This change also has the added benefit that now it becomes unnecessary to explicitly initialize free_objectid for a newly create fs root. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/inode.c | 8 ++++++-- fs/btrfs/ioctl.c | 4 ---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a5ae0bbd983e..5473bed6a7e8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4767,10 +4767,10 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - root->free_objectid = max_t(u64, found_key.objectid, - BTRFS_FIRST_FREE_OBJECTID - 1); + root->free_objectid = max_t(u64, found_key.objectid + 1, + BTRFS_FIRST_FREE_OBJECTID); } else { - root->free_objectid = BTRFS_FIRST_FREE_OBJECTID - 1; + root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } ret = 0; error: @@ -4791,7 +4791,7 @@ int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) goto out; } - *objectid = ++root->free_objectid; + *objectid = root->free_objectid++; ret = 0; out: mutex_unlock(&root->objectid_mutex); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0fe4df05006d..356905d97656 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8598,9 +8598,13 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct inode *inode; int err; u64 index = 0; + u64 ino; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, - new_dirid, new_dirid, + err = btrfs_get_free_objectid(new_root, &ino); + if (err < 0) + return err; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2041c4b6fd0e..d8422074da02 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -724,10 +724,6 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - mutex_lock(&new_root->objectid_mutex); - new_root->free_objectid = new_dirid; - mutex_unlock(&new_root->objectid_mutex); - /* * insert the directory item */ From 69948022c9261a87c3c256bfa21c132f5099c690 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 7 Dec 2020 17:32:37 +0200 Subject: [PATCH 155/307] btrfs: remove new_dirid argument from btrfs_create_subvol_root It's no longer used. While at it also remove new_dirid in create_subvol as it's used in a single place and open code it. No functional changes. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +-- fs/btrfs/inode.c | 3 +-- fs/btrfs/ioctl.c | 5 ++--- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dc77aac2476c..f5c636b29451 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3107,8 +3107,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid); + struct btrfs_root *parent_root); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 356905d97656..af5558f87243 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8592,8 +8592,7 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root, - u64 new_dirid) + struct btrfs_root *parent_root) { struct inode *inode; int err; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d8422074da02..5b9b0a390f0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -606,7 +606,6 @@ static noinline int create_subvol(struct inode *dir, int err; dev_t anon_dev = 0; u64 objectid; - u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); @@ -693,7 +692,7 @@ static noinline int create_subvol(struct inode *dir, free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(root_item, new_dirid); + btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; key.offset = 0; @@ -716,7 +715,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); + ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ From f75e2b79b5ba9dd3e0899840a329c3da02dc8937 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:43 -0500 Subject: [PATCH 156/307] btrfs: allow error injection for btrfs_search_slot and btrfs_cow_block The following patches are going to address error handling in relocation, in order to test those patches I need to be able to inject errors in btrfs_search_slot and btrfs_cow_block, as we call both of these pretty often in different cases during relocation. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cc89b63d65a4..56e132d825a2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1494,6 +1494,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } +ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* * helper function for defrag to decide if two blocks pointed to by a @@ -2821,6 +2822,7 @@ done: btrfs_release_path(p); return ret; } +ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); /* * Like btrfs_search_slot, this looks for a key in the given tree. It uses the From 1fec12a560033ebe8fa6857dd3cbf9677371fbee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:45 -0500 Subject: [PATCH 157/307] btrfs: noinline btrfs_should_cancel_balance I was attempting to reproduce a problem that Zygo hit, but my error injection wasn't firing for a few of the common calls to btrfs_should_cancel_balance. This is because the compiler decided to inline it at these spots. Keep this from happening by explicitly marking the function as noinline so that error injection will always work. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2698805ebd26..8e51b39cbfbb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2615,7 +2615,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, /* * Allow error injection to test balance cancellation */ -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || fatal_signal_pending(current); From 0d73a11c62642a25b688d09ae04b3b1f1b58ebb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:46 -0500 Subject: [PATCH 158/307] btrfs: ref-verify: pass down tree block level when building refs I noticed that sometimes I would have the wrong level printed out with ref-verify while testing some error injection related problems. This is because we only get the level from the main extent item, but our references could go off the current leaf into another, and at that point we lose our level. Fix this by keeping track of the last tree block level that we found, the same way we keep track of our bytenr and num_bytes, in case we happen to wander into another leaf while still processing the references for a bytenr. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 4b9b6c52a83b..409b02566b25 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -495,14 +495,15 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, } static int process_leaf(struct btrfs_root *root, - struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret = 0; + int i = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); @@ -515,15 +516,15 @@ static int process_leaf(struct btrfs_root *root, case BTRFS_METADATA_ITEM_KEY: *bytenr = key.objectid; ret = process_extent_item(fs_info, path, &key, i, - &tree_block_level); + tree_block_level); break; case BTRFS_TREE_BLOCK_REF_KEY: ret = add_tree_block(fs_info, key.offset, 0, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_SHARED_BLOCK_REF_KEY: ret = add_tree_block(fs_info, 0, key.offset, - key.objectid, tree_block_level); + key.objectid, *tree_block_level); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = btrfs_item_ptr(leaf, i, @@ -549,7 +550,8 @@ static int process_leaf(struct btrfs_root *root, /* Walk down to the leaf from the given level */ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, - int level, u64 *bytenr, u64 *num_bytes) + int level, u64 *bytenr, u64 *num_bytes, + int *tree_block_level) { struct extent_buffer *eb; int ret = 0; @@ -565,7 +567,8 @@ static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, path->slots[level-1] = 0; path->locks[level-1] = BTRFS_READ_LOCK; } else { - ret = process_leaf(root, path, bytenr, num_bytes); + ret = process_leaf(root, path, bytenr, num_bytes, + tree_block_level); if (ret) break; } @@ -974,6 +977,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) { struct btrfs_path *path; struct extent_buffer *eb; + int tree_block_level = 0; u64 bytenr = 0, num_bytes = 0; int ret, level; @@ -998,7 +1002,7 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) * different leaf from the original extent item. */ ret = walk_down_tree(fs_info->extent_root, path, level, - &bytenr, &num_bytes); + &bytenr, &num_bytes, &tree_block_level); if (ret) break; ret = walk_up_tree(path, &level); From 1478143ac81acc4094f8501a88e9e6ef9ff0e4a5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:18:47 -0500 Subject: [PATCH 159/307] btrfs: ref-verify: make sure owner is set for all refs I noticed that shared ref entries in ref-verify didn't have the proper owner set, which caused me to think there was something seriously wrong. However the problem is if we have a parent we simply weren't filling out the owner part of the reference, even though we have it. Fix this by making sure we set all the proper fields when we modify a reference, this way we'll have the proper owner if a problem happens and we don't waste time thinking we're updating the wrong level. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 409b02566b25..2b490becbe67 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -669,18 +669,18 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root; - u64 owner; - u64 offset; + u64 ref_root = 0; + u64 owner = 0; + u64 offset = 0; if (!btrfs_test_opt(fs_info, REF_VERIFY)) return 0; if (generic_ref->type == BTRFS_REF_METADATA) { - ref_root = generic_ref->tree_ref.root; + if (!parent) + ref_root = generic_ref->tree_ref.root; owner = generic_ref->tree_ref.level; - offset = 0; - } else { + } else if (!parent) { ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; @@ -696,13 +696,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, goto out; } - if (parent) { - ref->parent = parent; - } else { - ref->root_objectid = ref_root; - ref->owner = owner; - ref->offset = offset; - } + ref->parent = parent; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; memcpy(&ra->ref, ref, sizeof(struct ref_entry)); From 7056bf69e5a338811738a7932b8e707aaca9fdd0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 17 Dec 2020 15:21:16 +0200 Subject: [PATCH 160/307] btrfs: consolidate btrfs_previous_item ret val handling in btrfs_shrink_device Instead of having three 'if' to handle non-NULL return value consolidate this in one 'if (ret)'. That way the code is more obvious: - Always drop delete_unused_bgs_mutex if ret is not NULL - If ret is negative -> goto done - If it's 1 -> reset ret to 0, release the path and finish the loop. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6c24c8ad749..a8ec8539cd8d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4668,11 +4668,10 @@ again: } ret = btrfs_previous_item(root, path, 0, key.type); - if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); - if (ret < 0) - goto done; if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + if (ret < 0) + goto done; ret = 0; btrfs_release_path(path); break; From 9c4a062a94752dabd3954ef39c4dfed581c664b9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 11 Jan 2021 11:42:32 +0000 Subject: [PATCH 161/307] btrfs: send: remove stale code when checking for shared extents After commit 040ee6120cb670 ("Btrfs: send, improve clone range") we do not use anymore the data_offset field of struct backref_ctx, as after that we do all the necessary checks for the data offset of file extent items at clone_range(). Since there are no more users of data_offset from that structure, remove it. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 78a35374d492..3bcbf2bcb869 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1191,9 +1191,6 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - /* data offset in the file extent item */ - u64 data_offset; - /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1401,19 +1398,6 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; - /* - * For non-compressed extents iterate_extent_inodes() gives us extent - * offsets that already take into account the data offset, but not for - * compressed extents, since the offset is logical and not relative to - * the physical extent locations. We must take this into account to - * avoid sending clone offsets that go beyond the source file's size, - * which would result in the clone ioctl failing with -EINVAL on the - * receiving end. - */ - if (compressed == BTRFS_COMPRESS_NONE) - backref_ctx->data_offset = 0; - else - backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. From 9db4dc241e87fccd8301357d5ef908f40b50f2e3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 11 Jan 2021 12:58:11 +0200 Subject: [PATCH 162/307] btrfs: make btrfs_start_delalloc_root's nr argument a long It's currently u64 which gets instantly translated either to LONG_MAX (if U64_MAX is passed) or cast to an unsigned long (which is in fact, wrong because writeback_control::nr_to_write is a signed, long type). Just convert the function's argument to be long time which obviates the need to manually convert u64 value to a long. Adjust all call sites which pass U64_MAX to pass LONG_MAX. Finally ensure that in shrink_delalloc the u64 is converted to a long without overflowing, resulting in a negative number. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/inode.c | 6 +++--- fs/btrfs/ioctl.c | 2 +- fs/btrfs/space-info.c | 3 ++- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f5c636b29451..ed6bb46a2572 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3100,7 +3100,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 324f646d6e5e..bc73f798ce3a 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -715,7 +715,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index af5558f87243..17418a75e3c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9489,11 +9489,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) return start_delalloc_inodes(root, &wbc, true, false); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { - .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr, + .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, @@ -9515,7 +9515,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr, * Reset nr_to_write here so we know that we're doing a full * flush. */ - if (nr == U64_MAX) + if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5b9b0a390f0e..7f2935ea8d3a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4946,7 +4946,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false); + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e8347461c8dd..84fb94e78a8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -532,7 +532,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + long nr_pages = min_t(u64, temp, LONG_MAX); btrfs_start_delalloc_roots(fs_info, nr_pages, true); From d7830b7155ab43952ec8f2b95f326f63936ecd03 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 11 Jan 2021 12:58:12 +0200 Subject: [PATCH 163/307] btrfs: remove always true condition in btrfs_start_delalloc_roots Following the rework in e076ab2a2ca7 ("btrfs: shrink delalloc pages instead of full inodes") the nr variable is no longer passed by reference to start_delalloc_inodes hence it cannot change. Additionally we are always guaranteed for it to be positive number hence it's redundant to have it as a condition in the loop. Simply remove that usage. Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 17418a75e3c8..4056fc3b39cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9510,7 +9510,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); - while (!list_empty(&splice) && nr) { + while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. From 523929f1cac3e869492ea376c9d86af11ec0e5c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Dec 2020 13:59:23 +0800 Subject: [PATCH 164/307] btrfs: make btrfs_dio_private::bytes u32 btrfs_dio_private::bytes is only assigned from bio::bi_iter::bi_size, which is never larger than U32. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9bf53d9ff90..28e202e89660 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -325,7 +325,8 @@ struct btrfs_dio_private { struct inode *inode; u64 logical_offset; u64 disk_bytenr; - u64 bytes; + /* Used for bio::bi_size */ + u32 bytes; /* * References to this structure. There is one reference per in-flight From 58f74b2203d786da37128cbf786873996145bfdc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Dec 2020 13:59:24 +0800 Subject: [PATCH 165/307] btrfs: refactor btrfs_dec_test_* functions for ordered extents The refactoring involves the following modifications: - Return bool instead of int - Parameter update for @cached of btrfs_dec_test_first_ordered_pending() For btrfs_dec_test_first_ordered_pending(), @cached is only used to return the finished ordered extent. Rename it to @finished_ret. - Comment updates * Change one stale comment Which still refers to btrfs_dec_test_ordered_pending(), but the context is calling btrfs_dec_test_first_ordered_pending(). * Follow the common comment style for both functions Add more detailed descriptions for parameters and the return value * Move the reason why test_and_set_bit() is used into the call sites - Change how the return value is calculated The most anti-human part of the return value is: if (...) ret = 1; ... return ret == 0; This means, when we set ret to 1, the function returns 0. Change the local variable name to @finished, and directly return the value of it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +-- fs/btrfs/ordered-data.c | 104 ++++++++++++++++++++++------------------ fs/btrfs/ordered-data.h | 10 ++-- 3 files changed, 65 insertions(+), 55 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4056fc3b39cf..ef6cb7b620d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7797,10 +7797,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, NULL); btrfs_queue_work(wq, &ordered->work); } - /* - * If btrfs_dec_test_ordered_pending does not find any ordered - * extent in the range, we can exit. - */ + + /* No ordered extent found in the range, exit */ if (ordered_offset == last_offset) return; /* diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 79d366a36223..d5d326c674b1 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -297,26 +297,33 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * this is used to account for finished IO across a given range - * of the file. The IO may span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can + * contain several ordered extents. * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * @found_ret: Return the finished ordered extent + * @file_offset: File offset for the finished IO + * Will also be updated to one byte past the range that is + * recordered as finished. This allows caller to walk forward. + * @io_size: Length of the finish IO range + * @uptodate: If the IO finished without problem * - * file_offset is updated to one byte past the range that is recorded as - * complete. This allows you to walk forward in the file. + * Return true if any ordered extent is finished in the range, and update + * @found_ret and @file_offset. + * Return false otherwise. + * + * NOTE: Although The range can cross multiple ordered extents, only one + * ordered extent will be updated during one call. The caller is responsible to + * iterate all ordered extents in the range. */ -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - int ret; + bool finished = false; unsigned long flags; u64 dec_end; u64 dec_start; @@ -324,16 +331,12 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, *file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) { - ret = 1; + if (!offset_in_entry(entry, *file_offset)) goto out; - } dec_start = max(*file_offset, entry->file_offset); dec_end = min(*file_offset + io_size, @@ -354,39 +357,50 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { - *cached = entry; + if (finished && finished_ret && entry) { + *finished_ret = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* - * this is used to account for finished IO across a given range - * of the file. The IO should not span ordered extents. If - * a given ordered_extent is completely done, 1 is returned, otherwise - * 0. + * Finish IO for one ordered extent across a given range. The range can only + * contain one ordered extent. * - * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used - * to make sure this function only returns 1 once for a given ordered extent. + * @cached: The cached ordered extent. If not NULL, we can skip the tree + * search and use the ordered extent directly. + * Will be also used to store the finished ordered extent. + * @file_offset: File offset for the finished IO + * @io_size: Length of the finish IO range + * @uptodate: If the IO finishes without problem + * + * Return true if the ordered extent is finished in the range, and update + * @cached. + * Return false otherwise. + * + * NOTE: The range can NOT cross multiple ordered extents. + * Thus caller should ensure the range doesn't cross ordered extents. */ -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate) +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - int ret; + bool finished = false; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { @@ -395,41 +409,39 @@ int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, } node = tree_search(tree, file_offset); - if (!node) { - ret = 1; + if (!node) goto out; - } entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) { - ret = 1; + if (!offset_in_entry(entry, file_offset)) goto out; - } - if (io_size > entry->bytes_left) { + if (io_size > entry->bytes_left) btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); - } + entry->bytes_left -= io_size; if (!uptodate) set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { - ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Ensure only one caller can set the flag and finished_ret + * accordingly + */ + finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); /* test_and_set_bit implies a barrier */ cond_wake_up_nomb(&entry->wait); - } else { - ret = 1; } out: - if (!ret && cached && entry) { + if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); - return ret == 0; + return finished; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 0bfa82b58e23..46194c2c05d4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -152,11 +152,11 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate); -int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **cached, +bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size, int uptodate); +bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **finished_ret, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, From 0c64c33c603f692ceb91d9fe17cc10028cff7da8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:40 +0800 Subject: [PATCH 166/307] btrfs: rename parameter offset to disk_bytenr in submit_extent_page The parameter offset is confusing, it's supposed to be the disk bytenr of metadata/data. Rename it to disk_bytenr and update the comment. Also rename each offset passed to submit_extent_page() as @disk_bytenr so they're consistent. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9cee458e001..60990616b895 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3062,10 +3062,10 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting * @page: page to add to the bio + * @disk_bytenr: logical bytenr where the write will be + * @size: portion of page that we want to write to * @pg_offset: offset of the new bio or to check whether we are adding * a contiguous page to the previous one - * @size: portion of page that we want to write - * @offset: starting offset in the page * @bio_ret: must be valid pointer, newly allocated bio will be stored there * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write @@ -3074,7 +3074,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, - struct page *page, u64 offset, + struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -3086,7 +3086,7 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - sector_t sector = offset >> 9; + sector_t sector = disk_bytenr >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); @@ -3120,7 +3120,7 @@ static int submit_extent_page(unsigned int opf, } } - bio = btrfs_bio_alloc(offset); + bio = btrfs_bio_alloc(disk_bytenr); bio_add_page(bio, page, io_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; @@ -3242,7 +3242,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } while (cur <= end) { bool force_bio_submit = false; - u64 offset; + u64 disk_bytenr; if (cur >= last_byte) { char *userpage; @@ -3280,9 +3280,9 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) - offset = em->block_start; + disk_bytenr = em->block_start; else - offset = em->block_start + extent_offset; + disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3371,7 +3371,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, iosize, + page, disk_bytenr, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, @@ -3548,8 +3548,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, blocksize = inode->vfs_inode.i_sb->s_blocksize; while (cur <= end) { + u64 disk_bytenr; u64 em_end; - u64 offset; if (cur >= i_size) { btrfs_writepage_endio_finish_ordered(page, cur, @@ -3569,7 +3569,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, BUG_ON(end < cur); iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - offset = em->block_start + extent_offset; + disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); @@ -3599,7 +3599,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, offset, iosize, pg_offset, + page, disk_bytenr, iosize, pg_offset, &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); @@ -3923,7 +3923,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) { - u64 offset = eb->start; + u64 disk_bytenr = eb->start; u32 nritems; int i, num_pages; unsigned long start, end; @@ -3956,7 +3956,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, offset, PAGE_SIZE, 0, + p, disk_bytenr, PAGE_SIZE, 0, &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, false); @@ -3969,7 +3969,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, ret = -EIO; break; } - offset += PAGE_SIZE; + disk_bytenr += PAGE_SIZE; update_nr_written(wbc, 1); unlock_page(p); } From 6bc5636a67bf489d95ebc06c0449396fd487d309 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:41 +0800 Subject: [PATCH 167/307] btrfs: refactor __extent_writepage_io() to improve readability The refactoring involves the following modifications: - iosize alignment In fact we don't really need to manually do alignment at all. All extent maps should already be aligned, thus basic ASSERT() check would be enough. - redundant variables We have extra variable like blocksize/pg_offset/end. They are all unnecessary. @blocksize can be replaced by sectorsize size directly, and it's only used to verify the em start/size is aligned. @pg_offset can be easily calculated using @cur and page_offset(page). @end is just assigned from @page_end and never modified, use "start + PAGE_SIZE - 1" directly and remove @page_end. - remove some BUG_ON()s The BUG_ON()s are for extent map, which we have tree-checker to check on-disk extent data item and runtime check. ASSERT() should be enough. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60990616b895..74c0a32d04fc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3513,23 +3513,20 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, unsigned long nr_written, int *nr_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; - u64 end; + u64 end = start + PAGE_SIZE - 1; u64 cur = start; u64 extent_offset; u64 block_start; - u64 iosize; struct extent_map *em; - size_t pg_offset = 0; - size_t blocksize; int ret = 0; int nr = 0; const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, page_end); + ret = btrfs_writepage_cow_fixup(page, start, end); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3544,16 +3541,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ update_nr_written(wbc, nr_written + 1); - end = page_end; - blocksize = inode->vfs_inode.i_sb->s_blocksize; - while (cur <= end) { u64 disk_bytenr; u64 em_end; + u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, - page_end, 1); + btrfs_writepage_endio_finish_ordered(page, cur, end, 1); break; } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); @@ -3565,13 +3559,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, extent_offset = cur - em->start; em_end = extent_map_end(em); - BUG_ON(em_end <= cur); - BUG_ON(end < cur); - iosize = min(em_end - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); - disk_bytenr = em->block_start + extent_offset; + ASSERT(cur <= em_end); + ASSERT(cur < end); + ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize)); block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + disk_bytenr = em->block_start + extent_offset; + + /* Note that em_end from extent_map_end() is exclusive */ + iosize = min(em_end, end + 1) - cur; free_extent_map(em); em = NULL; @@ -3587,7 +3584,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_writepage_endio_finish_ordered(page, cur, cur + iosize - 1, 1); cur += iosize; - pg_offset += iosize; continue; } @@ -3599,8 +3595,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, disk_bytenr, iosize, pg_offset, - &epd->bio, + page, disk_bytenr, iosize, + cur - page_offset(page), &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); if (ret) { @@ -3609,8 +3605,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_page_writeback(page); } - cur = cur + iosize; - pg_offset += iosize; + cur += iosize; nr++; } *nr_ret = nr; From c0fab480955c4a943cc77be58269d97128ac3ef9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:42 +0800 Subject: [PATCH 168/307] btrfs: update comment for btrfs_dirty_pages The original comment is from the initial merge, which has several problems: - No holes check any more - No inline decision is made Update the out-of-date comment with more correct one. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e65223e3510d..d81ae1f518f2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -453,12 +453,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } /* - * after copy_from_user, pages need to be dirtied and we need to make - * sure holes are created between the current EOF and the start of - * any next extents (if required). - * - * this also makes the decision about creating an inline extent vs - * doing real data extents, marking pages dirty and delalloc as required. + * After btrfs_copy_from_user(), update the following things for delalloc: + * - Mark newly dirtied pages as DELALLOC in the io tree. + * Used to advise which range is to be written back. + * - Mark modified pages as Uptodate/Dirty and not needing COW fixup + * - Update inode size for past EOF write */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, From c0f0a9e71653b33c003433f2248cec88f6942f35 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 6 Jan 2021 09:01:45 +0800 Subject: [PATCH 169/307] btrfs: introduce helper to grab an existing extent buffer from a page This patch will extract the code to grab an extent buffer from a page into a helper, grab_extent_buffer_from_page(). This reduces one indent level, and provides the work place for later expansion for subapge support. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 50 ++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 74c0a32d04fc..7f689ad7709c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5247,6 +5247,29 @@ free_eb: } #endif +static struct extent_buffer *grab_extent_buffer(struct page *page) +{ + struct extent_buffer *exists; + + /* Page not yet attached to an extent buffer */ + if (!PagePrivate(page)) + return NULL; + + /* + * We could have already allocated an eb for this page and attached one + * so lets see if we can get a ref on the existing eb, and if we can we + * know it's good and we can just return that one, else we know we can + * just overwrite page->private. + */ + exists = (struct extent_buffer *)page->private; + if (atomic_inc_not_zero(&exists->refs)) + return exists; + + WARN_ON(PageDirty(page)); + detach_page_private(page); + return NULL; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -5292,26 +5315,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } spin_lock(&mapping->private_lock); - if (PagePrivate(p)) { - /* - * We could have already allocated an eb for this page - * and attached one so lets see if we can get a ref on - * the existing eb, and if we can we know it's good and - * we can just return that one, else we know we can just - * overwrite page->private. - */ - exists = (struct extent_buffer *)p->private; - if (atomic_inc_not_zero(&exists->refs)) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - goto free_eb; - } - exists = NULL; - - WARN_ON(PageDirty(p)); - detach_page_private(p); + exists = grab_extent_buffer(p); + if (exists) { + spin_unlock(&mapping->private_lock); + unlock_page(p); + put_page(p); + mark_extent_buffer_accessed(exists, p); + goto free_eb; } attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); From f7ba2d37519dd6e15af9f00e9b4bbc7d1aba267a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:22:15 -0500 Subject: [PATCH 170/307] btrfs: keep track of the root owner for relocation reads While testing the error paths in relocation, I hit the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.10.0-rc3+ #206 Not tainted ------------------------------------------------------ btrfs-balance/1571 is trying to acquire lock: ffff8cdbcc8f77d0 (&head_ref->mutex){+.+.}-{3:3}, at: btrfs_lookup_extent_info+0x156/0x3b0 but task is already holding lock: ffff8cdbc54adbf8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_lock+0x27/0x100 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-tree-00){++++}-{3:3}: down_write_nested+0x43/0x80 __btrfs_tree_lock+0x27/0x100 btrfs_search_slot+0x248/0x890 relocate_tree_blocks+0x490/0x650 relocate_block_group+0x1ba/0x5d0 kretprobe_trampoline+0x0/0x50 -> #1 (btrfs-csum-01){++++}-{3:3}: down_read_nested+0x43/0x130 __btrfs_tree_read_lock+0x27/0x100 btrfs_read_lock_root_node+0x31/0x40 btrfs_search_slot+0x5ab/0x890 btrfs_del_csums+0x10b/0x3c0 __btrfs_free_extent+0x49d/0x8e0 __btrfs_run_delayed_refs+0x283/0x11f0 btrfs_run_delayed_refs+0x86/0x220 btrfs_start_dirty_block_groups+0x2ba/0x520 kretprobe_trampoline+0x0/0x50 -> #0 (&head_ref->mutex){+.+.}-{3:3}: __lock_acquire+0x1167/0x2150 lock_acquire+0x116/0x3e0 __mutex_lock+0x7e/0x7b0 btrfs_lookup_extent_info+0x156/0x3b0 walk_down_proc+0x1c3/0x280 walk_down_tree+0x64/0xe0 btrfs_drop_subtree+0x182/0x260 do_relocation+0x52e/0x660 relocate_tree_blocks+0x2ae/0x650 relocate_block_group+0x1ba/0x5d0 kretprobe_trampoline+0x0/0x50 other info that might help us debug this: Chain exists of: &head_ref->mutex --> btrfs-csum-01 --> btrfs-tree-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-tree-00); lock(btrfs-csum-01); lock(btrfs-tree-00); lock(&head_ref->mutex); *** DEADLOCK *** 5 locks held by btrfs-balance/1571: #0: ffff8cdb89749ff8 (&fs_info->delete_unused_bgs_mutex){+.+.}-{3:3}, at: btrfs_balance+0x563/0xf40 #1: ffff8cdb89748838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x156/0x300 #2: ffff8cdbc2c16650 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x413/0x5c0 #3: ffff8cdbc135f538 (btrfs-treloc-01){+.+.}-{3:3}, at: __btrfs_tree_lock+0x27/0x100 #4: ffff8cdbc54adbf8 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_lock+0x27/0x100 stack backtrace: CPU: 1 PID: 1571 Comm: btrfs-balance Not tainted 5.10.0-rc3+ #206 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb0 check_noncircular+0xcf/0xf0 ? trace_call_bpf+0x139/0x260 __lock_acquire+0x1167/0x2150 lock_acquire+0x116/0x3e0 ? btrfs_lookup_extent_info+0x156/0x3b0 __mutex_lock+0x7e/0x7b0 ? btrfs_lookup_extent_info+0x156/0x3b0 ? btrfs_lookup_extent_info+0x156/0x3b0 ? release_extent_buffer+0x124/0x170 ? _raw_spin_unlock+0x1f/0x30 ? release_extent_buffer+0x124/0x170 btrfs_lookup_extent_info+0x156/0x3b0 walk_down_proc+0x1c3/0x280 walk_down_tree+0x64/0xe0 btrfs_drop_subtree+0x182/0x260 do_relocation+0x52e/0x660 relocate_tree_blocks+0x2ae/0x650 ? add_tree_block+0x149/0x1b0 relocate_block_group+0x1ba/0x5d0 elfcorehdr_read+0x40/0x40 ? elfcorehdr_read+0x40/0x40 ? btrfs_balance+0x796/0xf40 ? __kthread_parkme+0x66/0x90 ? btrfs_balance+0xf40/0xf40 ? balance_kthread+0x37/0x50 ? kthread+0x137/0x150 ? __kthread_bind_mask+0x60/0x60 ? ret_from_fork+0x1f/0x30 As you can see this is bogus, we never take another tree's lock under the csum lock. This happens because sometimes we have to read tree blocks from disk without knowing which root they belong to during relocation. We defaulted to an owner of 0, which translates to an fs tree. This is fine as all fs trees have the same class, but obviously isn't fine if the block belongs to a COW only tree. Thankfully COW only trees only have their owners root as a reference to them, and since we already look up the extent information during relocation, go ahead and check and see if this block might belong to a COW only tree, and if so save the owner in the tree_block struct. This allows us to read_tree_block with the proper owner, which gets rid of this lockdep splat. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 46 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8e51b39cbfbb..9f2289bcdde6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -97,6 +97,7 @@ struct tree_block { struct rb_node rb_node; u64 bytenr; }; /* Use rb_simple_node for search/insert */ + u64 owner; struct btrfs_key key; unsigned int level:8; unsigned int key_ready:1; @@ -2393,8 +2394,8 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, 0, block->key.offset, - block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, block->owner, + block->key.offset, block->level, NULL); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2493,7 +2494,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Kick in readahead for tree blocks with missing keys */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { if (!block->key_ready) - btrfs_readahead_tree_block(fs_info, block->bytenr, 0, 0, + btrfs_readahead_tree_block(fs_info, block->bytenr, + block->owner, 0, block->level); } @@ -2801,21 +2803,58 @@ static int add_tree_block(struct reloc_control *rc, u32 item_size; int level = -1; u64 generation; + u64 owner = 0; eb = path->nodes[0]; item_size = btrfs_item_size_nr(eb, path->slots[0]); if (extent_key->type == BTRFS_METADATA_ITEM_KEY || item_size >= sizeof(*ei) + sizeof(*bi)) { + unsigned long ptr = 0, end; + ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); + end = (unsigned long)ei + item_size; if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) { bi = (struct btrfs_tree_block_info *)(ei + 1); level = btrfs_tree_block_level(eb, bi); + ptr = (unsigned long)(bi + 1); } else { level = (int)extent_key->offset; + ptr = (unsigned long)(ei + 1); } generation = btrfs_extent_generation(eb, ei); + + /* + * We're reading random blocks without knowing their owner ahead + * of time. This is ok most of the time, as all reloc roots and + * fs roots have the same lock type. However normal trees do + * not, and the only way to know ahead of time is to read the + * inline ref offset. We know it's an fs root if + * + * 1. There's more than one ref. + * 2. There's a SHARED_DATA_REF_KEY set. + * 3. FULL_BACKREF is set on the flags. + * + * Otherwise it's safe to assume that the ref offset == the + * owner of this block, so we can use that when calling + * read_tree_block. + */ + if (btrfs_extent_refs(eb, ei) == 1 && + !(btrfs_extent_flags(eb, ei) & + BTRFS_BLOCK_FLAG_FULL_BACKREF) && + ptr < end) { + struct btrfs_extent_inline_ref *iref; + int type; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; + if (type == BTRFS_TREE_BLOCK_REF_KEY) + owner = btrfs_extent_inline_ref_offset(eb, iref); + } } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { btrfs_print_v0_err(eb->fs_info); btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); @@ -2837,6 +2876,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.offset = generation; block->level = level; block->key_ready = 0; + block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); if (rb_node) From 7e2a870a599d4699a626ec26430c7a1ab14a2a49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Dec 2020 11:22:16 -0500 Subject: [PATCH 171/307] btrfs: do not cleanup upper nodes in btrfs_backref_cleanup_node Zygo reported the following panic when testing my error handling patches for relocation: kernel BUG at fs/btrfs/backref.c:2545! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 3 PID: 8472 Comm: btrfs Tainted: G W 14 Hardware name: QEMU Standard PC (i440FX + PIIX, Call Trace: btrfs_backref_error_cleanup+0x4df/0x530 build_backref_tree+0x1a5/0x700 ? _raw_spin_unlock+0x22/0x30 ? release_extent_buffer+0x225/0x280 ? free_extent_buffer.part.52+0xd7/0x140 relocate_tree_blocks+0x2a6/0xb60 ? kasan_unpoison_shadow+0x35/0x50 ? do_relocation+0xc10/0xc10 ? kasan_kmalloc+0x9/0x10 ? kmem_cache_alloc_trace+0x6a3/0xcb0 ? free_extent_buffer.part.52+0xd7/0x140 ? rb_insert_color+0x342/0x360 ? add_tree_block.isra.36+0x236/0x2b0 relocate_block_group+0x2eb/0x780 ? merge_reloc_roots+0x470/0x470 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x18f0 ? pvclock_clocksource_read+0xeb/0x190 ? btrfs_relocate_chunk+0x120/0x120 ? lock_contended+0x620/0x6e0 ? do_raw_spin_lock+0x1e0/0x1e0 ? do_raw_spin_unlock+0xa8/0x140 btrfs_ioctl_balance+0x1f9/0x460 btrfs_ioctl+0x24c8/0x4380 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This occurs because of this check if (RB_EMPTY_NODE(&upper->rb_node)) BUG_ON(!list_empty(&node->upper)); As we are dropping the backref node, if we discover that our upper node in the edge we just cleaned up isn't linked into the cache that we are now done with this node, thus the BUG_ON(). However this is an erroneous assumption, as we will look up all the references for a node first, and then process the pending edges. All of the 'upper' nodes in our pending edges won't be in the cache's rb_tree yet, because they haven't been processed. We could very well have many edges still left to cleanup on this node. The fact is we simply do not need this check, we can just process all of the edges only for this node, because below this check we do the following if (list_empty(&upper->lower)) { list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } If the upper node truly isn't used yet, then we add it to the cache->leaves list to be cleaned up later. If it is still used then the last child node that has it linked into its node will add it to the leaves list and then it will be cleaned up. Fix this problem by dropping this logic altogether. With this fix I no longer see the panic when testing with error injection in the backref code. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/backref.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9cadacf3ec27..ef71aba5bc15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2541,13 +2541,6 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); - if (RB_EMPTY_NODE(&upper->rb_node)) { - BUG_ON(!list_empty(&node->upper)); - btrfs_backref_drop_node(cache, node); - node = upper; - node->lowest = 1; - continue; - } /* * Add the node to leaf node list if no other child block * cached. From fe3b7bb085a0b1fb26d622a5eccc7dbb5c4f82fb Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 21 Jan 2021 16:19:47 +0800 Subject: [PATCH 172/307] btrfs: remove redundant NULL check before kvfree Fix below warnings reported by coccicheck: ./fs/btrfs/raid56.c:237:2-8: WARNING: NULL check before some freeing functions is not needed. Reported-by: Abaci Robot Reviewed-by: Anand Jain Signed-off-by: Yang Li Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 93fbf87bdc8d..5394641541f7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -233,8 +233,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) - kvfree(x); + kvfree(x); return 0; } From 3c198fe064491dcceaed9e15c6c997e92e71293e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 21 Jan 2021 14:13:54 +0800 Subject: [PATCH 173/307] btrfs: rework the order of btrfs_ordered_extent::flags [BUG] There is a long existing bug in the last parameter of btrfs_add_ordered_extent(), in commit 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads") back to 2008. In that ancient commit btrfs_add_ordered_extent() expects the @type parameter to be one of the following: - BTRFS_ORDERED_REGULAR - BTRFS_ORDERED_NOCOW - BTRFS_ORDERED_PREALLOC - BTRFS_ORDERED_COMPRESSED But we pass 0 in cow_file_range(), which means BTRFS_ORDERED_IO_DONE. Ironically extra check in __btrfs_add_ordered_extent() won't set the bit if we see (type == IO_DONE || type == IO_COMPLETE), and avoid any obvious bug. But this still leads to regular COW ordered extent having no bit to indicate its type in various trace events, rendering REGULAR bit useless. [FIX] Change the following aspects to avoid such problem: - Reorder btrfs_ordered_extent::flags Now the type bits go first (REGULAR/NOCOW/PREALLCO/COMPRESSED), then DIRECT bit, finally extra status bits like IO_DONE/COMPLETE/IOERR. - Add extra ASSERT() for btrfs_add_ordered_extent_*() - Remove @type parameter for btrfs_add_ordered_extent_compress() As the only valid @type here is BTRFS_ORDERED_COMPRESSED. - Remove the unnecessary special check for IO_DONE/COMPLETE in __btrfs_add_ordered_extent() This is just to make the code work, with extra ASSERT(), there are limited values can be passed in. Reviewed-by: Nikolay Borisov Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- fs/btrfs/ordered-data.c | 21 +++++++++++++++----- fs/btrfs/ordered-data.h | 37 +++++++++++++++++++++++------------- include/trace/events/btrfs.h | 9 +++++---- 4 files changed, 47 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ef6cb7b620d0..ea9056cc5559 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -917,7 +917,6 @@ retry: ins.objectid, async_extent->ram_size, ins.offset, - BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (ret) { btrfs_drop_extent_cache(inode, async_extent->start, @@ -1127,7 +1126,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, free_extent_map(em); ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ram_size, cur_alloc_size, 0); + ram_size, cur_alloc_size, + BTRFS_ORDERED_REGULAR); if (ret) goto out_drop_extent_cache; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d5d326c674b1..b4e6500548a2 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,8 +199,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; - if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) - set_bit(type, &entry->flags); + + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC || + type == BTRFS_ORDERED_COMPRESSED); + set_bit(type, &entry->flags); if (dio) { percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, @@ -256,6 +260,9 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); @@ -265,6 +272,9 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type) { + ASSERT(type == BTRFS_ORDERED_REGULAR || + type == BTRFS_ORDERED_NOCOW || + type == BTRFS_ORDERED_PREALLOC); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); @@ -272,11 +282,12 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type) + u64 disk_num_bytes, int compress_type) { + ASSERT(compress_type != BTRFS_COMPRESS_NONE); return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, - num_bytes, disk_num_bytes, type, 0, + num_bytes, disk_num_bytes, + BTRFS_ORDERED_COMPRESSED, 0, compress_type); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 46194c2c05d4..cca3307807e8 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -27,7 +27,7 @@ struct btrfs_ordered_sum { }; /* - * bits for the flags field: + * Bits for btrfs_ordered_extent::flags. * * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. * It is used to make sure metadata is inserted into the tree only once @@ -38,24 +38,36 @@ struct btrfs_ordered_sum { * IO is done and any metadata is inserted into the tree. */ enum { + /* + * Different types for direct io, one and only one of the 4 type can + * be set when creating ordered extent. + * + * REGULAR: For regular non-compressed COW write + * NOCOW: For NOCOW write into existing non-hole extent + * PREALLOC: For NOCOW write into preallocated extent + * COMPRESSED: For compressed COW write + */ + BTRFS_ORDERED_REGULAR, + BTRFS_ORDERED_NOCOW, + BTRFS_ORDERED_PREALLOC, + BTRFS_ORDERED_COMPRESSED, + + /* + * Extra bit for direct io, can only be set for + * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. + */ + BTRFS_ORDERED_DIRECT, + + /* Extra status bits for ordered extents */ + /* set when all the pages are written */ BTRFS_ORDERED_IO_DONE, /* set when removed from the tree */ BTRFS_ORDERED_COMPLETE, - /* set when we want to write in place */ - BTRFS_ORDERED_NOCOW, - /* writing a zlib compressed extent */ - BTRFS_ORDERED_COMPRESSED, - /* set when writing to preallocated extent */ - BTRFS_ORDERED_PREALLOC, - /* set when we're doing DIO with this extent */ - BTRFS_ORDERED_DIRECT, /* We had an io error when writing this out */ BTRFS_ORDERED_IOERR, /* Set when we have to truncate an extent */ BTRFS_ORDERED_TRUNCATED, - /* Regular IO for COW */ - BTRFS_ORDERED_REGULAR, /* Used during fsync to track already logged extents */ BTRFS_ORDERED_LOGGED, /* We have already logged all the csums of the ordered extent */ @@ -167,8 +179,7 @@ int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset, u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, - u64 disk_num_bytes, int type, - int compress_type); + u64 disk_num_bytes, int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index ecd24c719de4..b9896fc06160 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -499,12 +499,13 @@ DEFINE_EVENT( #define show_ordered_flags(flags) \ __print_flags(flags, "|", \ + { (1 << BTRFS_ORDERED_REGULAR), "REGULAR" }, \ + { (1 << BTRFS_ORDERED_NOCOW), "NOCOW" }, \ + { (1 << BTRFS_ORDERED_PREALLOC), "PREALLOC" }, \ + { (1 << BTRFS_ORDERED_COMPRESSED), "COMPRESSED" }, \ + { (1 << BTRFS_ORDERED_DIRECT), "DIRECT" }, \ { (1 << BTRFS_ORDERED_IO_DONE), "IO_DONE" }, \ { (1 << BTRFS_ORDERED_COMPLETE), "COMPLETE" }, \ - { (1 << BTRFS_ORDERED_NOCOW), "NOCOW" }, \ - { (1 << BTRFS_ORDERED_COMPRESSED), "COMPRESSED" }, \ - { (1 << BTRFS_ORDERED_PREALLOC), "PREALLOC" }, \ - { (1 << BTRFS_ORDERED_DIRECT), "DIRECT" }, \ { (1 << BTRFS_ORDERED_IOERR), "IOERR" }, \ { (1 << BTRFS_ORDERED_TRUNCATED), "TRUNCATED" }) From 401bd2dd1299dd384849707c6577b2089ab9f615 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:52 +0200 Subject: [PATCH 174/307] btrfs: document modified parameter of add_extent_mapping Fixes fs/btrfs/extent_map.c:399: warning: Function parameter or member 'modified' not described in 'add_extent_mapping' Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index bd6229fb2b6f..8bda6c89e23e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -385,9 +385,12 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } /** - * add_extent_mapping - add new extent map to the extent tree + * Add new extent map to the extent tree + * * @tree: tree to insert new map in * @em: map to insert + * @modified: indicate whether the given @em should be added to the + * modified list, which indicates the extent needs to be logged * * Insert @em into @tree or perform a simple forward/backward merge with * existing mappings. The extent_map struct passed in will be inserted From 9ad37bb3ffc51fbd9c48ba4d85414b4aa3e21c6d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:53 +0200 Subject: [PATCH 175/307] btrfs: fix parameter description of btrfs_add_extent_mapping This fixes the following compiler warnings: fs/btrfs/extent_map.c:601: warning: Function parameter or member 'fs_info' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'em_tree' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'em_in' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'start' not described in 'btrfs_add_extent_mapping' fs/btrfs/extent_map.c:601: warning: Function parameter or member 'len' not described in 'btrfs_add_extent_mapping' Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 8bda6c89e23e..4a8e02f7b6c7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -577,12 +577,13 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, } /** - * btrfs_add_extent_mapping - add extent mapping into em_tree - * @fs_info - used for tracepoint - * @em_tree - the extent tree into which we want to insert the extent mapping - * @em_in - extent we are inserting - * @start - start of the logical range btrfs_get_extent() is requesting - * @len - length of the logical range btrfs_get_extent() is requesting + * Add extent mapping into em_tree + * + * @fs_info: the filesystem + * @em_tree: extent tree into which we want to insert the extent mapping + * @em_in: extent we are inserting + * @start: start of the logical range btrfs_get_extent() is requesting + * @len: length of the logical range btrfs_get_extent() is requesting * * Note that @em_in's range may be different from [start, start+len), * but they must be overlapped. From ca4207ae1385190f7d62926f107ede1edced4c1f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:54 +0200 Subject: [PATCH 176/307] btrfs: fix function description formats in file-item.c This fixes following W=1 warnings: fs/btrfs/file-item.c:27: warning: Cannot understand * @inode: the inode we want to update the disk_i_size for on line 27 - I thought it was a doc line fs/btrfs/file-item.c:65: warning: Cannot understand * @inode - the inode we're modifying on line 65 - I thought it was a doc line fs/btrfs/file-item.c:91: warning: Cannot understand * @inode - the inode we're modifying on line 91 - I thought it was a doc line Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6ccfc019ad90..47cd3a6dc635 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -24,8 +24,10 @@ PAGE_SIZE)) /** - * @inode - the inode we want to update the disk_i_size for - * @new_i_size - the i_size we want to set to, 0 if we use i_size + * Set inode's size according to filesystem options + * + * @inode: inode we want to update the disk_i_size for + * @new_i_size: i_size we want to set to, 0 if we use i_size * * With NO_HOLES set this simply sets the disk_is_size to whatever i_size_read() * returns as it is perfectly fine with a file that has holes without hole file @@ -62,9 +64,11 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Mark range within a file as having a new extent inserted + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Call when we are inserting a new file extent where there was none before. * Does not need to call this in the case where we're replacing an existing file @@ -88,9 +92,11 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, } /** - * @inode - the inode we're modifying - * @start - the start file offset of the file extent we've inserted - * @len - the logical length of the file extent item + * Marks an inode range as not having a backing extent + * + * @inode: inode being modified + * @start: start file offset of the file extent we've inserted + * @len: logical length of the file extent item * * Called when we drop a file extent, for example when we truncate. Doesn't * need to be called for cases where we're replacing a file extent, like when From 696eb22b67add04e13f26cebe9f63eeb9477becd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:55 +0200 Subject: [PATCH 177/307] btrfs: fix parameter description in delayed-ref.c functions This fixes the following warnings: fs/btrfs/delayed-ref.c:80: warning: Function parameter or member 'fs_info' not described in 'btrfs_delayed_refs_rsv_release' fs/btrfs/delayed-ref.c:80: warning: Function parameter or member 'nr' not described in 'btrfs_delayed_refs_rsv_release' fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'fs_info' not described in 'btrfs_migrate_to_delayed_refs_rsv' fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'src' not described in 'btrfs_migrate_to_delayed_refs_rsv' fs/btrfs/delayed-ref.c:128: warning: Function parameter or member 'num_bytes' not described in 'btrfs_migrate_to_delayed_refs_rsv' fs/btrfs/delayed-ref.c:174: warning: Function parameter or member 'fs_info' not described in 'btrfs_delayed_refs_rsv_refill' fs/btrfs/delayed-ref.c:174: warning: Function parameter or member 'flush' not described in 'btrfs_delayed_refs_rsv_refill' Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 353cc2994d10..88a1e27d2fc2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -69,9 +69,10 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) } /** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. + * Release a ref head's reservation + * + * @fs_info: the filesystem + * @nr: number of items to drop * * This drops the delayed ref head's count from the delayed refs rsv and frees * any excess reservation we had. @@ -114,10 +115,11 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) } /** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. + * Transfer bytes to our delayed refs rsv + * + * @fs_info: the filesystem + * @src: source block rsv to transfer from + * @num_bytes: number of bytes to transfer * * This transfers up to the num_bytes amount from the src rsv to the * delayed_refs_rsv. Any extra bytes are returned to the space info. @@ -162,9 +164,10 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, } /** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. + * Refill based on our delayed refs usage + * + * @fs_info: the filesystem + * @flush: control how we can flush for this reservation. * * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. From f092cf3cfd0144bdaf6110176ea9d2cef1f3b4a8 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:56 +0200 Subject: [PATCH 178/307] btrfs: improve parameter description for __btrfs_write_out_cache Fixes following W=1 warnings: fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'root' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'inode' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'ctl' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'block_group' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'io_ctl' not described in '__btrfs_write_out_cache' fs/btrfs/free-space-cache.c:1317: warning: Function parameter or member 'trans' not described in '__btrfs_write_out_cache' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fd6ddd6b8165..0d6dcb5ff963 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1299,11 +1299,14 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, } /** - * __btrfs_write_out_cache - write out cached info to an inode - * @root - the root the inode belongs to - * @ctl - the free space cache we are going to write out - * @block_group - the block_group for this cache if it belongs to a block_group - * @trans - the trans handle + * Write out cached info to an inode + * + * @root: root the inode belongs to + * @inode: freespace inode we are writing out + * @ctl: free space cache we are going to write out + * @block_group: block_group for this cache if it belongs to a block_group + * @io_ctl: holds context for the io + * @trans: the trans handle * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successful in writing the cache out, From 92419695478b6a75ca85e9f8e06b08a4a35bfb20 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:57 +0200 Subject: [PATCH 179/307] btrfs: document now parameter of peek_discard_list Fixes fs/btrfs/discard.c:203: warning: Function parameter or member 'now' not described in 'peek_discard_list' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/discard.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 2b8383d41144..306ff20af70f 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -185,10 +185,12 @@ static struct btrfs_block_group *find_next_block_group( } /** - * peek_discard_list - wrap find_next_block_group() - * @discard_ctl: discard control + * Wrap find_next_block_group() + * + * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management + * @now: time when discard was invoked, in ns * * This wraps find_next_block_group() and sets the block_group to be in use. * discard_state's control flow is managed here. Variables related to From 9ee9b97990d6eff9cea64303c640dfb4b3a40253 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:58 +0200 Subject: [PATCH 180/307] btrfs: document fs_info in btrfs_rmap_block Fixes fs/btrfs/block-group.c:1570: warning: Function parameter or member 'fs_info' not described in 'btrfs_rmap_block' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 48ebc106a606..2d7294d81616 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1561,7 +1561,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } /** - * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * Map a physical disk address to a list of logical addresses + * + * @fs_info: the filesystem * @chunk_start: logical address of block group * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical From 2639631d34941db1ebbc74fb879855e0cd286cec Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:57:59 +0200 Subject: [PATCH 181/307] btrfs: fix description format of fs_info of btrfs_wait_on_delayed_iputs Fixes fs/btrfs/inode.c:3101: warning: Function parameter or member 'fs_info' not described in 'btrfs_wait_on_delayed_iputs' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ea9056cc5559..0dbe1aaa0b71 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3103,14 +3103,16 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) } /** - * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running - * @fs_info - the fs_info for this fs - * @return - EINTR if we were killed, 0 if nothing's pending + * Wait for flushing all delayed iputs + * + * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. + * + * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { From 6e353e3b3c5545524d718d528548f7c8c95536c5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:00 +0200 Subject: [PATCH 182/307] btrfs: document btrfs_check_shared parameters Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ef71aba5bc15..701124c3e0b1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1501,7 +1501,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, } /** - * btrfs_check_shared - tell us whether an extent is shared + * Check if an extent is shared or not + * + * @root: root inode belongs to + * @inum: inode number of the inode whose extent we are checking + * @bytenr: logical bytenr of the extent we are checking + * @roots: list of roots this extent is shared among + * @tmp: temporary list used for iteration * * btrfs_check_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the From b762d1d08dacdc444ffd6417fc17805408da7af4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:01 +0200 Subject: [PATCH 183/307] btrfs: fix parameter description of btrfs_inode_rsv_release/btrfs_delalloc_release_space Fixes following warnings: fs/btrfs/delalloc-space.c:205: warning: Function parameter or member 'inode' not described in 'btrfs_inode_rsv_release' fs/btrfs/delalloc-space.c:205: warning: Function parameter or member 'qgroup_free' not described in 'btrfs_inode_rsv_release' fs/btrfs/delalloc-space.c:472: warning: Function parameter or member 'reserved' not described in 'btrfs_delalloc_release_space' fs/btrfs/delalloc-space.c:472: warning: Function parameter or member 'qgroup_free' not described in 'btrfs_delalloc_release_space' fs/btrfs/delalloc-space.c:472: warning: Excess function parameter 'release_bytes' description in 'btrfs_delalloc_release_space' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index bacee09b7bfd..56642ca7af10 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -191,12 +191,14 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, } /** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. + * Release any excessive reservation + * + * @inode: the inode we need to release from + * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup + * meta reservation needs to know if we are freeing qgroup + * reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal + * release. * * This is the same as btrfs_block_rsv_release, except that it handles the * tracepoint for the reservation. @@ -361,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * Release a metadata reservation for an inode + * * @inode: the inode to release the reservation for. * @num_bytes: the number of bytes we are releasing. * @qgroup_free: free qgroup reservation or convert it to per-trans reservation @@ -455,11 +458,13 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, } /** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use + * Release data and metadata space for delalloc + * + * @inode: inode we're releasing space for + * @reserved: list of changed/reserved ranges + * @start: start position of the space already reserved + * @len: length of the space already reserved + * @qgroup_free: should qgroup reserved-space also be freed * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes From d98b188ea463281ee89663c36d8ac0a030e93b0c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:02 +0200 Subject: [PATCH 184/307] btrfs: fix parameter description in space-info.c With these fixes space-info.c is clear for W=1 warnings, namely the following ones are fixed: fs/btrfs/space-info.c:575: warning: Function parameter or member 'fs_info' not described in 'may_commit_transaction' fs/btrfs/space-info.c:575: warning: Function parameter or member 'space_info' not described in 'may_commit_transaction' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'fs_info' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'space_info' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'ticket' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1231: warning: Function parameter or member 'flush' not described in 'handle_reserve_ticket' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'fs_info' not described in '__reserve_bytes' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'space_info' not described in '__reserve_bytes' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'orig_bytes' not described in '__reserve_bytes' fs/btrfs/space-info.c:1315: warning: Function parameter or member 'flush' not described in '__reserve_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'root' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'block_rsv' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'orig_bytes' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1427: warning: Function parameter or member 'flush' not described in 'btrfs_reserve_metadata_bytes' fs/btrfs/space-info.c:1462: warning: Function parameter or member 'fs_info' not described in 'btrfs_reserve_data_bytes' fs/btrfs/space-info.c:1462: warning: Function parameter or member 'bytes' not described in 'btrfs_reserve_data_bytes' fs/btrfs/space-info.c:1462: warning: Function parameter or member 'flush' not described in 'btrfs_reserve_data_bytes' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 50 +++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 84fb94e78a8f..fd8e79e3c10e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -561,10 +561,10 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, } /** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit + * Possibly commit the transaction if its ok to + * + * @fs_info: the filesystem + * @space_info: space_info we are checking for commit, either data or metadata * * This will check to make sure that committing the transaction will actually * get us somewhere and then commit the transaction if it does. Otherwise it @@ -1215,11 +1215,12 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, } /** - * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket - * @fs_info - the fs - * @space_info - the space_info for the reservation - * @ticket - the ticket for the reservation - * @flush - how much we can flush + * Do the appropriate flushing and waiting for a ticket + * + * @fs_info: the filesystem + * @space_info: space info for the reservation + * @ticket: ticket for the reservation + * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. @@ -1296,11 +1297,12 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) } /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Try to reserve bytes from the block_rsv's space + * + * @fs_info: the filesystem + * @space_info: space info we want to allocate from + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1407,11 +1409,12 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } /** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation + * Trye to reserve metadata bytes from the block_rsv's space + * + * @root: the root we're allocating for + * @block_rsv: block_rsv we're allocating for + * @orig_bytes: number of bytes we want + * @flush: whether or not we can flush to make our reservation * * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to @@ -1449,10 +1452,11 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } /** - * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation - * @fs_info - the filesystem - * @bytes - the number of bytes we need - * @flush - how we are allowed to flush + * Try to reserve data bytes for an allocation + * + * @fs_info: the filesystem + * @bytes: number of bytes we need + * @flush: how we are allowed to flush * * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. From 3bed2da1b00f554e70d16f44db9357a7670d776c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:03 +0200 Subject: [PATCH 185/307] btrfs: fix parameter description for functions in extent_io.c This makes the file W=1 clean and fixes the following warnings: fs/btrfs/extent_io.c:414: warning: Function parameter or member 'tree' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'offset' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'next_ret' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'prev_ret' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'p_ret' not described in '__etree_search' fs/btrfs/extent_io.c:414: warning: Function parameter or member 'parent_ret' not described in '__etree_search' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'tree' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'start' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'start_ret' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'end_ret' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1607: warning: Function parameter or member 'bits' not described in 'find_contiguous_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'tree' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'start' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'start_ret' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'end_ret' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:1644: warning: Function parameter or member 'bits' not described in 'find_first_clear_extent_bit' fs/btrfs/extent_io.c:4187: warning: Function parameter or member 'epd' not described in 'extent_write_cache_pages' fs/btrfs/extent_io.c:4187: warning: Excess function parameter 'data' description in 'extent_write_cache_pages' Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 52 +++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7f689ad7709c..2fa563da65bd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -389,16 +389,16 @@ do_insert: } /** - * __etree_search - searche @tree for an entry that contains @offset. Such - * entry would have entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset. Such entry would have + * entry->start <= offset && entry->end >= offset. * - * @tree - the tree to search - * @offset - offset that should fall within an entry in @tree - * @next_ret - pointer to the first entry whose range ends after @offset - * @prev - pointer to the first entry whose range begins before @offset - * @p_ret - pointer where new node should be anchored (used when inserting an - * entry in the tree) - * @parent_ret - points to entry which would have been the parent of the entry, + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * @p_ret: pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * * This function returns a pointer to the entry that contains @offset byte @@ -1588,12 +1588,13 @@ out: } /** - * find_contiguous_extent_bit: find a contiguous area of bits - * @tree - io tree to check - * @start - offset to start the search from - * @start_ret - the first offset we found with the bits set - * @end_ret - the final contiguous range of the bits that were set - * @bits - bits to look for + * Find a contiguous area of bits + * + * @tree: io tree to check + * @start: offset to start the search from + * @start_ret: the first offset we found with the bits set + * @end_ret: the final contiguous range of the bits that were set + * @bits: bits to look for * * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges * to set bits appropriately, and then merge them again. During this time it @@ -1625,14 +1626,14 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, } /** - * find_first_clear_extent_bit - find the first range that has @bits not set. - * This range could start before @start. + * Find the first range that has @bits not set. This range could start before + * @start. * - * @tree - the tree to search - * @start - the offset at/after which the found extent should start - * @start_ret - records the beginning of the range - * @end_ret - records the end of the range (inclusive) - * @bits - the set of bits which must be unset + * @tree: the tree to search + * @start: offset at/after which the found extent should start + * @start_ret: records the beginning of the range + * @end_ret: records the end of the range (inclusive) + * @bits: the set of bits which must be unset * * Since unallocated range is also considered one which doesn't have the bits * set it's possible that @end_ret contains -1, this happens in case the range @@ -4168,10 +4169,11 @@ retry: } /** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * Walk the list of dirty pages of the given address space and write all of them. + * * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @data: data passed to __extent_writepage function + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @epd: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, From 8c31a3dbaa356b1fce97bf55026410649e4dd0f1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Sun, 24 Jan 2021 18:03:21 +0200 Subject: [PATCH 186/307] btrfs: zoned: remove unused variable in btrfs_sb_log_location_bdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes warning: fs/btrfs/zoned.c:491:6: warning: variable ‘zone_size’ set but not used [-Wunused-but-set-variable] 491 | u64 zone_size; which got introduced in 12659251ca5d ("btrfs: implement log-structured superblock for ZONED mode"). We'll enable the warning by default and want clean build until the relevant zoned patches land. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c38846659019..41d27fefd306 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -488,7 +488,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, unsigned int zone_sectors; u32 sb_zone; int ret; - u64 zone_size; u8 zone_sectors_shift; sector_t nr_sectors; u32 nr_zones; @@ -503,7 +502,6 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, zone_sectors = bdev_zone_sectors(bdev); if (!is_power_of_2(zone_sectors)) return -EINVAL; - zone_size = zone_sectors << SECTOR_SHIFT; zone_sectors_shift = ilog2(zone_sectors); nr_sectors = bdev_nr_sectors(bdev); nr_zones = nr_sectors >> zone_sectors_shift; From 71c36788b9253f086d09763b98804ed473e12a3b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:04 +0200 Subject: [PATCH 187/307] lib/zstd: convert constants to defines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These constants are really used internally by zstd and including linux/zstd.h into users results in the following warnings: In file included from fs/btrfs/zstd.c:19: ./include/linux/zstd.h:798:21: warning: ‘ZSTD_skippableHeaderSize’ defined but not used [-Wunused-const-variable=] 798 | static const size_t ZSTD_skippableHeaderSize = 8; | ^~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/zstd.h:796:21: warning: ‘ZSTD_frameHeaderSize_max’ defined but not used [-Wunused-const-variable=] 796 | static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX; | ^~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/zstd.h:795:21: warning: ‘ZSTD_frameHeaderSize_min’ defined but not used [-Wunused-const-variable=] 795 | static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN; | ^~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/zstd.h:794:21: warning: ‘ZSTD_frameHeaderSize_prefix’ defined but not used [-Wunused-const-variable=] 794 | static const size_t ZSTD_frameHeaderSize_prefix = 5; So fix those warnings by turning the constants into defines. Reviewed-by: Nick Terrell Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/linux/zstd.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 249575e2485f..e87f78c9b19c 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -791,11 +791,11 @@ size_t ZSTD_DStreamOutSize(void); /* for static allocation */ #define ZSTD_FRAMEHEADERSIZE_MAX 18 #define ZSTD_FRAMEHEADERSIZE_MIN 6 -static const size_t ZSTD_frameHeaderSize_prefix = 5; -static const size_t ZSTD_frameHeaderSize_min = ZSTD_FRAMEHEADERSIZE_MIN; -static const size_t ZSTD_frameHeaderSize_max = ZSTD_FRAMEHEADERSIZE_MAX; +#define ZSTD_frameHeaderSize_prefix 5 +#define ZSTD_frameHeaderSize_min ZSTD_FRAMEHEADERSIZE_MIN +#define ZSTD_frameHeaderSize_max ZSTD_FRAMEHEADERSIZE_MAX /* magic number + skippable frame length */ -static const size_t ZSTD_skippableHeaderSize = 8; +#define ZSTD_skippableHeaderSize 8 /*-************************************* From e9aa7c285d20a69ce1fb940ec846686780af9e56 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 22 Jan 2021 11:58:05 +0200 Subject: [PATCH 188/307] btrfs: enable W=1 checks for btrfs Now that the btrfs' codebase is clean of almost all W=1 warnings let's enable those checks unconditionally for the entire fs/btrfs/ and its subdirectories to catch potential errors during development. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ add some comments ] Signed-off-by: David Sterba --- fs/btrfs/Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9f1b1a88e317..e45957319424 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,5 +1,21 @@ # SPDX-License-Identifier: GPL-2.0 +# Subset of W=1 warnings +subdir-ccflags-y += -Wextra -Wunused -Wno-unused-parameter +subdir-ccflags-y += -Wmissing-declarations +subdir-ccflags-y += -Wmissing-format-attribute +subdir-ccflags-y += -Wmissing-prototypes +subdir-ccflags-y += -Wold-style-definition +subdir-ccflags-y += -Wmissing-include-dirs +subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) +subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) +subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) +subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +# The following turn off the warnings enabled by -Wextra +subdir-ccflags-y += -Wno-missing-field-initializers +subdir-ccflags-y += -Wno-sign-compare +subdir-ccflags-y += -Wno-type-limits + obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ From 2187374f35fe9cadbddaa9fcf0c4121365d914e8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jan 2021 16:48:55 -0500 Subject: [PATCH 189/307] btrfs: handle space_info::total_bytes_pinned inside the delayed ref itself Currently we pass things around to figure out if we maybe freeing data based on the state of the delayed refs head. This makes the accounting sort of confusing and hard to follow, as it's distinctly separate from the delayed ref heads stuff, but also depends on it entirely. Fix this by explicitly adjusting the space_info->total_bytes_pinned in the delayed refs code. We now have two places where we modify this counter, once where we create the delayed and destroy the delayed refs, and once when we pin and unpin the extents. This means there is a slight overlap between delayed refs and the pin/unpin mechanisms, but this is simply used by the ENOSPC infrastructure to determine if we need to commit the transaction, so there's no adverse affect from this, we might simply commit thinking it will give us enough space when it might not. CC: stable@vger.kernel.org # 5.10 Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 10 ++--- fs/btrfs/delayed-ref.c | 51 +++++++++++++--------- fs/btrfs/delayed-ref.h | 16 +++++-- fs/btrfs/extent-tree.c | 97 ++++++------------------------------------ fs/btrfs/space-info.h | 17 ++++++++ 5 files changed, 74 insertions(+), 117 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 2d7294d81616..763a3671b7af 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1371,9 +1371,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -block_group->pinned, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -2898,10 +2896,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, + num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 88a1e27d2fc2..a540ace3e03a 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -651,12 +651,12 @@ inserted: */ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing, - struct btrfs_delayed_ref_head *update, - int *old_ref_mod_ret) + struct btrfs_delayed_ref_head *update) { struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; + u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -704,8 +704,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * currently, for refs we just added we know we're a-ok. */ old_ref_mod = existing->total_ref_mod; - if (old_ref_mod_ret) - *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; existing->total_ref_mod += update->ref_mod; @@ -727,6 +725,22 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates += csum_leaves; } } + + /* + * This handles the following conditions: + * + * 1. We had a ref mod of 0 or more and went negative, indicating that + * we may be freeing space, so add our space to the + * total_bytes_pinned counter. + * 2. We were negative and went to 0 or positive, so no longer can say + * that the space would be pinned, decrement our counter from the + * total_bytes_pinned counter. + */ + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); + else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) + btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); + spin_unlock(&existing->lock); } @@ -801,8 +815,7 @@ static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, - int action, int *qrecord_inserted_ret, - int *old_ref_mod, int *new_ref_mod) + int action, int *qrecord_inserted_ret) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; @@ -824,8 +837,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(trans, existing, head_ref, - old_ref_mod); + update_existing_head_ref(trans, existing, head_ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -833,14 +845,17 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - if (old_ref_mod) - *old_ref_mod = 0; + u64 flags = btrfs_ref_head_to_space_flags(head_ref); + if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } + if (head_ref->ref_mod < 0) + btrfs_mod_total_bytes_pinned(trans->fs_info, flags, + head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -848,8 +863,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; - if (new_ref_mod) - *new_ref_mod = head_ref->total_ref_mod; return head_ref; } @@ -912,8 +925,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info, */ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; @@ -980,8 +992,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1009,8 +1020,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, */ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod) + u64 reserved) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_data_ref *ref; @@ -1076,8 +1086,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, * the spin lock */ head_ref = add_delayed_ref_head(trans, head_ref, record, - action, &qrecord_inserted, - old_ref_mod, new_ref_mod); + action, &qrecord_inserted); ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); @@ -1120,7 +1129,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD, - NULL, NULL, NULL); + NULL); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 1c977e6d45dc..3ba140468f12 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -326,6 +326,16 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } +static inline u64 btrfs_ref_head_to_space_flags( + struct btrfs_delayed_ref_head *head_ref) +{ + if (head_ref->is_data) + return BTRFS_BLOCK_GROUP_DATA; + else if (head_ref->is_system) + return BTRFS_BLOCK_GROUP_SYSTEM; + return BTRFS_BLOCK_GROUP_METADATA; +} + static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) { if (refcount_dec_and_test(&head->refs)) @@ -334,12 +344,10 @@ static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *hea int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - struct btrfs_delayed_extent_op *extent_op, - int *old_ref_mod, int *new_ref_mod); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref, - u64 reserved, int *old_ref_mod, - int *new_ref_mod); + u64 reserved); int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, struct btrfs_delayed_extent_op *extent_op); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c335dae5af7..2f591036ffc1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -82,41 +82,6 @@ void btrfs_free_excluded_extents(struct btrfs_block_group *cache) EXTENT_UPTODATE); } -static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) -{ - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - return BTRFS_BLOCK_GROUP_SYSTEM; - else - return BTRFS_BLOCK_GROUP_METADATA; - } - return BTRFS_BLOCK_GROUP_DATA; -} - -static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref) -{ - struct btrfs_space_info *space_info; - u64 flags = generic_ref_to_space_flags(ref); - - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -1388,7 +1353,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && @@ -1397,17 +1361,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) - ret = btrfs_add_delayed_tree_ref(trans, generic_ref, - NULL, &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); else - ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0); btrfs_ref_tree_mod(fs_info, generic_ref); - if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - sub_pinned_bytes(fs_info, generic_ref); - return ret; } @@ -1796,20 +1755,9 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, int nr_items = 1; /* Dropping this ref head update. */ if (head->total_ref_mod < 0) { - struct btrfs_space_info *space_info; - u64 flags; + u64 flags = btrfs_ref_head_to_space_flags(head); - if (head->is_data) - flags = BTRFS_BLOCK_GROUP_DATA; - else if (head->is_system) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -head->num_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); + btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); /* * We had csum deletions accounted for in our delayed refs rsv, @@ -2572,8 +2520,7 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch(&cache->space_info->total_bytes_pinned, - num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2784,8 +2731,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - percpu_counter_add_batch(&space_info->total_bytes_pinned, - -len, BTRFS_TOTAL_BYTES_PINNED_BATCH); + __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -3318,7 +3264,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ref generic_ref = { 0 }; - int pin = 1; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3327,13 +3272,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, root->root_key.objectid); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - int old_ref_mod, new_ref_mod; - btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL); BUG_ON(ret); /* -ENOMEM */ - pin = old_ref_mod >= 0 && new_ref_mod < 0; } if (last_ref && btrfs_header_generation(buf) == trans->transid) { @@ -3345,7 +3286,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } - pin = 0; cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -3362,9 +3302,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); } out: - if (pin) - add_pinned_bytes(fs_info, &generic_ref); - if (last_ref) { /* * Deleting the buffer, clear the corrupt flag since it doesn't @@ -3378,7 +3315,6 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) { struct btrfs_fs_info *fs_info = trans->fs_info; - int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) @@ -3394,14 +3330,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); - old_ref_mod = new_ref_mod = 0; ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { - ret = btrfs_add_delayed_tree_ref(trans, ref, NULL, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); } else { - ret = btrfs_add_delayed_data_ref(trans, ref, 0, - &old_ref_mod, &new_ref_mod); + ret = btrfs_add_delayed_data_ref(trans, ref, 0); } if (!((ref->type == BTRFS_REF_METADATA && @@ -3410,9 +3343,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); - if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref); - return ret; } @@ -4528,7 +4458,6 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; - int ret; BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); @@ -4536,9 +4465,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ins->objectid, ins->offset, 0); btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); btrfs_ref_tree_mod(root->fs_info, &generic_ref); - ret = btrfs_add_delayed_data_ref(trans, &generic_ref, - ram_bytes, NULL, NULL); - return ret; + + return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); } /* @@ -4730,8 +4658,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, generic_ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&generic_ref, level, root_objectid); btrfs_ref_tree_mod(fs_info, &generic_ref); - ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, - extent_op, NULL, NULL); + ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) goto out_free_delayed; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 5646393b928c..74706f604bce 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -152,4 +152,21 @@ static inline void btrfs_space_info_free_bytes_may_use( int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); +static inline void __btrfs_mod_total_bytes_pinned( + struct btrfs_space_info *space_info, + s64 mod) +{ + percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, + BTRFS_TOTAL_BYTES_PINNED_BATCH); +} + +static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, + u64 flags, s64 mod) +{ + struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); + + ASSERT(space_info); + __btrfs_mod_total_bytes_pinned(space_info, mod); +} + #endif /* BTRFS_SPACE_INFO_H */ From 81e75ac74ecba929d1e922bf93f9fc467232e39f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Jan 2021 16:48:56 -0500 Subject: [PATCH 190/307] btrfs: account for new extents being deleted in total_bytes_pinned My recent patch set "A variety of lock contention fixes", found here https://lore.kernel.org/linux-btrfs/cover.1608319304.git.josef@toxicpanda.com/ (Tracked in https://github.com/btrfs/linux/issues/86) that reduce lock contention on the extent root by running delayed refs less often resulted in a regression in generic/371. This test fallocate()'s the fs until it's full, deletes all the files, and then tries to fallocate() until full again. Before these patches we would run all of the delayed refs during flushing, and then would commit the transaction because we had plenty of pinned space to recover in order to allocate. However my patches made it so we weren't running the delayed refs as aggressively, which meant that we appeared to have less pinned space when we were deciding to commit the transaction. We use the space_info->total_bytes_pinned to approximate how much space we have pinned. It's approximate because if we remove a reference to an extent we may free it, but there may be more references to it than we know of at that point, but we account it as pinned at the creation time, and then it's properly accounted when the delayed ref runs. The way we account for pinned space is if the delayed_ref_head->total_ref_mod is < 0, because that is clearly a freeing option. However there is another case, and that is where ->total_ref_mod == 0 && ->must_insert_reserved == 1. When we allocate a new extent, we have ->total_ref_mod == 1 and we have ->must_insert_reserved == 1. This is used to indicate that it is a brand new extent and will need to have its extent entry added before we modify any references on the delayed ref head. But if we subsequently remove that extent reference, our ->total_ref_mod will be 0, and that space will be pinned and freed. Accounting for this case properly allows for generic/371 to pass with my delayed refs patches applied. It's important to note that this problem exists without the referenced patches, it just was uncovered by them. CC: stable@vger.kernel.org # 5.10 Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 5 +++++ fs/btrfs/extent-tree.c | 33 +++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a540ace3e03a..63be7d01a9a3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -735,11 +735,16 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * 2. We were negative and went to 0 or positive, so no longer can say * that the space would be pinned, decrement our counter from the * total_bytes_pinned counter. + * 3. We are now at 0 and have ->must_insert_reserved set, which means + * this was a new allocation and then we dropped it, and thus must + * add our space to the total_bytes_pinned counter. */ if (existing->total_ref_mod < 0 && old_ref_mod >= 0) btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); + else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) + btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); spin_unlock(&existing->lock); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2f591036ffc1..6f0c59debc2b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1754,23 +1754,28 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, { int nr_items = 1; /* Dropping this ref head update. */ - if (head->total_ref_mod < 0) { + /* + * We had csum deletions accounted for in our delayed refs rsv, we need + * to drop the csum leaves for this update from our delayed_refs_rsv. + */ + if (head->total_ref_mod < 0 && head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + } + + /* + * We were dropping refs, or had a new ref and dropped it, and thus must + * adjust down our total_bytes_pinned, the space may or may not have + * been pinned and so is accounted for properly in the pinned space by + * now. + */ + if (head->total_ref_mod < 0 || + (head->total_ref_mod == 0 && head->must_insert_reserved)) { u64 flags = btrfs_ref_head_to_space_flags(head); btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); - - /* - * We had csum deletions accounted for in our delayed refs rsv, - * we need to drop the csum leaves for this update from our - * delayed_refs_rsv. - */ - if (head->is_data) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= head->num_bytes; - spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, - head->num_bytes); - } } btrfs_delayed_refs_rsv_release(fs_info, nr_items); From 2e626e5673c2a3b4ce8200b961e28edd613ab6a9 Mon Sep 17 00:00:00 2001 From: Nigel Christian Date: Sun, 24 Jan 2021 20:41:41 -0500 Subject: [PATCH 191/307] btrfs: remove repeated word in struct member comment Comment for processed extent end of range has an unnecessary "in", remove it. Signed-off-by: Nigel Christian Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2fa563da65bd..edcdbd739a1e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2776,7 +2776,7 @@ struct processed_extent { struct btrfs_inode *inode; /* Start of the range in @inode */ u64 start; - /* End of the range in in @inode */ + /* End of the range in @inode */ u64 end; bool uptodate; }; From c78a10aebb275c38d0cfccae129a803fe622e305 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:42 -0500 Subject: [PATCH 192/307] btrfs: fix reloc root leak with 0 ref reloc roots on recovery When recovering a relocation, if we run into a reloc root that has 0 refs we simply add it to the reloc_control->reloc_roots list, and then clean it up later. The problem with this is __del_reloc_root() doesn't do anything if the root isn't in the radix tree, which in this case it won't be because we never call __add_reloc_root() on the reloc_root. This exit condition simply isn't correct really. During normal operation we can remove ourselves from the rb tree and then we're meant to clean up later at merge_reloc_roots() time, and this happens correctly. During recovery we're depending on free_reloc_roots() to drop our references, but we're short-circuiting. Fix this by continuing to check if we're on the list and dropping ourselves from the reloc_control root list and dropping our reference appropriately. Change the corresponding BUG_ON() to an ASSERT() that does the correct thing if we aren't in the rb tree. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9f2289bcdde6..d29baf3822a7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -669,9 +669,7 @@ static void __del_reloc_root(struct btrfs_root *root) RB_CLEAR_NODE(&node->rb_node); } spin_unlock(&rc->reloc_root_tree.lock); - if (!node) - return; - BUG_ON((struct btrfs_root *)node->data != root); + ASSERT(!node || (struct btrfs_root *)node->data == root); } /* From 938fcbfb0cbcf532a1869efab58e6009446b1ced Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:43 -0500 Subject: [PATCH 193/307] btrfs: splice remaining dirty_bg's onto the transaction dirty bg list While doing error injection testing with my relocation patches I hit the following assert: assertion failed: list_empty(&block_group->dirty_list), in fs/btrfs/block-group.c:3356 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3357! invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 PID: 24351 Comm: umount Tainted: G W 5.10.0-rc3+ #193 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:assertfail.constprop.0+0x18/0x1a RSP: 0018:ffffa09b019c7e00 EFLAGS: 00010282 RAX: 0000000000000056 RBX: ffff8f6492c18000 RCX: 0000000000000000 RDX: ffff8f64fbc27c60 RSI: ffff8f64fbc19050 RDI: ffff8f64fbc19050 RBP: ffff8f6483bbdc00 R08: 0000000000000000 R09: 0000000000000000 R10: ffffa09b019c7c38 R11: ffffffff85d70928 R12: ffff8f6492c18100 R13: ffff8f6492c18148 R14: ffff8f6483bbdd70 R15: dead000000000100 FS: 00007fbfda4cdc40(0000) GS:ffff8f64fbc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbfda666fd0 CR3: 000000013cf66002 CR4: 0000000000370ef0 Call Trace: btrfs_free_block_groups.cold+0x55/0x55 close_ctree+0x2c5/0x306 ? fsnotify_destroy_marks+0x14/0x100 generic_shutdown_super+0x6c/0x100 kill_anon_super+0x14/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x36/0xa0 cleanup_mnt+0x12d/0x190 task_work_run+0x5c/0xa0 exit_to_user_mode_prepare+0x1b1/0x1d0 syscall_exit_to_user_mode+0x54/0x280 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happened because I injected an error in btrfs_cow_block() while running the dirty block groups. When we run the dirty block groups, we splice the list onto a local list to process. However if an error occurs, we only cleanup the transactions dirty block group list, not any pending block groups we have on our locally spliced list. In fact if we fail to allocate a path in this function we'll also fail to clean up the splice list. Fix this by splicing the list back onto the transaction dirty block group list so that the block groups are cleaned up. Then add a 'out' label and have the error conditions jump to out so that the errors are handled properly. This also has the side-effect of fixing a problem where we would clear 'ret' on error because we unconditionally ran btrfs_run_delayed_refs(). CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 763a3671b7af..dda495b2a862 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2564,8 +2564,10 @@ again: if (!path) { path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } } /* @@ -2659,16 +2661,14 @@ again: btrfs_put_block_group(cache); if (drop_reserve) btrfs_delayed_refs_rsv_release(fs_info, 1); - - if (ret) - break; - /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be * removed. */ mutex_unlock(&trans->transaction->cache_write_mutex); + if (ret) + goto out; mutex_lock(&trans->transaction->cache_write_mutex); } mutex_unlock(&trans->transaction->cache_write_mutex); @@ -2692,7 +2692,12 @@ again: goto again; } spin_unlock(&cur_trans->dirty_bgs_lock); - } else if (ret < 0) { + } +out: + if (ret < 0) { + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&dirty, &cur_trans->dirty_bgs); + spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } From f78743fbdae1bb31bc9c9233c3590a5048782381 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:44 -0500 Subject: [PATCH 194/307] btrfs: do not warn if we can't find the reloc root when looking up backref The backref code is looking for a reloc_root that corresponds to the given fs root. However any number of things could have gone wrong while initializing that reloc_root, like ENOMEM while trying to allocate the root itself, or EIO while trying to write the root item. This would result in no corresponding reloc_root being in the reloc root cache, and thus would return NULL when we do the find_reloc_root() call. Because of this we do not want to WARN_ON(). This presumably was meant to catch developer errors, cases where we messed up adding the reloc root. However we can easily hit this case with error injection, and thus should not do a WARN_ON(). CC: stable@vger.kernel.org # 5.10+ Reported-by: Zygo Blaxell Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 701124c3e0b1..f47c1528eb9a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2623,7 +2623,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, /* Only reloc backref cache cares about a specific root */ if (cache->is_reloc) { root = find_reloc_root(cache->fs_info, cur->bytenr); - if (WARN_ON(!root)) + if (!root) return -ENOENT; cur->root = root; } else { From eddda68d97732ce05ca145f8e85e8a447f65cdad Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:45 -0500 Subject: [PATCH 195/307] btrfs: add asserts for deleting backref cache nodes A weird KASAN problem that Zygo reported could have been easily caught if we checked for basic things in our backref freeing code. We have two methods of freeing a backref node - btrfs_backref_free_node: this just is kfree() essentially. - btrfs_backref_drop_node: this actually unlinks the node and cleans up everything and then calls btrfs_backref_free_node(). We should mostly be using btrfs_backref_drop_node(), to make sure the node is properly unlinked from the backref cache, and only use btrfs_backref_free_node() when we know the node isn't actually linked to the backref cache. We made a mistake here and thus got the KASAN splat. Make this style of issue easier to find by adding some ASSERT()'s to btrfs_backref_free_node() and adjusting our deletion stuff to properly init the list so we can rely on list_empty() checks working properly. BUG: KASAN: use-after-free in btrfs_backref_cleanup_node+0x18a/0x420 Read of size 8 at addr ffff888112402950 by task btrfs/28836 CPU: 0 PID: 28836 Comm: btrfs Tainted: G W 5.10.0-e35f27394290-for-next+ #23 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Call Trace: dump_stack+0xbc/0xf9 ? btrfs_backref_cleanup_node+0x18a/0x420 print_address_description.constprop.8+0x21/0x210 ? record_print_text.cold.34+0x11/0x11 ? btrfs_backref_cleanup_node+0x18a/0x420 ? btrfs_backref_cleanup_node+0x18a/0x420 kasan_report.cold.10+0x20/0x37 ? btrfs_backref_cleanup_node+0x18a/0x420 __asan_load8+0x69/0x90 btrfs_backref_cleanup_node+0x18a/0x420 btrfs_backref_release_cache+0x83/0x1b0 relocate_block_group+0x394/0x780 ? merge_reloc_roots+0x4a0/0x4a0 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 ? check_flags.part.50+0x6c/0x1e0 ? btrfs_relocate_chunk+0x120/0x120 ? kmem_cache_alloc_trace+0xa06/0xcb0 ? _copy_from_user+0x83/0xc0 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 ? __kasan_check_read+0x11/0x20 ? check_chain_key+0x1f4/0x2f0 ? __asan_loadN+0xf/0x20 ? btrfs_ioctl_get_supported_features+0x30/0x30 ? kvm_sched_clock_read+0x18/0x30 ? check_chain_key+0x1f4/0x2f0 ? lock_downgrade+0x3f0/0x3f0 ? handle_mm_fault+0xad6/0x2150 ? do_vfs_ioctl+0xfc/0x9d0 ? ioctl_file_clone+0xe0/0xe0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags.part.50+0x6c/0x1e0 ? check_flags+0x26/0x30 ? lock_is_held_type+0xc3/0xf0 ? syscall_enter_from_user_mode+0x1b/0x60 ? do_syscall_64+0x13/0x80 ? rcu_read_lock_sched_held+0xa1/0xd0 ? __kasan_check_read+0x11/0x20 ? __fget_light+0xae/0x110 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f4c4bdfe427 RSP: 002b:00007fff33ee6df8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fff33ee6e98 RCX: 00007f4c4bdfe427 RDX: 00007fff33ee6e98 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078 R10: fffffffffffff59d R11: 0000000000000202 R12: 0000000000000001 R13: 0000000000000000 R14: 00007fff33ee8a34 R15: 0000000000000001 Allocated by task 28836: kasan_save_stack+0x21/0x50 __kasan_kmalloc.constprop.18+0xbe/0xd0 kasan_kmalloc+0x9/0x10 kmem_cache_alloc_trace+0x410/0xcb0 btrfs_backref_alloc_node+0x46/0xf0 btrfs_backref_add_tree_node+0x60d/0x11d0 build_backref_tree+0xc5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 28836: kasan_save_stack+0x21/0x50 kasan_set_track+0x20/0x30 kasan_set_free_info+0x1f/0x30 __kasan_slab_free+0xf3/0x140 kasan_slab_free+0xe/0x10 kfree+0xde/0x200 btrfs_backref_error_cleanup+0x452/0x530 build_backref_tree+0x1a5/0x700 relocate_tree_blocks+0x2be/0xb90 relocate_block_group+0x2eb/0x780 btrfs_relocate_block_group+0x26e/0x4c0 btrfs_relocate_chunk+0x52/0x120 btrfs_balance+0xe2e/0x1900 btrfs_ioctl_balance+0x3a7/0x460 btrfs_ioctl+0x24c8/0x4360 __x64_sys_ioctl+0xc3/0x100 do_syscall_64+0x37/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff888112402900 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 80 bytes inside of 128-byte region [ffff888112402900, ffff888112402980) The buggy address belongs to the page: page:0000000028b1cd08 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888131c810c0 pfn:0x112402 flags: 0x17ffe0000000200(slab) raw: 017ffe0000000200 ffffea000424f308 ffffea0007d572c8 ffff888100040440 raw: ffff888131c810c0 ffff888112402000 0000000100000009 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888112402800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff888112402880: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff888112402900: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888112402980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff888112402a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Link: https://lore.kernel.org/linux-btrfs/20201208194607.GI31381@hungrycats.org/ CC: stable@vger.kernel.org # 5.10+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ff705cc564a9..17abde7f794c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -296,6 +296,9 @@ static inline void btrfs_backref_free_node(struct btrfs_backref_cache *cache, struct btrfs_backref_node *node) { if (node) { + ASSERT(list_empty(&node->list)); + ASSERT(list_empty(&node->lower)); + ASSERT(node->eb == NULL); cache->nr_nodes--; btrfs_put_root(node->root); kfree(node); @@ -340,11 +343,11 @@ static inline void btrfs_backref_drop_node_buffer( static inline void btrfs_backref_drop_node(struct btrfs_backref_cache *tree, struct btrfs_backref_node *node) { - BUG_ON(!list_empty(&node->upper)); + ASSERT(list_empty(&node->upper)); btrfs_backref_drop_node_buffer(node); - list_del(&node->list); - list_del(&node->lower); + list_del_init(&node->list); + list_del_init(&node->lower); if (!RB_EMPTY_NODE(&node->rb_node)) rb_erase(&node->rb_node, &tree->rb_root); btrfs_backref_free_node(tree, node); From 867ed321f90d06aaba84e2c91de51cd3038825ef Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 14 Jan 2021 14:02:46 -0500 Subject: [PATCH 196/307] btrfs: abort the transaction if we fail to inc ref in btrfs_copy_root While testing my error handling patches, I added a error injection site at btrfs_inc_extent_ref, to validate the error handling I added was doing the correct thing. However I hit a pretty ugly corruption while doing this check, with the following error injection stack trace: btrfs_inc_extent_ref btrfs_copy_root create_reloc_root btrfs_init_reloc_root btrfs_record_root_in_trans btrfs_start_transaction btrfs_update_inode btrfs_update_time touch_atime file_accessed btrfs_file_mmap This is because we do not catch the error from btrfs_inc_extent_ref, which in practice would be ENOMEM, which means we lose the extent references for a root that has already been allocated and inserted, which is the problem. Fix this by aborting the transaction if we fail to do the reference modification. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 56e132d825a2..95d9bae764ab 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -221,9 +221,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); return ret; + } btrfs_mark_buffer_dirty(cow); *cow_ret = cow; From ddfd08cb0484e491cae47a76ead051a168a0e644 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:19 -0500 Subject: [PATCH 197/307] btrfs: do not block on deleted bgs mutex in the cleaner While running some stress tests I started getting hung task messages. This is because the delete unused block groups code has to take the delete_unused_bgs_mutex to do it's work, which is taken by balance to make sure we don't delete block groups while we're balancing. The problem is that balance can take a while, and so we were getting hung task warnings. We don't need to block and run these things, and the cleaner is needed to do other work, so trylock on this mutex and just bail if we can't acquire it right away. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index dda495b2a862..5fa6b3d540f4 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1262,6 +1262,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + /* + * Long running balances can keep us blocked here for eternity, so + * simply skip deletion if we're unable to get the mutex. + */ + if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + return; + spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->unused_bgs)) { int trimming; @@ -1281,8 +1288,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); - mutex_lock(&fs_info->delete_unused_bgs_mutex); - /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); @@ -1426,11 +1431,11 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans); next: - mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); return; flip_async: From e19eb11f4f3d3b0463cd897016064a79cb6d8c6d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:20 -0500 Subject: [PATCH 198/307] btrfs: only let one thread pre-flush delayed refs in commit I've been running a stress test that runs 20 workers in their own subvolume, which are running an fsstress instance with 4 threads per worker, which is 80 total fsstress threads. In addition to this I'm running balance in the background as well as creating and deleting snapshots. This test takes around 12 hours to run normally, going slower and slower as the test goes on. The reason for this is because fsstress is running fsync sometimes, and because we're messing with block groups we often fall through to btrfs_commit_transaction, so will often have 20-30 threads all calling btrfs_commit_transaction at the same time. These all get stuck contending on the extent tree while they try to run delayed refs during the initial part of the commit. This is suboptimal, really because the extent tree is a single point of failure we only want one thread acting on that tree at once to reduce lock contention. Fix this by making the flushing mechanism a bit operation, to make it easy to use test_and_set_bit() in order to make sure only one task does this initial flush. Once we're into the transaction commit we only have one thread doing delayed ref running, it's just this initial pre-flush that is problematic. With this patch my stress test takes around 90 minutes to run, instead of 12 hours. The memory barrier is not necessary for the flushing bit as it's ordered, unlike plain int. The transaction state accessed in btrfs_should_end_transaction could be affected by that too as it's not always used under transaction lock. Upon Nikolay's analysis in [1] it's not necessary: In should_end_transaction it's read without holding any locks. (U) It's modified in btrfs_cleanup_transaction without holding the fs_info->trans_lock (U), but the STATE_ERROR flag is going to be set. set in cleanup_transaction under fs_info->trans_lock (L) set in btrfs_commit_trans to COMMIT_START under fs_info->trans_lock.(L) set in btrfs_commit_trans to COMMIT_DOING under fs_info->trans_lock.(L) set in btrfs_commit_trans to COMMIT_UNBLOCK under fs_info->trans_lock.(L) set in btrfs_commit_trans to COMMIT_COMPLETED without locks but at this point the transaction is finished and fs_info->running_trans is NULL (U but irrelevant). So by the looks of it we can have a concurrent READ race with a WRITE, due to reads not taking a lock. In this case what we want to ensure is we either see new or old state. I consulted with Will Deacon and he said that in such a case we'd want to annotate the accesses to ->state with (READ|WRITE)_ONCE so as to avoid a theoretical tear, in this case I don't think this could happen but I imagine at some point KCSAN would flag such an access as racy (which it is). [1] https://lore.kernel.org/linux-btrfs/e1fd5cc1-0f28-f670-69f4-e9958b4964e6@suse.com Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add comments regarding memory barrier ] Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.h | 12 ++++++------ fs/btrfs/transaction.c | 32 +++++++++++++++----------------- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 3ba140468f12..e22fba272e4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -135,6 +135,11 @@ struct btrfs_delayed_data_ref { u64 offset; }; +enum btrfs_delayed_ref_flags { + /* Indicate that we are flushing delayed refs for the commit */ + BTRFS_DELAYED_REFS_FLUSHING, +}; + struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root_cached href_root; @@ -158,12 +163,7 @@ struct btrfs_delayed_ref_root { u64 pending_csums; - /* - * set when the tree is flushing before a transaction commit, - * used by the throttling code to decide if new updates need - * to be run right away - */ - int flushing; + unsigned long flags; u64 run_delayed_start; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3bcb5444536e..1485f7722f47 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -909,9 +909,8 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; - smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START || - cur_trans->delayed_refs.flushing) + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; return should_end_transaction(trans); @@ -2043,23 +2042,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; - /* make a pass through all the delayed refs we have so far - * any runnings procs may add more while we are here - */ - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - - cur_trans = trans->transaction; - /* - * set the flushing flag so procs in this transaction have to - * start sending their work down. + * We only want one transaction commit doing the flushing so we do not + * waste a bunch of time on lock contention on the extent root node. */ - cur_trans->delayed_refs.flushing = 1; - smp_wmb(); + if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, + &cur_trans->delayed_refs.flags)) { + /* + * Make a pass through all the delayed refs we have so far. + * Any running threads may add more while we are here. + */ + ret = btrfs_run_delayed_refs(trans, 0); + if (ret) { + btrfs_end_transaction(trans); + return ret; + } + } btrfs_create_pending_block_groups(trans); From 61a56a992fcfc694a54de77d896350b9d0588e86 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:21 -0500 Subject: [PATCH 199/307] btrfs: delayed refs pre-flushing should only run the heads we have Previously our delayed ref running used the total number of items as the items to run. However we changed that to number of heads to run with the delayed_refs_rsv, as generally we want to run all of the operations for one bytenr. But with btrfs_run_delayed_refs(trans, 0) we set our count to 2x the number of items that we have. This is generally fine, but if we have some operation generation loads of delayed refs while we're doing this pre-flushing in the transaction commit, we'll just spin forever doing delayed refs. Fix this to simply pick the number of delayed refs we currently have, that way we do not end up doing a lot of extra work that's being generated in other threads. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6f0c59debc2b..0943731f7edd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2113,7 +2113,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (count == 0) - count = atomic_read(&delayed_refs->num_entries) * 2; + count = delayed_refs->num_heads_ready; again: #ifdef SCRAMBLE_DELAYED_REFS From ad368f3394b796fd7faa46da8d326c98718f21d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:22 -0500 Subject: [PATCH 200/307] btrfs: only run delayed refs once before committing We try to pre-flush the delayed refs when committing, because we want to do as little work as possible in the critical section of the transaction commit. However doing this twice can lead to very long transaction commit delays as other threads are allowed to continue to generate more delayed refs, which potentially delays the commit by multiple minutes in very extreme cases. So simply stick to one pre-flush, and then continue the rest of the transaction commit. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1485f7722f47..7bb58c3ddcd1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2061,12 +2061,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_create_pending_block_groups(trans); - ret = btrfs_run_delayed_refs(trans, 0); - if (ret) { - btrfs_end_transaction(trans); - return ret; - } - if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; From 2a4d84c11a872551a335cfe3ee8b60af67ded109 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:23 -0500 Subject: [PATCH 201/307] btrfs: move delayed ref flushing for qgroup into qgroup helper The commit d67263354541 ("btrfs: qgroup: Make snapshot accounting work with new extent-oriented qgroup.") added a flush of the delayed refs during snapshot creation in order to get the qgroup accounting properly. However this code has changed and been moved to it's own helper that is skipped if qgroups are turned off. Move the flushing to the helper, as we do not need it when qgroups are turned off. Also add a comment explaining why it exists, and why it doesn't actually save us. This will be helpful later when we try to fix qgroup accounting properly. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7bb58c3ddcd1..dbbd46417534 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1432,6 +1432,23 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, */ record_root_in_trans(trans, src, 1); + /* + * btrfs_qgroup_inherit relies on a consistent view of the usage for the + * src root, so we must run the delayed refs here. + * + * However this isn't particularly fool proof, because there's no + * synchronization keeping us from changing the tree after this point + * before we do the qgroup_inherit, or even from making changes while + * we're doing the qgroup_inherit. But that's a problem for the future, + * for now flush the delayed refs to narrow the race window where the + * qgroup counters could end up wrong. + */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + /* * We are going to commit transaction, see btrfs_commit_transaction() * comment for reason locking tree_log_mutex @@ -1685,12 +1702,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. From b7774425e0c08d8558be3a072b0c3e0b806b95f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:24 -0500 Subject: [PATCH 202/307] btrfs: remove bogus BUG_ON in alloc_reserved_tree_block The fix 361048f586f5 ("Btrfs: fix full backref problem when inserting shared block reference") added a delayed ref flushing at subvolume creation time in order to avoid hitting this particular BUG_ON(). Before this fix, we were tripping the BUG_ON() by 1. Modify snapshot A, which creates blocks with a normal reference for snapshot A, as A is the owner of these blocks. We now have delayed refs for these blocks. 2. Create a snapshot of A named B, which pushes references for the children blocks of the root node for the new root B, thus creating more delayed refs for newly allocated blocks. 3. A is modified, and because the metadata blocks can now be shared, it must push FULL_BACKREF references to the children of any block that A COWs down it's path to its target key. 4. Delayed refs are run. Because these are newly allocated blocks, we have ->must_insert_reserved reserved set on the delayed ref head, we call into alloc_reserved_tree_block() to add the extent item, and then add our ref. At the time of this fix, we were ordering FULL_BACKREF delayed ref operations first, so we'd go to add this reference and then BUG_ON() because we didn't have the FULL_BACKREF flag set. The patch fixed this problem by making sure we ran the delayed refs before we had the chance to modify A. This meant that any *new* blocks would have had their extent items created _before_ we would ever actually COW down and generate FULL_BACKREF entries. Thus the problem went away. However this BUG_ON() is actually completely bogus. The existence of a full backref doesn't necessarily mean that FULL_BACKREF must be set on that block, it must only be set on the actual parent itself. Consider the example provided above. If we COW down one path from A, any nodes are going to have a FULL_BACKREF ref pushed down to _all_ of their children, but not all of the children are going to have FULL_BACKREF set. It is completely valid to have an extent item with normal and full backrefs without FULL_BACKREF actually set on the block itself. As a final note, I have been testing with the patch (applied after this one) btrfs: stop running all delayed refs during snapshot which removed this flushing. My test was a torture test which did a lot of operations while snapshotting and deleting snapshots as well as relocation, and I never tripped this BUG_ON(). This is actually because at the time of 361048f586f5, we ordered SHARED keys _before_ normal references, and thus they would get run first. However currently they are ordered _after_ normal references, so we'd do the initial creation without having a shared reference, and thus not hit this BUG_ON(), which explains why I didn't start hitting this problem during my testing with my other patch applied. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0943731f7edd..5476ab84e544 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4426,7 +4426,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_SHARED_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent); From dac348e9257051e7a39224747695b53e3fc737d7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:25 -0500 Subject: [PATCH 203/307] btrfs: stop running all delayed refs during snapshot This was added in commit 361048f586f5 ("Btrfs: fix full backref problem when inserting shared block reference") to address a problem where we hit the following BUG_ON() in alloc_reserved_tree_block if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) { BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); However this BUG_ON() is bogus, and was removed by previous commit: btrfs: remove bogus BUG_ON in alloc_reserved_tree_block We no longer need to run delayed refs because of this, and can remove this flushing here. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dbbd46417534..02592c6ce755 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1749,12 +1749,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: From 488bc2a2d21e5faf14f9f695bb592ae9dd0e7465 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Dec 2020 14:24:26 -0500 Subject: [PATCH 204/307] btrfs: run delayed refs less often in commit_cowonly_roots We love running delayed refs in commit_cowonly_roots, but it is a bit excessive. I was seeing cases of running 3 or 4 refs a few times in a row during this time. Instead simply: - update all of the roots first - then run delayed refs - then handle the empty block groups case - and then if we have any more dirty roots do the whole thing again This allows us to be much more efficient with our delayed ref running, as we can batch a few more operations at once. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 02592c6ce755..b83e8ae38cfc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1226,10 +1226,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) btrfs_tree_unlock(eb); free_extent_buffer(eb); - if (ret) - return ret; - - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; @@ -1247,10 +1243,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) if (ret) return ret; - /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; @@ -1265,15 +1257,24 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); - if (ret) - return ret; } + /* Now flush any delayed refs generated by updating all of the roots */ + ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + if (ret) + return ret; + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; + + /* + * We're writing the dirty block groups, which could generate + * delayed refs, which could generate more dirty block groups, + * so we want to keep this flushing in this loop to make sure + * everything gets run. + */ ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) return ret; From 8898038309876e5b8e535eac9d4b9fe4e3d6f5b3 Mon Sep 17 00:00:00 2001 From: Roman Anasal Date: Mon, 25 Jan 2021 20:43:25 +0100 Subject: [PATCH 205/307] btrfs: send: use struct send_ctx *sctx for btrfs_compare_trees and changed_cb btrfs_compare_trees and changed_cb use a void *ctx parameter instead of struct send_ctx *sctx but when used in changed_cb it is immediately cast to `struct send_ctx *sctx = ctx;`. changed_cb is only ever called from btrfs_compare_trees and full_send_tree: - full_send_tree already passes a struct send_ctx *sctx - btrfs_compare_trees is only called by send_subvol with a struct send_ctx *sctx - void *ctx in btrfs_compare_trees is only used to be passed to changed_cb So casting to/from void *ctx seems unnecessary and directly using struct send_ctx *sctx instead provides better type-safety. The original reason for using void *ctx in the first place seems to have been dropped with 1b51d6fce45e ("btrfs: send: remove indirect callback parameter for changed_cb"). Signed-off-by: Roman Anasal Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3bcbf2bcb869..f87878274e9f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6591,10 +6591,9 @@ static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, - void *ctx) + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || @@ -6799,7 +6798,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, void *ctx) + struct btrfs_root *right_root, struct send_ctx *sctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6951,7 +6950,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; } @@ -6962,7 +6961,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; } @@ -6976,7 +6975,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, - ctx); + sctx); if (ret < 0) goto out; advance_left = ADVANCE; @@ -6984,7 +6983,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, - ctx); + sctx); if (ret < 0) goto out; advance_right = ADVANCE; @@ -6999,7 +6998,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, else result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, - &left_key, result, ctx); + &left_key, result, sctx); if (ret < 0) goto out; advance_left = ADVANCE; From 91e79a83fff663283341c8c29293faec8255099a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:18 -0400 Subject: [PATCH 206/307] btrfs: make flush_space take a enum btrfs_flush_state instead of int I got a automated message from somebody who runs clang against our kernels and it's because I used the wrong enum type for what I passed into flush_space, caught by -Wenum-conversion. Change the argument to be explicitly the enum we're expecting to make everything consistent. Maybe eventually gcc will catch errors like this. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index fd8e79e3c10e..4eab581b1b9c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -670,7 +670,7 @@ enospc: */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - int state) + enum btrfs_flush_state state) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -923,7 +923,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; - int flush_state; + enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; @@ -1055,7 +1055,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 last_tickets_id; - int flush_state = 0; + enum btrfs_flush_state flush_state = 0; fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); space_info = fs_info->data_sinfo; From ac1ea10e757a57fb61512ae9beb2ef67e5340e31 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:19 -0400 Subject: [PATCH 207/307] btrfs: add a trace point for reserve tickets While debugging a ENOSPC related performance problem I needed to see the time difference between start and end of a reserve ticket, so add a trace point to report when we handle a reserve ticket. I opted to spit out start_ns itself without calculating the difference because there could be a gap between enabling the tracepoint and setting start_ns. Doing it this way allows us to filter on 0 start_ns so we don't get bogus entries, and we can easily calculate the time difference with bpftrace or something else. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 12 +++++++++++- include/trace/events/btrfs.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 4eab581b1b9c..d879e3fea0b6 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1220,6 +1220,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, * @fs_info: the filesystem * @space_info: space info for the reservation * @ticket: ticket for the reservation + * @start_ns: timestamp when the reservation started + * @orig_bytes: amount of bytes originally reserved * @flush: how much we can flush * * This does the work of figuring out how to flush for the ticket, waiting for @@ -1228,6 +1230,7 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, + u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; @@ -1283,6 +1286,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * space wasn't reserved at all). */ ASSERT(!(ticket->bytes == 0 && ticket->error)); + trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, + start_ns, flush, ticket->error); return ret; } @@ -1317,6 +1322,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, { struct work_struct *async_work; struct reserve_ticket ticket; + u64 start_ns = 0; u64 used; int ret = 0; bool pending_tickets; @@ -1369,6 +1375,9 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); + if (trace_btrfs_reserve_ticket_enabled()) + start_ns = ktime_get_ns(); + if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || flush == BTRFS_RESERVE_FLUSH_DATA) { @@ -1405,7 +1414,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, flush); + return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, + orig_bytes, flush); } /** diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b9896fc06160..b0ea2a108be3 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2026,6 +2026,35 @@ TRACE_EVENT(btrfs_convert_extent_bit, __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); +TRACE_EVENT(btrfs_reserve_ticket, + TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, + u64 start_ns, int flush, int error), + + TP_ARGS(fs_info, flags, bytes, start_ns, flush, error), + + TP_STRUCT__entry_btrfs( + __field( u64, flags ) + __field( u64, bytes ) + __field( u64, start_ns ) + __field( int, flush ) + __field( int, error ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->flags = flags; + __entry->bytes = bytes; + __entry->start_ns = start_ns; + __entry->flush = flush; + __entry->error = error; + ), + + TP_printk_btrfs("flags=%s bytes=%llu start_ns=%llu flush=%s error=%d", + __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->bytes, __entry->start_ns, + __print_symbolic(__entry->flush, FLUSH_ACTIONS), + __entry->error) +); + DECLARE_EVENT_CLASS(btrfs_sleep_tree_lock, TP_PROTO(const struct extent_buffer *eb, u64 start_ns), From 5deb17e18e27a3502f21581ba4d086e762b86b31 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:20 -0400 Subject: [PATCH 208/307] btrfs: track ordered bytes instead of just dio ordered bytes We track dio_bytes because the shrink delalloc code needs to know if we have more DIO in flight than we have normal buffered IO. The reason for this is because we can't "flush" DIO, we have to just wait on the ordered extents to finish. However this is true of all ordered extents. If we have more ordered space outstanding than dirty pages we should be waiting on ordered extents. We already are ok on this front technically, because we always do a FLUSH_DELALLOC_WAIT loop, but I want to use the ordered counter in the preemptive flushing code as well, so change this to count all ordered bytes instead of just DIO ordered bytes. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/ordered-data.c | 13 ++++++------- fs/btrfs/space-info.c | 18 +++++++----------- 4 files changed, 18 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ed6bb46a2572..7d8660227520 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -797,7 +797,7 @@ struct btrfs_fs_info { /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; - struct percpu_counter dio_bytes; + struct percpu_counter ordered_bytes; s32 dirty_metadata_batch; s32 delalloc_batch; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5473bed6a7e8..e0d56b3d1223 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1469,7 +1469,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) { percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); - percpu_counter_destroy(&fs_info->dio_bytes); + percpu_counter_destroy(&fs_info->ordered_bytes); percpu_counter_destroy(&fs_info->dev_replace.bio_counter); btrfs_free_csum_hash(fs_info); btrfs_free_stripe_hash_table(fs_info); @@ -2802,7 +2802,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); - ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL); + ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL); if (ret) return ret; @@ -4163,9 +4163,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_sum(&fs_info->delalloc_bytes)); } - if (percpu_counter_sum(&fs_info->dio_bytes)) + if (percpu_counter_sum(&fs_info->ordered_bytes)) btrfs_info(fs_info, "at unmount dio bytes count %lld", - percpu_counter_sum(&fs_info->dio_bytes)); + percpu_counter_sum(&fs_info->ordered_bytes)); btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b4e6500548a2..e8dee1578d4a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -206,11 +206,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset type == BTRFS_ORDERED_COMPRESSED); set_bit(type, &entry->flags); - if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes, + fs_info->delalloc_batch); + + if (dio) set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); - } /* one ref for the tree */ refcount_set(&entry->refs, 1); @@ -503,9 +503,8 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, false); - if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, + fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d879e3fea0b6..711beacd75d6 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -489,7 +489,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, { struct btrfs_trans_handle *trans; u64 delalloc_bytes; - u64 dio_bytes; + u64 ordered_bytes; u64 items; long time_left; int loops; @@ -513,25 +513,20 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) return; - } /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (dio_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes) wait_ordered = true; loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { + while ((delalloc_bytes || ordered_bytes) && loops < 3) { u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; long nr_pages = min_t(u64, temp, LONG_MAX); @@ -556,7 +551,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + ordered_bytes = percpu_counter_sum_positive( + &fs_info->ordered_bytes); } } From f00c42dd4cc8b856e68638e6a88b51f88b8e849e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:21 -0400 Subject: [PATCH 209/307] btrfs: introduce a FORCE_COMMIT_TRANS flush operation Solely for preemptive flushing, we want to be able to force the transaction commit without any of the ambiguity of may_commit_transaction(). This is because may_commit_transaction() checks tickets and such, and in preemptive flushing we already know it'll be helpful, so use this to keep the code nice and clean and straightforward. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add comment ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/space-info.c | 14 ++++++++++++++ include/trace/events/btrfs.h | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7d8660227520..90726954b883 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2740,6 +2740,7 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, + FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 711beacd75d6..e677b5451f82 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -140,6 +140,12 @@ * be freed, plus any delayed work we may not have gotten rid of in the case * of metadata. * + * FORCE_COMMIT_TRANS + * For use by the preemptive flusher. We use this to bypass the ticketing + * checks in may_commit_transaction, as we have more information about the + * overall state of the system and may want to commit the transaction ahead + * of actual ENOSPC conditions. + * * OVERCOMMIT * * Because we hold so many reservations for metadata we will allow you to @@ -735,6 +741,14 @@ static void flush_space(struct btrfs_fs_info *fs_info, case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info); break; + case FORCE_COMMIT_TRANS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_commit_transaction(trans); + break; default: ret = -ENOSPC; break; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b0ea2a108be3..8a7c163907a2 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -99,7 +99,8 @@ struct btrfs_space_info; EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ - EMe(COMMIT_TRANS, "COMMIT_TRANS") + EM( COMMIT_TRANS, "COMMIT_TRANS") \ + EMe(FORCE_COMMIT_TRANS, "FORCE_COMMIT_TRANS") /* * First define the enums in the above macros to be exported to userspace via From 576fa34830afac6a40cd19c777f1ab49c914e87c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:22 -0400 Subject: [PATCH 210/307] btrfs: improve preemptive background space flushing Currently if we ever have to flush space because we do not have enough we allocate a ticket and attach it to the space_info, and then systematically flush things in the filesystem that hold space reservations until our space is reclaimed. However this has a latency cost, we must go to sleep and wait for the flushing to make progress before we are woken up and allowed to continue doing our work. In order to address that we used to kick off the async worker to flush space preemptively, so that we could be reclaiming space hopefully before any tasks needed to stop and wait for space to reclaim. When I introduced the ticketed ENOSPC stuff this broke slightly in the fact that we were using tickets to indicate if we were done flushing. No tickets, no more flushing. However this meant that we essentially never preemptively flushed. This caused a write performance regression that Nikolay noticed in an unrelated patch that removed the committing of the transaction during btrfs_end_transaction. The behavior that happened pre that patch was btrfs_end_transaction() would see that we were low on space, and it would commit the transaction. This was bad because in this particular case you could end up with thousands and thousands of transactions being committed during the 5 minute reproducer. With the patch to remove this behavior we got much more sane transaction commits, but we ended up slower because we would write for a while, flush, write for a while, flush again. To address this we need to reinstate a preemptive flushing mechanism. However it is distinctly different from our ticketing flushing in that it doesn't have tickets to base it's decisions on. Instead of bolting this logic into our existing flushing work, add another worker to handle this preemptive flushing. Here we will attempt to be slightly intelligent about the things that we flushing, attempting to balance between whichever pool is taking up the most space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/space-info.c | 100 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 90726954b883..a9b0521d9e89 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -933,6 +933,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0d56b3d1223..e0d1b328397e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4111,6 +4111,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); + cancel_work_sync(&fs_info->preempt_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e677b5451f82..8a27c193f8a8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1000,6 +1000,100 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } +/* + * This handles pre-flushing of metadata space before we get to the point that + * we need to start blocking threads on tickets. The logic here is different + * from the other flush paths because it doesn't rely on tickets to tell us how + * much we need to flush, instead it attempts to keep us below the 80% full + * watermark of space by flushing whichever reservation pool is currently the + * largest. + */ +static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + struct btrfs_block_rsv *delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv; + struct btrfs_block_rsv *trans_rsv; + u64 used; + + fs_info = container_of(work, struct btrfs_fs_info, + preempt_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + delayed_block_rsv = &fs_info->delayed_block_rsv; + delayed_refs_rsv = &fs_info->delayed_refs_rsv; + global_rsv = &fs_info->global_block_rsv; + trans_rsv = &fs_info->trans_block_rsv; + + spin_lock(&space_info->lock); + used = btrfs_space_info_used(space_info, true); + while (need_do_async_reclaim(fs_info, space_info, used)) { + enum btrfs_flush_state flush; + u64 delalloc_size = 0; + u64 to_reclaim, block_rsv_size; + u64 global_rsv_size = global_rsv->reserved; + + /* + * We don't have a precise counter for the metadata being + * reserved for delalloc, so we'll approximate it by subtracting + * out the block rsv's space from the bytes_may_use. If that + * amount is higher than the individual reserves, then we can + * assume it's tied up in delalloc reservations. + */ + block_rsv_size = global_rsv_size + + delayed_block_rsv->reserved + + delayed_refs_rsv->reserved + + trans_rsv->reserved; + if (block_rsv_size < space_info->bytes_may_use) + delalloc_size = space_info->bytes_may_use - block_rsv_size; + spin_unlock(&space_info->lock); + + /* + * We don't want to include the global_rsv in our calculation, + * because that's space we can't touch. Subtract it from the + * block_rsv_size for the next checks. + */ + block_rsv_size -= global_rsv_size; + + /* + * We really want to avoid flushing delalloc too much, as it + * could result in poor allocation patterns, so only flush it if + * it's larger than the rest of the pools combined. + */ + if (delalloc_size > block_rsv_size) { + to_reclaim = delalloc_size; + flush = FLUSH_DELALLOC; + } else if (space_info->bytes_pinned > + (delayed_block_rsv->reserved + + delayed_refs_rsv->reserved)) { + to_reclaim = space_info->bytes_pinned; + flush = FORCE_COMMIT_TRANS; + } else if (delayed_block_rsv->reserved > + delayed_refs_rsv->reserved) { + to_reclaim = delayed_block_rsv->reserved; + flush = FLUSH_DELAYED_ITEMS_NR; + } else { + to_reclaim = delayed_refs_rsv->reserved; + flush = FLUSH_DELAYED_REFS_NR; + } + + /* + * We don't want to reclaim everything, just a portion, so scale + * down the to_reclaim by 1/4. If it takes us down to 0, + * reclaim 1 items worth. + */ + to_reclaim >>= 2; + if (!to_reclaim) + to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); + flush_space(fs_info, space_info, to_reclaim, flush); + cond_resched(); + spin_lock(&space_info->lock); + used = btrfs_space_info_used(space_info, true); + } + spin_unlock(&space_info->lock); +} + /* * FLUSH_DELALLOC_WAIT: * Space is freed from flushing delalloc in one of two ways. @@ -1126,6 +1220,8 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); + INIT_WORK(&fs_info->preempt_reclaim_work, + btrfs_preempt_reclaim_metadata_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1413,11 +1509,11 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && need_do_async_reclaim(fs_info, space_info, used) && - !work_busy(&fs_info->async_reclaim_work)) { + !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + &fs_info->preempt_reclaim_work); } } spin_unlock(&space_info->lock); From ae7913ba52ec4a2883eb073c6bc99f1a8d9d636b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:23 -0400 Subject: [PATCH 211/307] btrfs: rename need_do_async_reclaim All of our normal flushing is asynchronous reclaim, so this helper is poorly named. This is more checking if we need to preemptively flush space, so rename it to need_preemptive_reclaim. Also switch it to bool and make it plain static as followup patches will move more code here. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8a27c193f8a8..effb9b73a418 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -808,18 +808,18 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; } -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used) +static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; + return false; if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) - return 0; + return false; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -1028,7 +1028,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) spin_lock(&space_info->lock); used = btrfs_space_info_used(space_info, true); - while (need_do_async_reclaim(fs_info, space_info, used)) { + while (need_preemptive_reclaim(fs_info, space_info, used)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; @@ -1508,7 +1508,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, used) && + need_preemptive_reclaim(fs_info, space_info, used) && !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); From f205edf77315a33eee82a7615fb57e9297957fe9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:24 -0400 Subject: [PATCH 212/307] btrfs: check reclaim_size in need_preemptive_reclaim If we're flushing space for tickets then we have space_info->reclaim_size set and we do not need to do background reclaim. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index effb9b73a418..9f30d6837eb5 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -818,6 +818,13 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return false; + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) return false; From 9f42d37748264d65ca611b60c22b9c003030b0b3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:25 -0400 Subject: [PATCH 213/307] btrfs: rework btrfs_calc_reclaim_metadata_size Currently btrfs_calc_reclaim_metadata_size does two things, it returns the space currently required for flushing by the tickets, and if there are no tickets it calculates a value for the preemptive flushing. However for the normal ticketed flushing we really only care about the space required for tickets. We will accidentally come in and flush one time, but as soon as we see there are no tickets we bail out of our flushing. Fix this by making btrfs_calc_reclaim_metadata_size really only tell us what is required for flushing if we have people waiting on space. Then move the preemptive flushing logic into need_preemptive_reclaim(). We ignore btrfs_calc_reclaim_metadata_size() in need_preemptive_reclaim() because if we are in this path then we made our reservation and there are not pending tickets currently, so we do not need to check it, simply do the fuzzy logic to check if we're getting low on space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 44 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9f30d6837eb5..636a42620b11 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -765,7 +765,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; - u64 expected; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -783,28 +782,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, if (space_info->total_bytes + avail < used) to_reclaim += used - (space_info->total_bytes + avail); - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); return to_reclaim; } @@ -813,6 +790,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); + u64 to_reclaim, expected; /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) @@ -825,7 +803,25 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, if (space_info->reclaim_size) return false; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) + return false; + + used = btrfs_space_info_used(space_info, true); + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); + if (!to_reclaim) return false; return (used >= thresh && !btrfs_fs_closing(fs_info) && From 2e294c60497f29ab8791f4b99f348b22d70dd3c3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:26 -0400 Subject: [PATCH 214/307] btrfs: simplify the logic in need_preemptive_flushing A lot of this was added all in one go with no explanation, and is a bit unwieldy and confusing. Simplify the logic to start preemptive flushing if we've reserved more than half of our available free space. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 73 ++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 636a42620b11..9befd22a2316 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -786,11 +786,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, } static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used) + struct btrfs_space_info *space_info) { + u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); - u64 to_reclaim, expected; + u64 used; /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) @@ -803,26 +803,52 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, if (space_info->reclaim_size) return false; - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) - return false; + /* + * If we have over half of the free space occupied by reservations or + * pinned then we want to start flushing. + * + * We do not do the traditional thing here, which is to say + * + * if (used >= ((total_bytes + avail) / 2)) + * return 1; + * + * because this doesn't quite work how we want. If we had more than 50% + * of the space_info used by bytes_used and we had 0 available we'd just + * constantly run the background flusher. Instead we want it to kick in + * if our reclaimable space exceeds 50% of our available free space. + */ + thresh = calc_available_free_space(fs_info, space_info, + BTRFS_RESERVE_FLUSH_ALL); + thresh += (space_info->total_bytes - space_info->bytes_used - + space_info->bytes_reserved - space_info->bytes_readonly); + thresh >>= 1; - used = btrfs_space_info_used(space_info, true); - if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); + used = space_info->bytes_pinned; - if (used > expected) - to_reclaim = used - expected; + /* + * If we have more ordered bytes than delalloc bytes then we're either + * doing a lot of DIO, or we simply don't have a lot of delalloc waiting + * around. Preemptive flushing is only useful in that it can free up + * space before tickets need to wait for things to finish. In the case + * of ordered extents, preemptively waiting on ordered extents gets us + * nothing, if our reservations are tied up in ordered extents we'll + * simply have to slow down writers by forcing them to wait on ordered + * extents. + * + * In the case that ordered is larger than delalloc, only include the + * block reserves that we would actually be able to directly reclaim + * from. In this case if we're heavy on metadata operations this will + * clearly be heavy enough to warrant preemptive flushing. In the case + * of heavy DIO or ordered reservations, preemptive flushing will just + * waste time and cause us to slow down. + */ + ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + if (ordered >= delalloc) + used += fs_info->delayed_refs_rsv.reserved + + fs_info->delayed_block_rsv.reserved; else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); - if (!to_reclaim) - return false; + used += space_info->bytes_may_use; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -1019,7 +1045,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) struct btrfs_block_rsv *delayed_refs_rsv; struct btrfs_block_rsv *global_rsv; struct btrfs_block_rsv *trans_rsv; - u64 used; fs_info = container_of(work, struct btrfs_fs_info, preempt_reclaim_work); @@ -1030,8 +1055,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv = &fs_info->trans_block_rsv; spin_lock(&space_info->lock); - used = btrfs_space_info_used(space_info, true); - while (need_preemptive_reclaim(fs_info, space_info, used)) { + while (need_preemptive_reclaim(fs_info, space_info)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; @@ -1092,7 +1116,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush_space(fs_info, space_info, to_reclaim, flush); cond_resched(); spin_lock(&space_info->lock); - used = btrfs_space_info_used(space_info, true); } spin_unlock(&space_info->lock); } @@ -1511,7 +1534,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_preemptive_reclaim(fs_info, space_info, used) && + need_preemptive_reclaim(fs_info, space_info) && !work_busy(&fs_info->preempt_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); From 88a777a6e5272106bdc96b1032d89b0ddc0e526f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:27 -0400 Subject: [PATCH 215/307] btrfs: implement space clamping for preemptive flushing Starting preemptive flushing at 50% of available free space is a good start, but some workloads are particularly abusive and can quickly overwhelm the preemptive flushing code and drive us into using tickets. Handle this by clamping down on our threshold for starting and continuing to run preemptive flushing. This is particularly important for our overcommit case, as we can really drive the file system into overages and then it's more difficult to pull it back as we start to actually fill up the file system. The clamping is essentially 2^CLAMP, but we start at 1 so whatever we calculate for overcommit is the baseline. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 53 +++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/space-info.h | 4 ++++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9befd22a2316..e8acf087dcee 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -212,6 +212,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->ro_bgs); INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -815,13 +816,28 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * because this doesn't quite work how we want. If we had more than 50% * of the space_info used by bytes_used and we had 0 available we'd just * constantly run the background flusher. Instead we want it to kick in - * if our reclaimable space exceeds 50% of our available free space. + * if our reclaimable space exceeds our clamped free space. + * + * Our clamping range is 2^1 -> 2^8. Practically speaking that means + * the following: + * + * Amount of RAM Minimum threshold Maximum threshold + * + * 256GiB 1GiB 128GiB + * 128GiB 512MiB 64GiB + * 64GiB 256MiB 32GiB + * 32GiB 128MiB 16GiB + * 16GiB 64MiB 8GiB + * + * These are the range our thresholds will fall in, corresponding to how + * much delalloc we need for the background flusher to kick in. */ + thresh = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); thresh += (space_info->total_bytes - space_info->bytes_used - space_info->bytes_reserved - space_info->bytes_readonly); - thresh >>= 1; + thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1045,6 +1061,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) struct btrfs_block_rsv *delayed_refs_rsv; struct btrfs_block_rsv *global_rsv; struct btrfs_block_rsv *trans_rsv; + int loops = 0; fs_info = container_of(work, struct btrfs_fs_info, preempt_reclaim_work); @@ -1061,6 +1078,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) u64 to_reclaim, block_rsv_size; u64 global_rsv_size = global_rsv->reserved; + loops++; + /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting @@ -1117,6 +1136,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) cond_resched(); spin_lock(&space_info->lock); } + + /* We only went through once, back off our clamping. */ + if (loops == 1 && !space_info->reclaim_size) + space_info->clamp = max(1, space_info->clamp - 1); spin_unlock(&space_info->lock); } @@ -1433,6 +1456,24 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } +static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); + u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + + /* + * If we're heavy on ordered operations then clamping won't help us. We + * need to clamp specifically to keep up with dirty'ing buffered + * writers, because there's not a 1:1 correlation of writing delalloc + * and freeing space, like there is with flushing delayed refs or + * delayed nodes. If we're already more ordered than delalloc then + * we're keeping up, otherwise we aren't and should probably clamp. + */ + if (ordered < delalloc) + space_info->clamp = min(space_info->clamp + 1, 8); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1526,6 +1567,14 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } + + /* + * We were forced to add a reserve ticket, so our preemptive + * flushing is unable to keep up. Clamp down on the threshold + * for the preemptive flushing in order to keep up with the + * workload. + */ + maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 74706f604bce..e237156ce888 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -22,6 +22,10 @@ struct btrfs_space_info { the space info if we had an ENOSPC in the allocator. */ + int clamp; /* Used to scale our threshold for preemptive + flushing. The value is >> clamp, so turns + out to be a 2^clamp divisor. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ From 4b02b00fe5f1377f3dbfb168dfcfebf3d7a9632f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:28 -0400 Subject: [PATCH 216/307] btrfs: adjust the flush trace point to include the source Since we have normal ticketed flushing and preemptive flushing, adjust the tracepoint so that we know the source of the flushing action to make it easier to debug problems. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 +++++++++-------- include/trace/events/btrfs.h | 10 ++++++---- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e8acf087dcee..bb4f85c07738 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -673,7 +673,7 @@ enospc: */ static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - enum btrfs_flush_state state) + enum btrfs_flush_state state, bool for_preempt) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -756,7 +756,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, } trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); + ret, for_preempt); return; } @@ -997,7 +997,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state); + flush_space(fs_info, space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1132,7 +1132,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim >>= 2; if (!to_reclaim) to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); - flush_space(fs_info, space_info, to_reclaim, flush); + flush_space(fs_info, space_info, to_reclaim, flush, true); cond_resched(); spin_lock(&space_info->lock); } @@ -1223,7 +1223,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1236,7 +1236,7 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) while (flush_state < ARRAY_SIZE(data_flush_states)) { flush_space(fs_info, space_info, U64_MAX, - data_flush_states[flush_state]); + data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1309,7 +1309,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, flush_state = 0; do { - flush_space(fs_info, space_info, to_reclaim, states[flush_state]); + flush_space(fs_info, space_info, to_reclaim, states[flush_state], + false); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -1325,7 +1326,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket) { while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8a7c163907a2..807921de6b32 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1113,15 +1113,16 @@ TRACE_EVENT(btrfs_trigger_flush, TRACE_EVENT(btrfs_flush_space, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes, - int state, int ret), + int state, int ret, bool for_preempt), - TP_ARGS(fs_info, flags, num_bytes, state, ret), + TP_ARGS(fs_info, flags, num_bytes, state, ret, for_preempt), TP_STRUCT__entry_btrfs( __field( u64, flags ) __field( u64, num_bytes ) __field( int, state ) __field( int, ret ) + __field( bool, for_preempt ) ), TP_fast_assign_btrfs(fs_info, @@ -1129,15 +1130,16 @@ TRACE_EVENT(btrfs_flush_space, __entry->num_bytes = num_bytes; __entry->state = state; __entry->ret = ret; + __entry->for_preempt = for_preempt; ), - TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d", + TP_printk_btrfs("state=%d(%s) flags=%llu(%s) num_bytes=%llu ret=%d for_preempt=%d", __entry->state, __print_symbolic(__entry->state, FLUSH_STATES), __entry->flags, __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS), - __entry->num_bytes, __entry->ret) + __entry->num_bytes, __entry->ret, __entry->for_preempt) ); DECLARE_EVENT_CLASS(btrfs__reserved_extent, From e5ad49e215a07562f0a765c68161d13d7c23d8d1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Oct 2020 09:28:29 -0400 Subject: [PATCH 217/307] btrfs: add a trace class for dumping the current ENOSPC state Often when I'm debugging ENOSPC related issues I have to resort to printing the entire ENOSPC state with trace_printk() in different spots. This gets pretty annoying, so add a trace state that does this for us. Then add a trace point at the end of preemptive flushing so you can see the state of the space_info when we decide to exit preemptive flushing. This helped me figure out we weren't kicking in the preemptive flushing soon enough. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 1 + include/trace/events/btrfs.h | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bb4f85c07738..bccd98141a6e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1140,6 +1140,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) /* We only went through once, back off our clamping. */ if (loops == 1 && !space_info->reclaim_size) space_info->clamp = max(1, space_info->clamp - 1); + trace_btrfs_done_preemptive_reclaim(fs_info, space_info); spin_unlock(&space_info->lock); } diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 807921de6b32..0551ea65374f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -2029,6 +2029,68 @@ TRACE_EVENT(btrfs_convert_extent_bit, __print_flags(__entry->clear_bits, "|", EXTENT_FLAGS)) ); +DECLARE_EVENT_CLASS(btrfs_dump_space_info, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo), + + TP_ARGS(fs_info, sinfo), + + TP_STRUCT__entry_btrfs( + __field( u64, flags ) + __field( u64, total_bytes ) + __field( u64, bytes_used ) + __field( u64, bytes_pinned ) + __field( u64, bytes_reserved ) + __field( u64, bytes_may_use ) + __field( u64, bytes_readonly ) + __field( u64, reclaim_size ) + __field( int, clamp ) + __field( u64, global_reserved ) + __field( u64, trans_reserved ) + __field( u64, delayed_refs_reserved ) + __field( u64, delayed_reserved ) + __field( u64, free_chunk_space ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->flags = sinfo->flags; + __entry->total_bytes = sinfo->total_bytes; + __entry->bytes_used = sinfo->bytes_used; + __entry->bytes_pinned = sinfo->bytes_pinned; + __entry->bytes_reserved = sinfo->bytes_reserved; + __entry->bytes_may_use = sinfo->bytes_may_use; + __entry->bytes_readonly = sinfo->bytes_readonly; + __entry->reclaim_size = sinfo->reclaim_size; + __entry->clamp = sinfo->clamp; + __entry->global_reserved = fs_info->global_block_rsv.reserved; + __entry->trans_reserved = fs_info->trans_block_rsv.reserved; + __entry->delayed_refs_reserved = fs_info->delayed_refs_rsv.reserved; + __entry->delayed_reserved = fs_info->delayed_block_rsv.reserved; + __entry->free_chunk_space = atomic64_read(&fs_info->free_chunk_space); + ), + + TP_printk_btrfs("flags=%s total_bytes=%llu bytes_used=%llu " + "bytes_pinned=%llu bytes_reserved=%llu " + "bytes_may_use=%llu bytes_readonly=%llu " + "reclaim_size=%llu clamp=%d global_reserved=%llu " + "trans_reserved=%llu delayed_refs_reserved=%llu " + "delayed_reserved=%llu chunk_free_space=%llu", + __print_flags(__entry->flags, "|", BTRFS_GROUP_FLAGS), + __entry->total_bytes, __entry->bytes_used, + __entry->bytes_pinned, __entry->bytes_reserved, + __entry->bytes_may_use, __entry->bytes_readonly, + __entry->reclaim_size, __entry->clamp, + __entry->global_reserved, __entry->trans_reserved, + __entry->delayed_refs_reserved, + __entry->delayed_reserved, __entry->free_chunk_space) +); + +DEFINE_EVENT(btrfs_dump_space_info, btrfs_done_preemptive_reclaim, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo), + TP_ARGS(fs_info, sinfo) +); + TRACE_EVENT(btrfs_reserve_ticket, TP_PROTO(const struct btrfs_fs_info *fs_info, u64 flags, u64 bytes, u64 start_ns, int flush, int error), From 2965194b7700f9405860557826520fd6e8e8b9ad Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 15:05:41 +0000 Subject: [PATCH 218/307] btrfs: remove wrong comment for can_nocow_extent() The comment for can_nocow_extent() says that the function will flush ordered extents, however that never happens and was never true before the comment was added in commit e4ecaf90bc13 ("btrfs: add comments for btrfs_check_can_nocow() and can_nocow_extent()"). This is true only for the function btrfs_check_can_nocow(), which after that commit was renamed to check_can_nocow(). So just remove that part of the comment. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0dbe1aaa0b71..589030cefd90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7105,9 +7105,6 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * - * This function will flush ordered extents in the range to ensure proper - * nocow checks for (nowait == false) case. - * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write From a4559e6f6f3a4e84cb788ac158fb419ece473527 Mon Sep 17 00:00:00 2001 From: Abaci Team Date: Wed, 27 Jan 2021 16:11:37 +0800 Subject: [PATCH 219/307] btrfs: simplify condition in __btrfs_run_delayed_items Fix the following coccicheck warnings: ./fs/btrfs/delayed-inode.c:1157:39-41: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Suggested-by: Jiapeng Zhong Reviewed-by: Josef Bacik Signed-off-by: Abaci Team Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 70c0340d839c..ec0b50b8c5d6 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1154,7 +1154,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) delayed_root = fs_info->delayed_root; curr_node = btrfs_first_delayed_node(delayed_root); - while (curr_node && (!count || (count && nr--))) { + while (curr_node && (!count || nr--)) { ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { From 951c80f83d61bd4b21794c8aba829c3c1a45c2d0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 27 Jan 2021 14:38:48 +0800 Subject: [PATCH 220/307] btrfs: fix double accounting of ordered extent for subpage case in btrfs_invalidapge Commit dbfdb6d1b369 ("Btrfs: Search for all ordered extents that could span across a page") make btrfs_invalidapage() to search all ordered extents. The offending code looks like this: again: start = page_start; ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordred) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); /* Do the cleanup */ start = end + 1; if (start < page_end) goto again; } The behavior is indeed necessary for the incoming subpage support, but when it iterates through all the ordered extents, it also resets the search range @start. This means, for the following cases, we can double account the ordered extents, causing its bytes_left underflow: Page offset 0 16K 32K |<--- OE 1 --->|<--- OE 2 ---->| As the first iteration will find ordered extent (OE) 1, which doesn't cover the full page, thus after cleanup code, we need to retry again. But again label will reset start to page_start, and we got OE 1 again, which causes double accounting on OE 1, and cause OE 1's byte_left to underflow. This problem can only happen for subpage case, as for regular sectorsize == PAGE_SIZE case, we will always find a OE ends at or after page end, thus no way to trigger the problem. Move the again label after start = page_start. There will be more comprehensive rework to convert the open coded loop to a proper while loop for subpage support. Fixes: dbfdb6d1b369 ("Btrfs: Search for all ordered extents that could span across a page") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 589030cefd90..680cd0eea6d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8183,8 +8183,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); -again: + start = page_start; +again: ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { found_ordered = true; From 420343131970fd29db129b308612f9364b06df0b Mon Sep 17 00:00:00 2001 From: Michal Rostecki Date: Wed, 27 Jan 2021 14:57:27 +0100 Subject: [PATCH 221/307] btrfs: let callers of btrfs_get_io_geometry pass the em Before this change, the btrfs_get_io_geometry() function was calling btrfs_get_chunk_map() to get the extent mapping, necessary for calculating the I/O geometry. It was using that extent mapping only internally and freeing the pointer after its execution. That resulted in calling btrfs_get_chunk_map() de facto twice by the __btrfs_map_block() function. It was calling btrfs_get_io_geometry() first and then calling btrfs_get_chunk_map() directly to get the extent mapping, used by the rest of the function. Change that to passing the extent mapping to the btrfs_get_io_geometry() function as an argument. This could improve performance in some cases. For very large filesystems, i.e. several thousands of allocated chunks, not only this avoids searching two times the rbtree, saving time, it may also help reducing contention on the lock that protects the tree - thinking of writeback starting for multiple inodes, other tasks allocating or removing chunks, and anything else that requires access to the rbtree. Reviewed-by: Filipe Manana Signed-off-by: Michal Rostecki Reviewed-by: David Sterba [ add Filipe's analysis ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 ++++++++++++++++++++++++++++++----------- fs/btrfs/volumes.c | 43 ++++++++++++++++++------------------------- fs/btrfs/volumes.h | 5 +++-- 3 files changed, 51 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 680cd0eea6d1..04cd95899ac8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2183,9 +2183,10 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + struct extent_map *em; u64 length = 0; u64 map_length; - int ret; + int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) @@ -2193,14 +2194,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, - &geom); + em = btrfs_get_chunk_map(fs_info, logical, map_length); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, + map_length, &geom); if (ret < 0) - return ret; + goto out; if (geom.len < length + size) - return 1; - return 0; + ret = 1; +out: + free_extent_map(em); + return ret; } /* @@ -7938,10 +7944,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 submit_len; int clone_offset = 0; int clone_len; + u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; struct btrfs_dio_data *dio_data = iomap->private; + struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7970,12 +7978,18 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, submit_len = dio_bio->bi_iter.bi_size; do { - ret = btrfs_get_io_geometry(fs_info, btrfs_op(dio_bio), - start_sector << 9, submit_len, - &geom); + logical = start_sector << 9; + em = btrfs_get_chunk_map(fs_info, logical, submit_len); + if (IS_ERR(em)) { + status = errno_to_blk_status(PTR_ERR(em)); + em = NULL; + goto out_err_em; + } + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), + logical, submit_len, &geom); if (ret) { status = errno_to_blk_status(ret); - goto out_err; + goto out_err_em; } ASSERT(geom.len <= INT_MAX); @@ -8020,19 +8034,24 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio_put(bio); if (submit_len > 0) refcount_dec(&dip->refs); - goto out_err; + goto out_err_em; } dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; + + free_extent_map(em); } while (submit_len > 0); return BLK_QC_T_NONE; +out_err_em: + free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a8ec8539cd8d..3948f5b50d11 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5940,23 +5940,24 @@ static bool need_full_stripe(enum btrfs_map_op op) } /* - * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) - * tuple. This information is used to calculate how big a - * particular bio can get before it straddles a stripe. + * Calculate the geometry of a particular (address, len) tuple. This + * information is used to calculate how big a particular bio can get before it + * straddles a stripe. * - * @fs_info - the filesystem - * @logical - address that we want to figure out the geometry of - * @len - the length of IO we are going to perform, starting at @logical - * @op - type of operation - write or read - * @io_geom - pointer used to return values + * @fs_info: the filesystem + * @em: mapping containing the logical extent + * @op: type of operation - write or read + * @logical: address that we want to figure out the geometry of + * @len: the length of IO we are going to perform, starting at @logical + * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom) { - struct extent_map *em; struct map_lookup *map; u64 offset; u64 stripe_offset; @@ -5964,14 +5965,9 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; - int ret = 0; ASSERT(op != BTRFS_MAP_DISCARD); - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) - return PTR_ERR(em); - map = em->map_lookup; /* Offset of this logical address in the chunk */ offset = logical - em->start; @@ -5985,8 +5981,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, btrfs_crit(fs_info, "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", stripe_offset, offset, em->start, logical, stripe_len); - ret = -EINVAL; - goto out; + return -EINVAL; } /* stripe_offset is the offset of this block in its stripe */ @@ -6033,10 +6028,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, io_geom->stripe_offset = stripe_offset; io_geom->raid56_stripe_offset = raid56_full_stripe_start; -out: - /* once for us */ - free_extent_map(em); - return ret; + return 0; } static int __btrfs_map_block(struct btrfs_fs_info *fs_info, @@ -6069,12 +6061,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ASSERT(bbio_ret); ASSERT(op != BTRFS_MAP_DISCARD); - ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(!IS_ERR(em)); + + ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); if (ret < 0) return ret; - em = btrfs_get_chunk_map(fs_info, logical, *length); - ASSERT(!IS_ERR(em)); map = em->map_lookup; *length = geom.len; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c43663d9c22e..04e2b26823c2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -440,8 +440,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); -int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 len, struct btrfs_io_geometry *io_geom); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, + enum btrfs_map_op op, u64 logical, u64 len, + struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); From ddffcf6fb5ac54ffcd7e90b10554d89dbd10b47b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:54 +0000 Subject: [PATCH 222/307] btrfs: remove unnecessary directory inode item update when deleting dir entry When we remove a directory entry, as part of an unlink operation, if the directory was logged before we must remove the directory index items from the log. We are also updating the inode item of the directory to update its i_size, but that is not necessary because during log replay we do not need it and we correctly adjust the i_size in the inode item of the subvolume as we process directory index items and replay deletes. This is not needed since commit d555438b6e1dad ("Btrfs: drop dir i_size when adding new names on replay"), where we explicitly ignore the i_size of directory inode items on log replay. Before that we used it but it was buggy as mentioned in that commit's change log (i_size got a larger value then it should have). So stop updating the i_size of the directory inode item in the log, as that is a waste of time, adds more log contention to the log tree and often results in COWing more extent buffers for the log tree. This code path is triggered often during dbench workloads for example. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 39 ++++----------------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8ee0700a980f..5d87afc6058a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3379,7 +3379,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_path *path; int ret; int err = 0; - int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) @@ -3406,7 +3405,6 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; @@ -3421,46 +3419,17 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); - bytes_del += name_len; if (ret) { err = ret; goto fail; } } - /* update the directory size in the log to reflect the names - * we have removed + /* + * We do not need to update the size field of the directory's inode item + * because on log replay we update the field to reflect all existing + * entries in the directory (see overwrite_item()). */ - if (bytes_del) { - struct btrfs_key key; - - key.objectid = dir_ino; - key.offset = 0; - key.type = BTRFS_INODE_ITEM_KEY; - btrfs_release_path(path); - - ret = btrfs_search_slot(trans, log, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 0) { - struct btrfs_inode_item *item; - u64 i_size; - - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - i_size = btrfs_inode_size(path->nodes[0], item); - if (i_size > bytes_del) - i_size -= bytes_del; - else - i_size = 0; - btrfs_set_inode_size(path->nodes[0], item, i_size); - btrfs_mark_buffer_dirty(path->nodes[0]); - } else - ret = 0; - btrfs_release_path(path); - } fail: btrfs_free_path(path); out_unlock: From e593e54ed1f643f5007ab4656188b7c3c9a9cb11 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:55 +0000 Subject: [PATCH 223/307] btrfs: stop setting nbytes when filling inode item for logging When we fill an inode item for logging we are setting its nbytes field with the value returned by inode_get_bytes() (a VFS API), however we do not need it because it is not used during log replay. In fact, for fast fsyncs, when we call inode_get_bytes() we may even get an outdated value for nbytes because the nbytes field of the inode is only updated when ordered extents complete, and a fast fsync only waits for writeback to complete, it does not wait for ordered extent completion. So just remove the setup of nbytes and add an explicit comment mentioning why we do not set it. This also avoids adding contention on the inode's i_lock (VFS) with concurrent stat() calls, since that spinlock is used by inode_get_bytes() which is also called by our stat callback (btrfs_getattr()). This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5d87afc6058a..be62759f0aac 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3858,7 +3858,14 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_timespec_nsec(&token, &item->ctime, inode->i_ctime.tv_nsec); - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait + * for ordered extent completion, which is where we update nbytes, it + * only waits for writeback to complete. During log replay as we find + * file extent items and replay them, we adjust the nbytes field of the + * inode item in subvolume tree as needed (see overwrite_item()). + */ btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); From ab12313a9f56b939529abc80ac26bedefb3d5b62 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:56 +0000 Subject: [PATCH 224/307] btrfs: avoid logging new ancestor inodes when logging new inode When we fsync a new file, created in the current transaction, we check all its ancestor inodes and always log them if they were created in the current transaction - even if we have already logged them before, which is a waste of time. So avoid logging new ancestor inodes if they were already logged before and have no xattrs added/updated/removed since they were last logged. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index be62759f0aac..105cf316ee27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5272,6 +5272,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->vfs_inode.i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); @@ -5520,6 +5521,34 @@ out: return ret; } +/* + * Check if we need to log an inode. This is used in contexts where while + * logging an inode we need to log another inode (either that it exists or in + * full mode). This is used instead of btrfs_inode_in_log() because the later + * requires the inode to be in the log and have the log transaction committed, + * while here we do not care if the log transaction was already committed - our + * caller will commit the log later - and we want to avoid logging an inode + * multiple times when multiple tasks have joined the same log transaction. + */ +static bool need_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + /* + * If this inode does not have new/updated/deleted xattrs since the last + * time it was logged and is flagged as logged in the current transaction, + * we can skip logging it. As for new/deleted names, those are updated in + * the log by link/unlink/rename operations. + * In case the inode was logged and then evicted and reloaded, its + * logged_trans will be 0, in which case we have to fully log it since + * logged_trans is a transient field, not persisted. + */ + if (inode->logged_trans == trans->transid && + !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags)) + return false; + + return true; +} + struct btrfs_dir_list { u64 ino; struct list_head list; @@ -5848,7 +5877,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid) + if (BTRFS_I(inode)->generation >= trans->transid && + need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); @@ -5902,7 +5932,8 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (root != inode->root) break; - if (inode->generation >= trans->transid) { + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) { ret = btrfs_log_inode(trans, root, inode, LOG_INODE_EXISTS, ctx); if (ret) From 3e6a86a193b08039a382807c56421622c3ff4368 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:57 +0000 Subject: [PATCH 225/307] btrfs: skip logging directories already logged when logging all parents Some times when we fsync an inode we need to do a full log of all its ancestors (due to unlink, link or rename operations), which can be an expensive operation, specially if the directories are large. However if we find an ancestor directory inode that is already logged in the current transaction, and has no inserted/updated/deleted xattrs since it was last logged, we can skip logging the directory again. We are safe to skip that since we know that for logged directories, any link, unlink or rename operations that implicate the directory will update the log as necessary. So use the helper need_log_dir(), introduced in a previous commit, to detect already logged directories that can be skipped. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 105cf316ee27..c0dce99c2c14 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5826,6 +5826,11 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } + if (!need_log_inode(trans, BTRFS_I(dir_inode))) { + btrfs_add_delayed_iput(dir_inode); + continue; + } + if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), From 0e44cb3f94284d33067fc74e30990a0ed5b3540d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:58 +0000 Subject: [PATCH 226/307] btrfs: skip logging inodes already logged when logging new entries When logging new directory entries of a directory, we log the inodes of new dentries and the inodes of dentries pointing to directories that may have been created in past transactions. For the case of directories we log in full mode, which can be particularly expensive for large directories. We do use btrfs_inode_in_log() to skip already logged inodes, however for that helper to return true, it requires that the log transaction used to log the inode to be already committed. This means that when we have more than one task using the same log transaction we can end up logging an inode multiple times, which is a waste of time and not necessary since the log will be committed by one of the tasks and the others will wait for the log transaction to be committed before returning to user space. So simply replace the use of btrfs_inode_in_log() with the new helper function need_log_inode(), introduced in a previous commit. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c0dce99c2c14..6dc376a16cf2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5676,7 +5676,7 @@ process_leaf: goto next_dir_inode; } - if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) { + if (!need_log_inode(trans, BTRFS_I(di_inode))) { btrfs_add_delayed_iput(di_inode); break; } From 64d6b281ba4db044c946158387c74e1149b9487e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:34:59 +0000 Subject: [PATCH 227/307] btrfs: remove unnecessary check_parent_dirs_for_sync() Whenever we fsync an inode, if it is a directory, a regular file that was created in the current transaction or has last_unlink_trans set to the generation of the current transaction, we check if any of its ancestor inodes (and the inode itself if it is a directory) can not be logged and need a fallback to a full transaction commit - if so, we return with a value of 1 in order to fallback to a transaction commit. However we often do not need to fallback to a transaction commit because: 1) The ancestor inode is not an immediate parent, and therefore there is not an explicit request to log it and it is not needed neither to guarantee the consistency of the inode originally asked to be logged (fsynced) nor its immediate parent; 2) The ancestor inode was already logged before, in which case any link, unlink or rename operation updates the log as needed. So for these two cases we can avoid an unnecessary transaction commit. Therefore remove check_parent_dirs_for_sync() and add a check at the top of btrfs_log_inode() to make us fallback immediately to a transaction commit when we are logging a directory inode that can not be logged and needs a full transaction commit. All we need to protect is the case where after renaming a file someone fsyncs only the old directory, which would result is losing the renamed file after a log replay. This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Performance results, after applying all patches, are mentioned in the change log of the last patch. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 121 ++++++-------------------------------------- 1 file changed, 15 insertions(+), 106 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6dc376a16cf2..4c7b283ed2b2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5265,6 +5265,21 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&inode->log_mutex); } + /* + * This is for cases where logging a directory could result in losing a + * a file after replaying the log. For example, if we move a file from a + * directory A to a directory B, then fsync directory A, we have no way + * to known the file was moved from A to B, so logging just A would + * result in losing the file after a log replay. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && + inode_only == LOG_INODE_ALL && + inode->last_unlink_trans >= trans->transid) { + btrfs_set_log_full_commit(trans); + err = 1; + goto out_unlock; + } + /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -5428,99 +5443,6 @@ out_unlock: return err; } -/* - * Check if we must fallback to a transaction commit when logging an inode. - * This must be called after logging the inode and is used only in the context - * when fsyncing an inode requires the need to log some other inode - in which - * case we can't lock the i_mutex of each other inode we need to log as that - * can lead to deadlocks with concurrent fsync against other inodes (as we can - * log inodes up or down in the hierarchy) or rename operations for example. So - * we take the log_mutex of the inode after we have logged it and then check for - * its last_unlink_trans value - this is safe because any task setting - * last_unlink_trans must take the log_mutex and it must do this before it does - * the actual unlink operation, so if we do this check before a concurrent task - * sets last_unlink_trans it means we've logged a consistent version/state of - * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task might have only updated last_unlink_trans before - * we logged the inode or it might have also done the unlink). - */ -static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode) -{ - bool ret = false; - - mutex_lock(&inode->log_mutex); - if (inode->last_unlink_trans >= trans->transid) { - /* - * Make sure any commits to the log are forced to be full - * commits. - */ - btrfs_set_log_full_commit(trans); - ret = true; - } - mutex_unlock(&inode->log_mutex); - - return ret; -} - -/* - * follow the dentry parent pointers up the chain and see if any - * of the directories in it require a full commit before they can - * be logged. Returns zero if nothing special needs to be done or 1 if - * a full commit is required. - */ -static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, - struct dentry *parent, - struct super_block *sb) -{ - int ret = 0; - struct dentry *old_parent = NULL; - - /* - * for regular files, if its inode is already on disk, we don't - * have to worry about the parents at all. This is because - * we can use the last_unlink_trans field to record renames - * and other fun in this file. - */ - if (S_ISREG(inode->vfs_inode.i_mode) && - inode->generation < trans->transid && - inode->last_unlink_trans < trans->transid) - goto out; - - if (!S_ISDIR(inode->vfs_inode.i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - goto out; - inode = BTRFS_I(d_inode(parent)); - } - - while (1) { - if (btrfs_must_commit_transaction(trans, inode)) { - ret = 1; - break; - } - - if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) - break; - - if (IS_ROOT(parent)) { - inode = BTRFS_I(d_inode(parent)); - if (btrfs_must_commit_transaction(trans, inode)) - ret = 1; - break; - } - - parent = dget_parent(parent); - dput(old_parent); - old_parent = parent; - inode = BTRFS_I(d_inode(parent)); - - } - dput(old_parent); -out: - return ret; -} - /* * Check if we need to log an inode. This is used in contexts where while * logging an inode we need to log another inode (either that it exists or in @@ -5686,9 +5608,6 @@ process_leaf: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), log_mode, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) - ret = 1; btrfs_add_delayed_iput(di_inode); if (ret) goto next_dir_inode; @@ -5835,9 +5754,6 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && - btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) - ret = 1; if (!ret && ctx && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); @@ -6053,12 +5969,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct super_block *sb; int ret = 0; bool log_dentries = false; - sb = inode->vfs_inode.i_sb; - if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; @@ -6069,10 +5982,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - ret = check_parent_dirs_for_sync(trans, inode, parent, sb); - if (ret) - goto end_no_trans; - /* * Skip already logged inodes or inodes corresponding to tmpfiles * (since logging them is pointless, a link count of 0 means they From d0c2f4fa555e70324ec2a129b822ab58f172cc62 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 27 Jan 2021 10:35:00 +0000 Subject: [PATCH 228/307] btrfs: make concurrent fsyncs wait less when waiting for a transaction commit Often an fsync needs to fallback to a transaction commit for several reasons (to ensure consistency after a power failure, a new block group was allocated or a temporary error such as ENOMEM or ENOSPC happened). In that case the log is marked as needing a full commit and any concurrent tasks attempting to log inodes or commit the log will also fallback to the transaction commit. When this happens they all wait for the task that first started the transaction commit to finish the transaction commit - however they wait until the full transaction commit happens, which is not needed, as they only need to wait for the superblocks to be persisted and not for unpinning all the extents pinned during the transaction's lifetime, which even for short lived transactions can be a few thousand and take some significant amount of time to complete - for dbench workloads I have observed up to 4~5 milliseconds of time spent unpinning extents in the worst cases, and the number of pinned extents was between 2 to 3 thousand. So allow fsync tasks to skip waiting for the unpinning of extents when they call btrfs_commit_transaction() and they were not the task that started the transaction commit (that one has to do it, the alternative would be to offload the transaction commit to another task so that it could avoid waiting for the extent unpinning or offload the extent unpinning to another task). This patch is part of a patchset comprised of the following patches: btrfs: remove unnecessary directory inode item update when deleting dir entry btrfs: stop setting nbytes when filling inode item for logging btrfs: avoid logging new ancestor inodes when logging new inode btrfs: skip logging directories already logged when logging all parents btrfs: skip logging inodes already logged when logging new entries btrfs: remove unnecessary check_parent_dirs_for_sync() btrfs: make concurrent fsyncs wait less when waiting for a transaction commit After applying the entire patchset, dbench shows improvements in respect to throughput and latency. The script used to measure it is the following: $ cat dbench-test.sh #!/bin/bash DEV=/dev/sdk MNT=/mnt/sdk MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="-m single -d single" echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor umount $DEV &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -D $MNT -t 300 64 umount $MNT The test was run on a physical machine with 12 cores (Intel corei7), 64G of ram, using a NVMe device and a non-debug kernel configuration (Debian's default configuration). Before applying patchset, 32 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9627107 0.153 61.938 Close 7072076 0.001 3.175 Rename 407633 1.222 44.439 Unlink 1943895 0.658 44.440 Deltree 256 17.339 110.891 Mkdir 128 0.003 0.009 Qpathinfo 8725406 0.064 17.850 Qfileinfo 1529516 0.001 2.188 Qfsinfo 1599884 0.002 1.457 Sfileinfo 784200 0.005 3.562 Find 3373513 0.411 30.312 WriteX 4802132 0.053 29.054 ReadX 15089959 0.002 5.801 LockX 31344 0.002 0.425 UnlockX 31344 0.001 0.173 Flush 674724 5.952 341.830 Throughput 1008.02 MB/sec 32 clients 32 procs max_latency=341.833 ms After applying patchset, 32 clients: After patchset, with 32 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9931568 0.111 25.597 Close 7295730 0.001 2.171 Rename 420549 0.982 49.714 Unlink 2005366 0.497 39.015 Deltree 256 11.149 89.242 Mkdir 128 0.002 0.014 Qpathinfo 9001863 0.049 20.761 Qfileinfo 1577730 0.001 2.546 Qfsinfo 1650508 0.002 3.531 Sfileinfo 809031 0.005 5.846 Find 3480259 0.309 23.977 WriteX 4952505 0.043 41.283 ReadX 15568127 0.002 5.476 LockX 32338 0.002 0.978 UnlockX 32338 0.001 2.032 Flush 696017 7.485 228.835 Throughput 1049.91 MB/sec 32 clients 32 procs max_latency=228.847 ms --> +4.1% throughput, -39.6% max latency Before applying patchset, 64 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 8956748 0.342 108.312 Close 6579660 0.001 3.823 Rename 379209 2.396 81.897 Unlink 1808625 1.108 131.148 Deltree 256 25.632 172.176 Mkdir 128 0.003 0.018 Qpathinfo 8117615 0.131 55.916 Qfileinfo 1423495 0.001 2.635 Qfsinfo 1488496 0.002 5.412 Sfileinfo 729472 0.007 8.643 Find 3138598 0.855 78.321 WriteX 4470783 0.102 79.442 ReadX 14038139 0.002 7.578 LockX 29158 0.002 0.844 UnlockX 29158 0.001 0.567 Flush 627746 14.168 506.151 Throughput 924.738 MB/sec 64 clients 64 procs max_latency=506.154 ms After applying patchset, 64 clients: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9069003 0.303 43.193 Close 6662328 0.001 3.888 Rename 383976 2.194 46.418 Unlink 1831080 1.022 43.873 Deltree 256 24.037 155.763 Mkdir 128 0.002 0.005 Qpathinfo 8219173 0.137 30.233 Qfileinfo 1441203 0.001 3.204 Qfsinfo 1507092 0.002 4.055 Sfileinfo 738775 0.006 5.431 Find 3177874 0.936 38.170 WriteX 4526152 0.084 39.518 ReadX 14213562 0.002 24.760 LockX 29522 0.002 1.221 UnlockX 29522 0.001 0.694 Flush 635652 14.358 422.039 Throughput 990.13 MB/sec 64 clients 64 procs max_latency=422.043 ms --> +6.8% throughput, -18.1% max latency Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 1 + fs/btrfs/transaction.c | 39 +++++++++++++++++++++++++++++++-------- fs/btrfs/transaction.h | 2 ++ 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d81ae1f518f2..be5350f5bedf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2238,6 +2238,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = PTR_ERR(trans); goto out_release_extents; } + trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); btrfs_release_log_ctx_extents(&ctx); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b83e8ae38cfc..00c0680dac3a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -107,6 +107,11 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), + [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | + __TRANS_ATTACH | + __TRANS_JOIN | + __TRANS_JOIN_NOLOCK | + __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | @@ -826,10 +831,11 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) return trans; } -/* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_transaction *commit) +/* Wait for a transaction commit to reach at least the given state. */ +static noinline void wait_for_commit(struct btrfs_transaction *commit, + const enum btrfs_trans_state min_state) { - wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); + wait_event(commit->commit_wait, commit->state >= min_state); } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) @@ -884,7 +890,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) goto out; /* nothing committing|committed */ } - wait_for_commit(cur_trans); + wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); btrfs_put_transaction(cur_trans); out: return ret; @@ -2100,11 +2106,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans); - wait_for_commit(cur_trans); + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + ret = btrfs_end_transaction(trans); + wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; @@ -2118,13 +2128,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); if (cur_trans->list.prev != &fs_info->trans_list) { + enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; + + if (trans->in_fsync) + want_state = TRANS_STATE_SUPER_COMMITTED; + prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); - if (prev_trans->state != TRANS_STATE_COMPLETED) { + if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); - wait_for_commit(prev_trans); + wait_for_commit(prev_trans, want_state); + ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); @@ -2343,6 +2359,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto scrub_continue; + /* + * We needn't acquire the lock here because there is no other task + * which can change it. + */ + cur_trans->state = TRANS_STATE_SUPER_COMMITTED; + wake_up(&cur_trans->commit_wait); + btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 31ca81bad822..935bd6958a8a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -16,6 +16,7 @@ enum btrfs_trans_state { TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, + TRANS_STATE_SUPER_COMMITTED, TRANS_STATE_COMPLETED, TRANS_STATE_MAX, }; @@ -133,6 +134,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool dirty; + bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct list_head new_bgs; From 6869b0a8be775e920be54ee9b69a743ca20d8332 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:45 +0800 Subject: [PATCH 229/307] btrfs: merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK to PAGE_START_WRITEBACK PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK are two defines used in __process_pages_contig(), to let the function know to clear page dirty bit and then set page writeback. However page writeback and dirty bits are conflicting (at least for sector size == PAGE_SIZE case), this means these two have to be always updated together. This means we can merge PAGE_CLEAR_DIRTY and PAGE_SET_WRITEBACK to PAGE_START_WRITEBACK. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 12 ++++++------ fs/btrfs/inode.c | 28 ++++++++++------------------ 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index edcdbd739a1e..7c14ccf76838 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1976,10 +1976,10 @@ static int __process_pages_contig(struct address_space *mapping, pages_processed++; continue; } - if (page_ops & PAGE_CLEAR_DIRTY) + if (page_ops & PAGE_START_WRITEBACK) { clear_page_dirty_for_io(pages[i]); - if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + } if (page_ops & PAGE_SET_ERROR) SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 19221095c635..2d8187c84812 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -35,12 +35,12 @@ enum { /* these are flags for __process_pages_contig */ #define PAGE_UNLOCK (1 << 0) -#define PAGE_CLEAR_DIRTY (1 << 1) -#define PAGE_SET_WRITEBACK (1 << 2) -#define PAGE_END_WRITEBACK (1 << 3) -#define PAGE_SET_PRIVATE2 (1 << 4) -#define PAGE_SET_ERROR (1 << 5) -#define PAGE_LOCK (1 << 6) +/* Page starts writeback, clear dirty bit and set writeback bit */ +#define PAGE_START_WRITEBACK (1 << 1) +#define PAGE_END_WRITEBACK (1 << 2) +#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ERROR (1 << 4) +#define PAGE_LOCK (1 << 5) /* * page->private values. Every page that is controlled by the extent diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 04cd95899ac8..3337c8ee7928 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -692,8 +692,7 @@ cont: NULL, clear_flags, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); @@ -933,8 +932,7 @@ retry: async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK); + PAGE_UNLOCK | PAGE_START_WRITEBACK); if (btrfs_submit_compressed_write(inode, async_extent->start, async_extent->ram_size, ins.objectid, @@ -970,9 +968,8 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR); + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); goto again; @@ -1070,8 +1067,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK); + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1194,8 +1190,7 @@ out_reserve: out_unlock: clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | - PAGE_END_WRITEBACK; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * If we reserved an extent for our delalloc range (or a subrange) and * failed to create the respective ordered extent, then it means that @@ -1320,9 +1315,8 @@ static int cow_file_range_async(struct btrfs_inode *inode, unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; - unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | - PAGE_SET_ERROR; + unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); @@ -1519,8 +1513,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return -ENOMEM; } @@ -1842,8 +1835,7 @@ error: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | - PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | + PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; From 62c053fbb2d1816def1d353d9abed4c2f1f0abe9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:46 +0800 Subject: [PATCH 230/307] btrfs: set UNMAPPED bit early in btrfs_clone_extent_buffer() for subpage support For the incoming subpage support, UNMAPPED extent buffer will have different behavior in btrfs_release_extent_buffer(). This means we need to set UNMAPPED bit early before calling btrfs_release_extent_buffer(). Currently there is only one caller which relies on btrfs_release_extent_buffer() in its error path while set UNMAPPED bit late: - btrfs_clone_extent_buffer() Make it subpage compatible by setting the UNMAPPED bit early, since we're here, also move the UPTODATE bit early. There is another caller, __alloc_dummy_extent_buffer(), setting UNMAPPED bit late, but that function clean up the allocated page manually, thus no need for any modification. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7c14ccf76838..d3819dde8952 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5064,6 +5064,13 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) if (new == NULL) return NULL; + /* + * Set UNMAPPED before calling btrfs_release_extent_buffer(), as + * btrfs_release_extent_buffer() have different behavior for + * UNMAPPED subpage extent buffer. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + for (i = 0; i < num_pages; i++) { p = alloc_page(GFP_NOFS); if (!p) { @@ -5076,9 +5083,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } - set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); - set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); return new; } From cac06d843f259ebc4d03e4bc8af7304c17f76ee5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:47 +0800 Subject: [PATCH 231/307] btrfs: introduce the skeleton of btrfs_subpage structure For sectorsize < page size support, we need a structure to record extra status info for each sector of a page. Introduce the skeleton structure, all subpage related code would go to subpage.[ch]. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 3 ++- fs/btrfs/subpage.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/super.c | 1 - 4 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 fs/btrfs/subpage.c create mode 100644 fs/btrfs/subpage.h diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index e45957319424..b634c42115ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -27,7 +27,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o discard.o reflink.o + block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ + subpage.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c new file mode 100644 index 000000000000..a3e5b6a13d54 --- /dev/null +++ b/fs/btrfs/subpage.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "ctree.h" +#include "subpage.h" + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type) +{ + struct btrfs_subpage *subpage; + + /* + * We have cases like a dummy extent buffer page, which is not mappped + * and doesn't need to be locked. + */ + if (page->mapping) + ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ + if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + return 0; + + subpage = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); + if (!subpage) + return -ENOMEM; + + spin_lock_init(&subpage->lock); + attach_page_private(page, subpage); + return 0; +} + +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + /* Either not subpage, or already detached */ + if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + return; + + subpage = (struct btrfs_subpage *)detach_page_private(page); + ASSERT(subpage); + kfree(subpage); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h new file mode 100644 index 000000000000..676280bc7562 --- /dev/null +++ b/fs/btrfs/subpage.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUBPAGE_H +#define BTRFS_SUBPAGE_H + +#include + +/* + * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap + * is sufficient. Regular bitmap_* is not used due to size reasons. + */ +#define BTRFS_SUBPAGE_BITMAP_SIZE 16 + +/* + * Structure to trace status of each sector inside a page, attached to + * page::private for both data and metadata inodes. + */ +struct btrfs_subpage { + /* Common members for both data and metadata pages */ + spinlock_t lock; +}; + +enum btrfs_subpage_type { + BTRFS_SUBPAGE_METADATA, + BTRFS_SUBPAGE_DATA, +}; + +int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, + struct page *page); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12d7d3be7cd4..919ed5c357e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,7 +48,6 @@ #include "tests/btrfs-tests.h" #include "block-group.h" #include "discard.h" - #include "qgroup.h" #define CREATE_TRACE_POINTS #include From 760f991f1428f25fd18b8638004c95f0a2a43b2f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:48 +0800 Subject: [PATCH 232/307] btrfs: make attach_extent_buffer_page() handle subpage case For subpage case, we need to allocate additional memory for each metadata page. So we need to: - Allow attach_extent_buffer_page() to return int to indicate allocation failure - Allow manually pre-allocate subpage memory for alloc_extent_buffer() As we don't want to use GFP_ATOMIC under spinlock, we introduce btrfs_alloc_subpage() and btrfs_free_subpage() functions for this purpose. (The simple wrap for btrfs_free_subpage() is for later convert to kmem_cache. Already internally tested without problem) - Preallocate btrfs_subpage structure for alloc_extent_buffer() We don't want to call memory allocation with spinlock held, so do preallocation before we acquire mapping->private_lock. - Handle subpage and regular case differently in attach_extent_buffer_page() For regular case, no change, just do the usual thing. For subpage case, allocate new memory or use the preallocated memory. For future subpage metadata, we will make use of radix tree to grab extent buffer. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 69 +++++++++++++++++++++++++++++++++++++++----- fs/btrfs/subpage.c | 30 +++++++++++++++---- fs/btrfs/subpage.h | 10 +++++++ 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d3819dde8952..e498d496560b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -24,6 +24,7 @@ #include "rcu-string.h" #include "backref.h" #include "disk-io.h" +#include "subpage.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -3141,9 +3142,13 @@ static int submit_extent_page(unsigned int opf, return ret; } -static void attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page) +static int attach_extent_buffer_page(struct extent_buffer *eb, + struct page *page, + struct btrfs_subpage *prealloc) { + struct btrfs_fs_info *fs_info = eb->fs_info; + int ret = 0; + /* * If the page is mapped to btree inode, we should hold the private * lock to prevent race. @@ -3153,10 +3158,28 @@ static void attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (fs_info->sectorsize == PAGE_SIZE) { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else + WARN_ON(page->private != (unsigned long)eb); + return 0; + } + + /* Already mapped, just free prealloc */ + if (PagePrivate(page)) { + btrfs_free_subpage(prealloc); + return 0; + } + + if (prealloc) + /* Has preallocated memory for subpage */ + attach_page_private(page, prealloc); else - WARN_ON(page->private != (unsigned long)eb); + /* Do new allocation to attach subpage */ + ret = btrfs_attach_subpage(fs_info, page, + BTRFS_SUBPAGE_METADATA); + return ret; } void set_page_extent_mapped(struct page *page) @@ -5072,12 +5095,19 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); for (i = 0; i < num_pages; i++) { + int ret; + p = alloc_page(GFP_NOFS); if (!p) { btrfs_release_extent_buffer(new); return NULL; } - attach_extent_buffer_page(new, p); + ret = attach_extent_buffer_page(new, p, NULL); + if (ret < 0) { + put_page(p); + btrfs_release_extent_buffer(new); + return NULL; + } WARN_ON(PageDirty(p)); SetPageUptodate(p); new->pages[i] = p; @@ -5315,12 +5345,33 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++, index++) { + struct btrfs_subpage *prealloc = NULL; + p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); goto free_eb; } + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock hold. The memory will be + * freed by attach_extent_buffer_page() or freed manually if + * we exit earlier. + * + * Although we have ensured one subpage eb can only have one + * page, but it may change in the future for 16K page size + * support, so we still preallocate the memory in the loop. + */ + ret = btrfs_alloc_subpage(fs_info, &prealloc, + BTRFS_SUBPAGE_METADATA); + if (ret < 0) { + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } + spin_lock(&mapping->private_lock); exists = grab_extent_buffer(p); if (exists) { @@ -5328,10 +5379,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, unlock_page(p); put_page(p); mark_extent_buffer_accessed(exists, p); + btrfs_free_subpage(prealloc); goto free_eb; } - attach_extent_buffer_page(eb, p); + /* Should not fail, as we have preallocated the memory */ + ret = attach_extent_buffer_page(eb, p, prealloc); + ASSERT(!ret); spin_unlock(&mapping->private_lock); + WARN_ON(PageDirty(p)); eb->pages[i] = p; if (!PageUptodate(p)) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a3e5b6a13d54..61b28dfca20c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -7,7 +7,8 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { - struct btrfs_subpage *subpage; + struct btrfs_subpage *subpage = NULL; + int ret; /* * We have cases like a dummy extent buffer page, which is not mappped @@ -19,11 +20,9 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) return 0; - subpage = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); - if (!subpage) - return -ENOMEM; - - spin_lock_init(&subpage->lock); + ret = btrfs_alloc_subpage(fs_info, &subpage, type); + if (ret < 0) + return ret; attach_page_private(page, subpage); return 0; } @@ -39,5 +38,24 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, subpage = (struct btrfs_subpage *)detach_page_private(page); ASSERT(subpage); + btrfs_free_subpage(subpage); +} + +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type) +{ + if (fs_info->sectorsize == PAGE_SIZE) + return 0; + + *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); + if (!*ret) + return -ENOMEM; + spin_lock_init(&(*ret)->lock); + return 0; +} + +void btrfs_free_subpage(struct btrfs_subpage *subpage) +{ kfree(subpage); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 676280bc7562..7ba544bcc9c6 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -18,6 +18,10 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; + union { + /* Structures only used by metadata */ + /* Structures only used by data */ + }; }; enum btrfs_subpage_type { @@ -30,4 +34,10 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct page *page); +/* Allocate additional data where page represents more than one sector */ +int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + struct btrfs_subpage **ret, + enum btrfs_subpage_type type); +void btrfs_free_subpage(struct btrfs_subpage *subpage); + #endif From 819822107d8837fc3363ceaeb172b981c8600a2b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:49 +0800 Subject: [PATCH 233/307] btrfs: make grab_extent_buffer_from_page() handle subpage case For subpage case, grab_extent_buffer() can't really get an extent buffer just from btrfs_subpage. We have radix tree lock protecting us from inserting the same eb into the tree. Thus we don't really need to do the extra hassle, just let alloc_extent_buffer() handle the existing eb in radix tree. Now if two ebs are being allocated as the same time, one will fail with -EEIXST when inserting into the radix tree. So for grab_extent_buffer(), just always return NULL for subpage case. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e498d496560b..133ff4531472 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5284,10 +5284,19 @@ free_eb: } #endif -static struct extent_buffer *grab_extent_buffer(struct page *page) +static struct extent_buffer *grab_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page) { struct extent_buffer *exists; + /* + * For subpage case, we completely rely on radix tree to ensure we + * don't try to insert two ebs for the same bytenr. So here we always + * return NULL and just continue. + */ + if (fs_info->sectorsize < PAGE_SIZE) + return NULL; + /* Page not yet attached to an extent buffer */ if (!PagePrivate(page)) return NULL; @@ -5373,7 +5382,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } spin_lock(&mapping->private_lock); - exists = grab_extent_buffer(p); + exists = grab_extent_buffer(fs_info, p); if (exists) { spin_unlock(&mapping->private_lock); unlock_page(p); From 8ff8466d29efc226648c3c5e57590428d798a6ea Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:50 +0800 Subject: [PATCH 234/307] btrfs: support subpage for extent buffer page release In btrfs_release_extent_buffer_pages(), we need to add extra handling for subpage. Introduce a helper, detach_extent_buffer_page(), to do different handling for regular and subpage cases. For subpage case, handle detaching page private. For unmapped (dummy or cloned) ebs, we can detach the page private immediately as the page can only be attached to one unmapped eb. For mapped ebs, we have to ensure there are no eb in the page range before we delete it, as page->private is shared between all ebs in the same page. But there is a subpage specific race, where we can race with extent buffer allocation, and clear the page private while new eb is still being utilized, like this: Extent buffer A is the new extent buffer which will be allocated, while extent buffer B is the last existing extent buffer of the page. T1 (eb A) | T2 (eb B) -------------------------------+------------------------------ alloc_extent_buffer() | btrfs_release_extent_buffer_pages() |- p = find_or_create_page() | | |- attach_extent_buffer_page() | | | | |- detach_extent_buffer_page() | | |- if (!page_range_has_eb()) | | | No new eb in the page range yet | | | As new eb A hasn't yet been | | | inserted into radix tree. | | |- btrfs_detach_subpage() | | |- detach_page_private(); |- radix_tree_insert() | Then we have a metadata eb whose page has no private bit. To avoid such race, we introduce a subpage metadata-specific member, btrfs_subpage::eb_refs. In alloc_extent_buffer() we increase eb_refs in the critical section of private_lock. Then page_range_has_eb() will return true for detach_extent_buffer_page(), and will not detach page private. The section is marked by: - btrfs_page_inc_eb_refs() - btrfs_page_dec_eb_refs() Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 94 +++++++++++++++++++++++++++++++++++++------- fs/btrfs/subpage.c | 42 ++++++++++++++++++++ fs/btrfs/subpage.h | 13 +++++- 3 files changed, 133 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 133ff4531472..1812813bdf63 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4995,25 +4995,39 @@ int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -/* - * Release all pages attached to the extent buffer. - */ -static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) { - int i; - int num_pages; - int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + struct btrfs_subpage *subpage; - BUG_ON(extent_buffer_under_io(eb)); + lockdep_assert_held(&page->mapping->private_lock); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + if (PagePrivate(page)) { + subpage = (struct btrfs_subpage *)page->private; + if (atomic_read(&subpage->eb_refs)) + return true; + } + return false; +} - if (!page) - continue; +static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); + + /* + * For mapped eb, we're going to change the page private, which should + * be done under the private_lock. + */ + if (mapped) + spin_lock(&page->mapping->private_lock); + + if (!PagePrivate(page)) { if (mapped) - spin_lock(&page->mapping->private_lock); + spin_unlock(&page->mapping->private_lock); + return; + } + + if (fs_info->sectorsize == PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5032,9 +5046,49 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) */ detach_page_private(page); } - if (mapped) spin_unlock(&page->mapping->private_lock); + return; + } + + /* + * For subpage, we can have dummy eb with page private. In this case, + * we can directly detach the private as such page is only attached to + * one dummy eb, no sharing. + */ + if (!mapped) { + btrfs_detach_subpage(fs_info, page); + return; + } + + btrfs_page_dec_eb_refs(fs_info, page); + + /* + * We can only detach the page private if there are no other ebs in the + * page range. + */ + if (!page_range_has_eb(fs_info, page)) + btrfs_detach_subpage(fs_info, page); + + spin_unlock(&page->mapping->private_lock); +} + +/* Release all pages attached to the extent buffer */ +static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) +{ + int i; + int num_pages; + + ASSERT(!extent_buffer_under_io(eb)); + + num_pages = num_extent_pages(eb); + for (i = 0; i < num_pages; i++) { + struct page *page = eb->pages[i]; + + if (!page) + continue; + + detach_extent_buffer_page(eb, page); /* One for when we allocated the page */ put_page(page); @@ -5394,6 +5448,16 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* Should not fail, as we have preallocated the memory */ ret = attach_extent_buffer_page(eb, p, prealloc); ASSERT(!ret); + /* + * To inform we have extra eb under allocation, so that + * detach_extent_buffer_page() won't release the page private + * when the eb hasn't yet been inserted into radix tree. + * + * The ref will be decreased when the eb released the page, in + * detach_extent_buffer_page(). + * Thus needs no special handling in error path. + */ + btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 61b28dfca20c..a2a21fa0ea35 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -52,6 +52,8 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, if (!*ret) return -ENOMEM; spin_lock_init(&(*ret)->lock); + if (type == BTRFS_SUBPAGE_METADATA) + atomic_set(&(*ret)->eb_refs, 0); return 0; } @@ -59,3 +61,43 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) { kfree(subpage); } + +/* + * Increase the eb_refs of current subpage. + * + * This is important for eb allocation, to prevent race with last eb freeing + * of the same page. + * With the eb_refs increased before the eb inserted into radix tree, + * detach_extent_buffer_page() won't detach the page private while we're still + * allocating the extent buffer. + */ +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + atomic_inc(&subpage->eb_refs); +} + +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->mapping); + lockdep_assert_held(&page->mapping->private_lock); + + subpage = (struct btrfs_subpage *)page->private; + ASSERT(atomic_read(&subpage->eb_refs)); + atomic_dec(&subpage->eb_refs); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 7ba544bcc9c6..fe51cc237a66 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -19,7 +19,13 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; union { - /* Structures only used by metadata */ + /* + * Structures only used by metadata + * + * @eb_refs should only be operated under private_lock, as it + * manages whether the subpage can be detached. + */ + atomic_t eb_refs; /* Structures only used by data */ }; }; @@ -40,4 +46,9 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); +void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); +void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, + struct page *page); + #endif From 09bc1f0fb845a6435e2c6c5d3c937f7a674e816a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:51 +0800 Subject: [PATCH 235/307] btrfs: attach private to dummy extent buffer pages There are locations where we allocate dummy extent buffers for temporary usage, like in tree_mod_log_rewind() or get_old_root(). These dummy extent buffers will be handled by the same eb accessors, and if they don't have page::private subpage eb accessors could fail. To address such problems, make __alloc_dummy_extent_buffer() attach page private for dummy extent buffers too. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1812813bdf63..b8ff05916b8f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5185,9 +5185,14 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { + int ret; + eb->pages[i] = alloc_page(GFP_NOFS); if (!eb->pages[i]) goto err; + ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + if (ret < 0) + goto err; } set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); @@ -5195,8 +5200,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (; i > 0; i--) + for (; i > 0; i--) { + detach_extent_buffer_page(eb, eb->pages[i - 1]); __free_page(eb->pages[i - 1]); + } __free_extent_buffer(eb); return NULL; } From a1d767c11cca0f9b6ddc56ea9561d441340d91a9 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:52 +0800 Subject: [PATCH 236/307] btrfs: introduce helpers for subpage uptodate status Introduce the following functions to handle subpage uptodate status: - btrfs_subpage_set_uptodate() - btrfs_subpage_clear_uptodate() - btrfs_subpage_test_uptodate() These helpers can only be called when the page has subpage attached and the range is ensured to be inside the page. - btrfs_page_set_uptodate() - btrfs_page_clear_uptodate() - btrfs_page_test_uptodate() These helpers can handle both regular sector size and subpage. Although caller should still ensure that the range is inside the page. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 113 +++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 27 +++++++++++ 2 files changed, 140 insertions(+) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a2a21fa0ea35..4e1b187b9607 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -101,3 +101,116 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, ASSERT(atomic_read(&subpage->eb_refs)); atomic_dec(&subpage->eb_refs); } + +/* + * Convert the [start, start + len) range into a u16 bitmap + * + * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. + */ +static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; + const int nbits = len >> fs_info->sectorsize_bits; + + /* Basic checks */ + ASSERT(PagePrivate(page) && page->private); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + + /* + * The range check only works for mapped page, we can still have + * unmapped page like dummy extent buffer pages. + */ + if (page->mapping) + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + /* + * Here nbits can be 16, thus can go beyond u16 range. We make the + * first left shift to be calculate in unsigned long (at least u32), + * then truncate the result to u16. + */ + return (u16)(((1UL << nbits) - 1) << bit_start); +} + +void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap |= tmp; + if (subpage->uptodate_bitmap == U16_MAX) + SetPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->uptodate_bitmap &= ~tmp; + ClearPageUptodate(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +/* + * Unlike set/clear which is dependent on each page status, for test all bits + * are tested in the same way. + */ +#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned long flags; \ + bool ret; \ + \ + spin_lock_irqsave(&subpage->lock, flags); \ + ret = ((subpage->name##_bitmap & tmp) == tmp); \ + spin_unlock_irqrestore(&subpage->lock, flags); \ + return ret; \ +} +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); + +/* + * Note that, in selftests (extent-io-tests), we can have empty fs_info passed + * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall + * back to regular sectorsize branch. + */ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ + test_page_func) \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, + PageUptodate); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index fe51cc237a66..98d511cd75bf 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -18,6 +18,7 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; + u16 uptodate_bitmap; union { /* * Structures only used by metadata @@ -51,4 +52,30 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct page *page); +/* + * Template for subpage related operations. + * + * btrfs_subpage_*() are for call sites where the page has subpage attached and + * the range is ensured to be inside the page. + * + * btrfs_page_*() are for call sites where the page can either be subpage + * specific or regular page. The function will handle both cases. + * But the range still needs to be inside the page. + */ +#define DECLARE_BTRFS_SUBPAGE_OPS(name) \ +void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); + +DECLARE_BTRFS_SUBPAGE_OPS(uptodate); + #endif From 03a816b32be577fdeed2e17d95c2636b68f6860c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:53 +0800 Subject: [PATCH 237/307] btrfs: introduce helpers for subpage error status Introduce the following functions to handle subpage error status: - btrfs_subpage_set_error() - btrfs_subpage_clear_error() - btrfs_subpage_test_error() These helpers can only be called when the page has subpage attached and the range is ensured to be inside the page. - btrfs_page_set_error() - btrfs_page_clear_error() - btrfs_page_test_error() These helpers can handle both regular sector size and subpage without problem. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 29 +++++++++++++++++++++++++++++ fs/btrfs/subpage.h | 2 ++ 2 files changed, 31 insertions(+) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 4e1b187b9607..2c51ab71e000 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -160,6 +160,33 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap |= tmp; + SetPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->error_bitmap &= ~tmp; + if (subpage->error_bitmap == 0) + ClearPageError(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -179,6 +206,7 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ return ret; \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -214,3 +242,4 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); +IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 98d511cd75bf..f3c5def313a1 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -19,6 +19,7 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; u16 uptodate_bitmap; + u16 error_bitmap; union { /* * Structures only used by metadata @@ -77,5 +78,6 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); +DECLARE_BTRFS_SUBPAGE_OPS(error); #endif From 251f2acc719e99f00827814ea77cfd38080e1d62 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:54 +0800 Subject: [PATCH 238/307] btrfs: support subpage in set/clear_extent_buffer_uptodate() To support subpage in set_extent_buffer_uptodate and clear_extent_buffer_uptodate we only need to use the subpage-aware helpers to update the page bits. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b8ff05916b8f..969de300a95b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5672,30 +5672,33 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (page) - ClearPageUptodate(page); + btrfs_page_clear_uptodate(fs_info, page, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - int i; + struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page; int num_pages; + int i; set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); } } From 92d83e94365706fa3250b0e43bdab5995ac03046 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:55 +0800 Subject: [PATCH 239/307] btrfs: support subpage in btrfs_clone_extent_buffer For btrfs_clone_extent_buffer(), it's mostly the same code of __alloc_dummy_extent_buffer(), except it has extra page copy. So to make it subpage compatible, we only need to: - Call set_extent_buffer_uptodate() instead of SetPageUptodate() This will set correct uptodate bit for subpage and regular sector size cases. Since we're calling set_extent_buffer_uptodate() which will also set EXTENT_BUFFER_UPTODATE bit, we don't need to manually set that bit either. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 969de300a95b..6b27daf62d94 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5163,11 +5163,10 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } WARN_ON(PageDirty(p)); - SetPageUptodate(p); new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } - set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); + set_extent_buffer_uptodate(new); return new; } From d1e86e3fc34f24b090d86949ad7f3db7a4c1861f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:56 +0800 Subject: [PATCH 240/307] btrfs: support subpage in try_release_extent_buffer() Unlike the original try_release_extent_buffer(), try_release_subpage_extent_buffer() will iterate through all the ebs in the page, and try to release each. We can release the full page only after there's no private attached, which means all ebs of that page have been released as well. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 106 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6b27daf62d94..a6102e795af5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6318,13 +6318,115 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +static struct extent_buffer *get_next_extent_buffer( + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) +{ + struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *found = NULL; + u64 page_start = page_offset(page); + int ret; + int i; + + ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); + ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); + lockdep_assert_held(&fs_info->buffer_lock); + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, + bytenr >> fs_info->sectorsize_bits, + PAGE_SIZE / fs_info->nodesize); + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + break; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + break; + } + } + return found; +} + +static int try_release_subpage_extent_buffer(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + u64 cur = page_offset(page); + const u64 end = page_offset(page) + PAGE_SIZE; + int ret; + + while (cur < end) { + struct extent_buffer *eb = NULL; + + /* + * Unlike try_release_extent_buffer() which uses page->private + * to grab buffer, for subpage case we rely on radix tree, thus + * we need to ensure radix tree consistency. + * + * We also want an atomic snapshot of the radix tree, thus go + * with spinlock rather than RCU. + */ + spin_lock(&fs_info->buffer_lock); + eb = get_next_extent_buffer(fs_info, page, cur); + if (!eb) { + /* No more eb in the page range after or at cur */ + spin_unlock(&fs_info->buffer_lock); + break; + } + cur = eb->start + eb->len; + + /* + * The same as try_release_extent_buffer(), to ensure the eb + * won't disappear out from under us. + */ + spin_lock(&eb->refs_lock); + if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { + spin_unlock(&eb->refs_lock); + spin_unlock(&fs_info->buffer_lock); + break; + } + spin_unlock(&fs_info->buffer_lock); + + /* + * If tree ref isn't set then we know the ref on this eb is a + * real ref, so just return, this eb will likely be freed soon + * anyway. + */ + if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) { + spin_unlock(&eb->refs_lock); + break; + } + + /* + * Here we don't care about the return value, we will always + * check the page private at the end. And + * release_extent_buffer() will release the refs_lock. + */ + release_extent_buffer(eb); + } + /* + * Finally to check if we have cleared page private, as if we have + * released all ebs in the page, the page private should be cleared now. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) + ret = 1; + else + ret = 0; + spin_unlock(&page->mapping->private_lock); + return ret; + +} + int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return try_release_subpage_extent_buffer(page); + /* - * We need to make sure nobody is attaching this page to an eb right - * now. + * We need to make sure nobody is changing page->private, as we rely on + * page->private as the pointer to extent buffer. */ spin_lock(&page->mapping->private_lock); if (!PagePrivate(page)) { From 4012daf769cb77dbf3bc36c3adecf480ad097682 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:57 +0800 Subject: [PATCH 241/307] btrfs: introduce read_extent_buffer_subpage() Introduce a helper, read_extent_buffer_subpage(), to do the subpage extent buffer read. The difference between regular and subpage routines are: - No page locking Here we completely rely on extent locking. Page locking can reduce the concurrency greatly, as if we lock one page to read one extent buffer, all the other extent buffers in the same page will have to wait. - Extent uptodate condition Despite the existing PageUptodate() and EXTENT_BUFFER_UPTODATE check, We also need to check btrfs_subpage::uptodate_bitmap. - No page iteration Just one page, no need to loop, this greatly simplified the subpage routine. This patch only implements the bio submit part, no endio support yet. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 70 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a6102e795af5..2ffaa983c9dd 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5701,6 +5701,73 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } } +static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct extent_io_tree *io_tree; + struct page *page = eb->pages[0]; + struct bio *bio = NULL; + int ret = 0; + + ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); + ASSERT(PagePrivate(page)); + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; + + if (wait == WAIT_NONE) { + ret = try_lock_extent(io_tree, eb->start, + eb->start + eb->len - 1); + if (ret <= 0) + return ret; + } else { + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); + if (ret < 0) + return ret; + } + + ret = 0; + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || + PageUptodate(page) || + btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1); + return ret; + } + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = 0; + atomic_set(&eb->io_pages, 1); + check_buffer_tree_ref(eb); + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, + eb->len, eb->start - page_offset(page), &bio, + end_bio_extent_readpage, mirror_num, 0, 0, + true); + if (ret) { + /* + * In the endio function, if we hit something wrong we will + * increase the io_pages, so here we need to decrease it for + * error path. + */ + atomic_dec(&eb->io_pages); + } + if (bio) { + int tmp; + + tmp = submit_one_bio(bio, mirror_num, 0); + if (tmp < 0) + return tmp; + } + if (ret || wait != WAIT_COMPLETE) + return ret; + + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + ret = -EIO; + return ret; +} + int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) { int i; @@ -5717,6 +5784,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return read_extent_buffer_subpage(eb, wait, mirror_num); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; From 4325cb2293817cef3611c43d7a27d0937d1e6962 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:58 +0800 Subject: [PATCH 242/307] btrfs: support subpage in endio_readpage_update_page_status() To handle subpage status update, add the following: - Use btrfs_page_*() subpage-aware helpers to update page status Now we can handle both cases well. - No page unlock for subpage metadata Since subpage metadata doesn't utilize page locking at all, skip it. For subpage data locking, it's handled in later commits. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ffaa983c9dd..491fc0114672 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2840,15 +2840,24 @@ update: processed->uptodate = uptodate; } -static void endio_readpage_update_page_status(struct page *page, bool uptodate) +static void endio_readpage_update_page_status(struct page *page, bool uptodate, + u64 start, u32 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + if (uptodate) { - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, start, len); } else { - ClearPageUptodate(page); - SetPageError(page); + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); } - unlock_page(page); + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + /* Subpage locking will be handled in later patches */ } /* @@ -2985,7 +2994,7 @@ readpage_ok: bio_offset += len; /* Update page status and unlock */ - endio_readpage_update_page_status(page, uptodate); + endio_readpage_update_page_status(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } From 371cdc0700c778b94ae8fa2c7d99401f13070d8f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:33:59 +0800 Subject: [PATCH 243/307] btrfs: introduce subpage metadata validation check For subpage metadata validation check, there are some differences: - Read must finish in one bvec Since we're just reading one subpage range in one page, it should never be split into two bios nor two bvecs. - How to grab the existing eb Instead of grabbing eb using page->private, we have to go search radix tree as we don't have any direct pointer at hand. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0d1b328397e..d34c6d61928f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -591,6 +591,59 @@ out: return ret; } +static int validate_subpage_buffer(struct page *page, u64 start, u64 end, + int mirror) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct extent_buffer *eb; + bool reads_done; + int ret = 0; + + /* + * We don't allow bio merge for subpage metadata read, so we should + * only get one eb for each endio hook. + */ + ASSERT(end == start + fs_info->nodesize - 1); + ASSERT(PagePrivate(page)); + + eb = find_extent_buffer(fs_info, start); + /* + * When we are reading one tree block, eb must have been inserted into + * the radix tree. If not, something is wrong. + */ + ASSERT(eb); + + reads_done = atomic_dec_and_test(&eb->io_pages); + /* Subpage read must finish in page read */ + ASSERT(reads_done); + + eb->read_mirror = mirror; + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { + ret = -EIO; + goto err; + } + ret = validate_extent_buffer(eb); + if (ret < 0) + goto err; + + if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) + btree_readahead_hook(eb, ret); + + set_extent_buffer_uptodate(eb); + + free_extent_buffer(eb); + return ret; +err: + /* + * end_bio_extent_readpage decrements io_pages in case of error, + * make sure it has something to decrement. + */ + atomic_inc(&eb->io_pages); + clear_extent_buffer_uptodate(eb); + free_extent_buffer(eb); + return ret; +} + int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, struct page *page, u64 start, u64 end, int mirror) @@ -600,6 +653,10 @@ int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, int reads_done; ASSERT(page->private); + + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return validate_subpage_buffer(page, start, end, mirror); + eb = (struct extent_buffer *)page->private; /* From 32443de3382be98c0a8b8f6f50d23da2e10c4117 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:34:00 +0800 Subject: [PATCH 244/307] btrfs: introduce btrfs_subpage for data inodes To support subpage sector size, data also need extra info to make sure which sectors in a page are uptodate/dirty/... This patch will make pages for data inodes get btrfs_subpage structure attached, and detached when the page is freed. This patch also slightly changes the timing when set_page_extent_mapped() is called to make sure: - We have page->mapping set page->mapping->host is used to grab btrfs_fs_info, thus we can only call this function after page is mapped to an inode. One call site attaches pages to inode manually, thus we have to modify the timing of set_page_extent_mapped() a bit. - As soon as possible, before other operations Since memory allocation can fail, we have to do extra error handling. Calling set_page_extent_mapped() as soon as possible can simply the error handling for several call sites. The idea is pretty much the same as iomap_page, but with more bitmaps for btrfs specific cases. Currently the plan is to switch iomap if iomap can provide sector aligned write back (only write back dirty sectors, but not the full page, data balance require this feature). So we will stick to btrfs specific bitmap for now. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 10 +++++++-- fs/btrfs/extent_io.c | 45 +++++++++++++++++++++++++++++++++---- fs/btrfs/extent_io.h | 3 ++- fs/btrfs/file.c | 24 ++++++++------------ fs/btrfs/free-space-cache.c | 15 ++++++++++--- fs/btrfs/inode.c | 15 +++++++++---- fs/btrfs/ioctl.c | 8 ++++++- fs/btrfs/reflink.c | 5 ++++- fs/btrfs/relocation.c | 11 +++++++-- 9 files changed, 103 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5ae3fa0386b7..6d203acfdeb3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -542,13 +542,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, goto next; } - end = last_offset + PAGE_SIZE - 1; /* * at this point, we have a locked page in the page cache * for these bytes in the file. But, we have to make * sure they map to this compressed extent on disk. */ - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + + end = last_offset + PAGE_SIZE - 1; lock_extent(tree, last_offset, end); read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 491fc0114672..5ff85ecc6c3f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3191,10 +3191,38 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, return ret; } -void set_page_extent_mapped(struct page *page) +int set_page_extent_mapped(struct page *page) { + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + + if (PagePrivate(page)) + return 0; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return 0; +} + +void clear_page_extent_mapped(struct page *page) +{ + struct btrfs_fs_info *fs_info; + + ASSERT(page->mapping); + if (!PagePrivate(page)) - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + return; + + fs_info = btrfs_sb(page->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return btrfs_detach_subpage(fs_info, page); + + detach_page_private(page); } static struct extent_map * @@ -3251,7 +3279,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_extent(tree, start, end); + SetPageError(page); + goto out; + } if (!PageUptodate(page)) { if (cleancache_get_page(page) == 0) { @@ -3691,7 +3724,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) { + SetPageError(page); + goto done; + } if (!epd->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2d8187c84812..047b3e66897f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -178,7 +178,8 @@ int btree_write_cache_pages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -void set_page_extent_mapped(struct page *page); +int set_page_extent_mapped(struct page *page); +void clear_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index be5350f5bedf..bf52d7e85914 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1369,6 +1369,12 @@ again: goto fail; } + err = set_page_extent_mapped(pages[i]); + if (err < 0) { + faili = i; + goto fail; + } + if (i == 0) err = prepare_uptodate_page(inode, pages[i], pos, force_uptodate); @@ -1453,23 +1459,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } /* - * It's possible the pages are dirty right now, but we don't want - * to clean them yet because copy_from_user may catch a page fault - * and we might have to fall back to one page at a time. If that - * happens, we'll unlock these pages and we'd have a window where - * reclaim could sneak in and drop the once-dirty page on the floor - * without writing it. - * - * We have the pages locked and the extent range locked, so there's - * no way someone can start IO on any dirty pages in this range. - * - * We'll call btrfs_dirty_pages() later on, and that will flip around - * delalloc bits and dirty the pages as required. + * We should be called after prepare_pages() which should have locked + * all pages in the range. */ - for (i = 0; i < num_pages; i++) { - set_page_extent_mapped(pages[i]); + for (i = 0; i < num_pages; i++) WARN_ON(!PageLocked(pages[i])); - } return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0d6dcb5ff963..6134e10a6e7f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -431,11 +431,22 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) int i; for (i = 0; i < io_ctl->num_pages; i++) { + int ret; + page = find_or_create_page(inode->i_mapping, i, mask); if (!page) { io_ctl_drop_pages(io_ctl); return -ENOMEM; } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + io_ctl_drop_pages(io_ctl); + return ret; + } + io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -455,10 +466,8 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) } } - for (i = 0; i < io_ctl->num_pages; i++) { + for (i = 0; i < io_ctl->num_pages; i++) clear_page_dirty_for_io(io_ctl->pages[i]); - set_page_extent_mapped(io_ctl->pages[i]); - } return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3337c8ee7928..5522e9d09c8a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4720,6 +4720,9 @@ again: ret = -ENOMEM; goto out; } + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); @@ -4737,7 +4740,6 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, block_start, block_end, &cached_state); - set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { @@ -8125,7 +8127,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); if (ret == 1) - detach_page_private(page); + clear_page_extent_mapped(page); return ret; } @@ -8285,7 +8287,7 @@ again: } ClearPageChecked(page); - detach_page_private(page); + clear_page_extent_mapped(page); } /* @@ -8364,7 +8366,12 @@ again: wait_on_page_writeback(page); lock_extent_bits(io_tree, page_start, page_end, &cached_state); - set_page_extent_mapped(page); + ret2 = set_page_extent_mapped(page); + if (ret2 < 0) { + ret = vmf_error(ret2); + unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + goto out_unlock; + } /* * we can't set the delalloc bits if there are pending ordered diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7f2935ea8d3a..e6a63f652235 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1314,6 +1314,13 @@ again: if (!page) break; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + break; + } + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; while (1) { @@ -1435,7 +1442,6 @@ again: for (i = 0; i < i_done; i++) { clear_page_dirty_for_io(pages[i]); ClearPageChecked(pages[i]); - set_page_extent_mapped(pages[i]); set_page_dirty(pages[i]); unlock_page(pages[i]); put_page(pages[i]); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index b03e7891394e..b24396cf2f99 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -81,7 +81,10 @@ static int copy_inline_to_page(struct btrfs_inode *inode, goto out_unlock; } - set_page_extent_mapped(page); + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d29baf3822a7..473b78874844 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2679,6 +2679,15 @@ static int relocate_file_extent_cluster(struct inode *inode, goto out; } } + ret = set_page_extent_mapped(page); + if (ret < 0) { + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + unlock_page(page); + put_page(page); + goto out; + } if (PageReadahead(page)) { page_cache_async_readahead(inode->i_mapping, @@ -2706,8 +2715,6 @@ static int relocate_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - set_page_extent_mapped(page); - if (nr < cluster->nr && page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, From 92082d40976ed0a421305e2264bde53944805627 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 2 Feb 2021 10:28:36 +0800 Subject: [PATCH 245/307] btrfs: integrate page status update for data read path into begin/end_page_read In btrfs data page read path, the page status update are handled in two different locations: btrfs_do_read_page() { while (cur <= end) { /* No need to read from disk */ if (HOLE/PREALLOC/INLINE){ memset(); set_extent_uptodate(); continue; } /* Read from disk */ ret = submit_extent_page(end_bio_extent_readpage); } end_bio_extent_readpage() { endio_readpage_uptodate_page_status(); } This is fine for sectorsize == PAGE_SIZE case, as for above loop we should only hit one branch and then exit. But for subpage, there is more work to be done in page status update: - Page Unlock condition Unlike regular page size == sectorsize case, we can no longer just unlock a page. Only the last reader of the page can unlock the page. This means, we can unlock the page either in the while() loop, or in the endio function. - Page uptodate condition Since we have multiple sectors to read for a page, we can only mark the full page uptodate if all sectors are uptodate. To handle both subpage and regular cases, introduce a pair of functions to help handling page status update: - begin_page_read() For regular case, it does nothing. For subpage case, it updates the reader counters so that later end_page_read() can know who is the last one to unlock the page. - end_page_read() This is just endio_readpage_uptodate_page_status() renamed. The original name is a little too long and too specific for endio. The new thing added is the condition for page unlock. Now for subpage data, we unlock the page if we're the last reader. This does not only provide the basis for subpage data read, but also hide the special handling of page read from the main read loop. Also, since we're changing how the page lock is handled, there are two existing error paths where we need to manually unlock the page before calling begin_page_read(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 42 ++++++++++++++++++++++----------- fs/btrfs/subpage.c | 55 +++++++++++++++++++++++++++++++++++--------- fs/btrfs/subpage.h | 8 +++++++ 3 files changed, 81 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5ff85ecc6c3f..40d3bca6aaa4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2840,8 +2840,17 @@ update: processed->uptodate = uptodate; } -static void endio_readpage_update_page_status(struct page *page, bool uptodate, - u64 start, u32 len) +static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) +{ + ASSERT(PageLocked(page)); + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page)); + btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); @@ -2857,7 +2866,12 @@ static void endio_readpage_update_page_status(struct page *page, bool uptodate, if (fs_info->sectorsize == PAGE_SIZE) unlock_page(page); - /* Subpage locking will be handled in later patches */ + else if (is_data_inode(page->mapping->host)) + /* + * For subpage data, unlock the page if we're the last reader. + * For subpage metadata, page lock is not utilized for read. + */ + btrfs_subpage_end_reader(fs_info, page, start, len); } /* @@ -2994,7 +3008,7 @@ readpage_ok: bio_offset += len; /* Update page status and unlock */ - endio_readpage_update_page_status(page, uptodate, start, len); + end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); } @@ -3263,6 +3277,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = page_offset(page); const u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3282,7 +3297,8 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = set_page_extent_mapped(page); if (ret < 0) { unlock_extent(tree, start, end); - SetPageError(page); + btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); + unlock_page(page); goto out; } @@ -3290,6 +3306,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (cleancache_get_page(page) == 0) { BUG_ON(blocksize != PAGE_SIZE); unlock_extent(tree, start, end); + unlock_page(page); goto out; } } @@ -3306,6 +3323,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, kunmap_atomic(userpage); } } + begin_page_read(fs_info, page); while (cur <= end) { bool force_bio_submit = false; u64 disk_bytenr; @@ -3323,13 +3341,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); break; } extent_offset = cur - em->start; @@ -3412,6 +3431,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, cur + iosize - 1, &cached); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3421,6 +3441,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3429,8 +3450,8 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * to date. Error out */ if (block_start == EXTENT_MAP_INLINE) { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); cur = cur + iosize; pg_offset += iosize; continue; @@ -3447,19 +3468,14 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, nr++; *bio_flags = this_bio_flag; } else { - SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + end_page_read(page, false, cur, iosize); goto out; } cur = cur + iosize; pg_offset += iosize; } out: - if (!nr) { - if (!PageError(page)) - SetPageUptodate(page); - unlock_page(page); - } return ret; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 2c51ab71e000..c69049e7daa9 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -54,6 +54,8 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, spin_lock_init(&(*ret)->lock); if (type == BTRFS_SUBPAGE_METADATA) atomic_set(&(*ret)->eb_refs, 0); + else + atomic_set(&(*ret)->readers, 0); return 0; } @@ -102,6 +104,47 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, atomic_dec(&subpage->eb_refs); } +static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + /* Basic checks */ + ASSERT(PagePrivate(page) && page->private); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); + /* + * The range check only works for mapped page, we can still have + * unmapped page like dummy extent buffer pages. + */ + if (page->mapping) + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); +} + +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ret = atomic_add_return(nbits, &subpage->readers); + ASSERT(ret == nbits); +} + +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = len >> fs_info->sectorsize_bits; + + btrfs_subpage_assert(fs_info, page, start, len); + ASSERT(atomic_read(&subpage->readers) >= nbits); + if (atomic_sub_and_test(nbits, &subpage->readers)) + unlock_page(page); +} + /* * Convert the [start, start + len) range into a u16 bitmap * @@ -113,18 +156,8 @@ static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; const int nbits = len >> fs_info->sectorsize_bits; - /* Basic checks */ - ASSERT(PagePrivate(page) && page->private); - ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); + btrfs_subpage_assert(fs_info, page, start, len); - /* - * The range check only works for mapped page, we can still have - * unmapped page like dummy extent buffer pages. - */ - if (page->mapping) - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); /* * Here nbits can be 16, thus can go beyond u16 range. We make the * first left shift to be calculate in unsigned long (at least u32), diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index f3c5def313a1..b86a4881475d 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -29,6 +29,9 @@ struct btrfs_subpage { */ atomic_t eb_refs; /* Structures only used by data */ + struct { + atomic_t readers; + }; }; }; @@ -53,6 +56,11 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct page *page); +void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + /* * Template for subpage related operations. * From 0bb3eb3ee8674d5d20ad3c0c0767e18787bbd761 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Jan 2021 16:34:02 +0800 Subject: [PATCH 246/307] btrfs: allow read-only mount of 4K sector size fs on 64K page system This adds the basic RO mount ability for 4K sector size on 64K page system. Currently we only plan to support 4K and 64K page system. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 25 ++++++++++++++++++++++--- fs/btrfs/super.c | 7 +++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d34c6d61928f..71fab77873a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2483,13 +2483,21 @@ static int validate_super(struct btrfs_fs_info *fs_info, btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - /* Only PAGE SIZE is supported yet */ - if (sectorsize != PAGE_SIZE) { + + /* + * For 4K page size, we only support 4K sector size. + * For 64K page size, we support read-write for 64K sector size, and + * read-only for 4K sector size. + */ + if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || + (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && + sectorsize != SZ_64K))) { btrfs_err(fs_info, - "sectorsize %llu not supported yet, only support %lu", + "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); @@ -3248,6 +3256,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + /* For 4K sector size support, it's only read-only */ + if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { + if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + btrfs_err(fs_info, + "subpage sectorsize %u only supported read-only for page size %lu", + sectorsize, PAGE_SIZE); + err = -EINVAL; + goto fail_alloc; + } + } + ret = btrfs_init_workqueues(fs_info, fs_devices); if (ret) { err = ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 919ed5c357e9..f8435641b912 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2027,6 +2027,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto restore; } + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_warn(fs_info, + "read-write mount is not yet allowed for sectorsize %u page size %lu", + fs_info->sectorsize, PAGE_SIZE); + ret = -EINVAL; + goto restore; + } /* * NOTE: when remounting with a change that does writes, don't From 2c4d8cb737b805ca8d890e50c23f2b5eca270733 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 28 Jan 2021 19:25:08 +0800 Subject: [PATCH 247/307] btrfs: explain page locking and readahead in read_extent_buffer_pages() In read_extent_buffer_pages(), if we failed to lock the page atomically, we just exit with return value 0. This is counter-intuitive, as normally if we can't lock what we need, we would return something like EAGAIN. But that return hides under (wait == WAIT_NONE) branch, which only gets triggered for readahead. And for readahead, if we failed to lock the page, it means the extent buffer is either being read by other thread, or has been read and is under modification. Either way the eb will or has been cached, thus readahead has no need to wait for it. Add comment on this counter-intuitive behavior. Reported-by: Dan Carpenter Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 40d3bca6aaa4..4be117adda33 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5853,6 +5853,13 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (wait == WAIT_NONE) { + /* + * WAIT_NONE is only utilized by readahead. If we can't + * acquire the lock atomically it means either the eb + * is being read out or under modification. + * Either way the eb will be or has been cached, + * readahead can exit safely. + */ if (!trylock_page(page)) goto unlock_exit; } else { From 72c9925f87c8b74f36f8e75a4cd93d964538d3ca Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 4 Feb 2021 14:35:44 +0000 Subject: [PATCH 248/307] btrfs: fix extent buffer leak on failure to copy root At btrfs_copy_root(), if the call to btrfs_inc_ref() fails we end up returning without unlocking and releasing our reference on the extent buffer named "cow" we previously allocated with btrfs_alloc_tree_block(). So fix that by unlocking the extent buffer and dropping our reference on it before returning. Fixes: be20aa9dbadc8c ("Btrfs: Add mount option to turn off data cow") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 95d9bae764ab..d56730a67885 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -222,6 +222,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } From ae29333fa644679b96d88c9dd3afbef25cbac0f6 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:40 +0900 Subject: [PATCH 249/307] block: add bio_add_zone_append_page Add bio_add_zone_append_page(), a wrapper around bio_add_hw_page() which is intended to be used by file systems that directly add pages to a bio instead of using bio_iov_iter_get_pages(). Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Reviewed-by: Chaitanya Kulkarni Acked-by: Jens Axboe Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- block/bio.c | 33 +++++++++++++++++++++++++++++++++ include/linux/bio.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/block/bio.c b/block/bio.c index 1f2cc1fbe283..2f21d2958b60 100644 --- a/block/bio.c +++ b/block/bio.c @@ -851,6 +851,39 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, } EXPORT_SYMBOL(bio_add_pc_page); +/** + * bio_add_zone_append_page - attempt to add page to zone-append bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist of a bio that will be submitted + * for a zone-append request. This can fail for a number of reasons, such as the + * bio being full or the target block device is not a zoned block device or + * other limitations of the target block device. The target block device must + * allow bio's up to PAGE_SIZE, so it is always possible to add a single page + * to an empty bio. + * + * Returns: number of bytes added to the bio, or 0 in case of a failure. + */ +int bio_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct request_queue *q = bio->bi_disk->queue; + bool same_page = false; + + if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) + return 0; + + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return 0; + + return bio_add_hw_page(q, bio, page, len, offset, + queue_max_zone_append_sectors(q), &same_page); +} +EXPORT_SYMBOL_GPL(bio_add_zone_append_page); + /** * __bio_try_merge_page - try appending data to an existing bvec. * @bio: destination bio diff --git a/include/linux/bio.h b/include/linux/bio.h index 1edda614f7ce..de62911473bb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -455,6 +455,8 @@ void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); +int bio_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset); bool __bio_try_merge_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off, bool *same_page); void __bio_add_page(struct bio *bio, struct page *page, From c3b0e880bbfafab6beed92b1ee6db2cdaf4bc54c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:41 +0900 Subject: [PATCH 250/307] iomap: support REQ_OP_ZONE_APPEND A ZONE_APPEND bio must follow hardware restrictions (e.g. not exceeding max_zone_append_sectors) not to be split. bio_iov_iter_get_pages builds such restricted bio using __bio_iov_append_get_pages if bio_op(bio) == REQ_OP_ZONE_APPEND. To utilize it, we need to set the bio_op before calling bio_iov_iter_get_pages(). This commit introduces IOMAP_F_ZONE_APPEND, so that iomap user can set the flag to indicate they want REQ_OP_ZONE_APPEND and restricted bio. Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/iomap/direct-io.c | 43 +++++++++++++++++++++++++++++++++++++------ include/linux/iomap.h | 1 + 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 933f234d5bec..2273120d8ed7 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -201,6 +201,34 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, iomap_dio_submit_bio(dio, iomap, bio, pos); } +/* + * Figure out the bio's operation flags from the dio request, the + * mapping, and whether or not we want FUA. Note that we can end up + * clearing the WRITE_FUA flag in the dio request. + */ +static inline unsigned int +iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) +{ + unsigned int opflags = REQ_SYNC | REQ_IDLE; + + if (!(dio->flags & IOMAP_DIO_WRITE)) { + WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND); + return REQ_OP_READ; + } + + if (iomap->flags & IOMAP_F_ZONE_APPEND) + opflags |= REQ_OP_ZONE_APPEND; + else + opflags |= REQ_OP_WRITE; + + if (use_fua) + opflags |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; + + return opflags; +} + static loff_t iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, struct iomap_dio *dio, struct iomap *iomap) @@ -208,6 +236,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); + unsigned int bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; @@ -263,6 +292,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, iomap_dio_zero(dio, iomap, pos - pad, pad); } + /* + * Set the operation flags early so that bio_iov_iter_get_pages + * can set up the page vector appropriately for a ZONE_APPEND + * operation. + */ + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); + do { size_t n; if (dio->error) { @@ -278,6 +314,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + bio->bi_opf = bio_opf; ret = bio_iov_iter_get_pages(bio, dio->submit.iter); if (unlikely(ret)) { @@ -293,14 +330,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - if (use_fua) - bio->bi_opf |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; task_io_account_write(n); } else { - bio->bi_opf = REQ_OP_READ; if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5bd3cac4df9c..8ebb1fa6f3b7 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -55,6 +55,7 @@ struct vm_fault; #define IOMAP_F_SHARED 0x04 #define IOMAP_F_MERGED 0x08 #define IOMAP_F_BUFFER_HEAD 0x10 +#define IOMAP_F_ZONE_APPEND 0x20 /* * Flags set by the core iomap code during operations: From 7365104236ade0bf22edd7724c8fd438b0342ee4 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:42 +0900 Subject: [PATCH 251/307] btrfs: zoned: defer loading zone info after opening trees This is a preparation patch to implement zone emulation on a regular device. To emulate a zoned filesystem on a regular (non-zoned) device, we need to decide an emulated zone size. Instead of making it a compile-time static value, we'll make it configurable at mkfs time. Since we have one zone == one device extent restriction, we can determine the emulated zone size from the size of a device extent. We can extend btrfs_get_dev_zone_info() to show a regular device filled with conventional zones once the zone size is decided. The current call site of btrfs_get_dev_zone_info() during the mount process is earlier than loading the file system trees so that we don't know the size of a device extent at this point. Thus we can't slice a regular device to conventional zones. This patch introduces btrfs_get_dev_zone_info_all_devices to load the zone info for all the devices. And, it places this function in open_ctree() after loading the trees. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 +++++++++++++ fs/btrfs/volumes.c | 4 ---- fs/btrfs/zoned.c | 25 +++++++++++++++++++++++++ fs/btrfs/zoned.h | 6 ++++++ 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 71fab77873a5..2b6a3df765cd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3333,6 +3333,19 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (ret) goto fail_tree_roots; + /* + * Get zone type information of zoned block devices. This will also + * handle emulation of a zoned filesystem if a regular device has the + * zoned incompat feature flag set. + */ + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, + "zoned: failed to read device zone info: %d", + ret); + goto fail_block_groups; + } + /* * If we have a uuid root and we're not being told to rescan we need to * check the generation here so we can set the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3948f5b50d11..07cd4742c123 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -669,10 +669,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; - ret = btrfs_get_dev_zone_info(device); - if (ret != 0) - goto error_free_page; - fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 41d27fefd306..0b1b1f38a196 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -143,6 +143,31 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + int ret = 0; + + /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ + if (!btrfs_fs_incompat(fs_info, ZONED)) + return 0; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + /* We can skip reading of zone info for missing devices */ + if (!device->bdev) + continue; + + ret = btrfs_get_dev_zone_info(device); + if (ret) + break; + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + int btrfs_get_dev_zone_info(struct btrfs_device *device) { struct btrfs_zoned_device_info *zone_info = NULL; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8abe2f83272b..eb47b7ad9ab1 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -25,6 +25,7 @@ struct btrfs_zoned_device_info { #ifdef CONFIG_BLK_DEV_ZONED int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone); +int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); int btrfs_get_dev_zone_info(struct btrfs_device *device); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); @@ -42,6 +43,11 @@ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, return 0; } +static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) +{ + return 0; +} + static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) { return 0; From d6639b35da2d742f9cbcdf8f49f87f2bde9fd479 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:43 +0900 Subject: [PATCH 252/307] btrfs: zoned: use regular super block location on zone emulation A zoned filesystem currently has a superblock at the beginning of the superblock logging zones if the zones are conventional. This difference in superblock position causes a chicken-and-egg problem for filesystems with emulated zones. Since the device is a regular (non-zoned) device, we cannot know if the filesystem is regular or zoned while reading the superblock. But, to load the superblock, we need to see if it is emulated zoned or not. Place the superblocks at the same location as they are on regular filesystem on regular devices to solve the problem. It is possible because it's ensured that all the superblock locations are at an (emulated) conventional zone on regular devices. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0b1b1f38a196..8b3868088c5e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -552,7 +552,13 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, struct btrfs_zoned_device_info *zinfo = device->zone_info; u32 zone_num; - if (!zinfo) { + /* + * For a zoned filesystem on a non-zoned block device, use the same + * super block locations as regular filesystem. Doing so, the super + * block can always be retrieved and the zoned flag of the volume + * detected from the super block information. + */ + if (!bdev_is_zoned(device->bdev)) { *bytenr_ret = btrfs_sb_offset(mirror); return 0; } From 4afd2fe835a0ff87fb88cba7a7daa881d8e14233 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:44 +0900 Subject: [PATCH 253/307] btrfs: release path before calling to btrfs_load_block_group_zone_info Since we have no write pointer in conventional zones, we cannot determine the allocation offset from it. Instead, we set the allocation offset after the highest addressed extent. This is done by reading the extent tree in btrfs_load_block_group_zone_info(). However, this function is called from btrfs_read_block_groups(), so the read lock for the tree node could be recursively taken. To avoid this unsafe locking scenario, release the path before reading the extent tree to get the allocation offset. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5fa6b3d540f4..b8fbee70a897 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1810,24 +1810,8 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static void read_block_group_item(struct btrfs_block_group *cache, - struct btrfs_path *path, - const struct btrfs_key *key) -{ - struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_block_group_item bgi; - int slot = path->slots[0]; - - cache->length = key->offset; - - read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), - sizeof(bgi)); - cache->used = btrfs_stack_block_group_used(&bgi); - cache->flags = btrfs_stack_block_group_flags(&bgi); -} - static int read_one_block_group(struct btrfs_fs_info *info, - struct btrfs_path *path, + struct btrfs_block_group_item *bgi, const struct btrfs_key *key, int need_clear) { @@ -1842,7 +1826,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - read_block_group_item(cache, path, key); + cache->length = key->offset; + cache->used = btrfs_stack_block_group_used(bgi); + cache->flags = btrfs_stack_block_group_flags(bgi); set_free_space_tree_thresholds(cache); @@ -2001,19 +1987,29 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) need_clear = 1; while (1) { + struct btrfs_block_group_item bgi; + struct extent_buffer *leaf; + int slot; + ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) goto error; - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - ret = read_one_block_group(info, path, &key, need_clear); + leaf = path->nodes[0]; + slot = path->slots[0]; + + read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), + sizeof(bgi)); + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_release_path(path); + ret = read_one_block_group(info, &bgi, &key, need_clear); if (ret < 0) goto error; key.objectid += key.offset; key.offset = 0; - btrfs_release_path(path); } btrfs_release_path(path); From b53429bad3a3555fdbda190192c6e9dfef8e7787 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:45 +0900 Subject: [PATCH 254/307] btrfs: zoned: do not load fs_info::zoned from incompat flag Don't set the zoned flag in fs_info as soon as we're encountering the incompat filesystem flag for a zoned filesystem on mount. The zoned flag in fs_info is in a union together with the zone_size, so setting it too early will result in setting an incorrect zone_size as well. Once the correct zone_size is read from the device, we can rely on the zoned flag in fs_info as well to determine if the filesystem is zoned. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/zoned.c | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b6a3df765cd..8551b0fc1b22 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3201,8 +3201,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); - fs_info->zoned = (features & BTRFS_FEATURE_INCOMPAT_ZONED); - /* * flag our filesystem as having big metadata blocks if * they are bigger than the page size diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 8b3868088c5e..c0840412ccb6 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -432,6 +432,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; + /* + * Check mount options here, because we might change fs_info->zoned + * from fs_info->zone_size. + */ + ret = btrfs_check_mountopts_zoned(fs_info); + if (ret) + goto out; + btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); out: return ret; From 1cb3dc3f79153c2d7f9a4438381e1385dff09656 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:46 +0900 Subject: [PATCH 255/307] btrfs: zoned: disallow fitrim on zoned filesystems The implementation of fitrim depends on space cache, which is not used and disabled for zoned extent allocator. So the current code does not work with zoned filesystem. In the future, we can implement fitrim for zoned filesystems by enabling space cache (but, only for fitrim) or scanning the extent tree at fitrim time. For now, disallow fitrim on zoned filesystems. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e6a63f652235..a8c60d46d19c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -527,6 +527,14 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * btrfs_trim_block_group() depends on space cache, which is not + * available in zoned filesystem. So, disallow fitrim on a zoned + * filesystem for now. + */ + if (btrfs_is_zoned(fs_info)) + return -EOPNOTSUPP; + /* * If the fs is mounted with nologreplay, which requires it to be * mounted in RO mode as well, we can not allow discard on free space From 3c9daa09ccd43f68104634020b364d834c01738c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:21:47 +0900 Subject: [PATCH 256/307] btrfs: zoned: allow zoned filesystems on non-zoned block devices Run a zoned filesystem on non-zoned devices. This is done by "slicing up" the block device into static sized chunks and fake a conventional zone on each of them. The emulated zone size is determined from the size of device extent. This is mainly aimed at testing of zoned filesystems, i.e. the zoned chunk allocator, on regular block devices. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 150 +++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/zoned.h | 14 +++-- 2 files changed, 148 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c0840412ccb6..6699f626a86e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -119,6 +119,36 @@ static inline u32 sb_zone_number(int shift, int mirror) return 0; } +/* + * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block + * device into static sized chunks and fake a conventional zone on each of + * them. + */ +static int emulate_report_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int nr_zones) +{ + const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; + sector_t bdev_size = bdev_nr_sectors(device->bdev); + unsigned int i; + + pos >>= SECTOR_SHIFT; + for (i = 0; i < nr_zones; i++) { + zones[i].start = i * zone_sectors + pos; + zones[i].len = zone_sectors; + zones[i].capacity = zone_sectors; + zones[i].wp = zones[i].start + zone_sectors; + zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; + zones[i].cond = BLK_ZONE_COND_NOT_WP; + + if (zones[i].wp >= bdev_size) { + i++; + break; + } + } + + return i; +} + static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, struct blk_zone *zones, unsigned int *nr_zones) { @@ -127,6 +157,12 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, if (!*nr_zones) return 0; + if (!bdev_is_zoned(device->bdev)) { + ret = emulate_report_zones(device, pos, zones, *nr_zones); + *nr_zones = ret; + return 0; + } + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, copy_zone_info_cb, zones); if (ret < 0) { @@ -143,6 +179,50 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, return 0; } +/* The emulated zone size is determined from the size of device extent */ +static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_dev_extent *dext; + int ret = 0; + + key.objectid = 1; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + /* No dev extents at all? Not good */ + if (ret > 0) { + ret = -EUCLEAN; + goto out; + } + } + + leaf = path->nodes[0]; + dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); + ret = 0; + +out: + btrfs_free_path(path); + + return ret; +} + int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -170,6 +250,7 @@ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) int btrfs_get_dev_zone_info(struct btrfs_device *device) { + struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; struct request_queue *queue = bdev_get_queue(bdev); @@ -178,9 +259,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct blk_zone *zones = NULL; unsigned int i, nreported = 0, nr_zones; unsigned int zone_sectors; + char *model, *emulated; int ret; - if (!bdev_is_zoned(bdev)) + /* + * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not + * yet be set. + */ + if (!btrfs_fs_incompat(fs_info, ZONED)) return 0; if (device->zone_info) @@ -190,8 +276,20 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!zone_info) return -ENOMEM; + if (!bdev_is_zoned(bdev)) { + if (!fs_info->zone_size) { + ret = calculate_emulated_zone_size(fs_info); + if (ret) + goto out; + } + + ASSERT(fs_info->zone_size); + zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; + } else { + zone_sectors = bdev_zone_sectors(bdev); + } + nr_sectors = bdev_nr_sectors(bdev); - zone_sectors = bdev_zone_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; @@ -297,20 +395,42 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) device->zone_info = zone_info; - /* device->fs_info is not safe to use for printing messages */ - btrfs_info_in_rcu(NULL, - "host-%s zoned block device %s, %u zones of %llu bytes", - bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware", - rcu_str_deref(device->name), zone_info->nr_zones, - zone_info->zone_size); + switch (bdev_zoned_model(bdev)) { + case BLK_ZONED_HM: + model = "host-managed zoned"; + emulated = ""; + break; + case BLK_ZONED_HA: + model = "host-aware zoned"; + emulated = ""; + break; + case BLK_ZONED_NONE: + model = "regular"; + emulated = "emulated "; + break; + default: + /* Just in case */ + btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", + bdev_zoned_model(bdev), + rcu_str_deref(device->name)); + ret = -EOPNOTSUPP; + goto out_free_zone_info; + } + + btrfs_info_in_rcu(fs_info, + "%s block device %s, %u %szones of %llu bytes", + model, rcu_str_deref(device->name), zone_info->nr_zones, + emulated, zone_info->zone_size); return 0; out: kfree(zones); +out_free_zone_info: bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->seq_zones); kfree(zone_info); + device->zone_info = NULL; return ret; } @@ -349,7 +469,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 nr_devices = 0; u64 zone_size = 0; u64 max_zone_append_size = 0; - const bool incompat_zoned = btrfs_is_zoned(fs_info); + const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; /* Count zoned devices */ @@ -360,9 +480,17 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) continue; model = bdev_zoned_model(device->bdev); + /* + * A Host-Managed zoned device must be used as a zoned device. + * A Host-Aware zoned device and a non-zoned devices can be + * treated as a zoned device, if ZONED flag is enabled in the + * superblock. + */ if (model == BLK_ZONED_HM || - (model == BLK_ZONED_HA && incompat_zoned)) { - struct btrfs_zoned_device_info *zone_info; + (model == BLK_ZONED_HA && incompat_zoned) || + (model == BLK_ZONED_NONE && incompat_zoned)) { + struct btrfs_zoned_device_info *zone_info = + device->zone_info; zone_info = device->zone_info; zoned_devices++; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index eb47b7ad9ab1..5e78786bb723 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -142,12 +142,16 @@ static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 p static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info, struct block_device *bdev) { - u64 zone_size; - if (btrfs_is_zoned(fs_info)) { - zone_size = bdev_zone_sectors(bdev) << SECTOR_SHIFT; - /* Do not allow non-zoned device */ - return bdev_is_zoned(bdev) && fs_info->zone_size == zone_size; + /* + * We can allow a regular device on a zoned filesystem, because + * we will emulate the zoned capabilities. + */ + if (!bdev_is_zoned(bdev)) + return true; + + return fs_info->zone_size == + (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } /* Do not allow Host Manged zoned device */ From 1cd6121f2a382a840f01f506694b54bf403fddc9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:48 +0900 Subject: [PATCH 257/307] btrfs: zoned: implement zoned chunk allocator Implement a zoned chunk and device extent allocator. One device zone becomes a device extent so that a zone reset affects only this device extent and does not change the state of blocks in the neighbor device extents. To implement the allocator, we need to extend the following functions for a zoned filesystem. - init_alloc_chunk_ctl - dev_extent_search_start - dev_extent_hole_check - decide_stripe_size init_alloc_chunk_ctl_zoned() is mostly the same as regular one. It always set the stripe_size to the zone size and aligns the parameters to the zone size. dev_extent_search_start() only aligns the start offset to zone boundaries. We don't care about the first 1MB like in regular filesystem because we anyway reserve the first two zones for superblock logging. dev_extent_hole_check_zoned() checks if zones in given hole are either conventional or empty sequential zones. Also, it skips zones reserved for superblock logging. With the change to the hole, the new hole may now contain pending extents. So, in this case, loop again to check that. Finally, decide_stripe_size_zoned() should shrink the number of devices instead of stripe size because we need to honor stripe_size == zone_size. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 171 ++++++++++++++++++++++++++++++++++++++++----- fs/btrfs/volumes.h | 1 + fs/btrfs/zoned.c | 141 +++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 25 +++++++ 4 files changed, 321 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 07cd4742c123..ae2aeadad5a0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1414,11 +1414,62 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * make sure to start at an offset of at least 1MB. */ return max_t(u64, start, SZ_1M); + case BTRFS_CHUNK_ALLOC_ZONED: + /* + * We don't care about the starting region like regular + * allocator, because we anyway use/reserve the first two zones + * for superblock logging. + */ + return ALIGN(start, device->zone_info->zone_size); default: BUG(); } } +static bool dev_extent_hole_check_zoned(struct btrfs_device *device, + u64 *hole_start, u64 *hole_size, + u64 num_bytes) +{ + u64 zone_size = device->zone_info->zone_size; + u64 pos; + int ret; + bool changed = false; + + ASSERT(IS_ALIGNED(*hole_start, zone_size)); + + while (*hole_size > 0) { + pos = btrfs_find_allocatable_zones(device, *hole_start, + *hole_start + *hole_size, + num_bytes); + if (pos != *hole_start) { + *hole_size = *hole_start + *hole_size - pos; + *hole_start = pos; + changed = true; + if (*hole_size < num_bytes) + break; + } + + ret = btrfs_ensure_empty_zones(device, pos, num_bytes); + + /* Range is ensured to be empty */ + if (!ret) + return changed; + + /* Given hole range was invalid (outside of device) */ + if (ret == -ERANGE) { + *hole_start += *hole_size; + *hole_size = 0; + return 1; + } + + *hole_start += zone_size; + *hole_size -= zone_size; + changed = true; + } + + return changed; +} + /** * dev_extent_hole_check - check if specified hole is suitable for allocation * @device: the device which we have the hole @@ -1426,7 +1477,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) * @hole_size: the size of the hole * @num_bytes: the size of the free space that we need * - * This function may modify @hole_start and @hole_end to reflect the suitable + * This function may modify @hole_start and @hole_size to reflect the suitable * position for allocation. Returns 1 if hole position is updated, 0 otherwise. */ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, @@ -1435,24 +1486,39 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, bool changed = false; u64 hole_end = *hole_start + *hole_size; - /* - * Check before we set max_hole_start, otherwise we could end up - * sending back this offset anyway. - */ - if (contains_pending_extent(device, hole_start, *hole_size)) { - if (hole_end >= *hole_start) - *hole_size = hole_end - *hole_start; - else - *hole_size = 0; - changed = true; - } + for (;;) { + /* + * Check before we set max_hole_start, otherwise we could end up + * sending back this offset anyway. + */ + if (contains_pending_extent(device, hole_start, *hole_size)) { + if (hole_end >= *hole_start) + *hole_size = hole_end - *hole_start; + else + *hole_size = 0; + changed = true; + } + + switch (device->fs_devices->chunk_alloc_policy) { + case BTRFS_CHUNK_ALLOC_REGULAR: + /* No extra check */ + break; + case BTRFS_CHUNK_ALLOC_ZONED: + if (dev_extent_hole_check_zoned(device, hole_start, + hole_size, num_bytes)) { + changed = true; + /* + * The changed hole can contain pending extent. + * Loop again to check that. + */ + continue; + } + break; + default: + BUG(); + } - switch (device->fs_devices->chunk_alloc_policy) { - case BTRFS_CHUNK_ALLOC_REGULAR: - /* No extra check */ break; - default: - BUG(); } return changed; @@ -1505,6 +1571,9 @@ static int find_free_dev_extent_start(struct btrfs_device *device, search_start = dev_extent_search_start(device, search_start); + WARN_ON(device->zone_info && + !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4899,6 +4968,37 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } +static void init_alloc_chunk_ctl_policy_zoned( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) +{ + u64 zone_size = fs_devices->fs_info->zone_size; + u64 limit; + int min_num_stripes = ctl->devs_min * ctl->dev_stripes; + int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; + u64 min_chunk_size = min_data_stripes * zone_size; + u64 type = ctl->type; + + ctl->max_stripe_size = zone_size; + if (type & BTRFS_BLOCK_GROUP_DATA) { + ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, + zone_size); + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + ctl->max_chunk_size = ctl->max_stripe_size; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + ctl->max_chunk_size = 2 * ctl->max_stripe_size; + ctl->devs_max = min_t(int, ctl->devs_max, + BTRFS_MAX_DEVS_SYS_CHUNK); + } + + /* We don't want a chunk larger than 10% of writable space */ + limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + zone_size), + min_chunk_size); + ctl->max_chunk_size = min(limit, ctl->max_chunk_size); + ctl->dev_extent_min = zone_size * ctl->dev_stripes; +} + static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { @@ -4919,6 +5019,9 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; + case BTRFS_CHUNK_ALLOC_ZONED: + init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); + break; default: BUG(); } @@ -5045,6 +5148,38 @@ static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, return 0; } +static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + u64 zone_size = devices_info[0].dev->zone_info->zone_size; + /* Number of stripes that count for block group size */ + int data_stripes; + + /* + * It should hold because: + * dev_extent_min == dev_extent_want == zone_size * dev_stripes + */ + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + + ctl->stripe_size = zone_size; + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + + /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ + if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { + ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, + ctl->stripe_size) + ctl->nparity, + ctl->dev_stripes); + ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; + data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + } + + ctl->chunk_size = ctl->stripe_size * data_stripes; + + return 0; +} + static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) @@ -5072,6 +5207,8 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, switch (fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); + case BTRFS_CHUNK_ALLOC_ZONED: + return decide_stripe_size_zoned(ctl, devices_info); default: BUG(); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 04e2b26823c2..598ac225176d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -214,6 +214,7 @@ BTRFS_DEVICE_GETSET_FUNCS(bytes_used); enum btrfs_chunk_allocation_policy { BTRFS_CHUNK_ALLOC_REGULAR, + BTRFS_CHUNK_ALLOC_ZONED, }; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 6699f626a86e..69fd0d078b9b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" +#include "disk-io.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -559,6 +561,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) fs_info->zone_size = zone_size; fs_info->max_zone_append_size = max_zone_append_size; + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; /* * Check mount options here, because we might change fs_info->zoned @@ -779,3 +782,141 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) sb_zone << zone_sectors_shift, zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } + +/** + * btrfs_find_allocatable_zones - find allocatable zones within a given region + * + * @device: the device to allocate a region on + * @hole_start: the position of the hole to allocate the region + * @num_bytes: size of wanted region + * @hole_end: the end of the hole + * @return: position of allocatable zones + * + * Allocatable region should not contain any superblock locations. + */ +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + u64 nzones = num_bytes >> shift; + u64 pos = hole_start; + u64 begin, end; + bool have_sb; + int i; + + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + + while (pos < hole_end) { + begin = pos >> shift; + end = begin + nzones; + + if (end > zinfo->nr_zones) + return hole_end; + + /* Check if zones in the region are all empty */ + if (btrfs_dev_is_sequential(device, pos) && + find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { + pos += zinfo->zone_size; + continue; + } + + have_sb = false; + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + u32 sb_zone; + u64 sb_pos; + + sb_zone = sb_zone_number(shift, i); + if (!(end <= sb_zone || + sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { + have_sb = true; + pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + break; + } + + /* We also need to exclude regular superblock positions */ + sb_pos = btrfs_sb_offset(i); + if (!(pos + num_bytes <= sb_pos || + sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { + have_sb = true; + pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, + zinfo->zone_size); + break; + } + } + if (!have_sb) + break; + } + + return pos; +} + +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes) +{ + int ret; + + *bytes = 0; + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, + GFP_NOFS); + if (ret) + return ret; + + *bytes = length; + while (length) { + btrfs_dev_set_zone_empty(device, physical); + physical += device->zone_info->zone_size; + length -= device->zone_info->zone_size; + } + + return 0; +} + +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) +{ + struct btrfs_zoned_device_info *zinfo = device->zone_info; + const u8 shift = zinfo->zone_size_shift; + unsigned long begin = start >> shift; + unsigned long end = (start + size) >> shift; + u64 pos; + int ret; + + ASSERT(IS_ALIGNED(start, zinfo->zone_size)); + ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + + if (end > zinfo->nr_zones) + return -ERANGE; + + /* All the zones are conventional */ + if (find_next_bit(zinfo->seq_zones, begin, end) == end) + return 0; + + /* All the zones are sequential and empty */ + if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && + find_next_zero_bit(zinfo->empty_zones, begin, end) == end) + return 0; + + for (pos = start; pos < start + size; pos += zinfo->zone_size) { + u64 reset_bytes; + + if (!btrfs_dev_is_sequential(device, pos) || + btrfs_dev_is_empty_zone(device, pos)) + continue; + + /* Free regions should be empty */ + btrfs_warn_in_rcu( + device->fs_info, + "zoned: resetting device %s (devid %llu) zone %llu for allocation", + rcu_str_deref(device->name), device->devid, pos >> shift); + WARN_ON_ONCE(1); + + ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, + &reset_bytes); + if (ret) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5e78786bb723..6c8f83c48c2e 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -36,6 +36,11 @@ int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); +u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, + u64 hole_end, u64 num_bytes); +int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, + u64 length, u64 *bytes); +int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -91,6 +96,26 @@ static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror return 0; } +static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device, + u64 hole_start, u64 hole_end, + u64 num_bytes) +{ + return hole_start; +} + +static inline int btrfs_reset_device_zone(struct btrfs_device *device, + u64 physical, u64 length, u64 *bytes) +{ + *bytes = 0; + return 0; +} + +static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, + u64 start, u64 size) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 381a696eb5f99189a2c8d0d99aae766767f9cb1e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:49 +0900 Subject: [PATCH 258/307] btrfs: zoned: verify device extent is aligned to zone Add a check in verify_one_dev_extent() to ensure that a device extent on a zoned block device is aligned to the respective zone boundary. If it isn't, mark the filesystem as unclean. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ae2aeadad5a0..10401def16ef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7769,6 +7769,20 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; goto out; } + + if (dev->zone_info) { + u64 zone_size = dev->zone_info->zone_size; + + if (!IS_ALIGNED(physical_offset, zone_size) || + !IS_ALIGNED(physical_len, zone_size)) { + btrfs_err(fs_info, +"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", + devid, physical_offset, physical_len); + ret = -EUCLEAN; + goto out; + } + } + out: free_extent_map(em); return ret; From 08e11a3db098f4ba0cfee46d7ab449cba43dea1b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:50 +0900 Subject: [PATCH 259/307] btrfs: zoned: load zone's allocation offset A zoned filesystem must allocate blocks at the zones' write pointer. The device's write pointer position can be mapped to a logical address within a block group. To facilitate this, add an "alloc_offset" to the block-group to track the logical addresses of the write pointer. This logical address is populated in btrfs_load_block_group_zone_info() from the write pointers of corresponding zones. For now, zoned filesystems the single profile. Supporting non-single profile with zone append writing is not trivial. For example, in the DUP profile, we send a zone append writing IO to two zones on a device. The device reply with written LBAs for the IOs. If the offsets of the returned addresses from the beginning of the zone are different, then it results in different logical addresses. We need fine-grained logical to physical mapping to support such separated physical address issue. Since it should require additional metadata type, disable non-single profiles for now. This commit supports the case all the zones in a block group are sequential. The next patch will handle the case having a conventional zone. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 15 ++++ fs/btrfs/block-group.h | 6 ++ fs/btrfs/zoned.c | 151 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 7 ++ 4 files changed, 179 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b8fbee70a897..e6bf728496eb 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -15,6 +15,7 @@ #include "delalloc-space.h" #include "discard.h" #include "raid56.h" +#include "zoned.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -1855,6 +1856,13 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } + ret = btrfs_load_block_group_zone_info(cache); + if (ret) { + btrfs_err(info, "zoned: failed to load zone info of bg %llu", + cache->start); + goto error; + } + /* * We need to exclude the super stripes now so that the space info has * super bytes accounted for, otherwise we'll think we have more space @@ -2141,6 +2149,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, cache->cached = BTRFS_CACHE_FINISHED; if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; + + ret = btrfs_load_block_group_zone_info(cache); + if (ret) { + btrfs_put_block_group(cache); + return ret; + } + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8f74a96074f7..224946fa9bed 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -183,6 +183,12 @@ struct btrfs_block_group { /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; + + /* + * Allocation offset for the block group to implement sequential + * allocation. This is used only on a zoned filesystem. + */ + u64 alloc_offset; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 69fd0d078b9b..0a7cd00f405f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -3,14 +3,20 @@ #include #include #include +#include #include "ctree.h" #include "volumes.h" #include "zoned.h" #include "rcu-string.h" #include "disk-io.h" +#include "block-group.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 +/* Invalid allocation pointer value for missing devices */ +#define WP_MISSING_DEV ((u64)-1) +/* Pseudo write pointer value for conventional zone */ +#define WP_CONVENTIONAL ((u64)-2) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 @@ -920,3 +926,148 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) return 0; } + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *device; + u64 logical = cache->start; + u64 length = cache->length; + u64 physical = 0; + int ret; + int i; + unsigned int nofs_flag; + u64 *alloc_offsets = NULL; + u32 num_sequential = 0, num_conventional = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + /* Sanity check */ + if (!IS_ALIGNED(length, fs_info->zone_size)) { + btrfs_err(fs_info, + "zoned: block group %llu len %llu unaligned to zone size %llu", + logical, length, fs_info->zone_size); + return -EIO; + } + + /* Get the chunk mapping */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) + return -EINVAL; + + map = em->map_lookup; + + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); + if (!alloc_offsets) { + free_extent_map(em); + return -ENOMEM; + } + + for (i = 0; i < map->num_stripes; i++) { + bool is_sequential; + struct blk_zone zone; + + device = map->stripes[i].dev; + physical = map->stripes[i].physical; + + if (device->bdev == NULL) { + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } + + is_sequential = btrfs_dev_is_sequential(device, physical); + if (is_sequential) + num_sequential++; + else + num_conventional++; + + if (!is_sequential) { + alloc_offsets[i] = WP_CONVENTIONAL; + continue; + } + + /* + * This zone will be used for allocation, so mark this zone + * non-empty. + */ + btrfs_dev_clear_zone_empty(device, physical); + + /* + * The group is mapped to a sequential zone. Get the zone write + * pointer to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret == -EIO || ret == -EOPNOTSUPP) { + ret = 0; + alloc_offsets[i] = WP_MISSING_DEV; + continue; + } else if (ret) { + goto out; + } + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + physical >> device->zone_info->zone_size_shift, + rcu_str_deref(device->name), device->devid); + alloc_offsets[i] = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + alloc_offsets[i] = 0; + break; + case BLK_ZONE_COND_FULL: + alloc_offsets[i] = fs_info->zone_size; + break; + default: + /* Partially used zone */ + alloc_offsets[i] = + ((zone.wp - zone.start) << SECTOR_SHIFT); + break; + } + } + + if (num_conventional > 0) { + /* + * Since conventional zones do not have a write pointer, we + * cannot determine alloc_offset from the pointer + */ + ret = -EINVAL; + goto out; + } + + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case 0: /* single */ + cache->alloc_offset = alloc_offsets[0]; + break; + case BTRFS_BLOCK_GROUP_DUP: + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID0: + case BTRFS_BLOCK_GROUP_RAID10: + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + /* non-single profiles are not supported yet */ + default: + btrfs_err(fs_info, "zoned: profile %s not yet supported", + btrfs_bg_type_to_raid_name(map->type)); + ret = -EINVAL; + goto out; + } + +out: + kfree(alloc_offsets); + free_extent_map(em); + + return ret; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6c8f83c48c2e..4f3152d7b98f 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -41,6 +41,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes); int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -116,6 +117,12 @@ static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, return 0; } +static inline int btrfs_load_block_group_zone_info( + struct btrfs_block_group *cache) +{ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From a94794d50d788d4735fd8f656ac8c0510117457d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:51 +0900 Subject: [PATCH 260/307] btrfs: zoned: calculate allocation offset for conventional zones Conventional zones do not have a write pointer, so we cannot use it to determine the allocation offset for sequential allocation if a block group contains a conventional zone. But instead, we can consider the end of the highest addressed extent in the block group for the allocation offset. For new block group, we cannot calculate the allocation offset by consulting the extent tree, because it can cause deadlock by taking extent buffer lock after chunk mutex, which is already taken in btrfs_make_block_group(). Since it is a new block group anyways, we can simply set the allocation offset to 0. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 +- fs/btrfs/zoned.c | 99 +++++++++++++++++++++++++++++++++++++++--- fs/btrfs/zoned.h | 4 +- 3 files changed, 98 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e6bf728496eb..6d10874189df 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1856,7 +1856,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_load_block_group_zone_info(cache); + ret = btrfs_load_block_group_zone_info(cache, false); if (ret) { btrfs_err(info, "zoned: failed to load zone info of bg %llu", cache->start); @@ -2150,7 +2150,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; - ret = btrfs_load_block_group_zone_info(cache); + ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { btrfs_put_block_group(cache); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0a7cd00f405f..b892566a1c93 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -927,7 +927,68 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) return 0; } -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) +/* + * Calculate an allocation pointer from the extent allocation information + * for a block group consist of conventional zones. It is pointed to the + * end of the highest addressed extent in the block group as an allocation + * offset. + */ +static int calculate_alloc_pointer(struct btrfs_block_group *cache, + u64 *offset_ret) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + u64 length; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = cache->start + cache->length; + key.type = 0; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + /* We should not find the exact match */ + if (!ret) + ret = -EUCLEAN; + if (ret < 0) + goto out; + + ret = btrfs_previous_extent_item(root, path, cache->start); + if (ret) { + if (ret == 1) { + ret = 0; + *offset_ret = 0; + } + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + + if (found_key.type == BTRFS_EXTENT_ITEM_KEY) + length = found_key.offset; + else + length = fs_info->nodesize; + + if (!(found_key.objectid >= cache->start && + found_key.objectid + length <= cache->start + cache->length)) { + ret = -EUCLEAN; + goto out; + } + *offset_ret = found_key.objectid + length - cache->start; + ret = 0; + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct extent_map_tree *em_tree = &fs_info->mapping_tree; @@ -941,6 +1002,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; + u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; if (!btrfs_is_zoned(fs_info)) @@ -1040,11 +1102,30 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) if (num_conventional > 0) { /* - * Since conventional zones do not have a write pointer, we - * cannot determine alloc_offset from the pointer + * Avoid calling calculate_alloc_pointer() for new BG. It + * is no use for new BG. It must be always 0. + * + * Also, we have a lock chain of extent buffer lock -> + * chunk mutex. For new BG, this function is called from + * btrfs_make_block_group() which is already taking the + * chunk mutex. Thus, we cannot call + * calculate_alloc_pointer() which takes extent buffer + * locks to avoid deadlock. */ - ret = -EINVAL; - goto out; + if (new) { + cache->alloc_offset = 0; + goto out; + } + ret = calculate_alloc_pointer(cache, &last_alloc); + if (ret || map->num_stripes == num_conventional) { + if (!ret) + cache->alloc_offset = last_alloc; + else + btrfs_err(fs_info, + "zoned: failed to determine allocation offset of bg %llu", + cache->start); + goto out; + } } switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { @@ -1066,6 +1147,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache) } out: + /* An extent is allocated after the write pointer */ + if (!ret && num_conventional && last_alloc > cache->alloc_offset) { + btrfs_err(fs_info, + "zoned: got wrong write pointer in BG %llu: %llu > %llu", + logical, last_alloc, cache->alloc_offset); + ret = -EIO; + } + kfree(alloc_offsets); free_extent_map(em); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 4f3152d7b98f..d27db3993e51 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -41,7 +41,7 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes); int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); -int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache); +int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -118,7 +118,7 @@ static inline int btrfs_ensure_empty_zones(struct btrfs_device *device, } static inline int btrfs_load_block_group_zone_info( - struct btrfs_block_group *cache) + struct btrfs_block_group *cache, bool new) { return 0; } From 169e0da91a21a571093feb8ff84c7e9229e64c08 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:52 +0900 Subject: [PATCH 261/307] btrfs: zoned: track unusable bytes for zones In a zoned filesystem a once written then freed region is not usable until the underlying zone has been reset. So we need to distinguish such unusable space from usable free space. Therefore we need to introduce the "zone_unusable" field to the block group structure, and "bytes_zone_unusable" to the space_info structure to track the unusable space. Pinned bytes are always reclaimed to the unusable space. But, when an allocated region is returned before using e.g., the block group becomes read-only between allocation time and reservation time, we can safely return the region to the block group. For the situation, this commit introduces "btrfs_add_free_space_unused". This behaves the same as btrfs_add_free_space() on regular filesystem. On zoned filesystems, it rewinds the allocation offset. Because the read-only bytes tracks free but unusable bytes when the block group is read-only, we need to migrate the zone_unusable bytes to read-only bytes when a block group is marked read-only. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 51 +++++++++++++++++++++------- fs/btrfs/block-group.h | 1 + fs/btrfs/extent-tree.c | 5 +++ fs/btrfs/free-space-cache.c | 67 +++++++++++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/space-info.c | 13 ++++--- fs/btrfs/space-info.h | 4 ++- fs/btrfs/sysfs.c | 2 ++ fs/btrfs/zoned.c | 21 ++++++++++++ fs/btrfs/zoned.h | 3 ++ 10 files changed, 151 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6d10874189df..e4444d4dd4b5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1009,12 +1009,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, WARN_ON(block_group->space_info->total_bytes < block_group->length); WARN_ON(block_group->space_info->bytes_readonly - < block_group->length); + < block_group->length - block_group->zone_unusable); + WARN_ON(block_group->space_info->bytes_zone_unusable + < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); } block_group->space_info->total_bytes -= block_group->length; - block_group->space_info->bytes_readonly -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= + block_group->zone_unusable; block_group->space_info->disk_total -= block_group->length * factor; spin_unlock(&block_group->space_info->lock); @@ -1158,7 +1163,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) } num_bytes = cache->length - cache->reserved - cache->pinned - - cache->bytes_super - cache->used; + cache->bytes_super - cache->zone_unusable - cache->used; /* * Data never overcommits, even in mixed mode, so do just the straight @@ -1189,6 +1194,12 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) if (!ret) { sinfo->bytes_readonly += num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes to readonly */ + sinfo->bytes_readonly += cache->zone_unusable; + sinfo->bytes_zone_unusable -= cache->zone_unusable; + cache->zone_unusable = 0; + } cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); } @@ -1876,12 +1887,20 @@ static int read_one_block_group(struct btrfs_fs_info *info, } /* - * Check for two cases, either we are full, and therefore don't need - * to bother with the caching work since we won't find any space, or we - * are empty, and we can just add all the space in and be done with it. - * This saves us _a_lot_ of time, particularly in the full case. + * For zoned filesystem, space after the allocation offset is the only + * free space for a block group. So, we don't need any caching work. + * btrfs_calc_zone_unusable() will set the amount of free space and + * zone_unusable space. + * + * For regular filesystem, check for two cases, either we are full, and + * therefore don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all the space + * in and be done with it. This saves us _a_lot_ of time, particularly + * in the full case. */ - if (cache->length == cache->used) { + if (btrfs_is_zoned(info)) { + btrfs_calc_zone_unusable(cache); + } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; btrfs_free_excluded_extents(cache); @@ -1900,7 +1919,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, } trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, - cache->used, cache->bytes_super, &space_info); + cache->used, cache->bytes_super, + cache->zone_unusable, &space_info); cache->space_info = space_info; @@ -1956,7 +1976,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) break; } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, &space_info); + 0, 0, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -2197,7 +2217,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, &cache->space_info); + cache->bytes_super, 0, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2305,8 +2325,15 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) spin_lock(&cache->lock); if (!--cache->ro) { num_bytes = cache->length - cache->reserved - - cache->pinned - cache->bytes_super - cache->used; + cache->pinned - cache->bytes_super - + cache->zone_unusable - cache->used; sinfo->bytes_readonly -= num_bytes; + if (btrfs_is_zoned(cache->fs_info)) { + /* Migrate zone_unusable bytes back */ + cache->zone_unusable = cache->alloc_offset - cache->used; + sinfo->bytes_zone_unusable += cache->zone_unusable; + sinfo->bytes_readonly -= cache->zone_unusable; + } list_del_init(&cache->ro_list); } spin_unlock(&cache->lock); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 224946fa9bed..0fd66febe115 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -189,6 +189,7 @@ struct btrfs_block_group { * allocation. This is used only on a zoned filesystem. */ u64 alloc_offset; + u64 zone_unusable; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5476ab84e544..5c61c3f136f7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -34,6 +34,7 @@ #include "block-group.h" #include "discard.h" #include "rcu-string.h" +#include "zoned.h" #undef SCRAMBLE_DELAYED_REFS @@ -2740,6 +2741,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache->ro) { space_info->bytes_readonly += len; readonly = true; + } else if (btrfs_is_zoned(fs_info)) { + /* Need reset before reusing in a zoned block group */ + space_info->bytes_zone_unusable += len; + readonly = true; } spin_unlock(&cache->lock); if (!readonly && return_free_space && diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6134e10a6e7f..b93ac31eca69 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2477,6 +2477,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, int ret = 0; u64 filter_bytes = bytes; + ASSERT(!btrfs_is_zoned(fs_info)); + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) return -ENOMEM; @@ -2534,11 +2536,49 @@ out: return ret; } +static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + + spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; + else if (offset >= block_group->alloc_offset) + to_free = size; + else if (offset + size <= block_group->alloc_offset) + to_free = 0; + else + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + + ctl->free_space += to_free; + block_group->zone_unusable += to_unusable; + spin_unlock(&ctl->tree_lock); + if (!used) { + spin_lock(&block_group->lock); + block_group->alloc_offset -= size; + spin_unlock(&block_group->lock); + } + + /* All the region is now unusable. Mark it as unused and reclaim */ + if (block_group->zone_unusable == block_group->length) + btrfs_mark_bg_unused(block_group); + + return 0; +} + int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2547,6 +2587,16 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group, bytenr, size, trim_state); } +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + false); + + return btrfs_add_free_space(block_group, bytenr, size); +} + /* * This is a subtle distinction because when adding free space back in general, * we want it to be added as untrimmed for async. But in the case where we add @@ -2557,6 +2607,10 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, { enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (btrfs_is_zoned(block_group->fs_info)) + return __btrfs_add_free_space_zoned(block_group, bytenr, size, + true); + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) trim_state = BTRFS_TRIM_STATE_TRIMMED; @@ -2574,6 +2628,9 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; + if (btrfs_is_zoned(block_group->fs_info)) + return 0; + spin_lock(&ctl->tree_lock); again: @@ -2668,6 +2725,16 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, struct rb_node *n; int count = 0; + /* + * Zoned btrfs does not use free space tree and cluster. Just print + * out the free space after the allocation offset. + */ + if (btrfs_is_zoned(fs_info)) { + btrfs_info(fs_info, "free space %llu", + block_group->length - block_group->alloc_offset); + return; + } + spin_lock(&ctl->tree_lock); for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { info = rb_entry(n, struct btrfs_free_space, offset_index); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ecb09a02d544..1f23088d43f9 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -107,6 +107,8 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_unused(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bccd98141a6e..2da6177f4b0b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -169,6 +169,7 @@ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, ASSERT(s_info); return s_info->bytes_used + s_info->bytes_reserved + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + (may_use_included ? s_info->bytes_may_use : 0); } @@ -264,7 +265,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -280,6 +281,7 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->bytes_readonly += bytes_readonly; + found->bytes_zone_unusable += bytes_zone_unusable; if (total_bytes > 0) found->full = 0; btrfs_try_granting_tickets(info, found); @@ -429,10 +431,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, info->total_bytes - btrfs_space_info_used(info, true), info->full ? "" : "not "); btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); + info->bytes_readonly, info->bytes_zone_unusable); DUMP_BLOCK_RSV(fs_info, global_block_rsv); DUMP_BLOCK_RSV(fs_info, trans_block_rsv); @@ -461,9 +463,10 @@ again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); + cache->reserved, cache->zone_unusable, + cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index e237156ce888..b1a8ffb03b3e 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -17,6 +17,8 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the @@ -123,7 +125,7 @@ DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, + u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 19b9fffa2c9c..6eb1c50fa98c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -666,6 +666,7 @@ SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); +SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(space_info, total_bytes_pinned, @@ -679,6 +680,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_reserved), BTRFS_ATTR_PTR(space_info, bytes_may_use), BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, total_bytes_pinned), diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b892566a1c93..c5f9f4c6f20b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1160,3 +1160,24 @@ out: return ret; } + +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) +{ + u64 unusable, free; + + if (!btrfs_is_zoned(cache->fs_info)) + return; + + WARN_ON(cache->bytes_super != 0); + unusable = cache->alloc_offset - cache->used; + free = cache->length - cache->alloc_offset; + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->last_byte_to_unpin = (u64)-1; + cache->cached = BTRFS_CACHE_FINISHED; + cache->free_space_ctl->free_space = free; + cache->zone_unusable = unusable; + + /* Should not have any excluded extents. Just in case, though */ + btrfs_free_excluded_extents(cache); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index d27db3993e51..37304d1675e6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -42,6 +42,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes); int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); +void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -123,6 +124,8 @@ static inline int btrfs_load_block_group_zone_info( return 0; } +static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 2eda57089ea31942e067d6ac37923c3154ef8a25 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:53 +0900 Subject: [PATCH 262/307] btrfs: zoned: implement sequential extent allocation Implement a sequential extent allocator for zoned filesystems. This allocator only needs to check if there is enough space in the block group after the allocation pointer to satisfy the extent allocation request. Therefore the allocator never manages bitmaps or clusters. Also, add assertions to the corresponding functions. As zone append writing is used, it would be unnecessary to track the allocation offset, as the allocator only needs to check available space. But by tracking and returning the offset as an allocated region, we can skip modification of ordered extents and checksum information when there is no IO reordering. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 ++ fs/btrfs/extent-tree.c | 90 ++++++++++++++++++++++++++++++++++--- fs/btrfs/free-space-cache.c | 6 +++ 3 files changed, 94 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e4444d4dd4b5..63093cfb807e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -725,6 +725,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only struct btrfs_caching_control *caching_ctl = NULL; int ret = 0; + /* Allocator for zoned filesystems does not use the cache at all */ + if (btrfs_is_zoned(fs_info)) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); if (!caching_ctl) return -ENOMEM; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5c61c3f136f7..3f83ca503051 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3429,6 +3429,7 @@ btrfs_release_block_group(struct btrfs_block_group *cache, enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, + BTRFS_EXTENT_ALLOC_ZONED, }; /* @@ -3681,6 +3682,65 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, return find_free_extent_unclustered(block_group, ffe_ctl); } +/* + * Simple allocator for sequential-only block group. It only allows sequential + * allocation. No need to play with trees. This function also reserves the + * bytes as in btrfs_add_reserved_bytes. + */ +static int do_allocation_zoned(struct btrfs_block_group *block_group, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_block_group **bg_ret) +{ + struct btrfs_space_info *space_info = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 start = block_group->start; + u64 num_bytes = ffe_ctl->num_bytes; + u64 avail; + int ret = 0; + + ASSERT(btrfs_is_zoned(block_group->fs_info)); + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + if (block_group->ro) { + ret = 1; + goto out; + } + + avail = block_group->length - block_group->alloc_offset; + if (avail < num_bytes) { + if (ffe_ctl->max_extent_size < avail) { + /* + * With sequential allocator, free space is always + * contiguous + */ + ffe_ctl->max_extent_size = avail; + ffe_ctl->total_free_space = avail; + } + ret = 1; + goto out; + } + + ffe_ctl->found_offset = start + block_group->alloc_offset; + block_group->alloc_offset += num_bytes; + spin_lock(&ctl->tree_lock); + ctl->free_space -= num_bytes; + spin_unlock(&ctl->tree_lock); + + /* + * We do not check if found_offset is aligned to stripesize. The + * address is anyway rewritten when using zone append writing. + */ + + ffe_ctl->search_start = ffe_ctl->found_offset; + +out: + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + return ret; +} + static int do_allocation(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) @@ -3688,6 +3748,8 @@ static int do_allocation(struct btrfs_block_group *block_group, switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: return do_allocation_clustered(block_group, ffe_ctl, bg_ret); + case BTRFS_EXTENT_ALLOC_ZONED: + return do_allocation_zoned(block_group, ffe_ctl, bg_ret); default: BUG(); } @@ -3702,6 +3764,9 @@ static void release_block_group(struct btrfs_block_group *block_group, ffe_ctl->retry_clustered = false; ffe_ctl->retry_unclustered = false; break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3730,6 +3795,9 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, case BTRFS_EXTENT_ALLOC_CLUSTERED: found_extent_clustered(ffe_ctl, ins); break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + break; default: BUG(); } @@ -3745,6 +3813,9 @@ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) */ ffe_ctl->loop = LOOP_NO_EMPTY_SIZE; return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Give up here */ + return -ENOSPC; default: BUG(); } @@ -3913,6 +3984,9 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, case BTRFS_EXTENT_ALLOC_CLUSTERED: return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ + return 0; default: BUG(); } @@ -3976,6 +4050,9 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.last_ptr = NULL; ffe_ctl.use_cluster = true; + if (btrfs_is_zoned(fs_info)) + ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -4118,20 +4195,21 @@ have_block_group: /* move on to the next group */ if (ffe_ctl.search_start + num_bytes > block_group->start + block_group->length) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } if (ffe_ctl.found_offset < ffe_ctl.search_start) - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, + ffe_ctl.search_start - ffe_ctl.found_offset); ret = btrfs_add_reserved_bytes(block_group, ram_bytes, num_bytes, delalloc); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, ffe_ctl.found_offset, - num_bytes); + btrfs_add_free_space_unused(block_group, + ffe_ctl.found_offset, num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b93ac31eca69..d2a43186cc7f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2928,6 +2928,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 align_gap_len = 0; enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, block_group->full_stripe_len, max_extent_size); @@ -3059,6 +3061,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, struct rb_node *node; u64 ret = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -3835,6 +3839,8 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, int ret; u64 rem = 0; + ASSERT(!btrfs_is_zoned(block_group->fs_info)); + *trimmed = 0; spin_lock(&block_group->lock); From d3575156f6623eecf086a20bcf99a63f1598109c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:54 +0900 Subject: [PATCH 263/307] btrfs: zoned: redirty released extent buffers Tree manipulating operations like merging nodes often release once-allocated tree nodes. Such nodes are cleaned so that pages in the node are not uselessly written out. On zoned volumes, however, such optimization blocks the following IOs as the cancellation of the write out of the freed blocks breaks the sequential write sequence expected by the device. Introduce a list of clean and unwritten extent buffers that have been released in a transaction. Redirty the buffers so that btree_write_cache_pages() can send proper bios to the devices. Besides it clears the entire content of the extent buffer not to confuse raw block scanners e.g. 'btrfs check'. By clearing the content, csum_dirty_buffer() complains about bytenr mismatch, so avoid the checking and checksum using newly introduced buffer flag EXTENT_BUFFER_NO_CHECK. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++++ fs/btrfs/extent-tree.c | 12 +++++++++++- fs/btrfs/extent_io.c | 4 ++++ fs/btrfs/extent_io.h | 2 ++ fs/btrfs/transaction.c | 10 ++++++++++ fs/btrfs/transaction.h | 3 +++ fs/btrfs/tree-log.c | 6 ++++++ fs/btrfs/zoned.c | 37 +++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 7 +++++++ 9 files changed, 88 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8551b0fc1b22..eb1afd7d89f7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -459,6 +459,12 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec return 0; found_start = btrfs_header_bytenr(eb); + + if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { + WARN_ON(found_start != 0); + return 0; + } + /* * Please do not consolidate these warnings into a single if. * It is useful to know what went wrong. @@ -4774,6 +4780,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_redirty_list(cur_trans); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3f83ca503051..dddcb8513c77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3292,8 +3292,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, buf->start); - if (!ret) + if (!ret) { + btrfs_redirty_list_add(trans->transaction, buf); goto out; + } } cache = btrfs_lookup_block_group(fs_info, buf->start); @@ -3304,6 +3306,13 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, goto out; } + if (btrfs_is_zoned(fs_info)) { + btrfs_redirty_list_add(trans->transaction, buf); + pin_down_extent(trans, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); + goto out; + } + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); @@ -4635,6 +4644,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); + clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); set_extent_buffer_uptodate(buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4be117adda33..eedcfb40c356 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -25,6 +25,7 @@ #include "backref.h" #include "disk-io.h" #include "subpage.h" +#include "zoned.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -5182,6 +5183,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); + INIT_LIST_HEAD(&eb->release_list); spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); @@ -6111,6 +6113,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); + if (check_eb_range(eb, start, len)) return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 047b3e66897f..824640cb0ace 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -31,6 +31,7 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, + EXTENT_BUFFER_NO_CHECK, }; /* these are flags for __process_pages_contig */ @@ -93,6 +94,7 @@ struct extent_buffer { struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + struct list_head release_list; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 00c0680dac3a..acff6bb49a97 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -21,6 +21,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -380,6 +381,8 @@ loop: spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); + INIT_LIST_HEAD(&cur_trans->releasing_ebs); + spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -2350,6 +2353,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } + /* + * At this point, we should have written all the tree blocks allocated + * in this transaction. So it's now safe to free the redirtyied extent + * buffers. + */ + btrfs_free_redirty_list(cur_trans); + ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 935bd6958a8a..6335716e513f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -93,6 +93,9 @@ struct btrfs_transaction { */ atomic_t pending_ordered; wait_queue_head_t pending_wait; + + spinlock_t releasing_ebs_lock; + struct list_head releasing_ebs; }; #define __TRANS_FREEZABLE (1U << 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4c7b283ed2b2..c02eeeac439c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -2752,6 +2753,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); return ret; } + btrfs_redirty_list_add( + trans->transaction, next); } else { if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) clear_extent_buffer_dirty(next); @@ -3296,6 +3299,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans, clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); extent_io_tree_release(&log->log_csum_range); + + if (trans && log->node) + btrfs_redirty_list_add(trans->transaction, log->node); btrfs_put_root(log); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c5f9f4c6f20b..1de67d789b83 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -10,6 +10,7 @@ #include "rcu-string.h" #include "disk-io.h" #include "block-group.h" +#include "transaction.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -1181,3 +1182,39 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) /* Should not have any excluded extents. Just in case, though */ btrfs_free_excluded_extents(cache); } + +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (!btrfs_is_zoned(fs_info) || + btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || + !list_empty(&eb->release_list)) + return; + + set_extent_buffer_dirty(eb); + set_extent_bits_nowait(&trans->dirty_pages, eb->start, + eb->start + eb->len - 1, EXTENT_DIRTY); + memzero_extent_buffer(eb, 0, eb->len); + set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); + + spin_lock(&trans->releasing_ebs_lock); + list_add_tail(&eb->release_list, &trans->releasing_ebs); + spin_unlock(&trans->releasing_ebs_lock); + atomic_inc(&eb->refs); +} + +void btrfs_free_redirty_list(struct btrfs_transaction *trans) +{ + spin_lock(&trans->releasing_ebs_lock); + while (!list_empty(&trans->releasing_ebs)) { + struct extent_buffer *eb; + + eb = list_first_entry(&trans->releasing_ebs, + struct extent_buffer, release_list); + list_del_init(&eb->release_list); + free_extent_buffer(eb); + } + spin_unlock(&trans->releasing_ebs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 37304d1675e6..b250a578e38c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -43,6 +43,9 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); +void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb); +void btrfs_free_redirty_list(struct btrfs_transaction *trans); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -126,6 +129,10 @@ static inline int btrfs_load_block_group_zone_info( static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } +static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, + struct extent_buffer *eb) { } +static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 011b41bffa3dd086de3f2c393b35cde6133a7140 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:55 +0900 Subject: [PATCH 264/307] btrfs: zoned: advance allocation pointer after tree log node Since the allocation info of a tree log node is not recorded in the extent tree, calculate_alloc_pointer() cannot detect this node, so the pointer can be over a tree node. Replaying the log calls btrfs_remove_free_space() for each node in the log tree. So, advance the pointer after the node to not allocate over it. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d2a43186cc7f..5400294bd271 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2628,8 +2628,22 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, int ret; bool re_search = false; - if (btrfs_is_zoned(block_group->fs_info)) + if (btrfs_is_zoned(block_group->fs_info)) { + /* + * This can happen with conventional zones when replaying log. + * Since the allocation info of tree-log nodes are not recorded + * to the extent-tree, calculate_alloc_pointer() failed to + * advance the allocation pointer after last allocated tree log + * node blocks. + * + * This function is called from + * btrfs_pin_extent_for_log_replay() when replaying the log. + * Advance the pointer not to overwrite the tree-log nodes. + */ + if (block_group->alloc_offset < offset + bytes) + block_group->alloc_offset = offset + bytes; return 0; + } spin_lock(&ctl->tree_lock); From dcba6e48b518e5e48522e9ea2b73b60827c93146 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:56 +0900 Subject: [PATCH 265/307] btrfs: zoned: reset zones of unused block groups We must reset the zones of a deleted unused block group to rewind the zones' write pointers to the zones' start. To do this, we can use the DISCARD_SYNC code to do the reset when the filesystem is running on zoned devices. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++++++-- fs/btrfs/extent-tree.c | 17 ++++++++++++----- fs/btrfs/zoned.h | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 63093cfb807e..70a0c0f8f99f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1408,8 +1408,12 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) goto flip_async; - /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); + /* + * DISCARD can flip during remount. On zoned filesystems, we + * need to reset sequential-required zones. + */ + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_is_zoned(fs_info); /* Implicit trim during transaction commit. */ if (trimming) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dddcb8513c77..d976c56b3a56 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1298,6 +1298,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { + struct btrfs_device *dev = stripe->dev; + u64 physical = stripe->physical; + u64 length = stripe->length; u64 bytes; struct request_queue *req_q; @@ -1305,14 +1308,18 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } + req_q = bdev_get_queue(stripe->dev->bdev); - if (!blk_queue_discard(req_q)) + /* Zone reset on zoned filesystems */ + if (btrfs_can_zone_reset(dev, physical, length)) + ret = btrfs_reset_device_zone(dev, physical, + length, &bytes); + else if (blk_queue_discard(req_q)) + ret = btrfs_issue_discard(dev->bdev, physical, + length, &bytes); + else continue; - ret = btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - stripe->length, - &bytes); if (!ret) { discarded_bytes += bytes; } else if (ret != -EOPNOTSUPP) { diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b250a578e38c..c105641a6ad3 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -209,4 +209,19 @@ static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 p return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos); } +static inline bool btrfs_can_zone_reset(struct btrfs_device *device, + u64 physical, u64 length) +{ + u64 zone_size; + + if (!btrfs_dev_is_sequential(device, physical)) + return false; + + zone_size = device->zone_info->zone_size; + if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size)) + return false; + + return true; +} + #endif From 953651eb308fb56cd1a2d916e3d3c8b242240651 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:57 +0900 Subject: [PATCH 266/307] btrfs: factor out helper adding a page to bio Factor out adding a page to a bio from submit_extent_page(). The page is added only when bio_flags are the same, contiguous and the added page fits in the same stripe as pages in the bio. Condition checks are reordered to allow early return to avoid possibly heavy btrfs_bio_fits_in_stripe() calling. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 60 +++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eedcfb40c356..3b33e7afd8e4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3084,6 +3084,48 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } +/** + * Attempt to add a page to bio + * + * @bio: destination bio + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @pg_offset: starting offset in the page + * @size: portion of page that we want to write + * @prev_bio_flags: flags of previous bio to see if we can merge the current one + * @bio_flags: flags of the current bio to see if we can merge them + * @return: true if page was added, false otherwise + * + * Attempt to add a page to bio considering stripe alignment etc. + * + * Return true if successfully page added. Otherwise, return false. + */ +static bool btrfs_bio_add_page(struct bio *bio, struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; + bool contig; + + if (prev_bio_flags != bio_flags) + return false; + + if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + contig = bio->bi_iter.bi_sector == sector; + else + contig = bio_end_sector(bio) == sector; + if (!contig) + return false; + + if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + return false; + + return bio_add_page(bio, page, size, pg_offset) == size; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3112,27 +3154,15 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - sector_t sector = disk_bytenr >> 9; struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; ASSERT(bio_ret); if (*bio_ret) { - bool contig; - bool can_merge = true; - bio = *bio_ret; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) - contig = bio->bi_iter.bi_sector == sector; - else - contig = bio_end_sector(bio) == sector; - - if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags)) - can_merge = false; - - if (prev_bio_flags != bio_flags || !contig || !can_merge || - force_bio_submit || - bio_add_page(bio, page, io_size, pg_offset) < io_size) { + if (force_bio_submit || + !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, + pg_offset, prev_bio_flags, bio_flags)) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; From e1326f0339fe0a3beecb0da4d1b8793443798e09 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:58 +0900 Subject: [PATCH 267/307] btrfs: zoned: use bio_add_zone_append_page A zoned device has its own hardware restrictions e.g. max_zone_append_size when using REQ_OP_ZONE_APPEND. To follow these restrictions, use bio_add_zone_append_page() instead of bio_add_page(). We need target device to use bio_add_zone_append_page(), so this commit reads the chunk information to cache the target device to btrfs_io_bio(bio)->device. Caching only the target device is sufficient here as zoned filesystems only supports the single profile at the moment. Once more profiles will be supported btrfs_io_bio can hold an extent_map to be able to check for the restrictions of all devices the btrfs_bio will be mapped to. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3b33e7afd8e4..6b26777efb92 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3109,6 +3109,7 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, { const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; + int ret; if (prev_bio_flags != bio_flags) return false; @@ -3123,7 +3124,12 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) return false; - return bio_add_page(bio, page, size, pg_offset) == size; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = bio_add_zone_append_page(bio, page, size, pg_offset); + else + ret = bio_add_page(bio, page, size, pg_offset); + + return ret == size; } /* @@ -3154,7 +3160,9 @@ static int submit_extent_page(unsigned int opf, int ret = 0; struct bio *bio; size_t io_size = min_t(size_t, size, PAGE_SIZE); - struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(bio_ret); @@ -3185,11 +3193,26 @@ static int submit_extent_page(unsigned int opf, if (wbc) { struct block_device *bdev; - bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); wbc_account_cgroup_owner(wbc, page, io_size); } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + btrfs_io_bio(bio)->device = map->stripes[0].dev; + + free_extent_map(em); + } *bio_ret = bio; From cfe94440d17404478771179150e6e4554f092dd5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:21:59 +0900 Subject: [PATCH 268/307] btrfs: zoned: handle REQ_OP_ZONE_APPEND as writing Zoned filesystems use REQ_OP_ZONE_APPEND bios for writing to actual devices. Let btrfs_end_bio() and btrfs_op be aware of it, by mapping REQ_OP_ZONE_APPEND to BTRFS_MAP_WRITE and using btrfs_op() instead of bio_op(). Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/inode.c | 10 +++++----- fs/btrfs/volumes.c | 8 ++++---- fs/btrfs/volumes.h | 1 + 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1afd7d89f7..70621184a731 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -709,7 +709,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) @@ -885,7 +885,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5522e9d09c8a..d7a9c770dc3b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2250,7 +2250,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (bio_op(bio) != REQ_OP_WRITE) { + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) goto out; @@ -7681,7 +7681,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (bio_op(dip->dio_bio) == REQ_OP_WRITE) { + if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), dip->logical_offset, dip->bytes, @@ -7847,7 +7847,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = bio_op(bio) == REQ_OP_WRITE; + bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; /* Check btrfs_submit_bio_hook() for rules about async submit. */ @@ -7897,7 +7897,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); size_t dip_size; struct btrfs_dio_private *dip; @@ -7927,7 +7927,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { - const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); + const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 10401def16ef..400375aaa197 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6448,7 +6448,7 @@ static void btrfs_end_bio(struct bio *bio) struct btrfs_device *dev = btrfs_io_bio(bio)->device; ASSERT(dev->bdev); - if (bio_op(bio) == REQ_OP_WRITE) + if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) @@ -6561,10 +6561,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { + ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (bio_op(bio) == REQ_OP_WRITE) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { ret = raid56_parity_write(fs_info, bio, bbio, map_length); } else { @@ -6587,7 +6587,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (bio_op(first_bio) == REQ_OP_WRITE && + (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { bbio_error(bbio, first_bio, logical); continue; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 598ac225176d..d3bbdb4175df 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -424,6 +424,7 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) case REQ_OP_DISCARD: return BTRFS_MAP_DISCARD; case REQ_OP_WRITE: + case REQ_OP_ZONE_APPEND: return BTRFS_MAP_WRITE; default: WARN_ON_ONCE(1); From d22002fd37bd970480c59754dfa448866a1f38bd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:00 +0900 Subject: [PATCH 269/307] btrfs: zoned: split ordered extent when bio is sent For a zone append write, the device decides the location the data is being written to. Therefore we cannot ensure that two bios are written consecutively on the device. In order to ensure that an ordered extent maps to a contiguous region on disk, we need to maintain a "one bio == one ordered extent" rule. Implement splitting of an ordered extent and extent map on bio submission to adhere to the rule. extract_ordered_extent() hooks into btrfs_submit_data_bio() and splits the corresponding ordered extent so that the ordered extent's region fits into one bio and the corresponding device limits. Several sanity checks need to be done in extract_ordered_extent() e.g. - We cannot split once end_bio'd ordered extent because we cannot divide ordered->bytes_left for the split ones - We do not expect a compressed ordered extent - We should not have checksum list because we omit the list splitting. Since the function is called before btrfs_wq_submit_bio() or btrfs_csum_one_bio(), this should be always ensured. We also need to split an extent map by creating a new one. If not, unpin_extent_cache() complains about the difference between the start of the extent map and the file's logical offset. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 95 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.c | 78 +++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.h | 2 + 3 files changed, 175 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d7a9c770dc3b..750482a06d67 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2215,6 +2215,92 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } +static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, + struct bio *bio, loff_t file_offset) +{ + struct btrfs_ordered_extent *ordered; + struct extent_map *em = NULL, *em_new = NULL; + struct extent_map_tree *em_tree = &inode->extent_tree; + u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bio->bi_iter.bi_size; + u64 end = start + len; + u64 ordered_end; + u64 pre, post; + int ret = 0; + + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (WARN_ON_ONCE(!ordered)) + return BLK_STS_IOERR; + + /* No need to split */ + if (ordered->disk_num_bytes == len) + goto out; + + /* We cannot split once end_bio'd ordered extent */ + if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { + ret = -EINVAL; + goto out; + } + + /* We cannot split a compressed ordered extent */ + if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { + ret = -EINVAL; + goto out; + } + + ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; + /* bio must be in one ordered extent */ + if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { + ret = -EINVAL; + goto out; + } + + /* Checksum list should be empty */ + if (WARN_ON_ONCE(!list_empty(&ordered->list))) { + ret = -EINVAL; + goto out; + } + + pre = start - ordered->disk_bytenr; + post = ordered_end - end; + + ret = btrfs_split_ordered_extent(ordered, pre, post); + if (ret) + goto out; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, ordered->file_offset, len); + if (!em) { + read_unlock(&em_tree->lock); + ret = -EIO; + goto out; + } + read_unlock(&em_tree->lock); + + ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + /* + * We cannot reuse em_new here but have to create a new one, as + * unpin_extent_cache() expects the start of the extent map to be the + * logical offset of the file, which does not hold true anymore after + * splitting. + */ + em_new = create_io_em(inode, em->start + pre, len, + em->start + pre, em->block_start + pre, len, + len, len, BTRFS_COMPRESS_NONE, + BTRFS_ORDERED_REGULAR); + if (IS_ERR(em_new)) { + ret = PTR_ERR(em_new); + goto out; + } + free_extent_map(em_new); + +out: + free_extent_map(em); + btrfs_put_ordered_extent(ordered); + + return errno_to_blk_status(ret); +} + /* * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read. @@ -2250,6 +2336,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *page = bio_first_bvec_all(bio)->bv_page; + loff_t file_offset = page_offset(page); + + ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + if (ret) + goto out; + } + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e8dee1578d4a..2dc707f02f00 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -920,6 +920,84 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, } } +static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, + u64 len) +{ + struct inode *inode = ordered->inode; + u64 file_offset = ordered->file_offset + pos; + u64 disk_bytenr = ordered->disk_bytenr + pos; + u64 num_bytes = len; + u64 disk_num_bytes = len; + int type; + unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT); + int compress_type = ordered->compress_type; + unsigned long weight; + int ret; + + weight = hweight_long(flags_masked); + WARN_ON_ONCE(weight > 1); + if (!weight) + type = 0; + else + type = __ffs(flags_masked); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { + WARN_ON_ONCE(1); + ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), + file_offset, disk_bytenr, num_bytes, + disk_num_bytes, compress_type); + } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } else { + ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, + disk_bytenr, num_bytes, disk_num_bytes, type); + } + + return ret; +} + +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post) +{ + struct inode *inode = ordered->inode; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct rb_node *node; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret = 0; + + spin_lock_irq(&tree->lock); + /* Remove from tree once */ + node = &ordered->rb_node; + rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); + if (tree->last == node) + tree->last = NULL; + + ordered->file_offset += pre; + ordered->disk_bytenr += pre; + ordered->num_bytes -= (pre + post); + ordered->disk_num_bytes -= (pre + post); + ordered->bytes_left -= (pre + post); + + /* Re-insert the node */ + node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + if (node) + btrfs_panic(fs_info, -EEXIST, + "zoned: inconsistency in ordered tree at offset %llu", + ordered->file_offset); + + spin_unlock_irq(&tree->lock); + + if (pre) + ret = clone_ordered_extent(ordered, 0, pre); + if (post) + ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, + post); + + return ret; +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index cca3307807e8..c400be75a3f1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -201,6 +201,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); +int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + u64 post); int __init ordered_data_init(void); void __cold ordered_data_exit(void); From cacb2cea46382aacf0365dbe231bd1ac3349478e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:22:01 +0900 Subject: [PATCH 270/307] btrfs: zoned: check if bio spans across an ordered extent To ensure that an ordered extent maps to a contiguous region on disk, we need to maintain a "one bio == one ordered extent" rule. Ensure that constructing bio does not span more than an ordered extent. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/extent_io.c | 9 +++++++-- fs/btrfs/inode.c | 27 +++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9b0521d9e89..10da47ab093a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3120,6 +3120,8 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size); void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6b26777efb92..f64e2be5749e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3124,10 +3124,15 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) return false; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct page *first_page = bio_first_bvec_all(bio)->bv_page; + + if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) + return false; ret = bio_add_zone_append_page(bio, page, size, pg_offset); - else + } else { ret = bio_add_page(bio, page, size, pg_offset); + } return ret == size; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 750482a06d67..31545e503b9e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2215,6 +2215,33 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } +bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, + unsigned int size) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_ordered_extent *ordered; + u64 len = bio->bi_iter.bi_size + size; + bool ret = true; + + ASSERT(btrfs_is_zoned(fs_info)); + ASSERT(fs_info->max_zone_append_size > 0); + ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); + + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); + if (!ordered) + return ret; + + if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > + ordered->disk_bytenr + ordered->disk_num_bytes) + ret = false; + + btrfs_put_ordered_extent(ordered); + + return ret; +} + static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { From 138082f36610698e3fd00318f275d7f2159b8d26 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:02 +0900 Subject: [PATCH 271/307] btrfs: extend btrfs_rmap_block for specifying a device btrfs_rmap_block currently reverse-maps the physical addresses on all devices to the corresponding logical addresses. Extend the function to match to a specified device. The old functionality of querying all devices is left intact by specifying NULL as target device. A block_device instead of a btrfs_device is passed into btrfs_rmap_block, as this function is intended to reverse-map the result of a bio, which only has a block_device. Also export the function for later use. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 16 +++++++++++----- fs/btrfs/block-group.h | 8 +++----- fs/btrfs/tests/extent-map-tests.c | 2 +- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 70a0c0f8f99f..f5e9f560ce6d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1588,6 +1588,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group + * @bdev: physical device to resolve, can be NULL to indicate any device * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical @@ -1597,9 +1598,9 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * Used primarily to exclude those portions of a block group that contain super * block copies. */ -EXPORT_FOR_TESTS int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len) { struct extent_map *em; struct map_lookup *map; @@ -1617,6 +1618,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, map = em->map_lookup; data_stripe_length = em->orig_block_len; io_stripe_size = map->stripe_len; + chunk_start = em->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -1631,14 +1633,18 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, for (i = 0; i < map->num_stripes; i++) { bool already_inserted = false; u64 stripe_nr; + u64 offset; int j; if (!in_range(physical, map->stripes[i].physical, data_stripe_length)) continue; + if (bdev && map->stripes[i].dev->bdev != bdev) + continue; + stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; @@ -1652,7 +1658,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, * instead of map->stripe_len */ - bytenr = chunk_start + stripe_nr * io_stripe_size; + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; /* Ensure we don't add duplicate addresses */ for (j = 0; j < nr; j++) { @@ -1694,7 +1700,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(fs_info, cache->start, + ret = btrfs_rmap_block(fs_info, cache->start, NULL, bytenr, &logical, &nr, &stripe_len); if (ret) return ret; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 0fd66febe115..d14ac03bb93d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -277,6 +277,9 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache, struct btrfs_caching_control *caching_ctl); +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + struct block_device *bdev, u64 physical, u64 **logical, + int *naddrs, int *stripe_len); static inline u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info) { @@ -303,9 +306,4 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) void btrfs_freeze_block_group(struct btrfs_block_group *cache); void btrfs_unfreeze_block_group(struct btrfs_block_group *cache); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); -#endif - #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 57379e96ccc9..c0aefe6dee0b 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -507,7 +507,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, em->start, NULL, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", From 08f455593fff701e103876d4db5d3f4f6d0ff871 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:22:03 +0900 Subject: [PATCH 272/307] btrfs: zoned: cache if block group is on a sequential zone On a zoned filesystem, cache if a block group is on a sequential write only zone. On sequential write only zones, we can use REQ_OP_ZONE_APPEND for writing data, therefore provide btrfs_use_zone_append() to figure out if IO is targeting a sequential write only zone and we can use REQ_OP_ZONE_APPEND for data writing. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 3 +++ fs/btrfs/zoned.c | 29 +++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 6 ++++++ 3 files changed, 38 insertions(+) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index d14ac03bb93d..31c7c5872b92 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -181,6 +181,9 @@ struct btrfs_block_group { */ int needs_free_space; + /* Flag indicating this block group is placed on a sequential zone */ + bool seq_zone; + /* Record locked full stripes for RAID5/6 block group */ struct btrfs_full_stripe_locks_tree full_stripe_locks_root; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1de67d789b83..f6c68704c840 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1101,6 +1101,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } } + if (num_sequential > 0) + cache->seq_zone = true; + if (num_conventional > 0) { /* * Avoid calling calculate_alloc_pointer() for new BG. It @@ -1218,3 +1221,29 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) } spin_unlock(&trans->releasing_ebs_lock); } + +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_group *cache; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return false; + + if (!fs_info->max_zone_append_size) + return false; + + if (!is_data_inode(&inode->vfs_inode)) + return false; + + cache = btrfs_lookup_block_group(fs_info, em->block_start); + ASSERT(cache); + if (!cache) + return false; + + ret = cache->seq_zone; + btrfs_put_block_group(cache); + + return ret; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index c105641a6ad3..14d578328cbe 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -46,6 +46,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); +bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -133,6 +134,11 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, + struct extent_map *em) +{ + return false; +} #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 24533f6a9ad633d6ff0332844fadafb9ecf4a917 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 4 Feb 2021 19:22:04 +0900 Subject: [PATCH 273/307] btrfs: save irq flags when looking up an ordered extent A following patch will add another caller of btrfs_lookup_ordered_extent(), but from a bio's endio context. btrfs_lookup_ordered_extent() uses spin_lock_irq() which unconditionally disables interrupts. Change this to spin_lock_irqsave() so interrupts aren't disabled and re-enabled unconditionally. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2dc707f02f00..fe235ab935d3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -767,9 +767,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irqsave(&tree->lock, flags); node = tree_search(tree, file_offset); if (!node) goto out; @@ -780,7 +781,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino if (entry) refcount_inc(&entry->refs); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irqrestore(&tree->lock, flags); return entry; } From d8e3fb106f393858b90b3befc4f6092a76c86d1c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:05 +0900 Subject: [PATCH 274/307] btrfs: zoned: use ZONE_APPEND write for zoned mode Enable zone append writing for zoned mode. When using zone append, a bio is issued to the start of a target zone and the device decides to place it inside the zone. Upon completion the device reports the actual written position back to the host. Three parts are necessary to enable zone append mode. First, modify the bio to use REQ_OP_ZONE_APPEND in btrfs_submit_bio_hook() and adjust the bi_sector to point the beginning of the zone. Second, record the returned physical address (and disk/partno) to the ordered extent in end_bio_extent_writepage() after the bio has been completed. We cannot resolve the physical address to the logical address because we can neither take locks nor allocate a buffer in this end_bio context. So, we need to record the physical address to resolve it later in btrfs_finish_ordered_io(). And finally, rewrite the logical addresses of the extent mapping and checksum data according to the physical address using btrfs_rmap_block. If the returned address matches the originally allocated address, we can skip this rewriting process. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 +++++++-- fs/btrfs/file.c | 6 +++- fs/btrfs/inode.c | 4 +++ fs/btrfs/ordered-data.c | 3 ++ fs/btrfs/ordered-data.h | 8 +++++ fs/btrfs/volumes.c | 14 +++++++++ fs/btrfs/zoned.c | 70 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 12 +++++++ 8 files changed, 129 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f64e2be5749e..14a68f3589cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2735,6 +2735,7 @@ static void end_bio_extent_writepage(struct bio *bio) u64 start; u64 end; struct bvec_iter_all iter_all; + bool first_bvec = true; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -2761,6 +2762,11 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; + if (first_bvec) { + btrfs_record_physical_zoned(inode, start, bio); + first_bvec = false; + } + end_extent_writepage(page, error, start, end); end_page_writeback(page); } @@ -3664,6 +3670,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct extent_map *em; int ret = 0; int nr = 0; + u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; @@ -3710,6 +3717,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, /* Note that em_end from extent_map_end() is exclusive */ iosize = min(em_end, end + 1) - cur; + + if (btrfs_use_zone_append(inode, em)) + opf = REQ_OP_ZONE_APPEND; + free_extent_map(em); em = NULL; @@ -3735,8 +3746,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - page, disk_bytenr, iosize, + ret = submit_extent_page(opf | write_flags, wbc, page, + disk_bytenr, iosize, cur - page_offset(page), &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bf52d7e85914..01a72f53fb5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2168,8 +2168,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. + * + * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the + * logical address recorded in the ordered extent may change. We need + * to wait for the IO to stabilize the logical address. */ - if (full_sync) { + if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); } else { /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 31545e503b9e..6dbab9293425 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" +#include "zoned.h" struct btrfs_iget_args { u64 ino; @@ -2874,6 +2875,9 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + if (ordered_extent->disk) + btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_free_io_failure_record(inode, start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fe235ab935d3..985a21558437 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -199,6 +199,9 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = ret; + entry->physical = (u64)-1; + entry->disk = NULL; + entry->partno = (u8)-1; ASSERT(type == BTRFS_ORDERED_REGULAR || type == BTRFS_ORDERED_NOCOW || diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c400be75a3f1..99e0853e4d3b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -139,6 +139,14 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; + + /* + * Used to reverse-map physical address returned from ZONE_APPEND write + * command in a workqueue context + */ + u64 physical; + struct gendisk *disk; + u8 partno; }; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 400375aaa197..a4d47c6050f7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6500,6 +6500,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfs_io_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } btrfs_debug_in_rcu(fs_info, "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f6c68704c840..f4e226bda9b0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1247,3 +1247,73 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) return ret; } + +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio) +{ + struct btrfs_ordered_extent *ordered; + const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (bio_op(bio) != REQ_OP_ZONE_APPEND) + return; + + ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); + if (WARN_ON(!ordered)) + return; + + ordered->physical = physical; + ordered->disk = bio->bi_disk; + ordered->partno = bio->bi_partno; + + btrfs_put_ordered_extent(ordered); +} + +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) +{ + struct btrfs_inode *inode = BTRFS_I(ordered->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_ordered_sum *sum; + struct block_device *bdev; + u64 orig_logical = ordered->disk_bytenr; + u64 *logical = NULL; + int nr, stripe_len; + + /* Zoned devices should not have partitions. So, we can assume it is 0 */ + ASSERT(ordered->partno == 0); + bdev = bdgrab(ordered->disk->part0); + if (WARN_ON(!bdev)) + return; + + if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, bdev, + ordered->physical, &logical, &nr, + &stripe_len))) + goto out; + + WARN_ON(nr != 1); + + if (orig_logical == *logical) + goto out; + + ordered->disk_bytenr = *logical; + + em_tree = &inode->extent_tree; + write_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); + em->block_start = *logical; + free_extent_map(em); + write_unlock(&em_tree->lock); + + list_for_each_entry(sum, &ordered->list, list) { + if (*logical < orig_logical) + sum->bytenr -= orig_logical - *logical; + else + sum->bytenr += *logical - orig_logical; + } + +out: + kfree(logical); + bdput(bdev); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 14d578328cbe..04f7b21652b6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -47,6 +47,9 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, + struct bio *bio); +void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -139,6 +142,15 @@ static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, { return false; } + +static inline void btrfs_record_physical_zoned(struct inode *inode, + u64 file_offset, struct bio *bio) +{ +} + +static inline void btrfs_rewrite_logical_zoned( + struct btrfs_ordered_extent *ordered) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 544d24f9de73642a65d50389b789a957b14ae3f6 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:06 +0900 Subject: [PATCH 275/307] btrfs: zoned: enable zone append writing for direct IO Likewise to buffered IO, enable zone append writing for direct IO when its used on a zoned block device. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dbab9293425..dd6fe8afd0e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7738,6 +7738,9 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; + if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + iomap->flags |= IOMAP_F_ZONE_APPEND; + free_extent_map(em); return 0; @@ -7964,6 +7967,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) dip->dio_bio->bi_status = err; + btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + bio_put(bio); btrfs_dio_private_put(dip); } @@ -8124,6 +8129,19 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; + WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && + fs_info->max_zone_append_size && + bio_op(bio) != REQ_OP_ZONE_APPEND); + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + status = extract_ordered_extent(BTRFS_I(inode), bio, + file_offset); + if (status) { + bio_put(bio); + goto out_err; + } + } + ASSERT(submit_len >= clone_len); submit_len -= clone_len; From 42c011000963442ce533d92a492c4a057b2f5a46 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:07 +0900 Subject: [PATCH 276/307] btrfs: zoned: introduce dedicated data write path for zoned filesystems If more than one IO is issued for one file extent, these IO can be written to separate regions on a device. Since we cannot map one file extent to such a separate area on a zoned filesystem, we need to follow the "one IO == one ordered extent" rule. The normal buffered, uncompressed and not pre-allocated write path (used by cow_file_range()) sometimes does not follow this rule. It can write a part of an ordered extent when specified a region to write e.g., when its called from fdatasync(). Introduce a dedicated (uncompressed buffered) data write path for zoned filesystems, that will COW the region and write it at once. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd6fe8afd0e0..c4779cde83c6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1394,6 +1394,29 @@ static int cow_file_range_async(struct btrfs_inode *inode, return 0; } +static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0); + if (ret) + return ret; + + if (*page_started) + return 0; + + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + *page_started = 1; + + return 0; +} + static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { @@ -1871,17 +1894,24 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page { int ret; int force_cow = need_force_cow(inode, start, end); + const bool zoned = btrfs_is_zoned(inode->root->fs_info); if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { + ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + if (zoned) + ret = run_delalloc_zoned(inode, locked_page, start, end, + page_started, nr_written); + else + ret = cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, From 0bc09ca12980db3ef1e55bfad25b1803d57628c9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:08 +0900 Subject: [PATCH 277/307] btrfs: zoned: serialize metadata IO We cannot use zone append for writing metadata, because the B-tree nodes have references to each other using logical address. Without knowing the address in advance, we cannot construct the tree in the first place. So we need to serialize write IOs for metadata. We cannot add a mutex around allocation and submission because metadata blocks are allocated in an earlier stage to build up B-trees. Add a zoned_meta_io_lock and hold it during metadata IO submission in btree_write_cache_pages() to serialize IOs. Furthermore, this adds a per-block group metadata IO submission pointer "meta_write_pointer" to ensure sequential writing, which can break when attempting to write back blocks in an unfinished transaction. If the writing out failed because of a hole and the write out is for data integrity (WB_SYNC_ALL), it returns EAGAIN. A caller like fsync() code should handle this properly e.g. by falling back to a full transaction commit. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 1 + fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/extent_io.c | 25 ++++++++++++++++++++- fs/btrfs/zoned.c | 50 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 32 +++++++++++++++++++++++++++ 6 files changed, 109 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 31c7c5872b92..a07108d65c44 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -193,6 +193,7 @@ struct btrfs_block_group { */ u64 alloc_offset; u64 zone_unusable; + u64 meta_write_pointer; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 10da47ab093a..1bb4f767966a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -975,6 +975,7 @@ struct btrfs_fs_info { /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70621184a731..458bb27e0327 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2769,6 +2769,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->zoned_meta_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14a68f3589cc..dfa6c6106b94 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -26,6 +26,7 @@ #include "disk-io.h" #include "subpage.h" #include "zoned.h" +#include "block-group.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -4161,6 +4162,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; + struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -4193,13 +4195,31 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; + if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { + /* + * If for_sync, this hole will be filled with + * trasnsaction commit. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + ret = -EAGAIN; + else + ret = 0; + free_extent_buffer(eb); + return ret; + } + *eb_context = eb; ret = lock_extent_buffer_for_io(eb, epd); if (ret <= 0) { + btrfs_revert_meta_write_pointer(cache, eb); + if (cache) + btrfs_put_block_group(cache); free_extent_buffer(eb); return ret; } + if (cache) + btrfs_put_block_group(cache); ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4245,6 +4265,7 @@ int btree_write_cache_pages(struct address_space *mapping, tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); @@ -4285,7 +4306,7 @@ retry: } if (ret < 0) { end_write_bio(&epd, ret); - return ret; + goto out; } /* * If something went wrong, don't allow any metadata write bio to be @@ -4320,6 +4341,8 @@ retry: ret = -EROFS; end_write_bio(&epd, ret); } +out: + btrfs_zoned_meta_io_unlock(fs_info); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f4e226bda9b0..b2a6553b2db0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1159,6 +1159,9 @@ out: ret = -EIO; } + if (!ret) + cache->meta_write_pointer = cache->alloc_offset + cache->start; + kfree(alloc_offsets); free_extent_map(em); @@ -1317,3 +1320,50 @@ out: kfree(logical); bdput(bdev); } + +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + struct btrfs_block_group *cache; + bool ret = true; + + if (!btrfs_is_zoned(fs_info)) + return true; + + cache = *cache_ret; + + if (cache && (eb->start < cache->start || + cache->start + cache->length <= eb->start)) { + btrfs_put_block_group(cache); + cache = NULL; + *cache_ret = NULL; + } + + if (!cache) + cache = btrfs_lookup_block_group(fs_info, eb->start); + + if (cache) { + if (cache->meta_write_pointer != eb->start) { + btrfs_put_block_group(cache); + cache = NULL; + ret = false; + } else { + cache->meta_write_pointer = eb->start + eb->len; + } + + *cache_ret = cache; + } + + return ret; +} + +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ + if (!btrfs_is_zoned(eb->fs_info) || !cache) + return; + + ASSERT(cache->meta_write_pointer == eb->start + eb->len); + cache->meta_write_pointer = eb->start; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 04f7b21652b6..0755a25d0f4c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -50,6 +50,11 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, struct bio *bio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); +bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret); +void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, + struct extent_buffer *eb); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -151,6 +156,19 @@ static inline void btrfs_record_physical_zoned(struct inode *inode, static inline void btrfs_rewrite_logical_zoned( struct btrfs_ordered_extent *ordered) { } +static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + struct btrfs_block_group **cache_ret) +{ + return true; +} + +static inline void btrfs_revert_meta_write_pointer( + struct btrfs_block_group *cache, + struct extent_buffer *eb) +{ +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -242,4 +260,18 @@ static inline bool btrfs_can_zone_reset(struct btrfs_device *device, return true; } +static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_lock(&fs_info->zoned_meta_io_lock); +} + +static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_is_zoned(fs_info)) + return; + mutex_unlock(&fs_info->zoned_meta_io_lock); +} + #endif From 24c0a7227fdfa598badcfc0f735d16745d39e0c4 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:09 +0900 Subject: [PATCH 278/307] btrfs: zoned: wait for existing extents before truncating When truncating a file, file buffers which have already been allocated but not yet written may be truncated. Truncating these buffers could cause breakage of a sequential write pattern in a block group if the truncated blocks are for example followed by blocks allocated to another file. To avoid this problem, always wait for write out of all unwritten buffers before proceeding with the truncate execution. Signed-off-by: Naohiro Aota Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4779cde83c6..535abf898225 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5169,6 +5169,15 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_wait_ordered_range(inode, + ALIGN(newsize, fs_info->sectorsize), + (u64)-1); + if (ret) + return ret; + } /* * We're truncating a file that used to have good data down to From 4eef29ef6360d9c3e4be111392e20b70e19171cc Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:10 +0900 Subject: [PATCH 279/307] btrfs: zoned: do not use async metadata checksum on zoned filesystems On zoned filesystems, btrfs uses per-fs zoned_meta_io_lock to serialize the metadata write IOs. Even with this serialization, write bios sent from btree_write_cache_pages can be reordered by async checksum workers as these workers are per CPU and not per zone. To preserve write bio ordering, we disable async metadata checksum on a zoned filesystem. This does not result in lower performance with HDDs as a single CPU core is fast enough to do checksum for a single zone write stream with the maximum possible bandwidth of the device. If multiple zones are being written simultaneously, HDD seek overhead lowers the achievable maximum bandwidth, resulting again in a per zone checksum serialization not affecting the performance. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 458bb27e0327..6e16f556ed75 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -871,6 +871,8 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, static int check_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { + if (btrfs_is_zoned(fs_info)) + return 0; if (atomic_read(&bi->sync_writers)) return 0; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) From 78ce9fc269af6e69c1399ab910ba6bc81c934f67 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:11 +0900 Subject: [PATCH 280/307] btrfs: zoned: mark block groups to copy for device-replace This is the 1/4 patch to support device-replace on zoned filesystems. We have two types of IOs during the device replace process. One is an IO to "copy" (by the scrub functions) all the device extents from the source device to the destination device. The other one is an IO to "clone" (by handle_ops_on_dev_replace()) new incoming write IOs from users to the source device into the target device. Cloning incoming IOs can break the sequential write rule in on target device. When a write is mapped in the middle of a block group, the IO is directed to the middle of a target device zone, which breaks the sequential write requirement. However, the cloning function cannot be disabled since incoming IOs targeting already copied device extents must be cloned so that the IO is executed on the target device. We cannot use dev_replace->cursor_{left,right} to determine whether a bio is going to a not yet copied region. Since we have a time gap between finishing btrfs_scrub_dev() and rewriting the mapping tree in btrfs_dev_replace_finishing(), we can have a newly allocated device extent which is never cloned nor copied. So the point is to copy only already existing device extents. This patch introduces mark_block_group_to_copy() to mark existing block groups as a target of copying. Then, handle_ops_on_dev_replace() and dev-replace can check the flag to do their job. Also, btrfs_finish_block_group_to_copy() will check if the copied stripe is the last stripe in the block group. With the last stripe copied, the to_copy flag is finally disabled. Afterwards we can safely clone incoming IOs on this block group. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 1 + fs/btrfs/dev-replace.c | 184 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/dev-replace.h | 3 + fs/btrfs/scrub.c | 16 ++++ 4 files changed, 204 insertions(+) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index a07108d65c44..d37ee576ac6e 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -95,6 +95,7 @@ struct btrfs_block_group { unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; + unsigned int to_copy:1; int disk_cache_state; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index bc73f798ce3a..3a9c1e046ebe 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -22,6 +22,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" +#include "block-group.h" /* * Device replace overview @@ -459,6 +460,185 @@ static char* btrfs_dev_name(struct btrfs_device *device) return rcu_str_deref(device->name); } +static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, + struct btrfs_device *src_dev) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_block_group *cache; + struct btrfs_trans_handle *trans; + int ret = 0; + u64 chunk_offset; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return 0; + + mutex_lock(&fs_info->chunk_mutex); + + /* Ensure we don't have pending new block group */ + spin_lock(&fs_info->trans_lock); + while (fs_info->running_transaction && + !list_empty(&fs_info->running_transaction->dev_update_list)) { + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->chunk_mutex); + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret == -ENOENT) { + spin_lock(&fs_info->trans_lock); + continue; + } else { + goto unlock; + } + } + + ret = btrfs_commit_transaction(trans); + mutex_lock(&fs_info->chunk_mutex); + if (ret) + goto unlock; + + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto unlock; + } + + path->reada = READA_FORWARD; + path->search_commit_root = 1; + path->skip_locking = 1; + + key.objectid = src_dev->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto free_path; + if (ret > 0) { + ret = 0; + goto free_path; + } + } else { + ret = 0; + } + } + + while (1) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != src_dev->devid) + break; + + if (found_key.type != BTRFS_DEV_EXTENT_KEY) + break; + + if (found_key.offset < key.offset) + break; + + dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + + chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!cache) + goto skip; + + spin_lock(&cache->lock); + cache->to_copy = 1; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + +skip: + ret = btrfs_next_item(root, path); + if (ret != 0) { + if (ret > 0) + ret = 0; + break; + } + } + +free_path: + btrfs_free_path(path); +unlock: + mutex_unlock(&fs_info->chunk_mutex); + + return ret; +} + +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct extent_map *em; + struct map_lookup *map; + u64 chunk_offset = cache->start; + int num_extents, cur_extent; + int i; + + /* Do not use "to_copy" on non zoned filesystem for now */ + if (!btrfs_is_zoned(fs_info)) + return true; + + spin_lock(&cache->lock); + if (cache->removed) { + spin_unlock(&cache->lock); + return true; + } + spin_unlock(&cache->lock); + + em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(em)); + map = em->map_lookup; + + num_extents = cur_extent = 0; + for (i = 0; i < map->num_stripes; i++) { + /* We have more device extent to copy */ + if (srcdev != map->stripes[i].dev) + continue; + + num_extents++; + if (physical == map->stripes[i].physical) + cur_extent = i; + } + + free_extent_map(em); + + if (num_extents > 1 && cur_extent < num_extents - 1) { + /* + * Has more stripes on this device. Keep this block group + * readonly until we finish all the stripes. + */ + return false; + } + + /* Last stripe on this device */ + spin_lock(&cache->lock); + cache->to_copy = 0; + spin_unlock(&cache->lock); + + return true; +} + static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, int read_src) @@ -500,6 +680,10 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, if (ret) return ret; + ret = mark_block_group_to_copy(fs_info, src_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 60b70dacc299..3911049a5f23 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -18,5 +18,8 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, + struct btrfs_block_group *cache, + u64 physical); #endif diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f4f88a4d2c8..da4f9c24e42d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3561,6 +3561,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache) goto skip; + if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { + spin_lock(&cache->lock); + if (!cache->to_copy) { + spin_unlock(&cache->lock); + ro_set = 0; + goto done; + } + spin_unlock(&cache->lock); + } + /* * Make sure that while we are scrubbing the corresponding block * group doesn't get its logical address and its device extents @@ -3692,6 +3702,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + if (sctx->is_dev_replace && + !btrfs_finish_block_group_to_copy(dev_replace->srcdev, + cache, found_key.offset)) + ro_set = 0; + +done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; From 6143c23ccced762d21a87ef5fa421ba876231131 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:12 +0900 Subject: [PATCH 281/307] btrfs: zoned: implement cloning for zoned device-replace This is 2/4 patch to implement device replace for zoned filesystems. In zoned mode, a block group must be either copied (from the source device to the target device) or cloned (to both devices). Implement the cloning part. If a block group targeted by an IO is marked to copy, we should not clone the IO to the destination device, because the block group is eventually copied by the replace process. This commit also handles cloning of device reset. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++----------- fs/btrfs/volumes.c | 31 +++++++++++++++++++++-- fs/btrfs/zoned.c | 9 +++++++ 3 files changed, 80 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d976c56b3a56..cd308bf3a220 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,6 +35,7 @@ #include "discard.h" #include "rcu-string.h" #include "zoned.h" +#include "dev-replace.h" #undef SCRAMBLE_DELAYED_REFS @@ -1265,6 +1266,46 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } +static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +{ + struct btrfs_device *dev = stripe->dev; + struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 phys = stripe->physical; + u64 len = stripe->length; + u64 discarded = 0; + int ret = 0; + + /* Zone reset on a zoned filesystem */ + if (btrfs_can_zone_reset(dev, phys, len)) { + u64 src_disc; + + ret = btrfs_reset_device_zone(dev, phys, len, &discarded); + if (ret) + goto out; + + if (!btrfs_dev_replace_is_ongoing(dev_replace) || + dev != dev_replace->srcdev) + goto out; + + src_disc = discarded; + + /* Send to replace target as well */ + ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, + &discarded); + discarded += src_disc; + } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); + } else { + ret = 0; + *bytes = 0; + } + +out: + *bytes = discarded; + return ret; +} + int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { @@ -1298,28 +1339,14 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { - struct btrfs_device *dev = stripe->dev; - u64 physical = stripe->physical; - u64 length = stripe->length; u64 bytes; - struct request_queue *req_q; if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - req_q = bdev_get_queue(stripe->dev->bdev); - /* Zone reset on zoned filesystems */ - if (btrfs_can_zone_reset(dev, physical, length)) - ret = btrfs_reset_device_zone(dev, physical, - length, &bytes); - else if (blk_queue_discard(req_q)) - ret = btrfs_issue_discard(dev->bdev, physical, - length, &bytes); - else - continue; - + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; } else if (ret != -EOPNOTSUPP) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a4d47c6050f7..52ec6721ada2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5973,9 +5973,29 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, return ret; } +static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + bool ret; + + /* Non-ZONED mode does not use "to_copy" flag */ + if (!btrfs_is_zoned(fs_info)) + return false; + + cache = btrfs_lookup_block_group(fs_info, logical); + + spin_lock(&cache->lock); + ret = cache->to_copy; + spin_unlock(&cache->lock); + + btrfs_put_block_group(cache); + return ret; +} + static void handle_ops_on_dev_replace(enum btrfs_map_op op, struct btrfs_bio **bbio_ret, struct btrfs_dev_replace *dev_replace, + u64 logical, int *num_stripes_ret, int *max_errors_ret) { struct btrfs_bio *bbio = *bbio_ret; @@ -5988,6 +6008,13 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, if (op == BTRFS_MAP_WRITE) { int index_where_to_add; + /* + * A block group which have "to_copy" set will eventually + * copied by dev-replace process. We can avoid cloning IO here. + */ + if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) + return; + /* * duplicate the write operations while the dev replace * procedure is running. Since the copying of the old disk to @@ -6376,8 +6403,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, - &max_errors); + handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + &num_stripes, &max_errors); } *bbio_ret = bbio; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b2a6553b2db0..15288f766842 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -11,6 +11,7 @@ #include "disk-io.h" #include "block-group.h" #include "transaction.h" +#include "dev-replace.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -1036,6 +1037,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) for (i = 0; i < map->num_stripes; i++) { bool is_sequential; struct blk_zone zone; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; device = map->stripes[i].dev; physical = map->stripes[i].physical; @@ -1062,6 +1065,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) */ btrfs_dev_clear_zone_empty(device, physical); + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical); + up_read(&dev_replace->rwsem); + /* * The group is mapped to a sequential zone. Get the zone write * pointer to determine the allocation offset within the zone. From de17addce7a20db311c020fa91497a7341782d2d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:13 +0900 Subject: [PATCH 282/307] btrfs: zoned: implement copying for zoned device-replace This is 3/4 patch to implement device-replace on zoned filesystems. This commit implements copying. To do this, it tracks the write pointer during the device replace process. As device-replace's copy process is smart enough to only copy used extents on the source device, we have to fill the gap to honor the sequential write requirement in the target device. The device-replace process on zoned filesystems must copy or clone all the extents in the source device exactly once. So, we need to ensure allocations started just before the dev-replace process to have their corresponding extent information in the B-trees. finish_extent_writes_for_zoned() implements that functionality, which basically is the removed code in the commit 042528f8d840 ("Btrfs: fix block group remaining RO forever after error during device replace"). Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 2 +- fs/btrfs/zoned.c | 9 +++++ fs/btrfs/zoned.h | 7 ++++ 4 files changed, 101 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index da4f9c24e42d..92904902d160 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -166,6 +166,7 @@ struct scrub_ctx { int pages_per_rd_bio; int is_dev_replace; + u64 write_pointer; struct scrub_bio *wr_curr_bio; struct mutex wr_lock; @@ -1619,6 +1620,25 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, return scrub_add_page_to_wr_bio(sblock->sctx, spage); } +static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) +{ + int ret = 0; + u64 length; + + if (!btrfs_is_zoned(sctx->fs_info)) + return 0; + + if (sctx->write_pointer < physical) { + length = physical - sctx->write_pointer; + + ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, + sctx->write_pointer, length); + if (!ret) + sctx->write_pointer = physical; + } + return ret; +} + static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { @@ -1641,6 +1661,13 @@ again: if (sbio->page_count == 0) { struct bio *bio; + ret = fill_writer_pointer_gap(sctx, + spage->physical_for_dev_replace); + if (ret) { + mutex_unlock(&sctx->wr_lock); + return ret; + } + sbio->physical = spage->physical_for_dev_replace; sbio->logical = spage->logical; sbio->dev = sctx->wr_tgtdev; @@ -1702,6 +1729,9 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * doubled the write performance on spinning disks when measured * with Linux 3.5 */ btrfsic_submit_bio(sbio->bio); + + if (btrfs_is_zoned(sctx->fs_info)) + sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -3025,6 +3055,20 @@ out: return ret < 0 ? ret : 0; } +static void sync_replace_for_zoned(struct scrub_ctx *sctx) +{ + if (!btrfs_is_zoned(sctx->fs_info)) + return; + + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -3165,6 +3209,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ blk_start_plug(&plug); + if (sctx->is_dev_replace && + btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { + mutex_lock(&sctx->wr_lock); + sctx->write_pointer = physical; + mutex_unlock(&sctx->wr_lock); + sctx->flush_all_writes = true; + } + /* * now find all extents for each stripe and scrub them */ @@ -3353,6 +3405,9 @@ again: if (ret) goto out; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + if (extent_logical + extent_len < key.objectid + bytes) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -3475,6 +3530,25 @@ out: return ret; } +static int finish_extent_writes_for_zoned(struct btrfs_root *root, + struct btrfs_block_group *cache) +{ + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_trans_handle *trans; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + return btrfs_commit_transaction(trans); +} + static noinline_for_stack int scrub_enumerate_chunks(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, u64 start, u64 end) @@ -3629,6 +3703,16 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * group is not RO. */ ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); + if (!ret && sctx->is_dev_replace) { + ret = finish_extent_writes_for_zoned(root, cache); + if (ret) { + btrfs_dec_block_group_ro(cache); + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + if (ret == 0) { ro_set = 1; } else if (ret == -ENOSPC && !sctx->is_dev_replace) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 52ec6721ada2..1312b17a6b49 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5978,7 +5978,7 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) struct btrfs_block_group *cache; bool ret; - /* Non-ZONED mode does not use "to_copy" flag */ + /* Non zoned filesystem does not use "to_copy" flag */ if (!btrfs_is_zoned(fs_info)) return false; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 15288f766842..a4707bab6073 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1376,3 +1376,12 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, ASSERT(cache->meta_write_pointer == eb->start + eb->len); cache->meta_write_pointer = eb->start; } + +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) +{ + if (!btrfs_dev_is_sequential(device, physical)) + return -EOPNOTSUPP; + + return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, + length >> SECTOR_SHIFT, GFP_NOFS, 0); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 0755a25d0f4c..5ed1ea2009ea 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -55,6 +55,7 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, struct btrfs_block_group **cache_ret); void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, struct extent_buffer *eb); +int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -169,6 +170,12 @@ static inline void btrfs_revert_meta_write_pointer( { } +static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, + u64 physical, u64 length) +{ + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 7db1c5d14dcd521bef1780b79dcc68b3968447a9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:14 +0900 Subject: [PATCH 283/307] btrfs: zoned: support dev-replace in zoned filesystems This is 4/4 patch to implement device-replace on zoned filesystems. Even after the copying is done, the write pointers of the source device and the destination device may not be synchronized. For example, when the last allocated extent is freed before device-replace process, the extent is not copied, leaving a hole there. Synchronize the write pointers by writing zeroes to the destination device. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 40 ++++++++++++++++++++++++++ fs/btrfs/zoned.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 9 ++++++ 3 files changed, 123 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 92904902d160..e0c3ec01e324 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1628,6 +1628,9 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) if (!btrfs_is_zoned(sctx->fs_info)) return 0; + if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) + return 0; + if (sctx->write_pointer < physical) { length = physical - sctx->write_pointer; @@ -3069,6 +3072,32 @@ static void sync_replace_for_zoned(struct scrub_ctx *sctx) wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); } +static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + u64 physical, u64 physical_end) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + + mutex_lock(&sctx->wr_lock); + if (sctx->write_pointer < physical_end) { + ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, + physical, + sctx->write_pointer); + if (ret) + btrfs_err(fs_info, + "zoned: failed to recover write pointer"); + } + mutex_unlock(&sctx->wr_lock); + btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, @@ -3475,6 +3504,17 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); btrfs_free_path(ppath); + + if (sctx->is_dev_replace && ret >= 0) { + int ret2; + + ret2 = sync_write_pointer_for_zoned(sctx, base + offset, + map->stripes[num].physical, + physical_end); + if (ret2) + ret = ret2; + } + return ret < 0 ? ret : 0; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a4707bab6073..9a5cf153da89 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -12,6 +12,7 @@ #include "block-group.h" #include "transaction.h" #include "dev-replace.h" +#include "space-info.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 @@ -1385,3 +1386,76 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, GFP_NOFS, 0); } + +static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, + struct blk_zone *zone) +{ + struct btrfs_bio *bbio = NULL; + u64 mapped_length = PAGE_SIZE; + unsigned int nofs_flag; + int nmirrors; + int i, ret; + + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &mapped_length, &bbio); + if (ret || !bbio || mapped_length < PAGE_SIZE) { + btrfs_put_bbio(bbio); + return -EIO; + } + + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + return -EINVAL; + + nofs_flag = memalloc_nofs_save(); + nmirrors = (int)bbio->num_stripes; + for (i = 0; i < nmirrors; i++) { + u64 physical = bbio->stripes[i].physical; + struct btrfs_device *dev = bbio->stripes[i].dev; + + /* Missing device */ + if (!dev->bdev) + continue; + + ret = btrfs_get_dev_zone(dev, physical, zone); + /* Failing device */ + if (ret == -EIO || ret == -EOPNOTSUPP) + continue; + break; + } + memalloc_nofs_restore(nofs_flag); + + return ret; +} + +/* + * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by + * filling zeros between @physical_pos to a write pointer of dev-replace + * source device. + */ +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos) +{ + struct btrfs_fs_info *fs_info = tgt_dev->fs_info; + struct blk_zone zone; + u64 length; + u64 wp; + int ret; + + if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) + return 0; + + ret = read_zone_info(fs_info, logical, &zone); + if (ret) + return ret; + + wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); + + if (physical_pos == wp) + return 0; + + if (physical_pos > wp) + return -EUCLEAN; + + length = wp - physical_pos; + return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5ed1ea2009ea..932ad9bc0de6 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -56,6 +56,8 @@ bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, struct extent_buffer *eb); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); +int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, + u64 physical_start, u64 physical_pos); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -176,6 +178,13 @@ static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, return -EOPNOTSUPP; } +static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, + u64 logical, u64 physical_start, + u64 physical_pos) +{ + return -EOPNOTSUPP; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 32430c614844169a5e5554dcbb307735ddd1f780 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:15 +0900 Subject: [PATCH 284/307] btrfs: zoned: enable relocation on a zoned filesystem Currently fallocate() is disabled on a zoned filesystem. Since current relocation process relies on preallocation to move file data extents, it must be handled differently. On a zoned filesystem, we just truncate the inode to the size that we wanted to pre-allocate. Then, we flush dirty pages on the file before finishing the relocation process. run_delalloc_zoned() will handle all the allocations and submit IOs to the underlying layers. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 473b78874844..232d5da7b7be 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2553,6 +2553,31 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; + /* + * On a zoned filesystem, we cannot preallocate the file region. + * Instead, we dirty and fiemap_write the region. + */ + if (btrfs_is_zoned(inode->root->fs_info)) { + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + + end = cluster->end - offset + 1; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + i_size_write(&inode->vfs_inode, end); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } + + return btrfs_end_transaction(trans); + } + inode_lock(&inode->vfs_inode); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2756,6 +2781,8 @@ static int relocate_file_extent_cluster(struct inode *inode, } } WARN_ON(nr != cluster->nr); + if (btrfs_is_zoned(fs_info) && !ret) + ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); out: kfree(ra); return ret; @@ -3434,8 +3461,12 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; + u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; + if (btrfs_is_zoned(trans->fs_info)) + flags &= ~BTRFS_INODE_PREALLOC; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3450,8 +3481,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | - BTRFS_INODE_PREALLOC); + btrfs_set_inode_flags(leaf, item, flags); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); From f7ef5287a63d644e62a52893af8c6cfcb5043213 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:16 +0900 Subject: [PATCH 285/307] btrfs: zoned: relocate block group to repair IO failure in zoned filesystems When a bad checksum is found and if the filesystem has a mirror of the damaged data, we read the correct data from the mirror and writes it to damaged blocks. This however, violates the sequential write constraints of a zoned block device. We can consider three methods to repair an IO failure in zoned filesystems: (1) Reset and rewrite the damaged zone (2) Allocate new device extent and replace the damaged device extent to the new extent (3) Relocate the corresponding block group Method (1) is most similar to a behavior done with regular devices. However, it also wipes non-damaged data in the same device extent, and so it unnecessary degrades non-damaged data. Method (2) is much like device replacing but done in the same device. It is safe because it keeps the device extent until the replacing finish. However, extending device replacing is non-trivial. It assumes "src_dev->physical == dst_dev->physical". Also, the extent mapping replacing function should be extended to support replacing device extent position in one device. Method (3) invokes relocation of the damaged block group and is straightforward to implement. It relocates all the mirrored device extents, so it potentially is a more costly operation than method (1) or (2). But it relocates only used extents which reduce the total IO size. Let's apply method (3) for now. In the future, we can extend device-replace and apply method (2). For protecting a block group gets relocated multiple time with multiple IO errors, this commit introduces "relocating_repair" bit to show it's now relocating to repair IO failures. Also it uses a new kthread "btrfs-relocating-repair", not to block IO path with relocating process. This commit also supports repairing in the scrub process. Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 1 + fs/btrfs/extent_io.c | 3 ++ fs/btrfs/scrub.c | 3 ++ fs/btrfs/volumes.c | 72 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 1 + 5 files changed, 80 insertions(+) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index d37ee576ac6e..29678426247d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -96,6 +96,7 @@ struct btrfs_block_group { unsigned int has_caching_ctl:1; unsigned int removed:1; unsigned int to_copy:1; + unsigned int relocating_repair:1; int disk_cache_state; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dfa6c6106b94..4dfb3ead1175 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2260,6 +2260,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + if (btrfs_is_zoned(fs_info)) + return btrfs_repair_one_zone(fs_info, logical); + bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e0c3ec01e324..310fce00fcda 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -857,6 +857,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + if (btrfs_is_zoned(fs_info) && !sctx->is_dev_replace) + return btrfs_repair_one_zone(fs_info, logical); + /* * We must use GFP_NOFS because the scrub task might be waiting for a * worker task executing this function and in turn a transaction commit diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1312b17a6b49..b8fab44394f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7980,3 +7980,75 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) spin_unlock(&fs_info->swapfile_pins_lock); return node != NULL; } + +static int relocating_repair_kthread(void *data) +{ + struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_fs_info *fs_info = cache->fs_info; + u64 target; + int ret = 0; + + target = cache->start; + btrfs_put_block_group(cache); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + btrfs_info(fs_info, + "zoned: skip relocating block group %llu to repair: EBUSY", + target); + return -EBUSY; + } + + mutex_lock(&fs_info->delete_unused_bgs_mutex); + + /* Ensure block group still exists */ + cache = btrfs_lookup_block_group(fs_info, target); + if (!cache) + goto out; + + if (!cache->relocating_repair) + goto out; + + ret = btrfs_may_alloc_data_chunk(fs_info, target); + if (ret < 0) + goto out; + + btrfs_info(fs_info, + "zoned: relocating block group %llu to repair IO failure", + target); + ret = btrfs_relocate_chunk(fs_info, target); + +out: + if (cache) + btrfs_put_block_group(cache); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_exclop_finish(fs_info); + + return ret; +} + +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) +{ + struct btrfs_block_group *cache; + + /* Do not attempt to repair in degraded state */ + if (btrfs_test_opt(fs_info, DEGRADED)) + return 0; + + cache = btrfs_lookup_block_group(fs_info, logical); + if (!cache) + return 0; + + spin_lock(&cache->lock); + if (cache->relocating_repair) { + spin_unlock(&cache->lock); + btrfs_put_block_group(cache); + return 0; + } + cache->relocating_repair = 1; + spin_unlock(&cache->lock); + + kthread_run(relocating_repair_kthread, cache, + "btrfs-relocating-repair"); + + return 0; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3bbdb4175df..d4c3e0dd32b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -599,5 +599,6 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); +int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); #endif From 6ab6ebb76042d3d94a7c6c447f770a28a412c68c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:17 +0900 Subject: [PATCH 286/307] btrfs: split alloc_log_tree() This is a preparation patch for the next patch. Split alloc_log_tree() into two parts. The first one allocating the tree structure, remains in alloc_log_tree() and the second part allocating the tree node, which is moved into btrfs_alloc_log_tree_node(). Also export the latter part is to be used in the next patch. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 33 +++++++++++++++++++++++++++------ fs/btrfs/disk-io.h | 2 ++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6e16f556ed75..d2fa92526b3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1254,7 +1254,6 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS); if (!root) @@ -1264,6 +1263,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + return root; +} + +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct extent_buffer *leaf; + /* * DON'T set SHAREABLE bit for log trees. * @@ -1276,26 +1283,33 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); - if (IS_ERR(leaf)) { - btrfs_put_root(root); - return ERR_CAST(leaf); - } + if (IS_ERR(leaf)) + return PTR_ERR(leaf); root->node = leaf; btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); - return root; + + return 0; } int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + WARN_ON(fs_info->log_root_tree); fs_info->log_root_tree = log_root; return 0; @@ -1307,11 +1321,18 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; + int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); + ret = btrfs_alloc_log_tree_node(trans, log_root); + if (ret) { + btrfs_put_root(log_root); + return ret; + } + log_root->last_trans = trans->transid; log_root->root_key.offset = root->root_key.objectid; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9f4a2a1e3d36..0e7e9526b6a8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,6 +120,8 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); +int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, From 40ab3be102f0a61dbb93093f330b432324a793f1 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:18 +0900 Subject: [PATCH 287/307] btrfs: zoned: extend zoned allocator to use dedicated tree-log block group This is the 1/3 patch to enable tree log on zoned filesystems. The tree-log feature does not work on a zoned filesystem as is. Blocks for a tree-log tree are allocated mixed with other metadata blocks and btrfs writes and syncs the tree-log blocks to devices at the time of fsync(), which has a different timing than a global transaction commit. As a result, both writing tree-log blocks and writing other metadata blocks become non-sequential writes that zoned filesystems must avoid. Introduce a dedicated block group for tree-log blocks, so that tree-log blocks and other metadata blocks can be separate write streams. As a result, each write stream can now be written to devices separately. "fs_info->treelog_bg" tracks the dedicated block group and assigns "treelog_bg" on-demand on tree-log block allocation time. This commit extends the zoned block allocator to use the block group. Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 ++ fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 75 +++++++++++++++++++++++++++++++++++++++--- fs/btrfs/zoned.h | 14 ++++++++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index f5e9f560ce6d..5064be59dac5 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -901,6 +901,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); + btrfs_clear_treelog_bg(block_group); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1bb4f767966a..6f4b493625ef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -976,6 +976,8 @@ struct btrfs_fs_info { /* Max size to emit ZONE_APPEND write command */ u64 max_zone_append_size; struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d2fa92526b3b..84c6650d5ef7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2787,6 +2787,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->delete_unused_bgs_mutex); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd308bf3a220..78ad31a59e59 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3497,6 +3497,9 @@ struct find_free_extent_ctl { bool have_caching_bg; bool orig_have_caching_bg; + /* Allocation is called for tree-log */ + bool for_treelog; + /* RAID index, converted from flags */ int index; @@ -3725,6 +3728,22 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, return find_free_extent_unclustered(block_group, ffe_ctl); } +/* + * Tree-log block group locking + * ============================ + * + * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which + * indicates the starting address of a block group, which is reserved only + * for tree-log metadata. + * + * Lock nesting + * ============ + * + * space_info::lock + * block_group::lock + * fs_info::treelog_bg_lock + */ + /* * Simple allocator for sequential-only block group. It only allows sequential * allocation. No need to play with trees. This function also reserves the @@ -3734,23 +3753,54 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, struct btrfs_block_group **bg_ret) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 start = block_group->start; u64 num_bytes = ffe_ctl->num_bytes; u64 avail; + u64 bytenr = block_group->start; + u64 log_bytenr; int ret = 0; + bool skip; ASSERT(btrfs_is_zoned(block_group->fs_info)); + /* + * Do not allow non-tree-log blocks in the dedicated tree-log block + * group, and vice versa. + */ + spin_lock(&fs_info->treelog_bg_lock); + log_bytenr = fs_info->treelog_bg; + skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + spin_unlock(&fs_info->treelog_bg_lock); + if (skip) + return 1; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); if (block_group->ro) { ret = 1; goto out; } + /* + * Do not allow currently using block group to be tree-log dedicated + * block group. + */ + if (ffe_ctl->for_treelog && !fs_info->treelog_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + avail = block_group->length - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { @@ -3765,6 +3815,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, goto out; } + if (ffe_ctl->for_treelog && !fs_info->treelog_bg) + fs_info->treelog_bg = block_group->start; + ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); @@ -3779,6 +3832,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, ffe_ctl->search_start = ffe_ctl->found_offset; out: + if (ret && ffe_ctl->for_treelog) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); return ret; @@ -4028,7 +4084,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, return prepare_allocation_clustered(fs_info, ffe_ctl, space_info, ins); case BTRFS_EXTENT_ALLOC_ZONED: - /* Nothing to do */ + if (ffe_ctl->for_treelog) { + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg) + ffe_ctl->hint_byte = fs_info->treelog_bg; + spin_unlock(&fs_info->treelog_bg_lock); + } return 0; default: BUG(); @@ -4072,6 +4133,7 @@ static noinline int find_free_extent(struct btrfs_root *root, struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); WARN_ON(num_bytes < fs_info->sectorsize); @@ -4085,6 +4147,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.orig_have_caching_bg = false; ffe_ctl.found_offset = 0; ffe_ctl.hint_byte = hint_byte_orig; + ffe_ctl.for_treelog = for_treelog; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; /* For clustered allocation */ @@ -4159,8 +4222,11 @@ search: struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ - if (unlikely(block_group->ro)) + if (unlikely(block_group->ro)) { + if (for_treelog) + btrfs_clear_treelog_bg(block_group); continue; + } btrfs_grab_block_group(block_group, delalloc); ffe_ctl.search_start = block_group->start; @@ -4346,6 +4412,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; + bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); flags = get_alloc_profile_by_root(root, is_data); again: @@ -4369,8 +4436,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu", - flags, num_bytes); + "allocation failed flags %llu, wanted %llu tree-log %d", + flags, num_bytes, for_treelog); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 932ad9bc0de6..61e969652fe1 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -7,6 +7,7 @@ #include #include "volumes.h" #include "disk-io.h" +#include "block-group.h" struct btrfs_zoned_device_info { /* @@ -290,4 +291,17 @@ static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->zoned_meta_io_lock); } +static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if (!btrfs_is_zoned(fs_info)) + return; + + spin_lock(&fs_info->treelog_bg_lock); + if (fs_info->treelog_bg == bg->start) + fs_info->treelog_bg = 0; + spin_unlock(&fs_info->treelog_bg_lock); +} + #endif From fa1a0f42a0356846fb1acd1d53061d53413a4c45 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:19 +0900 Subject: [PATCH 288/307] btrfs: zoned: serialize log transaction on zoned filesystems This is the 2/3 patch to enable tree-log on zoned filesystems. Since we can start more than one log transactions per subvolume simultaneously, nodes from multiple transactions can be allocated interleaved. Such mixed allocation results in non-sequential writes at the time of a log transaction commit. The nodes of the global log root tree (fs_info->log_root_tree), also have the same problem with mixed allocation. Serializes log transactions by waiting for a committing transaction when someone tries to start a new transaction, to avoid the mixed allocation problem. We must also wait for running log transactions from another subvolume, but there is no easy way to detect which subvolume root is running a log transaction. So, this patch forbids starting a new log transaction when other subvolumes already allocated the global log root tree. Reviewed-by: Josef Bacik Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c02eeeac439c..4e72794342c0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -105,6 +105,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dirid, int del_all); +static void wait_log_commit(struct btrfs_root *root, int transid); /* * tree logging is a special write ahead log used to make sure that @@ -140,7 +141,9 @@ static int start_log_trans(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; + const bool zoned = btrfs_is_zoned(fs_info); int ret = 0; + bool created = false; /* * First check if the log root tree was already created. If not, create @@ -150,8 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&tree_root->log_mutex); if (!fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, fs_info); - if (!ret) + if (!ret) { set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state); + created = true; + } } mutex_unlock(&tree_root->log_mutex); if (ret) @@ -160,12 +165,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + if (btrfs_need_log_full_commit(trans)) { ret = -EAGAIN; goto out; } + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } + if (!root->log_start_pid) { clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); root->log_start_pid = current->pid; @@ -173,6 +186,17 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { + /* + * This means fs_info->log_root_tree was already created + * for some other FS trees. Do the full commit not to mix + * nodes from multiple log transactions to do sequential + * writing. + */ + if (zoned && !created) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_add_log_tree(trans, root); if (ret) goto out; @@ -201,14 +225,22 @@ out: */ static int join_running_log_trans(struct btrfs_root *root) { + const bool zoned = btrfs_is_zoned(root->fs_info); int ret = -ENOENT; if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) return ret; mutex_lock(&root->log_mutex); +again: if (root->log_root) { + int index = (root->log_transid + 1) % 2; + ret = 0; + if (zoned && atomic_read(&root->log_commit[index])) { + wait_log_commit(root, root->log_transid - 1); + goto again; + } atomic_inc(&root->log_writers); } mutex_unlock(&root->log_mutex); From 3ddebf27fcd3a910989c85a3bfc9085225038c5b Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:20 +0900 Subject: [PATCH 289/307] btrfs: zoned: reorder log node allocation on zoned filesystem This is the 3/3 patch to enable tree-log on zoned filesystems. The allocation order of nodes of "fs_info->log_root_tree" and nodes of "root->log_root" is not the same as the writing order of them. So, the writing causes unaligned write errors. Reorder the allocation of them by delaying allocation of the root node of "fs_info->log_root_tree," so that the node buffers can go out sequentially to devices. Cc: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++++----- fs/btrfs/tree-log.c | 27 +++++++++++++++++++++------ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 84c6650d5ef7..c2576c5fe62e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1298,16 +1298,18 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *log_root; - int ret; log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); - ret = btrfs_alloc_log_tree_node(trans, log_root); - if (ret) { - btrfs_put_root(log_root); - return ret; + if (!btrfs_is_zoned(fs_info)) { + int ret = btrfs_alloc_log_tree_node(trans, log_root); + + if (ret) { + btrfs_put_root(log_root); + return ret; + } } WARN_ON(fs_info->log_root_tree); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4e72794342c0..fc04625cbbd1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3162,6 +3162,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; + if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); + if (!log_root_tree->node) { + ret = btrfs_alloc_log_tree_node(trans, log_root_tree); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&log_root_tree->log_mutex); + goto out; + } + } + mutex_unlock(&fs_info->tree_root->log_mutex); + } + /* * Now we are safe to update the log_root_tree because we're under the * log_mutex, and we're a current writer so we're holding the commit @@ -3320,12 +3333,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, .process_func = process_one_buffer }; - ret = walk_log_tree(trans, log, &wc); - if (ret) { - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(log->fs_info, ret, NULL); + if (log->node) { + ret = walk_log_tree(trans, log, &wc); + if (ret) { + if (trans) + btrfs_abort_transaction(trans, ret); + else + btrfs_handle_fs_error(log->fs_info, ret, NULL); + } } clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, From b528f467132713a03984b0f9592073d75677c501 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 5 Feb 2021 23:58:36 +0900 Subject: [PATCH 290/307] btrfs: zoned: deal with holes writing out tree-log pages Since the zoned filesystem requires sequential write out of metadata, we cannot proceed with a hole in tree-log pages. When such a hole exists, btree_write_cache_pages() will return -EAGAIN. This happens when someone, e.g., a concurrent transaction commit, writes a dirty extent in this tree-log commit. If we are not going to wait for the extents, we can hope the concurrent writing fills the hole for us. So, we can ignore the error in this case and hope the next write will succeed. If we want to wait for them and got the error, we cannot wait for them because it will cause a deadlock. So, let's bail out to a full commit in this case. Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc04625cbbd1..d90695c1ab6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3120,6 +3120,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); + /* + * -EAGAIN happens when someone, e.g., a concurrent transaction + * commit, writes a dirty extent in this tree-log commit. This + * concurrent write will create a hole writing out the extents, + * and we cannot proceed on a zoned filesystem, requiring + * sequential writing. While we can bail out to a full commit + * here, but we can continue hoping the concurrent writing fills + * the hole. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) + ret = 0; if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); @@ -3242,7 +3253,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); - if (ret) { + /* + * As described above, -EAGAIN indicates a hole in the extents. We + * cannot wait for these write outs since the waiting cause a + * deadlock. Bail out to the full commit instead. + */ + if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) { + btrfs_set_log_full_commit(trans); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } else if (ret) { btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); From 9d294a685fbcb256ce8c5f7fd88a7596d0f52a8a Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 4 Feb 2021 19:22:21 +0900 Subject: [PATCH 291/307] btrfs: zoned: enable to mount ZONED incompat flag This final patch adds the ZONED incompat flag to the supported flags and enables to mount ZONED flagged file system. Reviewed-by: Anand Jain Reviewed-by: Josef Bacik Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f4b493625ef..3bc00aed13b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -298,7 +298,8 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34) + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) From 505ca2f7770b49d6b27d97de7dc7ff6af109f8fa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 9 Feb 2021 15:59:04 +0100 Subject: [PATCH 292/307] ACPI: OSL: Rework acpi_check_resource_conflict() Rearrange the code in acpi_check_resource_conflict() so as to drop redundant checks and uneeded local variables from there and modify the messages printed by that function to be more concise and hopefully easier to understand. While at it, replace direct printk() usage with pr_*(). Signed-off-by: Rafael J. Wysocki --- drivers/acpi/osl.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 0418febc5cf2..326ce7f1b0d9 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1458,38 +1458,28 @@ __setup("acpi_enforce_resources=", acpi_enforce_resources_setup); int acpi_check_resource_conflict(const struct resource *res) { acpi_adr_space_type space_id; - acpi_size length; - u8 warn = 0; - int clash = 0; if (acpi_enforce_resources == ENFORCE_RESOURCES_NO) return 0; - if (!(res->flags & IORESOURCE_IO) && !(res->flags & IORESOURCE_MEM)) - return 0; if (res->flags & IORESOURCE_IO) space_id = ACPI_ADR_SPACE_SYSTEM_IO; - else + else if (res->flags & IORESOURCE_MEM) space_id = ACPI_ADR_SPACE_SYSTEM_MEMORY; + else + return 0; - length = resource_size(res); - if (acpi_enforce_resources != ENFORCE_RESOURCES_NO) - warn = 1; - clash = acpi_check_address_range(space_id, res->start, length, warn); + if (!acpi_check_address_range(space_id, res->start, resource_size(res), 1)) + return 0; + + pr_info("Resource conflict; ACPI support missing from driver?\n"); + + if (acpi_enforce_resources == ENFORCE_RESOURCES_STRICT) + return -EBUSY; + + if (acpi_enforce_resources == ENFORCE_RESOURCES_LAX) + pr_notice("Resource conflict: System may be unstable or behave erratically\n"); - if (clash) { - if (acpi_enforce_resources != ENFORCE_RESOURCES_NO) { - if (acpi_enforce_resources == ENFORCE_RESOURCES_LAX) - printk(KERN_NOTICE "ACPI: This conflict may" - " cause random problems and system" - " instability\n"); - printk(KERN_INFO "ACPI: If an ACPI driver is available" - " for this device, you should use it instead of" - " the native driver\n"); - } - if (acpi_enforce_resources == ENFORCE_RESOURCES_STRICT) - return -EBUSY; - } return 0; } EXPORT_SYMBOL(acpi_check_resource_conflict); From 70779b897395b330ba5a47bed84f94178da599f9 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Wed, 20 Jan 2021 00:51:13 -0800 Subject: [PATCH 293/307] fs/affs: release old buffer head on error path The reference count of the old buffer head should be decremented on path that fails to get the new buffer head. Fixes: 6b4657667ba0 ("fs/affs: add rename exchange") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Pan Bian Signed-off-by: David Sterba --- fs/affs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 41c5749f4db7..5400a876d73f 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -460,8 +460,10 @@ affs_xrename(struct inode *old_dir, struct dentry *old_dentry, return -EIO; bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino); - if (!bh_new) + if (!bh_new) { + affs_brelse(bh_old); return -EIO; + } /* Remove old header from its parent directory. */ affs_lock_dir(old_dir); From 8aef273ee88e3e94d5d1bfc0728065b8564d3463 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Feb 2021 19:09:43 +0100 Subject: [PATCH 294/307] ACPI: OSL: Clean up printing messages Replace the ACPI_DEBUG_PRINT() instance in osl.c unrelated to the ACPICA debug with acpi_handle_debug(), add a pr_fmt() definition to osl.c and replace direct printk() usage in that file with the suitable pr_*() calls. While at it, add a physical address value to the message in acpi_os_map_iomem() and reword a couple of messages to avoid using function names in them. Signed-off-by: Rafael J. Wysocki --- drivers/acpi/osl.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 326ce7f1b0d9..327e1b4eb6b0 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -9,6 +9,8 @@ * Author: Matthew Wilcox */ +#define pr_fmt(fmt) "ACPI: OSL: " fmt + #include #include #include @@ -37,6 +39,7 @@ #include "acpica/acnamesp.h" #include "internal.h" +/* Definitions for ACPI_DEBUG_PRINT() */ #define _COMPONENT ACPI_OS_SERVICES ACPI_MODULE_NAME("osl"); @@ -327,7 +330,7 @@ void __iomem __ref acpi_size pg_sz; if (phys > ULONG_MAX) { - printk(KERN_ERR PREFIX "Cannot map memory that high\n"); + pr_err("Cannot map memory that high: 0x%llx\n", phys); return NULL; } @@ -528,13 +531,12 @@ acpi_os_predefined_override(const struct acpi_predefined_names *init_val, *new_val = NULL; if (!memcmp(init_val->name, "_OS_", 4) && strlen(acpi_os_name)) { - printk(KERN_INFO PREFIX "Overriding _OS definition to '%s'\n", - acpi_os_name); + pr_info("Overriding _OS definition to '%s'\n", acpi_os_name); *new_val = acpi_os_name; } if (!memcmp(init_val->name, "_REV", 4) && acpi_rev_override) { - printk(KERN_INFO PREFIX "Overriding _REV return value to 5\n"); + pr_info("Overriding _REV return value to 5\n"); *new_val = (char *)5; } @@ -575,15 +577,14 @@ acpi_os_install_interrupt_handler(u32 gsi, acpi_osd_handler handler, return AE_ALREADY_ACQUIRED; if (acpi_gsi_to_irq(gsi, &irq) < 0) { - printk(KERN_ERR PREFIX "SCI (ACPI GSI %d) not registered\n", - gsi); + pr_err("SCI (ACPI GSI %d) not registered\n", gsi); return AE_OK; } acpi_irq_handler = handler; acpi_irq_context = context; if (request_irq(irq, acpi_irq, IRQF_SHARED, "acpi", acpi_irq)) { - printk(KERN_ERR PREFIX "SCI (IRQ%d) allocation failed\n", irq); + pr_err("SCI (IRQ%d) allocation failed\n", irq); acpi_irq_handler = NULL; return AE_NOT_ACQUIRED; } @@ -1071,7 +1072,7 @@ acpi_status acpi_os_execute(acpi_execute_type type, if (type == OSL_DEBUGGER_MAIN_THREAD) { ret = acpi_debugger_create_thread(function, context); if (ret) { - pr_err("Call to kthread_create() failed.\n"); + pr_err("Kernel thread creation failed\n"); status = AE_ERROR; } goto out_thread; @@ -1121,8 +1122,7 @@ acpi_status acpi_os_execute(acpi_execute_type type, */ ret = queue_work_on(0, queue, &dpc->work); if (!ret) { - printk(KERN_ERR PREFIX - "Call to queue_work() failed.\n"); + pr_err("Unable to queue work\n"); status = AE_ERROR; } err_workqueue: @@ -1165,9 +1165,9 @@ acpi_status acpi_hotplug_schedule(struct acpi_device *adev, u32 src) { struct acpi_hp_work *hpw; - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "Scheduling hotplug event (%p, %u) for deferred execution.\n", - adev, src)); + acpi_handle_debug(adev->handle, + "Scheduling hotplug event %u for deferred handling\n", + src); hpw = kmalloc(sizeof(*hpw), GFP_KERNEL); if (!hpw) @@ -1355,7 +1355,7 @@ acpi_status acpi_os_signal(u32 function, void *info) { switch (function) { case ACPI_SIGNAL_FATAL: - printk(KERN_ERR PREFIX "Fatal opcode executed\n"); + pr_err("Fatal opcode executed\n"); break; case ACPI_SIGNAL_BREAKPOINT: /* @@ -1407,7 +1407,7 @@ __setup("acpi_os_name=", acpi_os_name_setup); static int __init acpi_no_auto_serialize_setup(char *str) { acpi_gbl_auto_serialize_methods = FALSE; - pr_info("ACPI: auto-serialization disabled\n"); + pr_info("Auto-serialization disabled\n"); return 1; } @@ -1712,7 +1712,7 @@ acpi_status acpi_os_release_object(acpi_cache_t * cache, void *object) static int __init acpi_no_static_ssdt_setup(char *s) { acpi_gbl_disable_ssdt_table_install = TRUE; - pr_info("ACPI: static SSDT installation disabled\n"); + pr_info("Static SSDT installation disabled\n"); return 0; } @@ -1721,8 +1721,7 @@ early_param("acpi_no_static_ssdt", acpi_no_static_ssdt_setup); static int __init acpi_disable_return_repair(char *s) { - printk(KERN_NOTICE PREFIX - "ACPI: Predefined validation mechanism disabled\n"); + pr_notice("Predefined validation mechanism disabled\n"); acpi_gbl_disable_auto_repair = TRUE; return 1; @@ -1748,7 +1747,7 @@ acpi_status __init acpi_os_initialize(void) void *rv; rv = acpi_os_map_generic_address(&acpi_gbl_FADT.reset_register); - pr_debug(PREFIX "%s: map reset_reg %s\n", __func__, + pr_debug("%s: Reset register mapping %s\n", __func__, rv ? "successful" : "failed"); } acpi_os_initialized = true; From 4208c398aae4c2290864ba15c3dab7111f32bec1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Feb 2021 13:01:08 +0000 Subject: [PATCH 295/307] fs/jfs: fix potential integer overflow on shift of a int The left shift of int 32 bit integer constant 1 is evaluated using 32 bit arithmetic and then assigned to a signed 64 bit integer. In the case where l2nb is 32 or more this can lead to an overflow. Avoid this by shifting the value 1LL instead. Addresses-Coverity: ("Uninitentional integer overflow") Fixes: b40c2e665cd5 ("fs/jfs: TRIM support for JFS Filesystem") Signed-off-by: Colin Ian King Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_dmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 94b7c1cb5ceb..7aee15608619 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -1656,7 +1656,7 @@ s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) } else if (rc == -ENOSPC) { /* search for next smaller log2 block */ l2nb = BLKSTOL2(nblocks) - 1; - nblocks = 1 << l2nb; + nblocks = 1LL << l2nb; } else { /* Trim any already allocated blocks */ jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n"); From e1e6bd2995ac0e1ad0c2a2d906a06f59ce2ed293 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 11 Feb 2021 19:30:01 +0100 Subject: [PATCH 296/307] ACPI: property: Fix fwnode string properties matching Property matching does not work for ACPI fwnodes if the value of the given property is not represented as a package in the _DSD package containing it. For example, the "compatible" property in the _DSD below Name (_DSD, Package () { ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), Package () { Package () {"compatible", "ethernet-phy-ieee802.3-c45"} } }) will not be found by fwnode_property_match_string(), because the ACPI code handling device properties does not regard the single value as a "list" in that case. Namely, fwnode_property_match_string() invoked to match a given string property value first calls fwnode_property_read_string_array() with the last two arguments equal to NULL and 0, respectively, in order to count the items in the value of the given property, with the assumption that this value may be an array. For ACPI fwnodes, that operation is carried out by acpi_node_prop_read() which calls acpi_data_prop_read() for this purpose. However, when the return (val) pointer is NULL, that function only looks for a property whose value is a package without checking the single-value case at all. To fix that, make acpi_data_prop_read() check the single-value case if its return pointer argument is NULL and modify acpi_data_prop_read_single() handling that case to attempt to read the value of the property if the return pointer is NULL and return 1 if that succeeds. Fixes: 3708184afc77 ("device property: Move FW type specific functionality to FW specific files") Reported-by: Calvin Johnson Cc: 4.13+ # 4.13+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Sakari Ailus Reviewed-by: Mika Westerberg Reviewed-by: Andy Shevchenko --- drivers/acpi/property.c | 44 ++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 24e87b630573..16b28084c1ca 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -787,9 +787,6 @@ static int acpi_data_prop_read_single(const struct acpi_device_data *data, const union acpi_object *obj; int ret; - if (!val) - return -EINVAL; - if (proptype >= DEV_PROP_U8 && proptype <= DEV_PROP_U64) { ret = acpi_data_get_property(data, propname, ACPI_TYPE_INTEGER, &obj); if (ret) @@ -799,28 +796,43 @@ static int acpi_data_prop_read_single(const struct acpi_device_data *data, case DEV_PROP_U8: if (obj->integer.value > U8_MAX) return -EOVERFLOW; - *(u8 *)val = obj->integer.value; + + if (val) + *(u8 *)val = obj->integer.value; + break; case DEV_PROP_U16: if (obj->integer.value > U16_MAX) return -EOVERFLOW; - *(u16 *)val = obj->integer.value; + + if (val) + *(u16 *)val = obj->integer.value; + break; case DEV_PROP_U32: if (obj->integer.value > U32_MAX) return -EOVERFLOW; - *(u32 *)val = obj->integer.value; + + if (val) + *(u32 *)val = obj->integer.value; + break; default: - *(u64 *)val = obj->integer.value; + if (val) + *(u64 *)val = obj->integer.value; + break; } + + if (!val) + return 1; } else if (proptype == DEV_PROP_STRING) { ret = acpi_data_get_property(data, propname, ACPI_TYPE_STRING, &obj); if (ret) return ret; - *(char **)val = obj->string.pointer; + if (val) + *(char **)val = obj->string.pointer; return 1; } else { @@ -834,7 +846,7 @@ int acpi_dev_prop_read_single(struct acpi_device *adev, const char *propname, { int ret; - if (!adev) + if (!adev || !val) return -EINVAL; ret = acpi_data_prop_read_single(&adev->data, propname, proptype, val); @@ -928,10 +940,20 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, const union acpi_object *items; int ret; - if (val && nval == 1) { + if (nval == 1 || !val) { ret = acpi_data_prop_read_single(data, propname, proptype, val); - if (ret >= 0) + /* + * The overflow error means that the property is there and it is + * single-value, but its type does not match, so return. + */ + if (ret >= 0 || ret == -EOVERFLOW) return ret; + + /* + * Reading this property as a single-value one failed, but its + * value may still be represented as one-element array, so + * continue. + */ } ret = acpi_data_get_property_array(data, propname, ACPI_TYPE_ANY, &obj); From 38f3885edbef8a77b25c4d13f3de06a7b93d02de Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Feb 2021 16:11:17 +0200 Subject: [PATCH 297/307] ACPI: property: Remove dead code After the commit 3a7a2ab839ad couple of functions became a dead code. Moreover, for all these years nobody used them. Remove. Fixes: 3a7a2ab839ad ("ACPI / property: Extend fwnode_property_* to data-only subnodes") Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 20 -------------------- include/linux/acpi.h | 21 --------------------- 2 files changed, 41 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 16b28084c1ca..22ccab4e7c6d 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -841,20 +841,6 @@ static int acpi_data_prop_read_single(const struct acpi_device_data *data, return ret; } -int acpi_dev_prop_read_single(struct acpi_device *adev, const char *propname, - enum dev_prop_type proptype, void *val) -{ - int ret; - - if (!adev || !val) - return -EINVAL; - - ret = acpi_data_prop_read_single(&adev->data, propname, proptype, val); - if (ret < 0 || proptype != ACPI_TYPE_STRING) - return ret; - return 0; -} - static int acpi_copy_property_array_u8(const union acpi_object *items, u8 *val, size_t nval) { @@ -995,12 +981,6 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, return ret; } -int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname, - enum dev_prop_type proptype, void *val, size_t nval) -{ - return adev ? acpi_data_prop_read(&adev->data, propname, proptype, val, nval) : -EINVAL; -} - /** * acpi_node_prop_read - retrieve the value of an ACPI property with given name. * @fwnode: Firmware node to get the property from. diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 053bf05fb1f7..a47eaf448131 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1114,14 +1114,9 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); -int acpi_dev_prop_read_single(struct acpi_device *adev, - const char *propname, enum dev_prop_type proptype, - void *val); int acpi_node_prop_read(const struct fwnode_handle *fwnode, const char *propname, enum dev_prop_type proptype, void *val, size_t nval); -int acpi_dev_prop_read(const struct acpi_device *adev, const char *propname, - enum dev_prop_type proptype, void *val, size_t nval); struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child); @@ -1223,14 +1218,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, return -ENXIO; } -static inline int acpi_dev_prop_read_single(const struct acpi_device *adev, - const char *propname, - enum dev_prop_type proptype, - void *val) -{ - return -ENXIO; -} - static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode, const char *propname, enum dev_prop_type proptype, @@ -1239,14 +1226,6 @@ static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode, return -ENXIO; } -static inline int acpi_dev_prop_read(const struct acpi_device *adev, - const char *propname, - enum dev_prop_type proptype, - void *val, size_t nval) -{ - return -ENXIO; -} - static inline struct fwnode_handle * acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) From 325aa816143228a0b3472074ffb50d55ac3f04fe Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Feb 2021 16:11:18 +0200 Subject: [PATCH 298/307] ACPI: property: Make acpi_node_prop_read() static There is no users outside of property.c. No need to export acpi_node_prop_read(), hence make it static. Fixes: 3708184afc77 ("device property: Move FW type specific functionality to FW specific files") Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 6 +++--- include/linux/acpi.h | 11 ----------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 22ccab4e7c6d..2b65ad9b4c0d 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -993,9 +993,9 @@ static int acpi_data_prop_read(const struct acpi_device_data *data, * of the property. Otherwise, read at most @nval values to the array at the * location pointed to by @val. */ -int acpi_node_prop_read(const struct fwnode_handle *fwnode, - const char *propname, enum dev_prop_type proptype, - void *val, size_t nval) +static int acpi_node_prop_read(const struct fwnode_handle *fwnode, + const char *propname, enum dev_prop_type proptype, + void *val, size_t nval) { return acpi_data_prop_read(acpi_device_data_of_node(fwnode), propname, proptype, val, nval); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index a47eaf448131..dc6e1f39dc6f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1114,9 +1114,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); -int acpi_node_prop_read(const struct fwnode_handle *fwnode, - const char *propname, enum dev_prop_type proptype, - void *val, size_t nval); struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child); @@ -1218,14 +1215,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, return -ENXIO; } -static inline int acpi_node_prop_read(const struct fwnode_handle *fwnode, - const char *propname, - enum dev_prop_type proptype, - void *val, size_t nval) -{ - return -ENXIO; -} - static inline struct fwnode_handle * acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) From c82ff99eaab83df6b962ce83521c456ba9cf44c2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Feb 2021 16:11:19 +0200 Subject: [PATCH 299/307] ACPI: property: Satisfy kernel doc validator (part 1) CHECK drivers/acpi/property.c warning: Function parameter or member 'data' not described in 'acpi_data_get_property_array' warning: Excess function parameter 'adev' description in 'acpi_data_get_property_array' Fixes: 3a7a2ab839ad ("ACPI / property: Extend fwnode_property_* to data-only subnodes") Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 2b65ad9b4c0d..ab4d7c734b0d 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -564,7 +564,7 @@ int acpi_node_prop_get(const struct fwnode_handle *fwnode, /** * acpi_data_get_property_array - return an ACPI array property with given name - * @adev: ACPI data object to get the property from + * @data: ACPI data object to get the property from * @name: Name of the property * @type: Expected type of array elements * @obj: Location to store a pointer to the property value (if not NULL) From 1de359d82576e57963f0d8b2d89cbdb2c9f4f2aa Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 12 Feb 2021 16:11:20 +0200 Subject: [PATCH 300/307] ACPI: property: Satisfy kernel doc validator (part 2) CHECK drivers/acpi/property.c warning: Function parameter or member '__fwnode' not described in 'acpi_graph_get_remote_endpoint' warning: Excess function parameter 'fwnode' description in 'acpi_graph_get_remote_endpoint' warning: Excess function parameter 'endpoint' description in 'acpi_graph_get_remote_endpoint' Fixes: 0ef7478639c5 ("ACPI: property: Make the ACPI graph API private") Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index ab4d7c734b0d..e312ebaed8db 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1212,8 +1212,7 @@ static struct fwnode_handle *acpi_graph_get_child_prop_value( /** * acpi_graph_get_remote_endpoint - Parses and returns remote end of an endpoint - * @fwnode: Endpoint firmware node pointing to a remote device - * @endpoint: Firmware node of remote endpoint is filled here if not %NULL + * @__fwnode: Endpoint firmware node pointing to a remote device * * Returns the remote endpoint corresponding to @__fwnode. NULL on error. */ From 3af2f0aa2ed04f07975ba1242002b66cd53e6290 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Feb 2021 10:54:34 +0000 Subject: [PATCH 301/307] PM: EM: update Kconfig description and drop "default n" option Energy Model supports now other devices like GPUs, DSPs, not only CPUs. Thus, update the description in the config option. Remove also unneeded "default n". If the "default" line is removed, it defaults to 'n'. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a7320f07689d..56dbc2616d5c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -322,15 +322,14 @@ config CPU_PM bool config ENERGY_MODEL - bool "Energy Model for CPUs" + bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" depends on SMP depends on CPU_FREQ - default n help Several subsystems (thermal and/or the task scheduler for example) - can leverage information about the energy consumed by CPUs to make - smarter decisions. This config option enables the framework from - which subsystems can access the energy models. + can leverage information about the energy consumed by devices to + make smarter decisions. This config option enables the framework + from which subsystems can access the energy models. The exact usage of the energy model is subsystem-dependent. From c4cc3141b6f8e0097a03f6885cafac957421df9e Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Feb 2021 10:54:35 +0000 Subject: [PATCH 302/307] PM: Kconfig: remove unneeded "default n" options Remove "default n" options. If the "default" line is removed, it defaults to 'n'. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 56dbc2616d5c..6bfe3ead10ad 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -139,7 +139,6 @@ config PM_SLEEP_SMP_NONZERO_CPU config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP - default n help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. @@ -147,7 +146,6 @@ config PM_AUTOSLEEP config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP - default n help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -293,7 +291,6 @@ config PM_GENERIC_DOMAINS config WQ_POWER_EFFICIENT_DEFAULT bool "Enable workqueue power-efficient mode by default" depends on PM - default n help Per-cpu workqueues are generally preferred because they show better performance thanks to cache locality; unfortunately, From 1556057413a304b3020180240d798ec135d90844 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Mon, 1 Feb 2021 23:57:35 +0100 Subject: [PATCH 303/307] PM: sleep: Constify static struct attribute_group The only usage of suspend_attr_group is to put its address in an array of pointers to const attribute_group structs. Make it const to allow the compiler to put it into read-only memory. Signed-off-by: Rikard Falkeborn Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/main.c b/kernel/power/main.c index 0aefd6f57e0a..12c7e1bb442f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -387,7 +387,7 @@ static struct attribute *suspend_attrs[] = { NULL, }; -static struct attribute_group suspend_attr_group = { +static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, }; From 88ffce95764603e13eda4be003ec919e124ec365 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 23 Jan 2021 05:06:07 -0500 Subject: [PATCH 304/307] powercap: intel_rapl: Use topology interface in rapl_add_package() It's not a good idea to access phys_proc_id and cpu_die_id directly. Use topology_physical_package_id(cpu) and topology_die_id(cpu) instead. Signed-off-by: Yunfeng Ye [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index f0799837c2dd..b6c7bedf517e 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1310,7 +1310,6 @@ struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) { int id = topology_logical_die_id(cpu); struct rapl_package *rp; - struct cpuinfo_x86 *c = &cpu_data(cpu); int ret; if (!rapl_defaults) @@ -1327,10 +1326,11 @@ struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv) if (topology_max_die_per_package() > 1) snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, - "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id); + "package-%d-die-%d", + topology_physical_package_id(cpu), topology_die_id(cpu)); else snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d", - c->phys_proc_id); + topology_physical_package_id(cpu)); /* check if the package contains valid domains */ if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) { From 65348ba259e27ad4b69459ef477facd4c702bbf6 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 23 Jan 2021 05:06:08 -0500 Subject: [PATCH 305/307] powercap: intel_rapl: Use topology interface in rapl_init_domains() It's not a good idea to access the phys_proc_id of cpuinfo directly. Use topology_physical_package_id(cpu) instead. Signed-off-by: Yunfeng Ye [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index b6c7bedf517e..fdda2a737186 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -547,7 +547,7 @@ static void rapl_init_domains(struct rapl_package *rp) if (i == RAPL_DOMAIN_PLATFORM && rp->id > 0) { snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "psys-%d", - cpu_data(rp->lead_cpu).phys_proc_id); + topology_physical_package_id(rp->lead_cpu)); } else snprintf(rd->name, RAPL_DOMAIN_NAME_LENGTH, "%s", rapl_domain_names[i]); From e1d3209f95a19df16080b069265e172738189807 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 10 Feb 2021 18:22:08 +0100 Subject: [PATCH 306/307] MAINTAINERS: cpuidle: exynos: include header in file pattern Include the platform data header in Exynos cpuidle maintainer entry. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 00836f6452f0..ff3a64e31d2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4621,6 +4621,7 @@ L: linux-samsung-soc@vger.kernel.org S: Supported F: arch/arm/mach-exynos/pm.c F: drivers/cpuidle/cpuidle-exynos.c +F: include/linux/platform_data/cpuidle-exynos.h CPUIDLE DRIVER - ARM PSCI M: Lorenzo Pieralisi From eacd9aa8cedeb412842c7b339adbaa0477fdd5ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Feb 2021 12:03:23 -0500 Subject: [PATCH 307/307] fix handling of nd->depth on LOOKUP_CACHED failures in try_to_unlazy* After switching to non-RCU mode, we want nd->depth to match the number of entries in nd->stack[] that need eventual path_put(). legitimize_links() takes care of that on failures; unfortunately, failure exits added for LOOKUP_CACHED do not. We could add the logics for that into those failure exits, both in try_to_unlazy() and in try_to_unlazy_next(), but since both checks are immediately followed by legitimize_links() and there's no calls of legitimize_links() other than those two... It's easier to move the check (and required handling of nd->depth on failure) into legitimize_links() itself. [caught by Jens: ... and since we are zeroing ->depth here, we need to do drop_links() first] Fixes: 6c6ec2b0a3e0 "fs: add support for LOOKUP_CACHED" Tested-by: Jens Axboe Signed-off-by: Al Viro --- fs/namei.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 4cae88733a5c..de74ad2bc6e2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -630,6 +630,11 @@ static inline bool legitimize_path(struct nameidata *nd, static bool legitimize_links(struct nameidata *nd) { int i; + if (unlikely(nd->flags & LOOKUP_CACHED)) { + drop_links(nd); + nd->depth = 0; + return false; + } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { @@ -686,8 +691,6 @@ static bool try_to_unlazy(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - if (nd->flags & LOOKUP_CACHED) - goto out1; if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) @@ -724,8 +727,6 @@ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry, unsi BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - if (nd->flags & LOOKUP_CACHED) - goto out2; if (unlikely(!legitimize_links(nd))) goto out2; if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))