From 93b33bb6a5e5e8d29376f91199f30900c8d58b71 Mon Sep 17 00:00:00 2001
From: Jason Wang <wangborong@cdjrlc.com>
Date: Sun, 28 Nov 2021 11:48:23 +0800
Subject: [PATCH 001/229] microblaze: fix typo in a comment

The double `was' is repeated in a comment. Consequently, remove one
`was' from the comment.

Signed-off-by: Jason Wang <wangborong@cdjrlc.com>
Link: https://lore.kernel.org/r/20211128034823.6930-1-wangborong@cdjrlc.com
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 arch/microblaze/kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 23e8a9336a29..873e05c94021 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -11,7 +11,7 @@
  *
  * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
  *
- * This file was was derived from the sh version, arch/sh/kernel/signal.c
+ * This file was derived from the sh version, arch/sh/kernel/signal.c
  *
  * This file is subject to the terms and conditions of the GNU General
  * Public License. See the file COPYING in the main directory of this

From 19d448a04e00388bdb039cdd46a23b028f604eb4 Mon Sep 17 00:00:00 2001
From: Xiang wangx <wangxiang@cdjrlc.com>
Date: Sun, 12 Dec 2021 22:12:48 +0800
Subject: [PATCH 002/229] microblaze: add const to of_device_id

struct of_device_id should normally be const.

Signed-off-by: Xiang wangx <wangxiang@cdjrlc.com>
Link: https://lore.kernel.org/r/20211212141248.7134-1-wangxiang@cdjrlc.com
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 arch/microblaze/pci/xilinx_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/microblaze/pci/xilinx_pci.c b/arch/microblaze/pci/xilinx_pci.c
index b800909ddccf..f4cb86fffcee 100644
--- a/arch/microblaze/pci/xilinx_pci.c
+++ b/arch/microblaze/pci/xilinx_pci.c
@@ -27,7 +27,7 @@
 #define PCI_HOST_ENABLE_CMD (PCI_COMMAND_SERR | PCI_COMMAND_PARITY | \
 				PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY)
 
-static struct of_device_id xilinx_pci_match[] = {
+static const struct of_device_id xilinx_pci_match[] = {
 	{ .compatible = "xlnx,plbv46-pci-1.03.a", },
 	{}
 };

From fcc619621df50f63b641b18536f908b05d6782a4 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 21 Jan 2022 14:51:50 -0600
Subject: [PATCH 003/229] microblaze/PCI: Remove pci_phys_mem_access_prot()
 dead code

pci_phys_mem_access_prot() is defined but never used.  Remove it.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Link: https://lore.kernel.org/r/20220121205150.1151607-1-helgaas@kernel.org
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 arch/microblaze/include/asm/pci.h |  4 ---
 arch/microblaze/pci/pci-common.c  | 49 -------------------------------
 2 files changed, 53 deletions(-)

diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 7c4dc5d85f53..d90528064604 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -61,10 +61,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
 extern void pcibios_resource_survey(void);
 
 struct file;
-extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
-					 unsigned long pfn,
-					 unsigned long size,
-					 pgprot_t prot);
 
 /* This part of code was originally in xilinx-pci.h */
 #ifdef CONFIG_PCI_XILINX
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 622a4867f9e9..33bab7eec731 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -165,55 +165,6 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma)
 	return 0;
 }
 
-/*
- * This one is used by /dev/mem and fbdev who have no clue about the
- * PCI device, it tries to find the PCI device first and calls the
- * above routine
- */
-pgprot_t pci_phys_mem_access_prot(struct file *file,
-				  unsigned long pfn,
-				  unsigned long size,
-				  pgprot_t prot)
-{
-	struct pci_dev *pdev = NULL;
-	struct resource *found = NULL;
-	resource_size_t offset = ((resource_size_t)pfn) << PAGE_SHIFT;
-	int i;
-
-	if (page_is_ram(pfn))
-		return prot;
-
-	prot = pgprot_noncached(prot);
-	for_each_pci_dev(pdev) {
-		for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
-			struct resource *rp = &pdev->resource[i];
-			int flags = rp->flags;
-
-			/* Active and same type? */
-			if ((flags & IORESOURCE_MEM) == 0)
-				continue;
-			/* In the range of this resource? */
-			if (offset < (rp->start & PAGE_MASK) ||
-			    offset > rp->end)
-				continue;
-			found = rp;
-			break;
-		}
-		if (found)
-			break;
-	}
-	if (found) {
-		if (found->flags & IORESOURCE_PREFETCH)
-			prot = pgprot_noncached_wc(prot);
-		pci_dev_put(pdev);
-	}
-
-	pr_debug("PCI: Non-PCI map for %llx, prot: %lx\n",
-		 (unsigned long long)offset, pgprot_val(prot));
-
-	return prot;
-}
-
 /* This provides legacy IO read access on a bus */
 int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val, size_t size)
 {

From dfcf2e017f5bb928094952d5d56d3566d3d07ba7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 25 Jan 2022 16:20:01 +0300
Subject: [PATCH 004/229] swiotlb: do not zero buffer in set_memory_decrypted()

For larger TDX VM, memset() after set_memory_decrypted() in
swiotlb_update_mem_attributes() takes substantial portion of boot time.

Zeroing doesn't serve any functional purpose. Malicious VMM can mess
with decrypted/shared buffer at any point.

Remove the memset().

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b43..9390b38d2897 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -207,8 +207,6 @@ void __init swiotlb_update_mem_attributes(void)
 	mem->vaddr = swiotlb_mem_remap(mem, bytes);
 	if (!mem->vaddr)
 		mem->vaddr = vaddr;
-
-	memset(mem->vaddr, 0, bytes);
 }
 
 static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,

From 35265899acef135225e946b883fb07acba1d31a2 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jan 2022 16:40:17 +0000
Subject: [PATCH 005/229] swiotlb: simplify debugfs setup

Debugfs functions are already stubbed out for !CONFIG_DEBUG_FS, so we
can remove most of the #ifdefs, just keeping one to manually optimise
away the initcall when it would do nothing. We can also simplify the
code itself by factoring out the directory creation and realising that
the global io_tlb_default_mem now makes debugfs_dir redundant.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9390b38d2897..f829259262fd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -36,9 +36,7 @@
 #include <linux/scatterlist.h>
 #include <linux/cc_platform.h>
 #include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
-#endif
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 #include <linux/io.h>
 #include <linux/of.h>
@@ -756,47 +754,29 @@ bool is_swiotlb_active(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
-
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+					 const char *dirname)
 {
+	mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+	if (!mem->nslabs)
+		return;
+
 	debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
 	debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
 }
 
-static int __init swiotlb_create_default_debugfs(void)
+static int __init __maybe_unused swiotlb_create_default_debugfs(void)
 {
-	struct io_tlb_mem *mem = &io_tlb_default_mem;
-
-	debugfs_dir = debugfs_create_dir("swiotlb", NULL);
-	if (mem->nslabs) {
-		mem->debugfs = debugfs_dir;
-		swiotlb_create_debugfs_files(mem);
-	}
+	swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
 	return 0;
 }
 
+#ifdef CONFIG_DEBUG_FS
 late_initcall(swiotlb_create_default_debugfs);
-
 #endif
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
 
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-	struct io_tlb_mem *mem = rmem->priv;
-
-	mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
-	swiotlb_create_debugfs_files(mem);
-}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-}
-#endif
-
 struct page *swiotlb_alloc(struct device *dev, size_t size)
 {
 	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
@@ -858,7 +838,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 
 		rmem->priv = mem;
 
-		rmem_swiotlb_debugfs_init(rmem);
+		swiotlb_create_debugfs_files(mem, rmem->name);
 	}
 
 	dev->dma_io_tlb_mem = mem;

From c0a4191c27a12d3175283fa33f16db20e91008fd Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jan 2022 16:40:18 +0000
Subject: [PATCH 006/229] swiotlb: tidy up includes

SWIOTLB's includes have become a great big mess. Restore some order by
consolidating the random different blocks, sorting alphabetically, and
purging some clearly unnecessary entries - linux/io.h is now included
unconditionally, so need not be duplicated in the restricted DMA pool
case; similarly, linux/io.h subsumes asm/io.h; and by now it's a
mystery why asm/dma.h was ever here at all.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f829259262fd..f3ff0af49f81 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,38 +21,33 @@
 #define pr_fmt(fmt) "software IO TLB: " fmt
 
 #include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
 #include <linux/dma-direct.h>
 #include <linux/dma-map-ops.h>
-#include <linux/mm.h>
 #include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/swiotlb.h>
-#include <linux/pfn.h>
 #include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/cc_platform.h>
-#include <linux/set_memory.h>
-#include <linux/debugfs.h>
 #ifdef CONFIG_DMA_RESTRICTED_POOL
-#include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_fdt.h>
 #include <linux/of_reserved_mem.h>
 #include <linux/slab.h>
 #endif
 
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
 #define CREATE_TRACE_POINTS
 #include <trace/events/swiotlb.h>
 

From 404f9373c4e5c943ed8a5e71c8dcfef9eddd54ab Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 24 Jan 2022 16:40:19 +0000
Subject: [PATCH 007/229] swiotlb: simplify array allocation

Prefer kcalloc() to kzalloc(array_size()) for allocating an array.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/swiotlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f3ff0af49f81..908eac2527cb 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -818,8 +818,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
 		if (!mem)
 			return -ENOMEM;
 
-		mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
-				     GFP_KERNEL);
+		mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
 		if (!mem->slots) {
 			kfree(mem);
 			return -ENOMEM;

From b48cd0d12f8e3b5fc928ac84734e563eda8e430f Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Sun, 23 Jan 2022 10:38:47 -0800
Subject: [PATCH 008/229] cpufreq: replace cpumask_weight with cpumask_empty
 where appropriate

drivers/cpufreq calls cpumask_weight() to check if any bit of a given
cpumask is set. We can do it more efficiently with cpumask_empty() because
cpumask_empty() stops traversing the cpumask as soon as it finds first set
bit, while cpumask_weight() counts all bits unconditionally.

Signed-off-by: Yury Norov <yury.norov@gmail.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com> (for SCMI cpufreq driver)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 2 +-
 drivers/cpufreq/scmi-cpufreq.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index effbb680b453..c2cda28682a5 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -482,7 +482,7 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	qcom_get_related_cpus(index, policy->cpus);
-	if (!cpumask_weight(policy->cpus)) {
+	if (cpumask_empty(policy->cpus)) {
 		dev_err(dev, "Domain-%d failed to get related CPUs\n", index);
 		ret = -ENOENT;
 		goto error;
diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c
index 1e0cd4d165f0..919fa6e3f462 100644
--- a/drivers/cpufreq/scmi-cpufreq.c
+++ b/drivers/cpufreq/scmi-cpufreq.c
@@ -154,7 +154,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy)
 	 * table and opp-shared.
 	 */
 	ret = dev_pm_opp_of_get_sharing_cpus(cpu_dev, priv->opp_shared_cpus);
-	if (ret || !cpumask_weight(priv->opp_shared_cpus)) {
+	if (ret || cpumask_empty(priv->opp_shared_cpus)) {
 		/*
 		 * Either opp-table is not set or no opp-shared was found.
 		 * Use the CPU mask from SCMI to designate CPUs sharing an OPP

From 4a8a77abf0e2b6468ba0281e33384cbec5fb476a Mon Sep 17 00:00:00 2001
From: Luca Weiss <luca@z3ntu.xyz>
Date: Sun, 30 Jan 2022 12:45:35 +0100
Subject: [PATCH 009/229] cpufreq: qcom-cpufreq-nvmem: fix reading of PVS Valid
 fuse

The fuse consists of 64 bits, with this statement we're supposed to get
the upper 32 bits but it actually read out of bounds and got 0 instead
of the desired value which lead to the "PVS bin not set." codepath being
run resetting our pvs value.

Fixes: a8811ec764f9 ("cpufreq: qcom: Add support for krait based socs")
Signed-off-by: Luca Weiss <luca@z3ntu.xyz>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-nvmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index d1744b5d9619..6dfa86971a75 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -130,7 +130,7 @@ static void get_krait_bin_format_b(struct device *cpu_dev,
 	}
 
 	/* Check PVS_BLOW_STATUS */
-	pte_efuse = *(((u32 *)buf) + 4);
+	pte_efuse = *(((u32 *)buf) + 1);
 	pte_efuse &= BIT(21);
 	if (pte_efuse) {
 		dev_dbg(cpu_dev, "PVS bin: %d\n", *pvs);

From 4ce9f72e008b50a9401df78d807a58bbed102898 Mon Sep 17 00:00:00 2001
From: Denys Drozdov <denys.drozdov@toradex.com>
Date: Mon, 24 Jan 2022 09:28:02 +0100
Subject: [PATCH 010/229] ARM: dts: imx7s: Define operating points table for
 cpufreq

Processor operating points for imx7s.dtsi should be properly defined to
perform correct imx-cpufreq-dt probe and registration and provide an
access to the temperature sensors using the i.MX thermal driver.

Signed-off-by: Denys Drozdov <denys.drozdov@toradex.com>
Signed-off-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm/boot/dts/imx7s.dtsi | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi
index 52a9aeecdbb2..5af6d58666f4 100644
--- a/arch/arm/boot/dts/imx7s.dtsi
+++ b/arch/arm/boot/dts/imx7s.dtsi
@@ -76,6 +76,22 @@
 			clock-latency = <61036>; /* two CLK32 periods */
 			clocks = <&clks IMX7D_CLK_ARM>;
 			cpu-idle-states = <&cpu_sleep_wait>;
+			operating-points-v2 = <&cpu0_opp_table>;
+			#cooling-cells = <2>;
+			nvmem-cells = <&fuse_grade>;
+			nvmem-cell-names = "speed_grade";
+		};
+	};
+
+	cpu0_opp_table: opp-table {
+		compatible = "operating-points-v2";
+		opp-shared;
+
+		opp-792000000 {
+			opp-hz = /bits/ 64 <792000000>;
+			opp-microvolt = <1000000>;
+			clock-latency-ns = <150000>;
+			opp-supported-hw = <0xf>, <0xf>;
 		};
 	};
 

From bc8b0c271bbf1e1a303087eb4b393cc0b791fc9d Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan.agner@toradex.com>
Date: Mon, 24 Jan 2022 09:28:03 +0100
Subject: [PATCH 011/229] cpufreq: Add i.MX7S to cpufreq-dt-platdev blocklist

The i.MX 7Solo currently does not have multiple operating points,
however, in order for the i.MX Thermal driver to successfully probe
a cpufreq device is required. Add it to the cpufreq-dt-platdev
driver's blocklist to allow using imx-cpufreq-dt.

Signed-off-by: Stefan Agner <stefan.agner@toradex.com>
Cc: Stefan Agner <stefan@agner.ch>
Signed-off-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Reviewed-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index ca1d103ec449..6b808f805eab 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -110,6 +110,7 @@ static const struct of_device_id blocklist[] __initconst = {
 
 	{ .compatible = "fsl,imx7ulp", },
 	{ .compatible = "fsl,imx7d", },
+	{ .compatible = "fsl,imx7s", },
 	{ .compatible = "fsl,imx8mq", },
 	{ .compatible = "fsl,imx8mm", },
 	{ .compatible = "fsl,imx8mn", },

From ec1c7ad47664f964c1101fe555b6fde0cb124b38 Mon Sep 17 00:00:00 2001
From: Pierre Gondois <Pierre.Gondois@arm.com>
Date: Tue, 8 Feb 2022 09:01:09 +0100
Subject: [PATCH 012/229] cpufreq: CPPC: Fix performance/frequency conversion

CPUfreq governors request CPU frequencies using information
on current CPU usage. The CPPC driver converts them to
performance requests. Frequency targets are computed as:
	target_freq = (util / cpu_capacity) * max_freq
target_freq is then clamped between [policy->min, policy->max].

The CPPC driver converts performance values to frequencies
(and vice-versa) using cppc_cpufreq_perf_to_khz() and
cppc_cpufreq_khz_to_perf(). These functions both use two different
factors depending on the range of the input value. For
cppc_cpufreq_khz_to_perf():
- (NOMINAL_PERF / NOMINAL_FREQ) or
- (LOWEST_PERF / LOWEST_FREQ)
and for cppc_cpufreq_perf_to_khz():
- (NOMINAL_FREQ / NOMINAL_PERF) or
- ((NOMINAL_PERF - LOWEST_FREQ) / (NOMINAL_PERF - LOWEST_PERF))

This means:
1- the functions are not inverse for some values:
   (perf_to_khz(khz_to_perf(x)) != x)
2- cppc_cpufreq_perf_to_khz(LOWEST_PERF) can sometimes give
   a different value from LOWEST_FREQ due to integer approximation
3- it is implied that performance and frequency are proportional
   (NOMINAL_FREQ / NOMINAL_PERF) == (LOWEST_PERF / LOWEST_FREQ)

This patch changes the conversion functions to an affine function.
This fixes the 3 points above.

Suggested-by: Lukasz Luba <lukasz.luba@arm.com>
Suggested-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Pierre Gondois <Pierre.Gondois@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cppc_cpufreq.c | 43 +++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index db17196266e4..82d370ae6a4a 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -303,52 +303,48 @@ static u64 cppc_get_dmi_max_khz(void)
 
 /*
  * If CPPC lowest_freq and nominal_freq registers are exposed then we can
- * use them to convert perf to freq and vice versa
- *
- * If the perf/freq point lies between Nominal and Lowest, we can treat
- * (Low perf, Low freq) and (Nom Perf, Nom freq) as 2D co-ordinates of a line
- * and extrapolate the rest
- * For perf/freq > Nominal, we use the ratio perf:freq at Nominal for conversion
+ * use them to convert perf to freq and vice versa. The conversion is
+ * extrapolated as an affine function passing by the 2 points:
+ *  - (Low perf, Low freq)
+ *  - (Nominal perf, Nominal perf)
  */
 static unsigned int cppc_cpufreq_perf_to_khz(struct cppc_cpudata *cpu_data,
 					     unsigned int perf)
 {
 	struct cppc_perf_caps *caps = &cpu_data->perf_caps;
+	s64 retval, offset = 0;
 	static u64 max_khz;
 	u64 mul, div;
 
 	if (caps->lowest_freq && caps->nominal_freq) {
-		if (perf >= caps->nominal_perf) {
-			mul = caps->nominal_freq;
-			div = caps->nominal_perf;
-		} else {
-			mul = caps->nominal_freq - caps->lowest_freq;
-			div = caps->nominal_perf - caps->lowest_perf;
-		}
+		mul = caps->nominal_freq - caps->lowest_freq;
+		div = caps->nominal_perf - caps->lowest_perf;
+		offset = caps->nominal_freq - div64_u64(caps->nominal_perf * mul, div);
 	} else {
 		if (!max_khz)
 			max_khz = cppc_get_dmi_max_khz();
 		mul = max_khz;
 		div = caps->highest_perf;
 	}
-	return (u64)perf * mul / div;
+
+	retval = offset + div64_u64(perf * mul, div);
+	if (retval >= 0)
+		return retval;
+	return 0;
 }
 
 static unsigned int cppc_cpufreq_khz_to_perf(struct cppc_cpudata *cpu_data,
 					     unsigned int freq)
 {
 	struct cppc_perf_caps *caps = &cpu_data->perf_caps;
+	s64 retval, offset = 0;
 	static u64 max_khz;
 	u64  mul, div;
 
 	if (caps->lowest_freq && caps->nominal_freq) {
-		if (freq >= caps->nominal_freq) {
-			mul = caps->nominal_perf;
-			div = caps->nominal_freq;
-		} else {
-			mul = caps->lowest_perf;
-			div = caps->lowest_freq;
-		}
+		mul = caps->nominal_perf - caps->lowest_perf;
+		div = caps->nominal_freq - caps->lowest_freq;
+		offset = caps->nominal_perf - div64_u64(caps->nominal_freq * mul, div);
 	} else {
 		if (!max_khz)
 			max_khz = cppc_get_dmi_max_khz();
@@ -356,7 +352,10 @@ static unsigned int cppc_cpufreq_khz_to_perf(struct cppc_cpudata *cpu_data,
 		div = max_khz;
 	}
 
-	return (u64)freq * mul / div;
+	retval = offset + div64_u64(freq * mul, div);
+	if (retval >= 0)
+		return retval;
+	return 0;
 }
 
 static int cppc_cpufreq_set_target(struct cpufreq_policy *policy,

From e62c17f0455a74b182ce6373e2777817256afaa1 Mon Sep 17 00:00:00 2001
From: Xiang Chen <chenxiang66@hisilicon.com>
Date: Tue, 8 Feb 2022 15:04:51 +0800
Subject: [PATCH 013/229] MAINTAINERS: update maintainer list of DMA MAPPING
 BENCHMARK

Barry Song will not focus on this area, and Xiang Chen will continue his
work to maintain this module.

Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
Acked-by: Barry Song <song.bao.hua@hisilicon.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ea3e6c914384..48335022b0e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5765,7 +5765,7 @@ F:	include/linux/dma-map-ops.h
 F:	kernel/dma/
 
 DMA MAPPING BENCHMARK
-M:	Barry Song <song.bao.hua@hisilicon.com>
+M:	Xiang Chen <chenxiang66@hisilicon.com>
 L:	iommu@lists.linux-foundation.org
 F:	kernel/dma/map_benchmark.c
 F:	tools/testing/selftests/dma/

From 021dbecabc93b1610b5db989d52a94e0c6671136 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 10 Feb 2022 14:37:53 +0530
Subject: [PATCH 014/229] opp: Expose of-node's name in debugfs

It is difficult to find which OPPs are active at the moment, specially
if there are multiple OPPs with same frequency available in the device
tree (controlled by supported hardware feature).

Expose name of the DT node to find out the exact OPP.

While at it, also expose level field.

Reported-by: Leo Yan <leo.yan@linaro.org>
Tested-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/debugfs.c | 5 +++++
 drivers/opp/opp.h     | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
index 596c185b5dda..b5f2f9f39392 100644
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/of.h>
 #include <linux/init.h>
 #include <linux/limits.h>
 #include <linux/slab.h>
@@ -131,9 +132,13 @@ void opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend);
 	debugfs_create_u32("performance_state", S_IRUGO, d, &opp->pstate);
 	debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate);
+	debugfs_create_u32("level", S_IRUGO, d, &opp->level);
 	debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
 			     &opp->clock_latency_ns);
 
+	opp->of_name = of_node_full_name(opp->np);
+	debugfs_create_str("of_name", S_IRUGO, d, (char **)&opp->of_name);
+
 	opp_debug_create_supplies(opp, opp_table, d);
 	opp_debug_create_bw(opp, opp_table, d);
 
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 407c3bfe51d9..45e3a55239a1 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -96,6 +96,7 @@ struct dev_pm_opp {
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
+	const char *of_name;
 #endif
 };
 

From 0cc525901fe9062dbd3c7a42dba254244a229072 Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:25:13 +0000
Subject: [PATCH 015/229] dt-bindings: arm: qcom: Add msm8996 and apq8096
 compatibles

Add compatibles for MSM8996 and APQ8096 and all supported devices
that have them.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 Documentation/devicetree/bindings/arm/qcom.yaml | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index 370aab274cd1..fd8410d26c99 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -173,7 +173,21 @@ properties:
           - const: qcom,apq8094
 
       - items:
-          - const: qcom,msm8996-mtp
+          - enum:
+              - arrow,apq8096-db820c
+              - inforce,ifc6640
+          - const: qcom,apq8096-sbc
+          - const: qcom,apq8096
+
+      - items:
+          - enum:
+              - qcom,msm8996-mtp
+              - sony,dora-row
+              - sony,kagura-row
+              - sony,keyaki-row
+              - xiaomi,gemini
+              - xiaomi,scorpio
+          - const: qcom,msm8996
 
       - items:
           - enum:

From 8188eaf4de56f640b5e72e229de50161d9711c9f Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:25:32 +0000
Subject: [PATCH 016/229] arm64: dts: qcom: msm8996-mtp: Add msm8996 compatible

Add qcom,msm8996 compatible to match DT schema.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm64/boot/dts/qcom/msm8996-mtp.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/qcom/msm8996-mtp.dts b/arch/arm64/boot/dts/qcom/msm8996-mtp.dts
index 7d9fc35bc7a0..6a1699a96c99 100644
--- a/arch/arm64/boot/dts/qcom/msm8996-mtp.dts
+++ b/arch/arm64/boot/dts/qcom/msm8996-mtp.dts
@@ -9,7 +9,7 @@
 
 / {
 	model = "Qualcomm Technologies, Inc. MSM 8996 MTP";
-	compatible = "qcom,msm8996-mtp";
+	compatible = "qcom,msm8996-mtp", "qcom,msm8996";
 
 	aliases {
 		serial0 = &blsp2_uart2;

From 784adeb3a37c2cfbcb24a5707ef751d164cac454 Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:25:49 +0000
Subject: [PATCH 017/229] dt-bindings: opp: qcom-opp: Convert to DT schema

Convert qcom-opp.txt to DT schema format.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../bindings/opp/opp-v2-qcom-level.yaml       | 60 +++++++++++++++++++
 .../devicetree/bindings/opp/qcom-opp.txt      | 19 ------
 2 files changed, 60 insertions(+), 19 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml
 delete mode 100644 Documentation/devicetree/bindings/opp/qcom-opp.txt

diff --git a/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml b/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml
new file mode 100644
index 000000000000..14a7a689ad6d
--- /dev/null
+++ b/Documentation/devicetree/bindings/opp/opp-v2-qcom-level.yaml
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/opp/opp-v2-qcom-level.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm OPP bindings to describe OPP nodes.
+
+maintainers:
+  - Niklas Cassel <nks@flawful.org>
+
+allOf:
+  - $ref: opp-v2-base.yaml#
+
+properties:
+  compatible:
+    const: operating-points-v2-qcom-level
+
+patternProperties:
+  '^opp-?[0-9]+$':
+    type: object
+
+    properties:
+      opp-level: true
+
+      qcom,opp-fuse-level:
+        description: |
+          A positive value representing the fuse corner/level associated with
+          this OPP node. Sometimes several corners/levels shares a certain fuse
+          corner/level. A fuse corner/level contains e.g. ref uV, min uV,
+          and max uV.
+        $ref: /schemas/types.yaml#/definitions/uint32
+
+    required:
+      - opp-level
+      - qcom,opp-fuse-level
+
+required:
+  - compatible
+
+additionalProperties: false
+
+examples:
+  - |
+    cpr_opp_table: opp-table-cpr {
+        compatible = "operating-points-v2-qcom-level";
+
+        cpr_opp1: opp1 {
+            opp-level = <1>;
+            qcom,opp-fuse-level = <1>;
+        };
+        cpr_opp2: opp2 {
+            opp-level = <2>;
+            qcom,opp-fuse-level = <2>;
+        };
+        cpr_opp3: opp3 {
+            opp-level = <3>;
+            qcom,opp-fuse-level = <3>;
+        };
+    };
diff --git a/Documentation/devicetree/bindings/opp/qcom-opp.txt b/Documentation/devicetree/bindings/opp/qcom-opp.txt
deleted file mode 100644
index 41d3e4ff2dc3..000000000000
--- a/Documentation/devicetree/bindings/opp/qcom-opp.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Qualcomm OPP bindings to describe OPP nodes
-
-The bindings are based on top of the operating-points-v2 bindings
-described in Documentation/devicetree/bindings/opp/opp-v2-base.yaml
-Additional properties are described below.
-
-* OPP Table Node
-
-Required properties:
-- compatible: Allow OPPs to express their compatibility. It should be:
-  "operating-points-v2-qcom-level"
-
-* OPP Node
-
-Required properties:
-- qcom,opp-fuse-level: A positive value representing the fuse corner/level
-  associated with this OPP node. Sometimes several corners/levels shares
-  a certain fuse corner/level. A fuse corner/level contains e.g. ref uV,
-  min uV, and max uV.

From ec24d1d55469b66f5d4393dc8cebc5ac8f4fc683 Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:26:08 +0000
Subject: [PATCH 018/229] dt-bindings: opp: Convert qcom-nvmem-cpufreq to DT
 schema

Convert qcom-nvmem-cpufreq to DT schema format, splitting it into
an OPP schema and a CPUFreq schema in the process.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../bindings/cpufreq/qcom-cpufreq-nvmem.yaml  | 166 ++++
 .../bindings/opp/opp-v2-kryo-cpu.yaml         | 257 ++++++
 .../bindings/opp/qcom-nvmem-cpufreq.txt       | 796 ------------------
 MAINTAINERS                                   |   3 +-
 4 files changed, 425 insertions(+), 797 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml
 create mode 100644 Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml
 delete mode 100644 Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt

diff --git a/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml b/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml
new file mode 100644
index 000000000000..a9a776da5505
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cpufreq/qcom-cpufreq-nvmem.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies, Inc. NVMEM CPUFreq bindings
+
+maintainers:
+  - Ilia Lin <ilia.lin@kernel.org>
+
+description: |
+  In certain Qualcomm Technologies, Inc. SoCs such as QCS404, The CPU supply
+  voltage is dynamically configured by Core Power Reduction (CPR) depending on
+  current CPU frequency and efuse values.
+  CPR provides a power domain with multiple levels that are selected depending
+  on the CPU OPP in use. The CPUFreq driver sets the CPR power domain level
+  according to the required OPPs defined in the CPU OPP tables.
+
+select:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - qcom,qcs404
+  required:
+    - compatible
+
+properties:
+  cpus:
+    type: object
+
+    patternProperties:
+      'cpu@[0-9a-f]+':
+        type: object
+
+        properties:
+          power-domains:
+            maxItems: 1
+
+          power-domain-names:
+            items:
+              - const: cpr
+
+        required:
+          - power-domains
+          - power-domain-names
+
+patternProperties:
+  '^opp-table(-[a-z0-9]+)?$':
+    if:
+      properties:
+        compatible:
+          const: operating-points-v2-kryo-cpu
+    then:
+      patternProperties:
+        '^opp-?[0-9]+$':
+          required:
+            - required-opps
+
+additionalProperties: true
+
+examples:
+  - |
+    / {
+        model = "Qualcomm Technologies, Inc. QCS404";
+        compatible = "qcom,qcs404";
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        cpus {
+            #address-cells = <1>;
+            #size-cells = <0>;
+
+            CPU0: cpu@100 {
+                device_type = "cpu";
+                compatible = "arm,cortex-a53";
+                reg = <0x100>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                next-level-cache = <&L2_0>;
+                #cooling-cells = <2>;
+                clocks = <&apcs_glb>;
+                operating-points-v2 = <&cpu_opp_table>;
+                power-domains = <&cpr>;
+                power-domain-names = "cpr";
+            };
+
+            CPU1: cpu@101 {
+                device_type = "cpu";
+                compatible = "arm,cortex-a53";
+                reg = <0x101>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                next-level-cache = <&L2_0>;
+                #cooling-cells = <2>;
+                clocks = <&apcs_glb>;
+                operating-points-v2 = <&cpu_opp_table>;
+                power-domains = <&cpr>;
+                power-domain-names = "cpr";
+            };
+
+            CPU2: cpu@102 {
+                device_type = "cpu";
+                compatible = "arm,cortex-a53";
+                reg = <0x102>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                next-level-cache = <&L2_0>;
+                #cooling-cells = <2>;
+                clocks = <&apcs_glb>;
+                operating-points-v2 = <&cpu_opp_table>;
+                power-domains = <&cpr>;
+                power-domain-names = "cpr";
+            };
+
+            CPU3: cpu@103 {
+                device_type = "cpu";
+                compatible = "arm,cortex-a53";
+                reg = <0x103>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                next-level-cache = <&L2_0>;
+                #cooling-cells = <2>;
+                clocks = <&apcs_glb>;
+                operating-points-v2 = <&cpu_opp_table>;
+                power-domains = <&cpr>;
+                power-domain-names = "cpr";
+            };
+        };
+
+        cpu_opp_table: opp-table-cpu {
+            compatible = "operating-points-v2-kryo-cpu";
+            opp-shared;
+
+            opp-1094400000 {
+                opp-hz = /bits/ 64 <1094400000>;
+                required-opps = <&cpr_opp1>;
+            };
+            opp-1248000000 {
+                opp-hz = /bits/ 64 <1248000000>;
+                required-opps = <&cpr_opp2>;
+            };
+            opp-1401600000 {
+                opp-hz = /bits/ 64 <1401600000>;
+                required-opps = <&cpr_opp3>;
+            };
+        };
+
+        cpr_opp_table: opp-table-cpr {
+            compatible = "operating-points-v2-qcom-level";
+
+            cpr_opp1: opp1 {
+                opp-level = <1>;
+                qcom,opp-fuse-level = <1>;
+            };
+            cpr_opp2: opp2 {
+                opp-level = <2>;
+                qcom,opp-fuse-level = <2>;
+            };
+            cpr_opp3: opp3 {
+                opp-level = <3>;
+                qcom,opp-fuse-level = <3>;
+            };
+        };
+    };
diff --git a/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml b/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml
new file mode 100644
index 000000000000..8c2e9ac5f68d
--- /dev/null
+++ b/Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml
@@ -0,0 +1,257 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/opp/opp-v2-kryo-cpu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies, Inc. NVMEM OPP bindings
+
+maintainers:
+  - Ilia Lin <ilia.lin@kernel.org>
+
+allOf:
+  - $ref: opp-v2-base.yaml#
+
+description: |
+  In certain Qualcomm Technologies, Inc. SoCs like APQ8096 and MSM8996,
+  the CPU frequencies subset and voltage value of each OPP varies based on
+  the silicon variant in use.
+  Qualcomm Technologies, Inc. Process Voltage Scaling Tables
+  defines the voltage and frequency value based on the msm-id in SMEM
+  and speedbin blown in the efuse combination.
+  The qcom-cpufreq-nvmem driver reads the msm-id and efuse value from the SoC
+  to provide the OPP framework with required information (existing HW bitmap).
+  This is used to determine the voltage and frequency value for each OPP of
+  operating-points-v2 table when it is parsed by the OPP framework.
+
+properties:
+  compatible:
+    const: operating-points-v2-kryo-cpu
+
+  nvmem-cells:
+    description: |
+      A phandle pointing to a nvmem-cells node representing the
+      efuse registers that has information about the
+      speedbin that is used to select the right frequency/voltage
+      value pair.
+
+  opp-shared: true
+
+patternProperties:
+  '^opp-?[0-9]+$':
+    type: object
+
+    properties:
+      opp-hz: true
+
+      opp-microvolt: true
+
+      opp-supported-hw:
+        description: |
+          A single 32 bit bitmap value, representing compatible HW.
+          Bitmap:
+          0:  MSM8996 V3, speedbin 0
+          1:  MSM8996 V3, speedbin 1
+          2:  MSM8996 V3, speedbin 2
+          3:  unused
+          4:  MSM8996 SG, speedbin 0
+          5:  MSM8996 SG, speedbin 1
+          6:  MSM8996 SG, speedbin 2
+          7-31:  unused
+        maximum: 0x77
+
+      clock-latency-ns: true
+
+      required-opps: true
+
+    required:
+      - opp-hz
+
+required:
+  - compatible
+
+if:
+  required:
+    - nvmem-cells
+then:
+  patternProperties:
+    '^opp-?[0-9]+$':
+      required:
+        - opp-supported-hw
+
+additionalProperties: false
+
+examples:
+  - |
+    / {
+        model = "Qualcomm Technologies, Inc. DB820c";
+        compatible = "arrow,apq8096-db820c", "qcom,apq8096-sbc", "qcom,apq8096";
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        cpus {
+            #address-cells = <2>;
+            #size-cells = <0>;
+
+            CPU0: cpu@0 {
+                device_type = "cpu";
+                compatible = "qcom,kryo";
+                reg = <0x0 0x0>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                capacity-dmips-mhz = <1024>;
+                clocks = <&kryocc 0>;
+                operating-points-v2 = <&cluster0_opp>;
+                #cooling-cells = <2>;
+                next-level-cache = <&L2_0>;
+                L2_0: l2-cache {
+                    compatible = "cache";
+                    cache-level = <2>;
+                };
+            };
+
+            CPU1: cpu@1 {
+                device_type = "cpu";
+                compatible = "qcom,kryo";
+                reg = <0x0 0x1>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                capacity-dmips-mhz = <1024>;
+                clocks = <&kryocc 0>;
+                operating-points-v2 = <&cluster0_opp>;
+                #cooling-cells = <2>;
+                next-level-cache = <&L2_0>;
+            };
+
+            CPU2: cpu@100 {
+                device_type = "cpu";
+                compatible = "qcom,kryo";
+                reg = <0x0 0x100>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                capacity-dmips-mhz = <1024>;
+                clocks = <&kryocc 1>;
+                operating-points-v2 = <&cluster1_opp>;
+                #cooling-cells = <2>;
+                next-level-cache = <&L2_1>;
+                L2_1: l2-cache {
+                    compatible = "cache";
+                    cache-level = <2>;
+                };
+            };
+
+            CPU3: cpu@101 {
+                device_type = "cpu";
+                compatible = "qcom,kryo";
+                reg = <0x0 0x101>;
+                enable-method = "psci";
+                cpu-idle-states = <&CPU_SLEEP_0>;
+                capacity-dmips-mhz = <1024>;
+                clocks = <&kryocc 1>;
+                operating-points-v2 = <&cluster1_opp>;
+                #cooling-cells = <2>;
+                next-level-cache = <&L2_1>;
+            };
+
+            cpu-map {
+                cluster0 {
+                    core0 {
+                        cpu = <&CPU0>;
+                    };
+
+                    core1 {
+                        cpu = <&CPU1>;
+                    };
+                };
+
+                cluster1 {
+                    core0 {
+                        cpu = <&CPU2>;
+                    };
+
+                    core1 {
+                        cpu = <&CPU3>;
+                    };
+                };
+            };
+        };
+
+        cluster0_opp: opp-table-0 {
+            compatible = "operating-points-v2-kryo-cpu";
+            nvmem-cells = <&speedbin_efuse>;
+            opp-shared;
+
+            opp-307200000 {
+                opp-hz = /bits/ 64 <307200000>;
+                opp-microvolt = <905000 905000 1140000>;
+                opp-supported-hw = <0x77>;
+                clock-latency-ns = <200000>;
+            };
+            opp-1593600000 {
+                opp-hz = /bits/ 64 <1593600000>;
+                opp-microvolt = <1140000 905000 1140000>;
+                opp-supported-hw = <0x71>;
+                clock-latency-ns = <200000>;
+            };
+            opp-2188800000 {
+                opp-hz = /bits/ 64 <2188800000>;
+                opp-microvolt = <1140000 905000 1140000>;
+                opp-supported-hw = <0x10>;
+                clock-latency-ns = <200000>;
+            };
+        };
+
+        cluster1_opp: opp-table-1 {
+            compatible = "operating-points-v2-kryo-cpu";
+            nvmem-cells = <&speedbin_efuse>;
+            opp-shared;
+
+            opp-307200000 {
+                opp-hz = /bits/ 64 <307200000>;
+                opp-microvolt = <905000 905000 1140000>;
+                opp-supported-hw = <0x77>;
+                clock-latency-ns = <200000>;
+            };
+            opp-1593600000 {
+                opp-hz = /bits/ 64 <1593600000>;
+                opp-microvolt = <1140000 905000 1140000>;
+                opp-supported-hw = <0x70>;
+                clock-latency-ns = <200000>;
+            };
+            opp-2150400000 {
+                opp-hz = /bits/ 64 <2150400000>;
+                opp-microvolt = <1140000 905000 1140000>;
+                opp-supported-hw = <0x31>;
+                clock-latency-ns = <200000>;
+            };
+            opp-2342400000 {
+                opp-hz = /bits/ 64 <2342400000>;
+                opp-microvolt = <1140000 905000 1140000>;
+                opp-supported-hw = <0x10>;
+                clock-latency-ns = <200000>;
+            };
+        };
+
+        smem {
+            compatible = "qcom,smem";
+            memory-region = <&smem_mem>;
+            hwlocks = <&tcsr_mutex 3>;
+        };
+
+        soc {
+            #address-cells = <1>;
+            #size-cells = <1>;
+
+            qfprom: qfprom@74000 {
+                compatible = "qcom,msm8996-qfprom", "qcom,qfprom";
+                reg = <0x00074000 0x8ff>;
+                #address-cells = <1>;
+                #size-cells = <1>;
+
+                speedbin_efuse: speedbin@133 {
+                    reg = <0x133 0x1>;
+                    bits = <5 3>;
+                };
+            };
+        };
+    };
diff --git a/Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt b/Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt
deleted file mode 100644
index 64f07417ecfb..000000000000
--- a/Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt
+++ /dev/null
@@ -1,796 +0,0 @@
-Qualcomm Technologies, Inc. NVMEM CPUFreq and OPP bindings
-===================================
-
-In Certain Qualcomm Technologies, Inc. SoCs like apq8096 and msm8996,
-the CPU frequencies subset and voltage value of each OPP varies based on
-the silicon variant in use.
-Qualcomm Technologies, Inc. Process Voltage Scaling Tables
-defines the voltage and frequency value based on the msm-id in SMEM
-and speedbin blown in the efuse combination.
-The qcom-cpufreq-nvmem driver reads the msm-id and efuse value from the SoC
-to provide the OPP framework with required information (existing HW bitmap).
-This is used to determine the voltage and frequency value for each OPP of
-operating-points-v2 table when it is parsed by the OPP framework.
-
-Required properties:
---------------------
-In 'cpu' nodes:
-- operating-points-v2: Phandle to the operating-points-v2 table to use.
-
-In 'operating-points-v2' table:
-- compatible: Should be
-	- 'operating-points-v2-kryo-cpu' for apq8096, msm8996, msm8974,
-					     apq8064, ipq8064, msm8960 and ipq8074.
-
-Optional properties:
---------------------
-In 'cpu' nodes:
-- power-domains: A phandle pointing to the PM domain specifier which provides
-		the performance states available for active state management.
-		Please refer to the power-domains bindings
-		Documentation/devicetree/bindings/power/power_domain.txt
-		and also examples below.
-- power-domain-names: Should be
-	- 'cpr' for qcs404.
-
-In 'operating-points-v2' table:
-- nvmem-cells: A phandle pointing to a nvmem-cells node representing the
-		efuse registers that has information about the
-		speedbin that is used to select the right frequency/voltage
-		value pair.
-		Please refer the for nvmem-cells
-		bindings Documentation/devicetree/bindings/nvmem/nvmem.txt
-		and also examples below.
-
-In every OPP node:
-- opp-supported-hw: A single 32 bit bitmap value, representing compatible HW.
-		    Bitmap:
-			0:	MSM8996 V3, speedbin 0
-			1:	MSM8996 V3, speedbin 1
-			2:	MSM8996 V3, speedbin 2
-			3:	unused
-			4:	MSM8996 SG, speedbin 0
-			5:	MSM8996 SG, speedbin 1
-			6:	MSM8996 SG, speedbin 2
-			7-31:	unused
-
-Example 1:
----------
-
-	cpus {
-		#address-cells = <2>;
-		#size-cells = <0>;
-
-		CPU0: cpu@0 {
-			device_type = "cpu";
-			compatible = "qcom,kryo";
-			reg = <0x0 0x0>;
-			enable-method = "psci";
-			clocks = <&kryocc 0>;
-			cpu-supply = <&pm8994_s11_saw>;
-			operating-points-v2 = <&cluster0_opp>;
-			#cooling-cells = <2>;
-			next-level-cache = <&L2_0>;
-			L2_0: l2-cache {
-			      compatible = "cache";
-			      cache-level = <2>;
-			};
-		};
-
-		CPU1: cpu@1 {
-			device_type = "cpu";
-			compatible = "qcom,kryo";
-			reg = <0x0 0x1>;
-			enable-method = "psci";
-			clocks = <&kryocc 0>;
-			cpu-supply = <&pm8994_s11_saw>;
-			operating-points-v2 = <&cluster0_opp>;
-			#cooling-cells = <2>;
-			next-level-cache = <&L2_0>;
-		};
-
-		CPU2: cpu@100 {
-			device_type = "cpu";
-			compatible = "qcom,kryo";
-			reg = <0x0 0x100>;
-			enable-method = "psci";
-			clocks = <&kryocc 1>;
-			cpu-supply = <&pm8994_s11_saw>;
-			operating-points-v2 = <&cluster1_opp>;
-			#cooling-cells = <2>;
-			next-level-cache = <&L2_1>;
-			L2_1: l2-cache {
-			      compatible = "cache";
-			      cache-level = <2>;
-			};
-		};
-
-		CPU3: cpu@101 {
-			device_type = "cpu";
-			compatible = "qcom,kryo";
-			reg = <0x0 0x101>;
-			enable-method = "psci";
-			clocks = <&kryocc 1>;
-			cpu-supply = <&pm8994_s11_saw>;
-			operating-points-v2 = <&cluster1_opp>;
-			#cooling-cells = <2>;
-			next-level-cache = <&L2_1>;
-		};
-
-		cpu-map {
-			cluster0 {
-				core0 {
-					cpu = <&CPU0>;
-				};
-
-				core1 {
-					cpu = <&CPU1>;
-				};
-			};
-
-			cluster1 {
-				core0 {
-					cpu = <&CPU2>;
-				};
-
-				core1 {
-					cpu = <&CPU3>;
-				};
-			};
-		};
-	};
-
-	cluster0_opp: opp_table0 {
-		compatible = "operating-points-v2-kryo-cpu";
-		nvmem-cells = <&speedbin_efuse>;
-		opp-shared;
-
-		opp-307200000 {
-			opp-hz = /bits/ 64 <307200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x77>;
-			clock-latency-ns = <200000>;
-		};
-		opp-384000000 {
-			opp-hz = /bits/ 64 <384000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-422400000 {
-			opp-hz = /bits/ 64 <422400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-460800000 {
-			opp-hz = /bits/ 64 <460800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-480000000 {
-			opp-hz = /bits/ 64 <480000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-537600000 {
-			opp-hz = /bits/ 64 <537600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-556800000 {
-			opp-hz = /bits/ 64 <556800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-614400000 {
-			opp-hz = /bits/ 64 <614400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-652800000 {
-			opp-hz = /bits/ 64 <652800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-691200000 {
-			opp-hz = /bits/ 64 <691200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-729600000 {
-			opp-hz = /bits/ 64 <729600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-768000000 {
-			opp-hz = /bits/ 64 <768000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-844800000 {
-			opp-hz = /bits/ 64 <844800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x77>;
-			clock-latency-ns = <200000>;
-		};
-		opp-902400000 {
-			opp-hz = /bits/ 64 <902400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-960000000 {
-			opp-hz = /bits/ 64 <960000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-979200000 {
-			opp-hz = /bits/ 64 <979200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1036800000 {
-			opp-hz = /bits/ 64 <1036800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1056000000 {
-			opp-hz = /bits/ 64 <1056000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1113600000 {
-			opp-hz = /bits/ 64 <1113600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1132800000 {
-			opp-hz = /bits/ 64 <1132800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1190400000 {
-			opp-hz = /bits/ 64 <1190400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1209600000 {
-			opp-hz = /bits/ 64 <1209600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1228800000 {
-			opp-hz = /bits/ 64 <1228800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1286400000 {
-			opp-hz = /bits/ 64 <1286400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1324800000 {
-			opp-hz = /bits/ 64 <1324800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x5>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1363200000 {
-			opp-hz = /bits/ 64 <1363200000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x72>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1401600000 {
-			opp-hz = /bits/ 64 <1401600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x5>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1440000000 {
-			opp-hz = /bits/ 64 <1440000000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1478400000 {
-			opp-hz = /bits/ 64 <1478400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x1>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1497600000 {
-			opp-hz = /bits/ 64 <1497600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x4>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1516800000 {
-			opp-hz = /bits/ 64 <1516800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1593600000 {
-			opp-hz = /bits/ 64 <1593600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x71>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1996800000 {
-			opp-hz = /bits/ 64 <1996800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x20>;
-			clock-latency-ns = <200000>;
-		};
-		opp-2188800000 {
-			opp-hz = /bits/ 64 <2188800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x10>;
-			clock-latency-ns = <200000>;
-		};
-	};
-
-	cluster1_opp: opp_table1 {
-		compatible = "operating-points-v2-kryo-cpu";
-		nvmem-cells = <&speedbin_efuse>;
-		opp-shared;
-
-		opp-307200000 {
-			opp-hz = /bits/ 64 <307200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x77>;
-			clock-latency-ns = <200000>;
-		};
-		opp-384000000 {
-			opp-hz = /bits/ 64 <384000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-403200000 {
-			opp-hz = /bits/ 64 <403200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-460800000 {
-			opp-hz = /bits/ 64 <460800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-480000000 {
-			opp-hz = /bits/ 64 <480000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-537600000 {
-			opp-hz = /bits/ 64 <537600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-556800000 {
-			opp-hz = /bits/ 64 <556800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-614400000 {
-			opp-hz = /bits/ 64 <614400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-652800000 {
-			opp-hz = /bits/ 64 <652800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-691200000 {
-			opp-hz = /bits/ 64 <691200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-729600000 {
-			opp-hz = /bits/ 64 <729600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-748800000 {
-			opp-hz = /bits/ 64 <748800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-806400000 {
-			opp-hz = /bits/ 64 <806400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-825600000 {
-			opp-hz = /bits/ 64 <825600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-883200000 {
-			opp-hz = /bits/ 64 <883200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-902400000 {
-			opp-hz = /bits/ 64 <902400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-940800000 {
-			opp-hz = /bits/ 64 <940800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-979200000 {
-			opp-hz = /bits/ 64 <979200000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1036800000 {
-			opp-hz = /bits/ 64 <1036800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1056000000 {
-			opp-hz = /bits/ 64 <1056000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1113600000 {
-			opp-hz = /bits/ 64 <1113600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1132800000 {
-			opp-hz = /bits/ 64 <1132800000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1190400000 {
-			opp-hz = /bits/ 64 <1190400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1209600000 {
-			opp-hz = /bits/ 64 <1209600000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1248000000 {
-			opp-hz = /bits/ 64 <1248000000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1286400000 {
-			opp-hz = /bits/ 64 <1286400000>;
-			opp-microvolt = <905000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1324800000 {
-			opp-hz = /bits/ 64 <1324800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1363200000 {
-			opp-hz = /bits/ 64 <1363200000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1401600000 {
-			opp-hz = /bits/ 64 <1401600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1440000000 {
-			opp-hz = /bits/ 64 <1440000000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1478400000 {
-			opp-hz = /bits/ 64 <1478400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1516800000 {
-			opp-hz = /bits/ 64 <1516800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1555200000 {
-			opp-hz = /bits/ 64 <1555200000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1593600000 {
-			opp-hz = /bits/ 64 <1593600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1632000000 {
-			opp-hz = /bits/ 64 <1632000000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1670400000 {
-			opp-hz = /bits/ 64 <1670400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1708800000 {
-			opp-hz = /bits/ 64 <1708800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1747200000 {
-			opp-hz = /bits/ 64 <1747200000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x70>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1785600000 {
-			opp-hz = /bits/ 64 <1785600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x7>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1804800000 {
-			opp-hz = /bits/ 64 <1804800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x6>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1824000000 {
-			opp-hz = /bits/ 64 <1824000000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x71>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1900800000 {
-			opp-hz = /bits/ 64 <1900800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x74>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1920000000 {
-			opp-hz = /bits/ 64 <1920000000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x1>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1977600000 {
-			opp-hz = /bits/ 64 <1977600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x30>;
-			clock-latency-ns = <200000>;
-		};
-		opp-1996800000 {
-			opp-hz = /bits/ 64 <1996800000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x1>;
-			clock-latency-ns = <200000>;
-		};
-		opp-2054400000 {
-			opp-hz = /bits/ 64 <2054400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x30>;
-			clock-latency-ns = <200000>;
-		};
-		opp-2073600000 {
-			opp-hz = /bits/ 64 <2073600000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x1>;
-			clock-latency-ns = <200000>;
-		};
-		opp-2150400000 {
-			opp-hz = /bits/ 64 <2150400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x31>;
-			clock-latency-ns = <200000>;
-		};
-		opp-2246400000 {
-			opp-hz = /bits/ 64 <2246400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x10>;
-			clock-latency-ns = <200000>;
-		};
-		opp-2342400000 {
-			opp-hz = /bits/ 64 <2342400000>;
-			opp-microvolt = <1140000 905000 1140000>;
-			opp-supported-hw = <0x10>;
-			clock-latency-ns = <200000>;
-		};
-	};
-
-....
-
-reserved-memory {
-	#address-cells = <2>;
-	#size-cells = <2>;
-	ranges;
-....
-	smem_mem: smem-mem@86000000 {
-		reg = <0x0 0x86000000 0x0 0x200000>;
-		no-map;
-	};
-....
-};
-
-smem {
-	compatible = "qcom,smem";
-	memory-region = <&smem_mem>;
-	hwlocks = <&tcsr_mutex 3>;
-};
-
-soc {
-....
-	qfprom: qfprom@74000 {
-		compatible = "qcom,qfprom";
-		reg = <0x00074000 0x8ff>;
-		#address-cells = <1>;
-		#size-cells = <1>;
-		....
-		speedbin_efuse: speedbin@133 {
-			reg = <0x133 0x1>;
-			bits = <5 3>;
-		};
-	};
-};
-
-Example 2:
----------
-
-	cpus {
-		#address-cells = <1>;
-		#size-cells = <0>;
-
-		CPU0: cpu@100 {
-			device_type = "cpu";
-			compatible = "arm,cortex-a53";
-			reg = <0x100>;
-			....
-			clocks = <&apcs_glb>;
-			operating-points-v2 = <&cpu_opp_table>;
-			power-domains = <&cpr>;
-			power-domain-names = "cpr";
-		};
-
-		CPU1: cpu@101 {
-			device_type = "cpu";
-			compatible = "arm,cortex-a53";
-			reg = <0x101>;
-			....
-			clocks = <&apcs_glb>;
-			operating-points-v2 = <&cpu_opp_table>;
-			power-domains = <&cpr>;
-			power-domain-names = "cpr";
-		};
-
-		CPU2: cpu@102 {
-			device_type = "cpu";
-			compatible = "arm,cortex-a53";
-			reg = <0x102>;
-			....
-			clocks = <&apcs_glb>;
-			operating-points-v2 = <&cpu_opp_table>;
-			power-domains = <&cpr>;
-			power-domain-names = "cpr";
-		};
-
-		CPU3: cpu@103 {
-			device_type = "cpu";
-			compatible = "arm,cortex-a53";
-			reg = <0x103>;
-			....
-			clocks = <&apcs_glb>;
-			operating-points-v2 = <&cpu_opp_table>;
-			power-domains = <&cpr>;
-			power-domain-names = "cpr";
-		};
-	};
-
-	cpu_opp_table: cpu-opp-table {
-		compatible = "operating-points-v2-kryo-cpu";
-		opp-shared;
-
-		opp-1094400000 {
-			opp-hz = /bits/ 64 <1094400000>;
-			required-opps = <&cpr_opp1>;
-		};
-		opp-1248000000 {
-			opp-hz = /bits/ 64 <1248000000>;
-			required-opps = <&cpr_opp2>;
-		};
-		opp-1401600000 {
-			opp-hz = /bits/ 64 <1401600000>;
-			required-opps = <&cpr_opp3>;
-		};
-	};
-
-	cpr_opp_table: cpr-opp-table {
-		compatible = "operating-points-v2-qcom-level";
-
-		cpr_opp1: opp1 {
-			opp-level = <1>;
-			qcom,opp-fuse-level = <1>;
-		};
-		cpr_opp2: opp2 {
-			opp-level = <2>;
-			qcom,opp-fuse-level = <2>;
-		};
-		cpr_opp3: opp3 {
-			opp-level = <3>;
-			qcom,opp-fuse-level = <3>;
-		};
-	};
-
-....
-
-soc {
-....
-	cpr: power-controller@b018000 {
-		compatible = "qcom,qcs404-cpr", "qcom,cpr";
-		reg = <0x0b018000 0x1000>;
-		....
-		vdd-apc-supply = <&pms405_s3>;
-		#power-domain-cells = <0>;
-		operating-points-v2 = <&cpr_opp_table>;
-		....
-	};
-};
diff --git a/MAINTAINERS b/MAINTAINERS
index ea3e6c914384..1d840b89fd8a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15940,7 +15940,8 @@ QUALCOMM CPUFREQ DRIVER MSM8996/APQ8096
 M:	Ilia Lin <ilia.lin@kernel.org>
 L:	linux-pm@vger.kernel.org
 S:	Maintained
-F:	Documentation/devicetree/bindings/opp/qcom-nvmem-cpufreq.txt
+F:	Documentation/devicetree/bindings/cpufreq/qcom-cpufreq-nvmem.yaml
+F:	Documentation/devicetree/bindings/opp/opp-v2-kryo-cpu.yaml
 F:	drivers/cpufreq/qcom-cpufreq-nvmem.c
 
 QUALCOMM CRYPTO DRIVERS

From 2b8382d2717088177bc60b6a14237ef2e955434f Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:26:24 +0000
Subject: [PATCH 019/229] arm64: dts: qcom: msm8996: Rename cluster OPP tables

Rename cluster OPP table node names to match the nodename pattern
defined in the opp-v2-base DT schema.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm64/boot/dts/qcom/msm8996.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi
index 91bc974aeb0a..036de52c54f1 100644
--- a/arch/arm64/boot/dts/qcom/msm8996.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi
@@ -134,7 +134,7 @@
 		};
 	};
 
-	cluster0_opp: opp_table0 {
+	cluster0_opp: opp-table-cluster0 {
 		compatible = "operating-points-v2-kryo-cpu";
 		nvmem-cells = <&speedbin_efuse>;
 		opp-shared;
@@ -222,7 +222,7 @@
 		};
 	};
 
-	cluster1_opp: opp_table1 {
+	cluster1_opp: opp-table-cluster1 {
 		compatible = "operating-points-v2-kryo-cpu";
 		nvmem-cells = <&speedbin_efuse>;
 		opp-shared;

From 6b3abe0cfb7d68dc25172f7da8a60a5f35cb290d Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:26:44 +0000
Subject: [PATCH 020/229] arm64: dts: qcom: qcs404: Rename CPU and CPR OPP
 tables

Rename CPU and CPR OPP table node names to match the nodename pattern
defined in the opp-v2-base DT schema.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 arch/arm64/boot/dts/qcom/qcs404.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/qcs404.dtsi b/arch/arm64/boot/dts/qcom/qcs404.dtsi
index 6db753b49326..3f06f7cd3cf2 100644
--- a/arch/arm64/boot/dts/qcom/qcs404.dtsi
+++ b/arch/arm64/boot/dts/qcom/qcs404.dtsi
@@ -110,7 +110,7 @@
 		};
 	};
 
-	cpu_opp_table: cpu-opp-table {
+	cpu_opp_table: opp-table-cpu {
 		compatible = "operating-points-v2-kryo-cpu";
 		opp-shared;
 
@@ -128,7 +128,7 @@
 		};
 	};
 
-	cpr_opp_table: cpr-opp-table {
+	cpr_opp_table: opp-table-cpr {
 		compatible = "operating-points-v2-qcom-level";
 
 		cpr_opp1: opp1 {

From 8acf5cb92d9dc0a6eff7a22b01432d0379656e45 Mon Sep 17 00:00:00 2001
From: Yassine Oudjana <y.oudjana@protonmail.com>
Date: Thu, 3 Feb 2022 07:26:58 +0000
Subject: [PATCH 021/229] dt-bindings: power: avs: qcom,cpr: Convert to DT
 schema

Convert qcom,cpr.txt to DT schema format.

Signed-off-by: Yassine Oudjana <y.oudjana@protonmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../bindings/power/avs/qcom,cpr.txt           | 130 --------------
 .../bindings/power/avs/qcom,cpr.yaml          | 160 ++++++++++++++++++
 MAINTAINERS                                   |   2 +-
 3 files changed, 161 insertions(+), 131 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
 create mode 100644 Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml

diff --git a/Documentation/devicetree/bindings/power/avs/qcom,cpr.txt b/Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
deleted file mode 100644
index ab0d5ebbad4e..000000000000
--- a/Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
+++ /dev/null
@@ -1,130 +0,0 @@
-QCOM CPR (Core Power Reduction)
-
-CPR (Core Power Reduction) is a technology to reduce core power on a CPU
-or other device. Each OPP of a device corresponds to a "corner" that has
-a range of valid voltages for a particular frequency. While the device is
-running at a particular frequency, CPR monitors dynamic factors such as
-temperature, etc. and suggests adjustments to the voltage to save power
-and meet silicon characteristic requirements.
-
-- compatible:
-	Usage: required
-	Value type: <string>
-	Definition: should be "qcom,qcs404-cpr", "qcom,cpr" for qcs404
-
-- reg:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition: base address and size of the rbcpr register region
-
-- interrupts:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition: should specify the CPR interrupt
-
-- clocks:
-	Usage: required
-	Value type: <prop-encoded-array>
-	Definition: phandle to the reference clock
-
-- clock-names:
-	Usage: required
-	Value type: <stringlist>
-	Definition: must be "ref"
-
-- vdd-apc-supply:
-	Usage: required
-	Value type: <phandle>
-	Definition: phandle to the vdd-apc-supply regulator
-
-- #power-domain-cells:
-	Usage: required
-	Value type: <u32>
-	Definition: should be 0
-
-- operating-points-v2:
-	Usage: required
-	Value type: <phandle>
-	Definition: A phandle to the OPP table containing the
-		    performance states supported by the CPR
-		    power domain
-
-- acc-syscon:
-	Usage: optional
-	Value type: <phandle>
-	Definition: phandle to syscon for writing ACC settings
-
-- nvmem-cells:
-	Usage: required
-	Value type: <phandle>
-	Definition: phandle to nvmem cells containing the data
-		    that makes up a fuse corner, for each fuse corner.
-		    As well as the CPR fuse revision.
-
-- nvmem-cell-names:
-	Usage: required
-	Value type: <stringlist>
-	Definition: should be "cpr_quotient_offset1", "cpr_quotient_offset2",
-		    "cpr_quotient_offset3", "cpr_init_voltage1",
-		    "cpr_init_voltage2", "cpr_init_voltage3", "cpr_quotient1",
-		    "cpr_quotient2", "cpr_quotient3", "cpr_ring_osc1",
-		    "cpr_ring_osc2", "cpr_ring_osc3", "cpr_fuse_revision"
-		    for qcs404.
-
-Example:
-
-	cpr_opp_table: cpr-opp-table {
-		compatible = "operating-points-v2-qcom-level";
-
-		cpr_opp1: opp1 {
-			opp-level = <1>;
-			qcom,opp-fuse-level = <1>;
-		};
-		cpr_opp2: opp2 {
-			opp-level = <2>;
-			qcom,opp-fuse-level = <2>;
-		};
-		cpr_opp3: opp3 {
-			opp-level = <3>;
-			qcom,opp-fuse-level = <3>;
-		};
-	};
-
-	power-controller@b018000 {
-		compatible = "qcom,qcs404-cpr", "qcom,cpr";
-		reg = <0x0b018000 0x1000>;
-		interrupts = <0 15 IRQ_TYPE_EDGE_RISING>;
-		clocks = <&xo_board>;
-		clock-names = "ref";
-		vdd-apc-supply = <&pms405_s3>;
-		#power-domain-cells = <0>;
-		operating-points-v2 = <&cpr_opp_table>;
-		acc-syscon = <&tcsr>;
-
-		nvmem-cells = <&cpr_efuse_quot_offset1>,
-			<&cpr_efuse_quot_offset2>,
-			<&cpr_efuse_quot_offset3>,
-			<&cpr_efuse_init_voltage1>,
-			<&cpr_efuse_init_voltage2>,
-			<&cpr_efuse_init_voltage3>,
-			<&cpr_efuse_quot1>,
-			<&cpr_efuse_quot2>,
-			<&cpr_efuse_quot3>,
-			<&cpr_efuse_ring1>,
-			<&cpr_efuse_ring2>,
-			<&cpr_efuse_ring3>,
-			<&cpr_efuse_revision>;
-		nvmem-cell-names = "cpr_quotient_offset1",
-			"cpr_quotient_offset2",
-			"cpr_quotient_offset3",
-			"cpr_init_voltage1",
-			"cpr_init_voltage2",
-			"cpr_init_voltage3",
-			"cpr_quotient1",
-			"cpr_quotient2",
-			"cpr_quotient3",
-			"cpr_ring_osc1",
-			"cpr_ring_osc2",
-			"cpr_ring_osc3",
-			"cpr_fuse_revision";
-	};
diff --git a/Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml b/Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml
new file mode 100644
index 000000000000..3301fa0c2653
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/power/avs/qcom,cpr.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Core Power Reduction (CPR) bindings
+
+maintainers:
+  - Niklas Cassel <nks@flawful.org>
+
+description: |
+  CPR (Core Power Reduction) is a technology to reduce core power on a CPU
+  or other device. Each OPP of a device corresponds to a "corner" that has
+  a range of valid voltages for a particular frequency. While the device is
+  running at a particular frequency, CPR monitors dynamic factors such as
+  temperature, etc. and suggests adjustments to the voltage to save power
+  and meet silicon characteristic requirements.
+
+properties:
+  compatible:
+    items:
+      - enum:
+          - qcom,qcs404-cpr
+      - const: qcom,cpr
+
+  reg:
+    description: Base address and size of the RBCPR register region.
+    maxItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  clocks:
+    items:
+      - description: Reference clock.
+
+  clock-names:
+    items:
+      - const: ref
+
+  vdd-apc-supply:
+    description: APC regulator supply.
+
+  '#power-domain-cells':
+    const: 0
+
+  operating-points-v2:
+    description: |
+      A phandle to the OPP table containing the performance states
+      supported by the CPR power domain.
+
+  acc-syscon:
+    description: A phandle to the syscon used for writing ACC settings.
+
+  nvmem-cells:
+    items:
+      - description: Corner 1 quotient offset
+      - description: Corner 2 quotient offset
+      - description: Corner 3 quotient offset
+      - description: Corner 1 initial voltage
+      - description: Corner 2 initial voltage
+      - description: Corner 3 initial voltage
+      - description: Corner 1 quotient
+      - description: Corner 2 quotient
+      - description: Corner 3 quotient
+      - description: Corner 1 ring oscillator
+      - description: Corner 2 ring oscillator
+      - description: Corner 3 ring oscillator
+      - description: Fuse revision
+
+  nvmem-cell-names:
+    items:
+      - const: cpr_quotient_offset1
+      - const: cpr_quotient_offset2
+      - const: cpr_quotient_offset3
+      - const: cpr_init_voltage1
+      - const: cpr_init_voltage2
+      - const: cpr_init_voltage3
+      - const: cpr_quotient1
+      - const: cpr_quotient2
+      - const: cpr_quotient3
+      - const: cpr_ring_osc1
+      - const: cpr_ring_osc2
+      - const: cpr_ring_osc3
+      - const: cpr_fuse_revision
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+  - vdd-apc-supply
+  - '#power-domain-cells'
+  - operating-points-v2
+  - nvmem-cells
+  - nvmem-cell-names
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    cpr_opp_table: opp-table-cpr {
+        compatible = "operating-points-v2-qcom-level";
+
+        cpr_opp1: opp1 {
+            opp-level = <1>;
+            qcom,opp-fuse-level = <1>;
+        };
+        cpr_opp2: opp2 {
+            opp-level = <2>;
+            qcom,opp-fuse-level = <2>;
+        };
+        cpr_opp3: opp3 {
+            opp-level = <3>;
+            qcom,opp-fuse-level = <3>;
+        };
+    };
+
+    power-controller@b018000 {
+        compatible = "qcom,qcs404-cpr", "qcom,cpr";
+        reg = <0x0b018000 0x1000>;
+        interrupts = <0 15 IRQ_TYPE_EDGE_RISING>;
+        clocks = <&xo_board>;
+        clock-names = "ref";
+        vdd-apc-supply = <&pms405_s3>;
+        #power-domain-cells = <0>;
+        operating-points-v2 = <&cpr_opp_table>;
+        acc-syscon = <&tcsr>;
+
+        nvmem-cells = <&cpr_efuse_quot_offset1>,
+            <&cpr_efuse_quot_offset2>,
+            <&cpr_efuse_quot_offset3>,
+            <&cpr_efuse_init_voltage1>,
+            <&cpr_efuse_init_voltage2>,
+            <&cpr_efuse_init_voltage3>,
+            <&cpr_efuse_quot1>,
+            <&cpr_efuse_quot2>,
+            <&cpr_efuse_quot3>,
+            <&cpr_efuse_ring1>,
+            <&cpr_efuse_ring2>,
+            <&cpr_efuse_ring3>,
+            <&cpr_efuse_revision>;
+        nvmem-cell-names = "cpr_quotient_offset1",
+            "cpr_quotient_offset2",
+            "cpr_quotient_offset3",
+            "cpr_init_voltage1",
+            "cpr_init_voltage2",
+            "cpr_init_voltage3",
+            "cpr_quotient1",
+            "cpr_quotient2",
+            "cpr_quotient3",
+            "cpr_ring_osc1",
+            "cpr_ring_osc2",
+            "cpr_ring_osc3",
+            "cpr_fuse_revision";
+    };
diff --git a/MAINTAINERS b/MAINTAINERS
index 1d840b89fd8a..d48febc9bdbb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15933,7 +15933,7 @@ M:	Niklas Cassel <nks@flawful.org>
 L:	linux-pm@vger.kernel.org
 L:	linux-arm-msm@vger.kernel.org
 S:	Maintained
-F:	Documentation/devicetree/bindings/power/avs/qcom,cpr.txt
+F:	Documentation/devicetree/bindings/power/avs/qcom,cpr.yaml
 F:	drivers/soc/qcom/cpr.c
 
 QUALCOMM CPUFREQ DRIVER MSM8996/APQ8096

From b2638e56c2ced2ca258d22f939c47327b189e00c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 3 Feb 2022 14:56:13 +0200
Subject: [PATCH 022/229] device property: Don't split fwnode_get_irq*() APIs
 in the code

New fwnode_get_irq_byname() landed after an unrelated function
by ordering. Move fwnode_iomap(), so fwnode_get_irq*() APIs will
go together.

No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 32 ++++++++++++++++----------------
 include/linux/property.h |  6 +++---
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/base/property.c b/drivers/base/property.c
index fc59e0f7f9cc..c0e94cce9c29 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -895,6 +895,22 @@ int device_get_phy_mode(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(device_get_phy_mode);
 
+/**
+ * fwnode_iomap - Maps the memory mapped IO for a given fwnode
+ * @fwnode:	Pointer to the firmware node
+ * @index:	Index of the IO range
+ *
+ * Returns a pointer to the mapped memory.
+ */
+void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index)
+{
+	if (IS_ENABLED(CONFIG_OF_ADDRESS) && is_of_node(fwnode))
+		return of_iomap(to_of_node(fwnode), index);
+
+	return NULL;
+}
+EXPORT_SYMBOL(fwnode_iomap);
+
 /**
  * fwnode_irq_get - Get IRQ directly from a fwnode
  * @fwnode:	Pointer to the firmware node
@@ -919,22 +935,6 @@ int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index)
 }
 EXPORT_SYMBOL(fwnode_irq_get);
 
-/**
- * fwnode_iomap - Maps the memory mapped IO for a given fwnode
- * @fwnode:	Pointer to the firmware node
- * @index:	Index of the IO range
- *
- * Returns a pointer to the mapped memory.
- */
-void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index)
-{
-	if (IS_ENABLED(CONFIG_OF_ADDRESS) && is_of_node(fwnode))
-		return of_iomap(to_of_node(fwnode), index);
-
-	return NULL;
-}
-EXPORT_SYMBOL(fwnode_iomap);
-
 /**
  * fwnode_irq_get_byname - Get IRQ from a fwnode using its name
  * @fwnode:	Pointer to the firmware node
diff --git a/include/linux/property.h b/include/linux/property.h
index 95d56a562b6a..4cd4b326941f 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -123,8 +123,6 @@ void fwnode_handle_put(struct fwnode_handle *fwnode);
 int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index);
 int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name);
 
-void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index);
-
 unsigned int device_get_child_node_count(struct device *dev);
 
 static inline bool device_property_read_bool(struct device *dev,
@@ -388,8 +386,10 @@ enum dev_dma_attr device_get_dma_attr(struct device *dev);
 const void *device_get_match_data(struct device *dev);
 
 int device_get_phy_mode(struct device *dev);
-
 int fwnode_get_phy_mode(struct fwnode_handle *fwnode);
+
+void __iomem *fwnode_iomap(struct fwnode_handle *fwnode, int index);
+
 struct fwnode_handle *fwnode_graph_get_next_endpoint(
 	const struct fwnode_handle *fwnode, struct fwnode_handle *prev);
 struct fwnode_handle *

From 4327b9eaf8a4ddd2c534e4f4ed7c949cf3d2be1e Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Wed, 16 Feb 2022 08:11:01 -0800
Subject: [PATCH 023/229] livepatch: Skip livepatch tests if ftrace cannot be
 configured

livepatch has a set of selftests that are used to validate the behavior of
the livepatching subsystem.  One of the testcases in the livepatch
testsuite is test-ftrace.sh, which among other things, validates that
livepatching gracefully fails when ftrace is disabled.  In the event that
ftrace cannot be disabled using 'sysctl kernel.ftrace_enabled=0', the test
will fail later due to it unexpectedly successfully loading the
test_klp_livepatch module.

While the livepatch selftests are careful to remove any of the livepatch
test modules between testcases to avoid this situation, ftrace may still
fail to be disabled if another trace is active on the system that was
enabled with FTRACE_OPS_FL_PERMANENT.  For example, any active BPF programs
that use trampolines will cause this test to fail due to the trampoline
being implemented with register_ftrace_direct().  The following is an
example of such a trace:

tcp_drop (1) R I D      tramp: ftrace_regs_caller+0x0/0x58
(call_direct_funcs+0x0/0x30)
        direct-->bpf_trampoline_6442550536_0+0x0/0x1000

In order to make the test more resilient to system state that is out of its
control, this patch updates set_ftrace_enabled() to detect sysctl failures,
and skip the testrun when appropriate.

Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: David Vernet <void@manifault.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220216161100.3243100-1-void@manifault.com
---
 .../testing/selftests/livepatch/functions.sh  | 22 ++++++++++++++++---
 .../selftests/livepatch/test-ftrace.sh        |  3 ++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
index 846c7ed71556..9230b869371d 100644
--- a/tools/testing/selftests/livepatch/functions.sh
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -75,9 +75,25 @@ function set_dynamic_debug() {
 }
 
 function set_ftrace_enabled() {
-	result=$(sysctl -q kernel.ftrace_enabled="$1" 2>&1 && \
-		 sysctl kernel.ftrace_enabled 2>&1)
-	echo "livepatch: $result" > /dev/kmsg
+	local can_fail=0
+	if [[ "$1" == "--fail" ]] ; then
+		can_fail=1
+		shift
+	fi
+
+	local err=$(sysctl -q kernel.ftrace_enabled="$1" 2>&1)
+	local result=$(sysctl --values kernel.ftrace_enabled)
+
+	if [[ "$result" != "$1" ]] ; then
+		if [[ $can_fail -eq 1 ]] ; then
+			echo "livepatch: $err" > /dev/kmsg
+			return
+		fi
+
+		skip "failed to set kernel.ftrace_enabled = $1"
+	fi
+
+	echo "livepatch: kernel.ftrace_enabled = $result" > /dev/kmsg
 }
 
 function cleanup() {
diff --git a/tools/testing/selftests/livepatch/test-ftrace.sh b/tools/testing/selftests/livepatch/test-ftrace.sh
index 552e165512f4..825540a5194d 100755
--- a/tools/testing/selftests/livepatch/test-ftrace.sh
+++ b/tools/testing/selftests/livepatch/test-ftrace.sh
@@ -25,7 +25,8 @@ if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]]
 	die "livepatch kselftest(s) failed"
 fi
 
-set_ftrace_enabled 0
+# Check that ftrace could not get disabled when a livepatch is enabled
+set_ftrace_enabled --fail 0
 if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then
 	echo -e "FAIL\n\n"
 	die "livepatch kselftest(s) failed"

From c377d4ba86e9136b4923124ac1ed858d77020aaf Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Wed, 9 Feb 2022 21:01:17 -0800
Subject: [PATCH 024/229] cpufreq: qcom-hw: Add support for per-core-dcvs

The OSM and EPSS hardware controls the frequency of each cluster in the
system based on requests from the OS and various limiting factors, such
as input from LMH.

In most systems the vote from the OS is done using a single register per
cluster, but some systems are configured to instead take one request per
core.  In this configuration a set of consecutive registers are used for
the OS to request the frequency of each of the cores within the cluster.
The information is then aggregated in the hardware and the frequency for
the cluster is determined.

As the current implementation ends up only requesting a frequency for
the first core in each cluster and only the vote of non-idle cores are
considered it's often the case that the cluster will be clocked (much)
lower than expected.

It's possible that there are benefits of performing the per-core
requests from the OS, but more investigation of the outcome is needed
before introducing such support. As such this patch extends the request
for the cluster to be written to all the cores.

The weight of the policy's related_cpus is used to determine how many
cores, and hence consecutive registers, each cluster has.

The OS is not permitted to disable the per-core dcvs feature.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index c2cda28682a5..f9d593ff4718 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -28,6 +28,7 @@
 
 struct qcom_cpufreq_soc_data {
 	u32 reg_enable;
+	u32 reg_dcvs_ctrl;
 	u32 reg_freq_lut;
 	u32 reg_volt_lut;
 	u32 reg_current_vote;
@@ -50,6 +51,8 @@ struct qcom_cpufreq_data {
 	bool cancel_throttle;
 	struct delayed_work throttle_work;
 	struct cpufreq_policy *policy;
+
+	bool per_core_dcvs;
 };
 
 static unsigned long cpu_hw_rate, xo_rate;
@@ -102,9 +105,14 @@ static int qcom_cpufreq_hw_target_index(struct cpufreq_policy *policy,
 	struct qcom_cpufreq_data *data = policy->driver_data;
 	const struct qcom_cpufreq_soc_data *soc_data = data->soc_data;
 	unsigned long freq = policy->freq_table[index].frequency;
+	unsigned int i;
 
 	writel_relaxed(index, data->base + soc_data->reg_perf_state);
 
+	if (data->per_core_dcvs)
+		for (i = 1; i < cpumask_weight(policy->related_cpus); i++)
+			writel_relaxed(index, data->base + soc_data->reg_perf_state + i * 4);
+
 	if (icc_scaling_enabled)
 		qcom_cpufreq_set_bw(policy, freq);
 
@@ -137,10 +145,15 @@ static unsigned int qcom_cpufreq_hw_fast_switch(struct cpufreq_policy *policy,
 	struct qcom_cpufreq_data *data = policy->driver_data;
 	const struct qcom_cpufreq_soc_data *soc_data = data->soc_data;
 	unsigned int index;
+	unsigned int i;
 
 	index = policy->cached_resolved_idx;
 	writel_relaxed(index, data->base + soc_data->reg_perf_state);
 
+	if (data->per_core_dcvs)
+		for (i = 1; i < cpumask_weight(policy->related_cpus); i++)
+			writel_relaxed(index, data->base + soc_data->reg_perf_state + i * 4);
+
 	return policy->freq_table[index].frequency;
 }
 
@@ -342,6 +355,7 @@ static irqreturn_t qcom_lmh_dcvs_handle_irq(int irq, void *data)
 
 static const struct qcom_cpufreq_soc_data qcom_soc_data = {
 	.reg_enable = 0x0,
+	.reg_dcvs_ctrl = 0xbc,
 	.reg_freq_lut = 0x110,
 	.reg_volt_lut = 0x114,
 	.reg_current_vote = 0x704,
@@ -351,6 +365,7 @@ static const struct qcom_cpufreq_soc_data qcom_soc_data = {
 
 static const struct qcom_cpufreq_soc_data epss_soc_data = {
 	.reg_enable = 0x0,
+	.reg_dcvs_ctrl = 0xb0,
 	.reg_freq_lut = 0x100,
 	.reg_volt_lut = 0x200,
 	.reg_perf_state = 0x320,
@@ -481,6 +496,9 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy)
 		goto error;
 	}
 
+	if (readl_relaxed(base + data->soc_data->reg_dcvs_ctrl) & 0x1)
+		data->per_core_dcvs = true;
+
 	qcom_get_related_cpus(index, policy->cpus);
 	if (cpumask_empty(policy->cpus)) {
 		dev_err(dev, "Domain-%d failed to get related CPUs\n", index);

From 72951a77c00fb23275c8164aeee409c06b6f197c Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Thu, 24 Feb 2022 19:57:06 -0800
Subject: [PATCH 025/229] cpufreq: blocklist Qualcomm sc8280xp and sa8540p in
 cpufreq-dt-platdev

The Qualcomm sc8280xp and sa8540p platforms also uses the
qcom-cpufreq-hw driver, so add them to the cpufreq-dt-platdev driver's
blocklist.

Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 6b808f805eab..96de1536e1cb 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -139,9 +139,11 @@ static const struct of_device_id blocklist[] __initconst = {
 	{ .compatible = "qcom,msm8996", },
 	{ .compatible = "qcom,qcs404", },
 	{ .compatible = "qcom,sa8155p" },
+	{ .compatible = "qcom,sa8540p" },
 	{ .compatible = "qcom,sc7180", },
 	{ .compatible = "qcom,sc7280", },
 	{ .compatible = "qcom,sc8180x", },
+	{ .compatible = "qcom,sc8280xp", },
 	{ .compatible = "qcom,sdm845", },
 	{ .compatible = "qcom,sm6350", },
 	{ .compatible = "qcom,sm8150", },

From 06cc5cf16591c3b1d63af2bbc9d33a66419ced98 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 6 Jan 2022 22:46:52 +0100
Subject: [PATCH 026/229] alpha: Remove usage of the deprecated
 "pci-dma-compat.h" API

In [1], Christoph Hellwig has proposed to remove the wrappers in
include/linux/pci-dma-compat.h.

Some reasons why this API should be removed have been given by Julia
Lawall in [2].

A coccinelle script has been used to perform the needed transformation.
It can be found in [3].

[1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/
[2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/
[3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/alpha/include/asm/floppy.h |  7 ++++---
 arch/alpha/kernel/pci_iommu.c   | 12 ++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h
index 8dfdb3aa1d96..588758685439 100644
--- a/arch/alpha/include/asm/floppy.h
+++ b/arch/alpha/include/asm/floppy.h
@@ -43,17 +43,18 @@ alpha_fd_dma_setup(char *addr, unsigned long size, int mode, int io)
 	static int prev_dir;
 	int dir;
 
-	dir = (mode != DMA_MODE_READ) ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE;
+	dir = (mode != DMA_MODE_READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 
 	if (bus_addr 
 	    && (addr != prev_addr || size != prev_size || dir != prev_dir)) {
 		/* different from last time -- unmap prev */
-		pci_unmap_single(isa_bridge, bus_addr, prev_size, prev_dir);
+		dma_unmap_single(&isa_bridge->dev, bus_addr, prev_size,
+				 prev_dir);
 		bus_addr = 0;
 	}
 
 	if (!bus_addr)	/* need to map it */
-		bus_addr = pci_map_single(isa_bridge, addr, size, dir);
+		bus_addr = dma_map_single(&isa_bridge->dev, addr, size, dir);
 
 	/* remember this one as prev */
 	prev_addr = addr;
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 21f9ac101324..e83a02ed5267 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -333,7 +333,7 @@ static dma_addr_t alpha_pci_map_page(struct device *dev, struct page *page,
 	struct pci_dev *pdev = alpha_gendev_to_pci(dev);
 	int dac_allowed;
 
-	BUG_ON(dir == PCI_DMA_NONE);
+	BUG_ON(dir == DMA_NONE);
 
 	dac_allowed = pdev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0; 
 	return pci_map_single_1(pdev, (char *)page_address(page) + offset, 
@@ -356,7 +356,7 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr,
 	struct pci_iommu_arena *arena;
 	long dma_ofs, npages;
 
-	BUG_ON(dir == PCI_DMA_NONE);
+	BUG_ON(dir == DMA_NONE);
 
 	if (dma_addr >= __direct_map_base
 	    && dma_addr < __direct_map_base + __direct_map_size) {
@@ -460,7 +460,7 @@ static void alpha_pci_free_coherent(struct device *dev, size_t size,
 				    unsigned long attrs)
 {
 	struct pci_dev *pdev = alpha_gendev_to_pci(dev);
-	pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL);
+	dma_unmap_single(&pdev->dev, dma_addr, size, DMA_BIDIRECTIONAL);
 	free_pages((unsigned long)cpu_addr, get_order(size));
 
 	DBGA2("pci_free_consistent: [%llx,%zx] from %ps\n",
@@ -639,7 +639,7 @@ static int alpha_pci_map_sg(struct device *dev, struct scatterlist *sg,
 	dma_addr_t max_dma;
 	int dac_allowed;
 
-	BUG_ON(dir == PCI_DMA_NONE);
+	BUG_ON(dir == DMA_NONE);
 
 	dac_allowed = dev ? pci_dac_dma_supported(pdev, pdev->dma_mask) : 0;
 
@@ -702,7 +702,7 @@ static int alpha_pci_map_sg(struct device *dev, struct scatterlist *sg,
 	/* Some allocation failed while mapping the scatterlist
 	   entries.  Unmap them now.  */
 	if (out > start)
-		pci_unmap_sg(pdev, start, out - start, dir);
+		dma_unmap_sg(&pdev->dev, start, out - start, dir);
 	return -ENOMEM;
 }
 
@@ -722,7 +722,7 @@ static void alpha_pci_unmap_sg(struct device *dev, struct scatterlist *sg,
 	dma_addr_t max_dma;
 	dma_addr_t fbeg, fend;
 
-	BUG_ON(dir == PCI_DMA_NONE);
+	BUG_ON(dir == DMA_NONE);
 
 	if (! alpha_mv.mv_pci_tbi)
 		return;

From ffecba83be9c7ced229b9f1d75643d5a49f820c4 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 6 Jan 2022 22:51:05 +0100
Subject: [PATCH 027/229] agp/intel: Remove usage of the deprecated
 "pci-dma-compat.h" API

In [1], Christoph Hellwig has proposed to remove the wrappers in
include/linux/pci-dma-compat.h.

Some reasons why this API should be removed have been given by Julia
Lawall in [2].

A coccinelle script has been used to perform the needed transformation.
It can be found in [3].

[1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/
[2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/
[3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/char/agp/intel-gtt.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/char/agp/intel-gtt.c b/drivers/char/agp/intel-gtt.c
index c53cc9868cd8..79a1b65527c2 100644
--- a/drivers/char/agp/intel-gtt.c
+++ b/drivers/char/agp/intel-gtt.c
@@ -111,8 +111,8 @@ static int intel_gtt_map_memory(struct page **pages,
 	for_each_sg(st->sgl, sg, num_entries, i)
 		sg_set_page(sg, pages[i], PAGE_SIZE, 0);
 
-	if (!pci_map_sg(intel_private.pcidev,
-			st->sgl, st->nents, PCI_DMA_BIDIRECTIONAL))
+	if (!dma_map_sg(&intel_private.pcidev->dev, st->sgl, st->nents,
+			DMA_BIDIRECTIONAL))
 		goto err;
 
 	return 0;
@@ -127,8 +127,8 @@ static void intel_gtt_unmap_memory(struct scatterlist *sg_list, int num_sg)
 	struct sg_table st;
 	DBG("try unmapping %lu pages\n", (unsigned long)mem->page_count);
 
-	pci_unmap_sg(intel_private.pcidev, sg_list,
-		     num_sg, PCI_DMA_BIDIRECTIONAL);
+	dma_unmap_sg(&intel_private.pcidev->dev, sg_list, num_sg,
+		     DMA_BIDIRECTIONAL);
 
 	st.sgl = sg_list;
 	st.orig_nents = st.nents = num_sg;
@@ -303,9 +303,9 @@ static int intel_gtt_setup_scratch_page(void)
 	set_pages_uc(page, 1);
 
 	if (intel_private.needs_dmar) {
-		dma_addr = pci_map_page(intel_private.pcidev, page, 0,
-				    PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
-		if (pci_dma_mapping_error(intel_private.pcidev, dma_addr)) {
+		dma_addr = dma_map_page(&intel_private.pcidev->dev, page, 0,
+					PAGE_SIZE, DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(&intel_private.pcidev->dev, dma_addr)) {
 			__free_page(page);
 			return -EINVAL;
 		}
@@ -552,9 +552,9 @@ static void intel_gtt_teardown_scratch_page(void)
 {
 	set_pages_wb(intel_private.scratch_page, 1);
 	if (intel_private.needs_dmar)
-		pci_unmap_page(intel_private.pcidev,
-			       intel_private.scratch_page_dma,
-			       PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+		dma_unmap_page(&intel_private.pcidev->dev,
+			       intel_private.scratch_page_dma, PAGE_SIZE,
+			       DMA_BIDIRECTIONAL);
 	__free_page(intel_private.scratch_page);
 }
 
@@ -1412,13 +1412,13 @@ int intel_gmch_probe(struct pci_dev *bridge_pdev, struct pci_dev *gpu_pdev,
 
 	if (bridge) {
 		mask = intel_private.driver->dma_mask_size;
-		if (pci_set_dma_mask(intel_private.pcidev, DMA_BIT_MASK(mask)))
+		if (dma_set_mask(&intel_private.pcidev->dev, DMA_BIT_MASK(mask)))
 			dev_err(&intel_private.pcidev->dev,
 				"set gfx device dma mask %d-bit failed!\n",
 				mask);
 		else
-			pci_set_consistent_dma_mask(intel_private.pcidev,
-						    DMA_BIT_MASK(mask));
+			dma_set_coherent_mask(&intel_private.pcidev->dev,
+					      DMA_BIT_MASK(mask));
 	}
 
 	if (intel_gtt_init() != 0) {

From 0fb3436b4b36cf69f4544385aa2bb8c5a4913509 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 6 Jan 2022 22:51:38 +0100
Subject: [PATCH 028/229] sparc: Remove usage of the deprecated
 "pci-dma-compat.h" API

In [1], Christoph Hellwig has proposed to remove the wrappers in
include/linux/pci-dma-compat.h.

Some reasons why this API should be removed have been given by Julia
Lawall in [2].

A coccinelle script has been used to perform the needed transformation.
It can be found in [3].

[1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/
[2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/
[3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/sparc/kernel/ioport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 57a72c46eddb..4e4f3d3263e4 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -309,7 +309,7 @@ arch_initcall(sparc_register_ioport);
 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 		enum dma_data_direction dir)
 {
-	if (dir != PCI_DMA_TODEVICE &&
+	if (dir != DMA_TO_DEVICE &&
 	    sparc_cpu_model == sparc_leon &&
 	    !sparc_leon3_snooping_enabled())
 		leon_flush_dcache_all();

From 8c155674d9757be855547dc4eb6bcb82d52482e7 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 6 Jan 2022 22:52:46 +0100
Subject: [PATCH 029/229] rapidio/tsi721: Remove usage of the deprecated
 "pci-dma-compat.h" API

In [1], Christoph Hellwig has proposed to remove the wrappers in
include/linux/pci-dma-compat.h.

Some reasons why this API should be removed have been given by Julia
Lawall in [2].

A coccinelle script has been used to perform the needed transformation.
It can be found in [3].

[1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/
[2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/
[3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/rapidio/devices/tsi721.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c
index 4dd31dd9feea..b3134744fb55 100644
--- a/drivers/rapidio/devices/tsi721.c
+++ b/drivers/rapidio/devices/tsi721.c
@@ -2836,17 +2836,17 @@ static int tsi721_probe(struct pci_dev *pdev,
 	}
 
 	/* Configure DMA attributes. */
-	if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
-		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	if (dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) {
+		err = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
 		if (err) {
 			tsi_err(&pdev->dev, "Unable to set DMA mask");
 			goto err_unmap_bars;
 		}
 
-		if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)))
+		if (dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)))
 			tsi_info(&pdev->dev, "Unable to set consistent DMA mask");
 	} else {
-		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		err = dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
 		if (err)
 			tsi_info(&pdev->dev, "Unable to set consistent DMA mask");
 	}

From fba09099c6e506608e05e08ac717bf34501f821b Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 6 Jan 2022 22:53:11 +0100
Subject: [PATCH 030/229] media: v4l2-pci-skeleton: Remove usage of the
 deprecated "pci-dma-compat.h" API

In [1], Christoph Hellwig has proposed to remove the wrappers in
include/linux/pci-dma-compat.h.

Some reasons why this API should be removed have been given by Julia
Lawall in [2].

A coccinelle script has been used to perform the needed transformation.
It can be found in [3].

[1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/
[2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/
[3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 samples/v4l/v4l2-pci-skeleton.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c
index 3fa6582b4a68..6311b7465220 100644
--- a/samples/v4l/v4l2-pci-skeleton.c
+++ b/samples/v4l/v4l2-pci-skeleton.c
@@ -766,7 +766,7 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	ret = pci_enable_device(pdev);
 	if (ret)
 		return ret;
-	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+	ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32));
 	if (ret) {
 		dev_err(&pdev->dev, "no suitable DMA available.\n");
 		goto disable_pci;

From 98c27f276be85a73f0babc61c9f8128b7ef593c6 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 17 Jan 2022 09:50:10 -0800
Subject: [PATCH 031/229] NFS: simplify check for freeing cn_resp

nfs42_files_from_same_server() is called to check if freeing
cn_resp is required, just do the free.

Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e79ae4cbc395..ba117592a95b 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -180,8 +180,8 @@ retry:
 	ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count,
 				nss, cnrs, sync);
 out:
-	if (!nfs42_files_from_same_server(file_in, file_out))
-		kfree(cn_resp);
+	kfree(cn_resp);
+
 	if (ret == -EAGAIN)
 		goto retry;
 	return ret;

From 8786fde8421ce755a842051f9528674a1b1f0b9a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Sat, 22 Jan 2022 20:54:52 +0000
Subject: [PATCH 032/229] Convert NFS from readpages to readahead

NFS is one of the last two users of the deprecated ->readpages aop.
This conversion looks straightforward, but I have only compile-tested
it.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/file.c          |  2 +-
 fs/nfs/nfstrace.h      |  6 +++---
 fs/nfs/read.c          | 21 +++++++++++++--------
 include/linux/nfs_fs.h |  3 +--
 4 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 76d76acbc594..4d681683d13c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -514,7 +514,7 @@ static void nfs_swap_deactivate(struct file *file)
 
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
-	.readpages = nfs_readpages,
+	.readahead = nfs_readahead,
 	.set_page_dirty = __set_page_dirty_nobuffers,
 	.writepage = nfs_writepage,
 	.writepages = nfs_writepages,
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 317ce27bdc4b..4611aa3a21a4 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -889,11 +889,11 @@ TRACE_EVENT(nfs_aop_readpage_done,
 TRACE_EVENT(nfs_aop_readahead,
 		TP_PROTO(
 			const struct inode *inode,
-			struct page *page,
+			loff_t pos,
 			unsigned int nr_pages
 		),
 
-		TP_ARGS(inode, page, nr_pages),
+		TP_ARGS(inode, pos, nr_pages),
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
@@ -911,7 +911,7 @@ TRACE_EVENT(nfs_aop_readahead,
 			__entry->fileid = nfsi->fileid;
 			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
 			__entry->version = inode_peek_iversion_raw(inode);
-			__entry->offset = page_index(page) << PAGE_SHIFT;
+			__entry->offset = pos;
 			__entry->nr_pages = nr_pages;
 		),
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index eb00229c1a50..2472f962a9a2 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -290,9 +290,8 @@ static void nfs_readpage_result(struct rpc_task *task,
 }
 
 static int
-readpage_async_filler(void *data, struct page *page)
+readpage_async_filler(struct nfs_readdesc *desc, struct page *page)
 {
-	struct nfs_readdesc *desc = data;
 	struct inode *inode = page_file_mapping(page)->host;
 	unsigned int rsize = NFS_SERVER(inode)->rsize;
 	struct nfs_page *new;
@@ -397,14 +396,16 @@ out_unlock:
 	return ret;
 }
 
-int nfs_readpages(struct file *file, struct address_space *mapping,
-		struct list_head *pages, unsigned nr_pages)
+void nfs_readahead(struct readahead_control *ractl)
 {
+	unsigned int nr_pages = readahead_count(ractl);
+	struct file *file = ractl->file;
 	struct nfs_readdesc desc;
-	struct inode *inode = mapping->host;
+	struct inode *inode = ractl->mapping->host;
+	struct page *page;
 	int ret;
 
-	trace_nfs_aop_readahead(inode, lru_to_page(pages), nr_pages);
+	trace_nfs_aop_readahead(inode, readahead_pos(ractl), nr_pages);
 	nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
 
 	ret = -ESTALE;
@@ -422,14 +423,18 @@ int nfs_readpages(struct file *file, struct address_space *mapping,
 	nfs_pageio_init_read(&desc.pgio, inode, false,
 			     &nfs_async_read_completion_ops);
 
-	ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
+	while ((page = readahead_page(ractl)) != NULL) {
+		ret = readpage_async_filler(&desc, page);
+		put_page(page);
+		if (ret)
+			break;
+	}
 
 	nfs_pageio_complete_read(&desc.pgio);
 
 	put_nfs_open_context(desc.ctx);
 out:
 	trace_nfs_aop_readahead_done(inode, nr_pages, ret);
-	return ret;
 }
 
 int __init nfs_init_readpagecache(void)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 68f81d8d36de..333ea05e2531 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -601,8 +601,7 @@ nfs_have_writebacks(struct inode *inode)
  * linux/fs/nfs/read.c
  */
 extern int  nfs_readpage(struct file *, struct page *);
-extern int  nfs_readpages(struct file *, struct address_space *,
-		struct list_head *, unsigned);
+void nfs_readahead(struct readahead_control *);
 
 /*
  * inline functions

From b7f114edd54326f730a754547e7cfb197b5bc132 Mon Sep 17 00:00:00 2001
From: Xin Xiong <xiongx18@fudan.edu.cn>
Date: Tue, 25 Jan 2022 21:10:45 +0800
Subject: [PATCH 033/229] NFSv4.2: fix reference count leaks in
 _nfs42_proc_copy_notify()

[You don't often get email from xiongx18@fudan.edu.cn. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.]

The reference counting issue happens in two error paths in the
function _nfs42_proc_copy_notify(). In both error paths, the function
simply returns the error code and forgets to balance the refcount of
object `ctx`, bumped by get_nfs_open_context() earlier, which may
cause refcount leaks.

Fix it by balancing refcount of the `ctx` object before the function
returns in both error paths.

Signed-off-by: Xin Xiong <xiongx18@fudan.edu.cn>
Signed-off-by: Xiyu Yang <xiyuyang19@fudan.edu.cn>
Signed-off-by: Xin Tan <tanxin.ctf@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42proc.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 32129446beca..ca878d021fab 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -591,8 +591,10 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
 
 	ctx = get_nfs_open_context(nfs_file_open_context(src));
 	l_ctx = nfs_get_lock_context(ctx);
-	if (IS_ERR(l_ctx))
-		return PTR_ERR(l_ctx);
+	if (IS_ERR(l_ctx)) {
+		status = PTR_ERR(l_ctx);
+		goto out;
+	}
 
 	status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
 				     FMODE_READ);
@@ -600,7 +602,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
 	if (status) {
 		if (status == -EAGAIN)
 			status = -NFS4ERR_BAD_STATEID;
-		return status;
+		goto out;
 	}
 
 	status = nfs4_call_sync(src_server->client, src_server, &msg,
@@ -609,6 +611,7 @@ static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
 	if (status == -ENOTSUPP)
 		src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
 
+out:
 	put_nfs_open_context(nfs_file_open_context(src));
 	return status;
 }

From ab22e2cbbccbcf65bed9524605ac5dacb89471e8 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Sun, 30 Jan 2022 22:44:10 +0000
Subject: [PATCH 034/229] SUNRPC: remove redundant pointer plainhdr

[You don't often get email from colin.i.king@gmail.com. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.]

Pointer plainhdr is being assigned a value that is never read, the
pointer is redundant and can be removed.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/auth_gss/gss_krb5_wrap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index e95c009bb869..5f96e75f9eec 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -409,7 +409,7 @@ static u32
 gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
 		     struct xdr_buf *buf, struct page **pages)
 {
-	u8		*ptr, *plainhdr;
+	u8		*ptr;
 	time64_t	now;
 	u8		flags = 0x00;
 	__be16		*be16ptr;
@@ -426,7 +426,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
 		return GSS_S_FAILURE;
 
 	/* construct gss token header */
-	ptr = plainhdr = buf->head[0].iov_base + offset;
+	ptr = buf->head[0].iov_base + offset;
 	*ptr++ = (unsigned char) ((KG2_TOK_WRAP>>8) & 0xff);
 	*ptr++ = (unsigned char) (KG2_TOK_WRAP & 0xff);
 

From 3e17898aca293a24dae757a440a50aa63ca29671 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 13:32:45 -0500
Subject: [PATCH 035/229] NFSv4: Protect the state recovery thread against
 direct reclaim

If memory allocation triggers a direct reclaim from the state recovery
thread, then we can deadlock. Use memalloc_nofs_save/restore to ensure
that doesn't happen.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4state.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f5a62c0d999b..0f4818627ef0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
+#include <linux/sched/mm.h>
 
 #include <linux/sunrpc/clnt.h>
 
@@ -2560,9 +2561,17 @@ static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
 
 static void nfs4_state_manager(struct nfs_client *clp)
 {
+	unsigned int memflags;
 	int status = 0;
 	const char *section = "", *section_sep = "";
 
+	/*
+	 * State recovery can deadlock if the direct reclaim code tries
+	 * start NFS writeback. So ensure memory allocations are all
+	 * GFP_NOFS.
+	 */
+	memflags = memalloc_nofs_save();
+
 	/* Ensure exclusive access to NFSv4 state */
 	do {
 		trace_nfs4_state_mgr(clp);
@@ -2657,6 +2666,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
 		}
 
+		memalloc_nofs_restore(memflags);
 		nfs4_end_drain_session(clp);
 		nfs4_clear_state_manager_bit(clp);
 
@@ -2674,6 +2684,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			return;
 		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
 			return;
+		memflags = memalloc_nofs_save();
 	} while (refcount_read(&clp->cl_count) > 1 && !signalled());
 	goto out_drain;
 
@@ -2686,6 +2697,7 @@ out_error:
 			clp->cl_hostname, -status);
 	ssleep(1);
 out_drain:
+	memalloc_nofs_restore(memflags);
 	nfs4_end_drain_session(clp);
 	nfs4_clear_state_manager_bit(clp);
 }

From d7867712d81c05680beff2c9645ff1e8fa59a41d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 13:04:10 -0500
Subject: [PATCH 036/229] NFS: Charge open/lock file contexts to kmemcg

Allow kmemcg to limit the number of open/lock file contexts, in the same
way that it limits the parent file descriptors.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c   | 2 +-
 fs/nfs/inode.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 75cb1cbe4cde..fbb4a522d716 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -73,7 +73,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
-	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
 	if (ctx != NULL) {
 		ctx->duped = 0;
 		ctx->attr_gencount = nfsi->attr_gencount;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d96baa4450e3..34209abe16c5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -952,7 +952,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
 	res = __nfs_find_lock_context(ctx);
 	rcu_read_unlock();
 	if (res == NULL) {
-		new = kmalloc(sizeof(*new), GFP_KERNEL);
+		new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
 		nfs_init_lock_context(new);
@@ -1030,7 +1030,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
 {
 	struct nfs_open_context *ctx;
 
-	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 	nfs_sb_active(dentry->d_sb);

From 9c00fd9acba81070e9413a468df57cf89d6ee960 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 13:42:01 -0500
Subject: [PATCH 037/229] NFSv4: Charge NFSv4 open state trackers to kmemcg

Allow kmemcg to limit the number of NFSv4 delegation, lock and open
state trackers.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/delegation.c | 2 +-
 fs/nfs/nfs4state.c  | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7c9eb679dbdb..5c97cad741a7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -439,7 +439,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
 	struct nfs_delegation *freeme = NULL;
 	int status = 0;
 
-	delegation = kmalloc(sizeof(*delegation), GFP_NOFS);
+	delegation = kmalloc(sizeof(*delegation), GFP_KERNEL_ACCOUNT);
 	if (delegation == NULL)
 		return -ENOMEM;
 	nfs4_stateid_copy(&delegation->stateid, stateid);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0f4818627ef0..87cb864a1ba2 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -667,7 +667,7 @@ nfs4_alloc_open_state(void)
 {
 	struct nfs4_state *state;
 
-	state = kzalloc(sizeof(*state), GFP_NOFS);
+	state = kzalloc(sizeof(*state), GFP_KERNEL_ACCOUNT);
 	if (!state)
 		return NULL;
 	refcount_set(&state->count, 1);
@@ -870,14 +870,15 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	struct nfs4_lock_state *lsp;
 	struct nfs_server *server = state->owner->so_server;
 
-	lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
+	lsp = kzalloc(sizeof(*lsp), GFP_KERNEL_ACCOUNT);
 	if (lsp == NULL)
 		return NULL;
 	nfs4_init_seqid_counter(&lsp->ls_seqid);
 	refcount_set(&lsp->ls_count, 1);
 	lsp->ls_state = state;
 	lsp->ls_owner = fl_owner;
-	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
+	lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id,
+						0, 0, GFP_KERNEL_ACCOUNT);
 	if (lsp->ls_seqid.owner_id < 0)
 		goto out_free;
 	INIT_LIST_HEAD(&lsp->ls_locks);

From 5c60e89e71f864033a268ed66933dad0d92f1550 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 14:03:06 -0500
Subject: [PATCH 038/229] NFSv4.2: Fix up an invalid combination of memory
 allocation flags

We should use either GFP_KERNEL or GFP_NOFS, but not both. Also strip
GFP_KERNEL_ACCOUNT down to GFP_KERNEL. This memory is shrinkable, so
does not need to be limited by kmemcg.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xattr.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 1c4d2a05b401..ad3405c64b9e 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -199,7 +199,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
 		flags = NFS4_XATTR_ENTRY_EXTVAL;
 	}
 
-	buf = kmalloc(alloclen, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+	buf = kmalloc(alloclen, GFP_KERNEL);
 	if (buf == NULL)
 		return NULL;
 	entry = (struct nfs4_xattr_entry *)buf;
@@ -213,7 +213,7 @@ nfs4_xattr_alloc_entry(const char *name, const void *value,
 
 
 	if (flags & NFS4_XATTR_ENTRY_EXTVAL) {
-		valp = kvmalloc(len, GFP_KERNEL_ACCOUNT | GFP_NOFS);
+		valp = kvmalloc(len, GFP_KERNEL);
 		if (valp == NULL) {
 			kfree(buf);
 			return NULL;
@@ -289,8 +289,7 @@ nfs4_xattr_alloc_cache(void)
 {
 	struct nfs4_xattr_cache *cache;
 
-	cache = kmem_cache_alloc(nfs4_xattr_cache_cachep,
-	    GFP_KERNEL_ACCOUNT | GFP_NOFS);
+	cache = kmem_cache_alloc(nfs4_xattr_cache_cachep, GFP_KERNEL);
 	if (cache == NULL)
 		return NULL;
 

From da48f267f90d9dc9f930fd9a67753643657b404f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 13:57:38 -0500
Subject: [PATCH 039/229] NFS: Convert GFP_NOFS to GFP_KERNEL

Assume that sections that should not re-enter the filesystem are already
protected with memalloc_nofs_save/restore call, so relax those GFP_NOFS
instances which might be used by other contexts.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c     |  6 +++---
 fs/nfs/nfs4proc.c  | 15 +++++++--------
 fs/nfs/nfs4state.c |  2 +-
 fs/nfs/pnfs.c      |  4 ++--
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 34209abe16c5..8cf29c6cd9f9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1583,7 +1583,7 @@ struct nfs_fattr *nfs_alloc_fattr(void)
 {
 	struct nfs_fattr *fattr;
 
-	fattr = kmalloc(sizeof(*fattr), GFP_NOFS);
+	fattr = kmalloc(sizeof(*fattr), GFP_KERNEL);
 	if (fattr != NULL) {
 		nfs_fattr_init(fattr);
 		fattr->label = NULL;
@@ -1599,7 +1599,7 @@ struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server)
 	if (!fattr)
 		return NULL;
 
-	fattr->label = nfs4_label_alloc(server, GFP_NOFS);
+	fattr->label = nfs4_label_alloc(server, GFP_KERNEL);
 	if (IS_ERR(fattr->label)) {
 		kfree(fattr);
 		return NULL;
@@ -1613,7 +1613,7 @@ struct nfs_fh *nfs_alloc_fhandle(void)
 {
 	struct nfs_fh *fh;
 
-	fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS);
+	fh = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
 	if (fh != NULL)
 		fh->size = 0;
 	return fh;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e0db6c27619..b3793b82a5e7 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5904,7 +5904,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		buflen = server->rsize;
 
 	npages = DIV_ROUND_UP(buflen, PAGE_SIZE) + 1;
-	pages = kmalloc_array(npages, sizeof(struct page *), GFP_NOFS);
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
@@ -6609,7 +6609,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
 	};
 	int status = 0;
 
-	data = kzalloc(sizeof(*data), GFP_NOFS);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 
@@ -6797,7 +6797,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
 	struct nfs4_state *state = lsp->ls_state;
 	struct inode *inode = state->inode;
 
-	p = kzalloc(sizeof(*p), GFP_NOFS);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (p == NULL)
 		return NULL;
 	p->arg.fh = NFS_FH(inode);
@@ -7202,8 +7202,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		task_setup_data.flags |= RPC_TASK_MOVEABLE;
 
 	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
-			fl->fl_u.nfs4_fl.owner,
-			recovery_type == NFS_LOCK_NEW ? GFP_KERNEL : GFP_NOFS);
+				   fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 	if (IS_SETLKW(cmd))
@@ -7626,7 +7625,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 	if (server->nfs_client->cl_mvops->minor_version != 0)
 		return;
 
-	data = kmalloc(sizeof(*data), GFP_NOFS);
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return;
 	data->lsp = lsp;
@@ -9291,7 +9290,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
 		goto out_err;
 
 	ret = ERR_PTR(-ENOMEM);
-	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	calldata = kzalloc(sizeof(*calldata), GFP_KERNEL);
 	if (calldata == NULL)
 		goto out_put_clp;
 	nfs4_init_sequence(&calldata->args, &calldata->res, 0, is_privileged);
@@ -10222,7 +10221,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
 		&task_setup.rpc_client, &msg);
 
 	dprintk("NFS call  free_stateid %p\n", stateid);
-	data = kmalloc(sizeof(*data), GFP_NOFS);
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 	data->server = server;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 87cb864a1ba2..58054dfdf2b0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -821,7 +821,7 @@ static void __nfs4_close(struct nfs4_state *state,
 
 void nfs4_close_state(struct nfs4_state *state, fmode_t fmode)
 {
-	__nfs4_close(state, fmode, GFP_NOFS, 0);
+	__nfs4_close(state, fmode, GFP_KERNEL, 0);
 }
 
 void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 7c9090a28e5c..f089e11fd001 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1233,7 +1233,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
 	int status = 0;
 
 	*pcred = NULL;
-	lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
 	if (unlikely(lrp == NULL)) {
 		status = -ENOMEM;
 		spin_lock(&ino->i_lock);
@@ -3250,7 +3250,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
 	struct nfs4_threshold *thp;
 
-	thp = kzalloc(sizeof(*thp), GFP_NOFS);
+	thp = kzalloc(sizeof(*thp), GFP_KERNEL);
 	if (!thp) {
 		dprintk("%s mdsthreshold allocation failed\n", __func__);
 		return NULL;

From 61345a42a2ff8b0539faf545a5ce17b6c339db38 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 14:16:55 -0500
Subject: [PATCH 040/229] NFSv4/flexfiles: Convert GFP_NOFS to GFP_KERNEL

Assume that the higher layers will have set memalloc_nofs_save/restore
as appropriate.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 17 +++++++++--------
 fs/nfs/nfs42proc.c                     |  2 +-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index a553d59afa8b..e28f2177afb7 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -694,7 +694,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(inode, GFP_NOIO);
+		pnfs_report_layoutstat(inode, GFP_KERNEL);
 }
 
 static void
@@ -900,7 +900,7 @@ retry:
 						   req->wb_bytes,
 						   IOMODE_RW,
 						   false,
-						   GFP_NOFS);
+						   GFP_KERNEL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -959,7 +959,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 						   req->wb_bytes,
 						   IOMODE_RW,
 						   false,
-						   GFP_NOFS);
+						   GFP_KERNEL);
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -1258,7 +1258,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
-				       GFP_NOIO);
+				       GFP_KERNEL);
 
 	switch (status) {
 	case NFS4ERR_DELAY:
@@ -1973,7 +1973,7 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
 	struct inode *inode = lseg->pls_layout->plh_inode;
 	struct pnfs_commit_array *array, *new;
 
-	new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+	new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_KERNEL);
 	if (new) {
 		spin_lock(&inode->i_lock);
 		array = pnfs_add_commit_array(fl_cinfo, new, lseg);
@@ -2192,8 +2192,8 @@ ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
 	if (list_empty(&head))
 		return;
 
-	errors = kmalloc_array(NFS42_LAYOUTERROR_MAX,
-			sizeof(*errors), GFP_NOFS);
+	errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors),
+			       GFP_KERNEL);
 	if (errors != NULL) {
 		const struct nfs4_ff_layout_ds_err *pos;
 		size_t n = 0;
@@ -2444,7 +2444,8 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 	const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
 
 	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
-	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
+	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo),
+				      GFP_KERNEL);
 	if (!args->devinfo)
 		return -ENOMEM;
 
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ca878d021fab..30ea1cbd305b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1017,7 +1017,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
 		return -EOPNOTSUPP;
 	if (n > NFS42_LAYOUTERROR_MAX)
 		return -EINVAL;
-	data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS);
+	data = nfs42_alloc_layouterror_data(lseg, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 	for (i = 0; i < n; i++) {

From 4fb547be355d4af349681ba4c3bab81d99f4f774 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 14:19:15 -0500
Subject: [PATCH 041/229] NFSv4.2/copyoffload: Convert GFP_NOFS to GFP_KERNEL

There doesn't seem to be any reason why the copy offload code can't use
GFP_KERNEL. It can't get called by direct reclaim.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/callback_proc.c |  2 +-
 fs/nfs/nfs42proc.c     | 10 +++++-----
 fs/nfs/nfs4file.c      |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c343666d9a42..39d1ec870d90 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -710,7 +710,7 @@ __be32 nfs4_callback_offload(void *data, void *dummy,
 	struct nfs4_copy_state *copy, *tmp_copy;
 	bool found = false;
 
-	copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+	copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
 	if (!copy)
 		return htonl(NFS4ERR_SERVERFAULT);
 
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 30ea1cbd305b..882bf84484ac 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -181,7 +181,7 @@ static int handle_async_copy(struct nfs42_copy_res *res,
 	struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
 	struct nfs_open_context *src_ctx = nfs_file_open_context(src);
 
-	copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS);
+	copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
 	if (!copy)
 		return -ENOMEM;
 
@@ -254,7 +254,7 @@ static int process_copy_commit(struct file *dst, loff_t pos_dst,
 	struct nfs_commitres cres;
 	int status = -ENOMEM;
 
-	cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+	cres.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
 	if (!cres.verf)
 		goto out;
 
@@ -357,7 +357,7 @@ static ssize_t _nfs42_proc_copy(struct file *src,
 	res->commit_res.verf = NULL;
 	if (args->sync) {
 		res->commit_res.verf =
-			kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+			kzalloc(sizeof(struct nfs_writeverf), GFP_KERNEL);
 		if (!res->commit_res.verf)
 			return -ENOMEM;
 	}
@@ -552,7 +552,7 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
 	if (!(dst_server->caps & NFS_CAP_OFFLOAD_CANCEL))
 		return -EOPNOTSUPP;
 
-	data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_NOFS);
+	data = kzalloc(sizeof(struct nfs42_offloadcancel_data), GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
 
@@ -629,7 +629,7 @@ int nfs42_proc_copy_notify(struct file *src, struct file *dst,
 	if (!(src_server->caps & NFS_CAP_COPY_NOTIFY))
 		return -EOPNOTSUPP;
 
-	args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS);
+	args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_KERNEL);
 	if (args == NULL)
 		return -ENOMEM;
 
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index ba117592a95b..d258933cf8c8 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -165,7 +165,7 @@ retry:
 		if (sync)
 			return -EOPNOTSUPP;
 		cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res),
-				GFP_NOFS);
+				  GFP_KERNEL);
 		if (unlikely(cn_resp == NULL))
 			return -ENOMEM;
 
@@ -339,7 +339,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
 
 	res = ERR_PTR(-ENOMEM);
 	len = strlen(SSC_READ_NAME_BODY) + 16;
-	read_name = kzalloc(len, GFP_NOFS);
+	read_name = kzalloc(len, GFP_KERNEL);
 	if (read_name == NULL)
 		goto out;
 	snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);

From 0adc87940618648b3dcccc819c20068bd6b4ec93 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 12:49:44 -0500
Subject: [PATCH 042/229] SUNRPC: Convert GFP_NOFS to GFP_KERNEL

The sections which should not re-enter the filesystem are already
protected with memalloc_nofs_save/restore calls, so it is better to use
GFP_KERNEL in these calls to allow better performance for synchronous
RPC calls.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/auth_unix.c | 2 +-
 net/sunrpc/clnt.c      | 2 +-
 net/sunrpc/rpcb_clnt.c | 4 ++--
 net/sunrpc/sched.c     | 4 ++--
 net/sunrpc/xprt.c      | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index e7df1f782b2e..3600d8641644 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -43,7 +43,7 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_NOFS);
+	struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_KERNEL);
 
 	rpcauth_init_cred(ret, acred, auth, &unix_credops);
 	ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index c83fe618767c..97165a545cb3 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2793,7 +2793,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 		return -EINVAL;
 	}
 
-	data = kmalloc(sizeof(*data), GFP_NOFS);
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 	data->xps = xprt_switch_get(xps);
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 647b323cc1d5..0fdeb8666bfd 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -714,7 +714,7 @@ void rpcb_getport_async(struct rpc_task *task)
 		goto bailout_nofree;
 	}
 
-	map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS);
+	map = kzalloc(sizeof(struct rpcbind_args), GFP_KERNEL);
 	if (!map) {
 		status = -ENOMEM;
 		goto bailout_release_client;
@@ -730,7 +730,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	case RPCBVERS_4:
 	case RPCBVERS_3:
 		map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
-		map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS);
+		map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
 		if (!map->r_addr) {
 			status = -ENOMEM;
 			goto bailout_free_args;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index e2c835482791..52769b883c0a 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1021,7 +1021,7 @@ int rpc_malloc(struct rpc_task *task)
 	struct rpc_rqst *rqst = task->tk_rqstp;
 	size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
 	struct rpc_buffer *buf;
-	gfp_t gfp = GFP_NOFS;
+	gfp_t gfp = GFP_KERNEL;
 
 	if (RPC_IS_SWAPPER(task))
 		gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -1095,7 +1095,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 static struct rpc_task *
 rpc_alloc_task(void)
 {
-	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_KERNEL);
 }
 
 /*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a02de2bddb28..9f0025e0742c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1692,7 +1692,7 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
 		goto out;
 	++xprt->num_reqs;
 	spin_unlock(&xprt->reserve_lock);
-	req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS);
+	req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
 	spin_lock(&xprt->reserve_lock);
 	if (req != NULL)
 		goto out;

From 4c2883e77c5f30d34d04d3c9731988767eb9e898 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 14:43:09 -0500
Subject: [PATCH 043/229] SUNRPC/auth_gss: Convert GFP_NOFS to GFP_KERNEL

Assume that the upper layers have set memalloc_nofs_save/restore as
appropriate.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/auth_gss/auth_gss.c          | 22 +++++++++++-----------
 net/sunrpc/auth_gss/auth_gss_internal.h |  2 +-
 net/sunrpc/auth_gss/gss_krb5_crypto.c   | 10 +++++-----
 net/sunrpc/auth_gss/gss_krb5_seqnum.c   |  4 ++--
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 5f42aa5fc612..affd64a54f02 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -145,7 +145,7 @@ gss_alloc_context(void)
 {
 	struct gss_cl_ctx *ctx;
 
-	ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
 		ctx->gc_proc = RPC_GSS_PROC_DATA;
 		ctx->gc_seq = 1;	/* NetApp 6.4R1 doesn't accept seq. no. 0 */
@@ -208,7 +208,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 		p = ERR_PTR(-EFAULT);
 		goto err;
 	}
-	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
+	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_KERNEL);
 	if (ret < 0) {
 		trace_rpcgss_import_ctx(ret);
 		p = ERR_PTR(ret);
@@ -510,7 +510,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
 	int vers;
 	int err = -ENOMEM;
 
-	gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
+	gss_msg = kzalloc(sizeof(*gss_msg), GFP_KERNEL);
 	if (gss_msg == NULL)
 		goto err;
 	vers = get_pipe_version(gss_auth->net);
@@ -526,7 +526,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
 	gss_msg->auth = gss_auth;
 	kref_get(&gss_auth->kref);
 	if (service_name) {
-		gss_msg->service_name = kstrdup_const(service_name, GFP_NOFS);
+		gss_msg->service_name = kstrdup_const(service_name, GFP_KERNEL);
 		if (!gss_msg->service_name) {
 			err = -ENOMEM;
 			goto err_put_pipe_version;
@@ -702,7 +702,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	if (mlen > MSG_BUF_MAXSIZE)
 		goto out;
 	err = -ENOMEM;
-	buf = kmalloc(mlen, GFP_NOFS);
+	buf = kmalloc(mlen, GFP_KERNEL);
 	if (!buf)
 		goto out;
 
@@ -1218,7 +1218,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
 	struct gss_cred *new;
 
 	/* Make a copy of the cred so that we can reference count it */
-	new = kzalloc(sizeof(*gss_cred), GFP_NOFS);
+	new = kzalloc(sizeof(*gss_cred), GFP_KERNEL);
 	if (new) {
 		struct auth_cred acred = {
 			.cred = gss_cred->gc_base.cr_cred,
@@ -1341,7 +1341,7 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS);
+	return rpcauth_lookup_credcache(auth, acred, flags, GFP_KERNEL);
 }
 
 static struct rpc_cred *
@@ -1667,7 +1667,7 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr)
 	if (!p)
 		goto validate_failed;
 
-	seq = kmalloc(4, GFP_NOFS);
+	seq = kmalloc(4, GFP_KERNEL);
 	if (!seq)
 		goto validate_failed;
 	*seq = cpu_to_be32(task->tk_rqstp->rq_seqno);
@@ -1777,11 +1777,11 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
 	rqstp->rq_enc_pages
 		= kmalloc_array(rqstp->rq_enc_pages_num,
 				sizeof(struct page *),
-				GFP_NOFS);
+				GFP_KERNEL);
 	if (!rqstp->rq_enc_pages)
 		goto out;
 	for (i=0; i < rqstp->rq_enc_pages_num; i++) {
-		rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS);
+		rqstp->rq_enc_pages[i] = alloc_page(GFP_KERNEL);
 		if (rqstp->rq_enc_pages[i] == NULL)
 			goto out_free;
 	}
@@ -1985,7 +1985,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
 	if (offset + len > rcv_buf->len)
 		goto unwrap_failed;
 	mic.len = len;
-	mic.data = kmalloc(len, GFP_NOFS);
+	mic.data = kmalloc(len, GFP_KERNEL);
 	if (!mic.data)
 		goto unwrap_failed;
 	if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
index f6d9631bd9d0..c53b329092d4 100644
--- a/net/sunrpc/auth_gss/auth_gss_internal.h
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
 	if (unlikely(q > end || q < p))
 		return ERR_PTR(-EFAULT);
 	if (len) {
-		dest->data = kmemdup(p, len, GFP_NOFS);
+		dest->data = kmemdup(p, len, GFP_KERNEL);
 		if (unlikely(dest->data == NULL))
 			return ERR_PTR(-ENOMEM);
 	} else
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 634b6c6e0dcb..3ea58175e159 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -161,7 +161,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
 		return GSS_S_FAILURE;
 	}
 
-	checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+	checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL);
 	if (checksumdata == NULL)
 		return GSS_S_FAILURE;
 
@@ -169,7 +169,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
 	if (IS_ERR(tfm))
 		goto out_free_cksum;
 
-	req = ahash_request_alloc(tfm, GFP_NOFS);
+	req = ahash_request_alloc(tfm, GFP_KERNEL);
 	if (!req)
 		goto out_free_ahash;
 
@@ -257,7 +257,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
 		return GSS_S_FAILURE;
 	}
 
-	checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS);
+	checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL);
 	if (!checksumdata)
 		return GSS_S_FAILURE;
 
@@ -265,7 +265,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
 	if (IS_ERR(tfm))
 		goto out_free_cksum;
 
-	req = ahash_request_alloc(tfm, GFP_NOFS);
+	req = ahash_request_alloc(tfm, GFP_KERNEL);
 	if (!req)
 		goto out_free_ahash;
 
@@ -554,7 +554,7 @@ gss_krb5_cts_crypt(struct crypto_sync_skcipher *cipher, struct xdr_buf *buf,
 		WARN_ON(0);
 		return -ENOMEM;
 	}
-	data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS);
+	data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index fb117817ff5d..3200b971a814 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -49,7 +49,7 @@ krb5_make_seq_num(struct krb5_ctx *kctx,
 	unsigned char *plain;
 	s32 code;
 
-	plain = kmalloc(8, GFP_NOFS);
+	plain = kmalloc(8, GFP_KERNEL);
 	if (!plain)
 		return -ENOMEM;
 
@@ -80,7 +80,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
 
 	dprintk("RPC:       krb5_get_seq_num:\n");
 
-	plain = kmalloc(8, GFP_NOFS);
+	plain = kmalloc(8, GFP_KERNEL);
 	if (!plain)
 		return -ENOMEM;
 

From 46442b850e5b3d846c8c82d251e47990dbd6457d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 29 Jan 2022 14:44:38 -0500
Subject: [PATCH 044/229] SUNRPC/xprtrdma: Convert GFP_NOFS to GFP_KERNEL

Assume that the upper layers have set memalloc_nofs_save/restore as
appropriate.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c | 2 +-
 net/sunrpc/xprtrdma/verbs.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 515dd7a66a04..3fcd8e1b2550 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -130,7 +130,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 	if (IS_ERR(frmr))
 		goto out_mr_err;
 
-	sg = kmalloc_array(depth, sizeof(*sg), GFP_NOFS);
+	sg = kmalloc_array(depth, sizeof(*sg), GFP_KERNEL);
 	if (!sg)
 		goto out_list_err;
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 7b5fce2faa10..2fbe9aaeec34 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -373,7 +373,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 	struct rpcrdma_ep *ep;
 	int rc;
 
-	ep = kzalloc(sizeof(*ep), GFP_NOFS);
+	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 	if (!ep)
 		return -ENOTCONN;
 	ep->re_xprt = &r_xprt->rx_xprt;
@@ -746,7 +746,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
 		struct rpcrdma_mr *mr;
 		int rc;
 
-		mr = kzalloc(sizeof(*mr), GFP_NOFS);
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 		if (!mr)
 			break;
 

From 43245eca6e670ebf65908b549641c1460a9cc944 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 2 Feb 2022 17:55:02 -0500
Subject: [PATCH 045/229] NFSv4.1 support for NFS4_RESULT_PRESERVER_UNLINKED

In 4.1+, the server is allowed to set a flag
NFS4_RESULT_PRESERVE_UNLINKED in reply to the OPEN, that tells
the client that it does not need to do a silly rename of an
opened file when it's being removed.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c              | 10 ++++++++--
 fs/nfs/nfs4proc.c         |  2 ++
 include/linux/nfs_fs.h    |  1 +
 include/uapi/linux/nfs4.h |  1 +
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fbb4a522d716..8b190c8e4a45 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1419,7 +1419,12 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
 	if (flags & LOOKUP_REVAL)
 		goto out_force;
 out:
-	return (inode->i_nlink == 0) ? -ESTALE : 0;
+	if (inode->i_nlink > 0 ||
+	    (inode->i_nlink == 0 &&
+	     test_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(inode)->flags)))
+		return 0;
+	else
+		return -ESTALE;
 out_force:
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -2330,7 +2335,8 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	trace_nfs_unlink_enter(dir, dentry);
 	spin_lock(&dentry->d_lock);
-	if (d_count(dentry) > 1) {
+	if (d_count(dentry) > 1 && !test_bit(NFS_INO_PRESERVE_UNLINKED,
+					     &NFS_I(d_inode(dentry))->flags)) {
 		spin_unlock(&dentry->d_lock);
 		/* Start asynchronous writeout of the inode */
 		write_inode_now(d_inode(dentry), 0);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b3793b82a5e7..73a9b6de666c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3050,6 +3050,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
 	if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
 		set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
+	if (opendata->o_res.rflags & NFS4_OPEN_RESULT_PRESERVE_UNLINKED)
+		set_bit(NFS_INO_PRESERVE_UNLINKED, &NFS_I(state->inode)->flags);
 
 	dentry = opendata->dentry;
 	if (d_really_is_negative(dentry)) {
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 333ea05e2531..0e79dbbc759a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -277,6 +277,7 @@ struct nfs4_copy_state {
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
+#define NFS_INO_PRESERVE_UNLINKED (4)		/* preserve file if removed while open */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
 #define NFS_INO_FORCE_READDIR	(7)		/* force readdirplus */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index 800bb0ffa6e6..1d2043708bf1 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -45,6 +45,7 @@
 
 #define NFS4_OPEN_RESULT_CONFIRM		0x0002
 #define NFS4_OPEN_RESULT_LOCKTYPE_POSIX		0x0004
+#define NFS4_OPEN_RESULT_PRESERVE_UNLINKED	0x0008
 #define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK	0x0020
 
 #define NFS4_SHARE_ACCESS_MASK	0x000F

From 50c790a0b69bdc420f00f30bdf348d6c90194c78 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 9 Feb 2022 09:07:01 -0500
Subject: [PATCH 046/229] NFSv4: use unique client identifiers in network
 namespaces

In order to differentiate client state, assign a random uuid to the
uniquifing portion of the client identifier when a network namespace is
created.  Containers may still override this value if they wish to maintain
stable client identifiers by writing to /sys/fs/nfs/net/client/identifier,
either by udev rules or other means.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/sysfs.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index a6f740366963..886ed1eec2e5 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -151,6 +151,18 @@ static struct kobj_type nfs_netns_client_type = {
 	.namespace = nfs_netns_client_namespace,
 };
 
+static void assign_unique_clientid(struct nfs_netns_client *clp)
+{
+	unsigned char client_uuid[16];
+	char *uuid_str = kmalloc(UUID_STRING_LEN + 1, GFP_KERNEL);
+
+	if (uuid_str) {
+		generate_random_uuid(client_uuid);
+		sprintf(uuid_str, "%pU", client_uuid);
+		rcu_assign_pointer(clp->identifier, uuid_str);
+	}
+}
+
 static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
 		struct net *net)
 {
@@ -158,6 +170,8 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (p) {
+		if (net != &init_net)
+			assign_unique_clientid(p);
 		p->net = net;
 		p->kobject.kset = nfs_client_kset;
 		if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type,

From 88a6099fc3274a27814d26dd688fdc5cd7a480ee Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 9 Feb 2022 13:22:48 -0500
Subject: [PATCH 047/229] NFS: Replace last uses of NFS_INO_REVAL_PAGECACHE

Now that we have more fine grained attribute revalidation, let's just
get rid of NFS_INO_REVAL_PAGECACHE.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c         | 24 +++++++++++-------------
 fs/nfs/write.c         |  2 +-
 include/linux/nfs_fs.h |  8 +++-----
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8cf29c6cd9f9..3adf8b4a0079 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -236,19 +236,17 @@ static void nfs_zap_caches_locked(struct inode *inode)
 	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	nfsi->attrtimeo_timestamp = jiffies;
 
-	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
-		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
-					| NFS_INO_INVALID_DATA
-					| NFS_INO_INVALID_ACCESS
-					| NFS_INO_INVALID_ACL
-					| NFS_INO_INVALID_XATTR
-					| NFS_INO_REVAL_PAGECACHE);
-	} else
-		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
-					| NFS_INO_INVALID_ACCESS
-					| NFS_INO_INVALID_ACL
-					| NFS_INO_INVALID_XATTR
-					| NFS_INO_REVAL_PAGECACHE);
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+						     NFS_INO_INVALID_DATA |
+						     NFS_INO_INVALID_ACCESS |
+						     NFS_INO_INVALID_ACL |
+						     NFS_INO_INVALID_XATTR);
+	else
+		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR |
+						     NFS_INO_INVALID_ACCESS |
+						     NFS_INO_INVALID_ACL |
+						     NFS_INO_INVALID_XATTR);
 	nfs_zap_label_cache_locked(nfsi);
 }
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 987a187bd39a..f88b0eb9b18e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -306,7 +306,7 @@ static void nfs_set_pageerror(struct address_space *mapping)
 	/* Force file size revalidation */
 	spin_lock(&inode->i_lock);
 	nfs_set_cache_invalid(inode, NFS_INO_REVAL_FORCED |
-					     NFS_INO_REVAL_PAGECACHE |
+					     NFS_INO_INVALID_CHANGE |
 					     NFS_INO_INVALID_SIZE);
 	spin_unlock(&inode->i_lock);
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0e79dbbc759a..ce3128e4bffa 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -356,11 +356,9 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE
-		| NFS_INO_INVALID_ACCESS
-		| NFS_INO_INVALID_ACL
-		| NFS_INO_INVALID_CHANGE
-		| NFS_INO_INVALID_CTIME;
+	nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
+				NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME |
+				NFS_INO_INVALID_SIZE;
 	if (S_ISDIR(inode->i_mode))
 		nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	spin_unlock(&inode->i_lock);

From 41e97b7f8a15d15da03ca15e6ff7b9b7ab7f588c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 9 Feb 2022 13:26:19 -0500
Subject: [PATCH 048/229] NFS: Remove unused flag NFS_INO_REVAL_PAGECACHE

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c         | 5 ++---
 fs/nfs/nfstrace.h      | 1 -
 include/linux/nfs_fs.h | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3adf8b4a0079..f9fc506ebb29 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -203,14 +203,13 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 				   NFS_INO_INVALID_OTHER |
 				   NFS_INO_INVALID_XATTR);
 		flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
-	} else if (flags & NFS_INO_REVAL_PAGECACHE)
-		flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE;
+	}
 
 	if (!nfs_has_xattr_cache(nfsi))
 		flags &= ~NFS_INO_INVALID_XATTR;
 	if (flags & NFS_INO_INVALID_DATA)
 		nfs_fscache_invalidate(inode, 0);
-	flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED);
+	flags &= ~NFS_INO_REVAL_FORCED;
 
 	nfsi->cache_validity |= flags;
 
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 4611aa3a21a4..45a310b586ce 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -21,7 +21,6 @@
 			{ NFS_INO_INVALID_ATIME, "INVALID_ATIME" }, \
 			{ NFS_INO_INVALID_ACCESS, "INVALID_ACCESS" }, \
 			{ NFS_INO_INVALID_ACL, "INVALID_ACL" }, \
-			{ NFS_INO_REVAL_PAGECACHE, "REVAL_PAGECACHE" }, \
 			{ NFS_INO_REVAL_FORCED, "REVAL_FORCED" }, \
 			{ NFS_INO_INVALID_LABEL, "INVALID_LABEL" }, \
 			{ NFS_INO_INVALID_CHANGE, "INVALID_CHANGE" }, \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index ce3128e4bffa..72a732a5103c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -247,7 +247,6 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALID_ATIME	BIT(2)		/* cached atime is invalid */
 #define NFS_INO_INVALID_ACCESS	BIT(3)		/* cached access cred invalid */
 #define NFS_INO_INVALID_ACL	BIT(4)		/* cached acls are invalid */
-#define NFS_INO_REVAL_PAGECACHE	BIT(5)		/* must revalidate pagecache */
 #define NFS_INO_REVAL_FORCED	BIT(6)		/* force revalidation ignoring a delegation */
 #define NFS_INO_INVALID_LABEL	BIT(7)		/* cached label is invalid */
 #define NFS_INO_INVALID_CHANGE	BIT(8)		/* cached change is invalid */

From b622ffe1d9ecbac71f0cddb52ff0831efdf8fb83 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Feb 2022 18:20:38 -0500
Subject: [PATCH 049/229] NFS: NFSv2/v3 clients should never be setting
 NFS_CAP_XATTR

Ensure that we always initialise the 'xattr_support' field in struct
nfs_fsinfo, so that nfs_server_set_fsinfo() doesn't declare our NFSv2/v3
client to be capable of supporting the NFSv4.2 xattr protocol by setting
the NFS_CAP_XATTR capability.

This configuration can cause nfs_do_access() to set access mode bits
that are unsupported by the NFSv3 ACCESS call, which may confuse
spec-compliant servers.

Reported-by: Olga Kornievskaia <kolga@netapp.com>
Fixes: b78ef845c35d ("NFSv4.2: query the server for extended attribute support")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3xdr.c | 1 +
 fs/nfs/proc.c    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9274c9c5efea..54a1d21cbcc6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2228,6 +2228,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr,
 	/* ignore properties */
 	result->lease_time = 0;
 	result->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+	result->xattr_support = 0;
 	return 0;
 }
 
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 73dcaa99fa9b..e3570c656b0f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -92,6 +92,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	info->maxfilesize = 0x7FFFFFFF;
 	info->lease_time = 0;
 	info->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED;
+	info->xattr_support = 0;
 	return 0;
 }
 

From f1ec501d08b78fd52e56124519e3e6cdecbfc16f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 23 Feb 2022 15:46:20 -0500
Subject: [PATCH 050/229] NFS: Remove unnecessary XATTR cache invalidation in
 nfs_fhget()

We should never expect the 'xattr_cache' to be non-null in that case,
hence nfs_set_cache_invalid() is just going to optimise it away.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f9fc506ebb29..7cecabf57b95 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -561,8 +561,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode->i_gid = fattr->gid;
 		else if (fattr_supported & NFS_ATTR_FATTR_GROUP)
 			nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER);
-		if (nfs_server_capable(inode, NFS_CAP_XATTR))
-			nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
 		if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
 			inode->i_blocks = fattr->du.nfs2.blocks;
 		else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED &&

From 84631f84ac95b6ff6f08a41ffba1f93eaab4e9c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 23 Feb 2022 15:43:26 -0500
Subject: [PATCH 051/229] NFS: Clean up NFSv4.2 xattrs

Add a helper for the xattr mask so that we can get rid of the inlined
ifdefs.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c      |  7 ++-----
 fs/nfs/internal.h | 14 ++++++++++++++
 fs/nfs/nfs4proc.c |  9 ++-------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8b190c8e4a45..ebddc736eac2 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2995,11 +2995,8 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
 	/*
 	 * Determine which access bits we want to ask for...
 	 */
-	cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND;
-	if (nfs_server_capable(inode, NFS_CAP_XATTR)) {
-		cache.mask |= NFS_ACCESS_XAREAD | NFS_ACCESS_XAWRITE |
-		    NFS_ACCESS_XALIST;
-	}
+	cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND |
+		     nfs_access_xattr_mask(NFS_SERVER(inode));
 	if (S_ISDIR(inode->i_mode))
 		cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP;
 	else
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2de7c56a1fbe..b5398af53c7f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -388,6 +388,20 @@ int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t,
 int nfs_rename(struct user_namespace *, struct inode *, struct dentry *,
 	       struct inode *, struct dentry *, unsigned int);
 
+#ifdef CONFIG_NFS_V4_2
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+	if (!(server->caps & NFS_CAP_XATTR))
+		return 0;
+	return NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE | NFS4_ACCESS_XALIST;
+}
+#else
+static inline __u32 nfs_access_xattr_mask(const struct nfs_server *server)
+{
+	return 0;
+}
+#endif
+
 /* file.c */
 int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync);
 loff_t nfs_file_llseek(struct file *, loff_t, int);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 73a9b6de666c..8b875355824b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1392,13 +1392,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	case NFS4_OPEN_CLAIM_FH:
 		p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
 				  NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE |
-				  NFS4_ACCESS_EXECUTE;
-#ifdef CONFIG_NFS_V4_2
-		if (!(server->caps & NFS_CAP_XATTR))
-			break;
-		p->o_arg.access |= NFS4_ACCESS_XAREAD | NFS4_ACCESS_XAWRITE |
-				   NFS4_ACCESS_XALIST;
-#endif
+				  NFS4_ACCESS_EXECUTE |
+				  nfs_access_xattr_mask(server);
 	}
 	p->o_arg.clientid = server->nfs_client->cl_clientid;
 	p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);

From 6c984083ec2453dfd3fcf98f392f34500c73e3f2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 15 Feb 2022 15:58:38 -0500
Subject: [PATCH 052/229] NFS: Use of mapping_set_error() results in spurious
 errors

The use of mapping_set_error() in conjunction with calls to
filemap_check_errors() is problematic because every error gets reported
as either an EIO or an ENOSPC by filemap_check_errors() in functions
such as filemap_write_and_wait() or filemap_write_and_wait_range().
In almost all cases, we prefer to use the more nuanced wb errors.

Fixes: b8946d7bfb94 ("NFS: Revalidate the file mapping on all fatal writeback errors")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/write.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f88b0eb9b18e..74d258781205 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -316,7 +316,10 @@ static void nfs_mapping_set_error(struct page *page, int error)
 	struct address_space *mapping = page_file_mapping(page);
 
 	SetPageError(page);
-	mapping_set_error(mapping, error);
+	filemap_set_wb_err(mapping, error);
+	if (mapping->host)
+		errseq_set(&mapping->host->i_sb->s_wb_err,
+			   error == -ENOSPC ? -ENOSPC : -EIO);
 	nfs_set_pageerror(mapping);
 }
 

From b38e09b9b613ee608c87bc00979db891ee9f0538 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 28 Feb 2022 10:09:23 -0500
Subject: [PATCH 053/229] Revert "NFSv4: use unique client identifiers in
 network namespaces"

This reverts commit 50c790a0b69bdc420f00f30bdf348d6c90194c78.

The functionality is believed to be capable of causing regressions in
existing setups, so the author has requested that it be reverted.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/sysfs.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 886ed1eec2e5..a6f740366963 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -151,18 +151,6 @@ static struct kobj_type nfs_netns_client_type = {
 	.namespace = nfs_netns_client_namespace,
 };
 
-static void assign_unique_clientid(struct nfs_netns_client *clp)
-{
-	unsigned char client_uuid[16];
-	char *uuid_str = kmalloc(UUID_STRING_LEN + 1, GFP_KERNEL);
-
-	if (uuid_str) {
-		generate_random_uuid(client_uuid);
-		sprintf(uuid_str, "%pU", client_uuid);
-		rcu_assign_pointer(clp->identifier, uuid_str);
-	}
-}
-
 static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
 		struct net *net)
 {
@@ -170,8 +158,6 @@ static struct nfs_netns_client *nfs_netns_client_alloc(struct kobject *parent,
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (p) {
-		if (net != &init_net)
-			assign_unique_clientid(p);
 		p->net = net;
 		p->kobject.kset = nfs_client_kset;
 		if (kobject_init_and_add(&p->kobject, &nfs_netns_client_type,

From 64cfca85bacde54caa64e0ab855c48734894fa37 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 24 Feb 2022 10:59:37 -0500
Subject: [PATCH 054/229] NFS: Return valid errors from nfs2/3_decode_dirent()

Valid return values for decode_dirent() callback functions are:
 0: Success
 -EBADCOOKIE: End of directory
 -EAGAIN: End of xdr_stream

All errors need to map into one of those three values.

Fixes: 573c4e1ef53a ("NFS: Simplify ->decode_dirent() calling sequence")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs2xdr.c |  2 +-
 fs/nfs/nfs3xdr.c | 21 ++++++---------------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 7fba7711e6b3..3d5ba43f44bb 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -949,7 +949,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 
 	error = decode_filename_inline(xdr, &entry->name, &entry->len);
 	if (unlikely(error))
-		return error;
+		return -EAGAIN;
 
 	/*
 	 * The type (size and byte order) of nfscookie isn't defined in
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 54a1d21cbcc6..7ab60ad98776 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1967,7 +1967,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		       bool plus)
 {
 	struct user_namespace *userns = rpc_userns(entry->server->client);
-	struct nfs_entry old = *entry;
 	__be32 *p;
 	int error;
 	u64 new_cookie;
@@ -1987,15 +1986,15 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 
 	error = decode_fileid3(xdr, &entry->ino);
 	if (unlikely(error))
-		return error;
+		return -EAGAIN;
 
 	error = decode_inline_filename3(xdr, &entry->name, &entry->len);
 	if (unlikely(error))
-		return error;
+		return -EAGAIN;
 
 	error = decode_cookie3(xdr, &new_cookie);
 	if (unlikely(error))
-		return error;
+		return -EAGAIN;
 
 	entry->d_type = DT_UNKNOWN;
 
@@ -2003,7 +2002,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		entry->fattr->valid = 0;
 		error = decode_post_op_attr(xdr, entry->fattr, userns);
 		if (unlikely(error))
-			return error;
+			return -EAGAIN;
 		if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
 			entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
@@ -2018,11 +2017,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 			return -EAGAIN;
 		if (*p != xdr_zero) {
 			error = decode_nfs_fh3(xdr, entry->fh);
-			if (unlikely(error)) {
-				if (error == -E2BIG)
-					goto out_truncated;
-				return error;
-			}
+			if (unlikely(error))
+				return -EAGAIN;
 		} else
 			zero_nfs_fh3(entry->fh);
 	}
@@ -2031,11 +2027,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	entry->cookie = new_cookie;
 
 	return 0;
-
-out_truncated:
-	dprintk("NFS: directory entry contains invalid file handle\n");
-	*entry = old;
-	return -EAGAIN;
 }
 
 /*

From 1a93b82c59ab45f2d9f14c47f09d2e341ff02381 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 18 Feb 2022 07:07:08 -0500
Subject: [PATCH 055/229] NFS: constify nfs_server_capable() and
 nfs_have_writebacks()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/nfs_fs.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 72a732a5103c..6e10725887d1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -363,7 +363,7 @@ static inline void nfs_mark_for_revalidate(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
-static inline int nfs_server_capable(struct inode *inode, int cap)
+static inline int nfs_server_capable(const struct inode *inode, int cap)
 {
 	return NFS_SERVER(inode)->caps & cap;
 }
@@ -587,12 +587,11 @@ extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 bool nfs_commit_end(struct nfs_mds_commit_info *cinfo);
 
-static inline int
-nfs_have_writebacks(struct inode *inode)
+static inline bool nfs_have_writebacks(const struct inode *inode)
 {
 	if (S_ISREG(inode->i_mode))
 		return atomic_long_read(&NFS_I(inode)->nrequests) != 0;
-	return 0;
+	return false;
 }
 
 /*

From 2eef8a31112262f6f8e5d2b3076b9f288473eaf2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 19 Feb 2022 20:38:19 -0500
Subject: [PATCH 056/229] NFS: Trace lookup revalidation failure

Enable tracing of lookup revalidation failures.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ebddc736eac2..1aa55cac9d9a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1474,9 +1474,7 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
 {
 	switch (error) {
 	case 1:
-		dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
-			__func__, dentry);
-		return 1;
+		break;
 	case 0:
 		/*
 		 * We can't d_drop the root of a disconnected tree:
@@ -1485,13 +1483,10 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
 		 * inodes on unmount and further oopses.
 		 */
 		if (inode && IS_ROOT(dentry))
-			return 1;
-		dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
-				__func__, dentry);
-		return 0;
+			error = 1;
+		break;
 	}
-	dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n",
-				__func__, dentry, error);
+	trace_nfs_lookup_revalidate_exit(dir, dentry, 0, error);
 	return error;
 }
 
@@ -1623,9 +1618,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
 		goto out_bad;
 
 	trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
-	error = nfs_lookup_revalidate_dentry(dir, dentry, inode);
-	trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error);
-	return error;
+	return nfs_lookup_revalidate_dentry(dir, dentry, inode);
 out_valid:
 	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
 out_bad:

From d1e32ea35502bcbf9241c54338882f18b6fa6452 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 25 Feb 2022 10:22:30 -0500
Subject: [PATCH 057/229] NFS: Initialise the readdir verifier as best we can
 in nfs_opendir()

For the purpose of ensuring that opendir() followed by seekdir() work as
correctly as possible, try to initialise the readdir verifier in
nfs_opendir().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1aa55cac9d9a..1dfbd05081ad 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -89,6 +89,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 						      NFS_INO_REVAL_FORCED);
 		list_add(&ctx->list, &nfsi->open_files);
 		clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+		memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf));
 		spin_unlock(&dir->i_lock);
 		return ctx;
 	}

From 281f31b2e5a2f1c5ae82dbe0b14c9d57401e0967 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Feb 2022 12:10:36 -0500
Subject: [PATCH 058/229] NFS: Use kzalloc() to avoid initialising the
 nfs_open_dir_context

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1dfbd05081ad..379f88b158fb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,18 +69,15 @@ const struct address_space_operations nfs_dir_aops = {
 	.freepage = nfs_readdir_clear_array,
 };
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
+static struct nfs_open_dir_context *
+alloc_nfs_open_dir_context(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
-	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
 	if (ctx != NULL) {
-		ctx->duped = 0;
 		ctx->attr_gencount = nfsi->attr_gencount;
-		ctx->dir_cookie = 0;
-		ctx->dup_cookie = 0;
-		ctx->page_index = 0;
-		ctx->eof = false;
 		spin_lock(&dir->i_lock);
 		if (list_empty(&nfsi->open_files) &&
 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))

From 0b2662b7e7fddaaf6d6c055763270765cdb97a0d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Feb 2022 10:39:26 -0500
Subject: [PATCH 059/229] NFS: Calculate page offsets algorithmically

Instead of relying on counting the page offsets as we walk through the
page cache, switch to calculating them algorithmically.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 379f88b158fb..6f0a38db6c37 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -249,17 +249,20 @@ static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
 	return ret;
 }
 
+static size_t nfs_readdir_array_maxentries(void)
+{
+	return (PAGE_SIZE - sizeof(struct nfs_cache_array)) /
+	       sizeof(struct nfs_cache_array_entry);
+}
+
 /*
  * Check that the next array entry lies entirely within the page bounds
  */
 static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
 {
-	struct nfs_cache_array_entry *cache_entry;
-
 	if (array->page_full)
 		return -ENOSPC;
-	cache_entry = &array->array[array->size + 1];
-	if ((char *)cache_entry - (char *)array > PAGE_SIZE) {
+	if (array->size == nfs_readdir_array_maxentries()) {
 		array->page_full = 1;
 		return -ENOSPC;
 	}
@@ -318,6 +321,11 @@ static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
 	return page;
 }
 
+static loff_t nfs_readdir_page_offset(struct page *page)
+{
+	return (loff_t)page->index * (loff_t)nfs_readdir_array_maxentries();
+}
+
 static u64 nfs_readdir_page_last_cookie(struct page *page)
 {
 	struct nfs_cache_array *array;
@@ -448,7 +456,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 		if (array->array[i].cookie == desc->dir_cookie) {
 			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
 
-			new_pos = desc->current_index + i;
+			new_pos = nfs_readdir_page_offset(desc->page) + i;
 			if (desc->attr_gencount != nfsi->attr_gencount ||
 			    !nfs_readdir_inode_mapping_valid(nfsi)) {
 				desc->duped = 0;

From 895519c19fae563b4bec6615bb6a7750e0c18d4a Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 28 Feb 2022 11:58:00 -0800
Subject: [PATCH 060/229] MAINTAINERS: Add additional file to uncore frequency
 control

Add additional Documentation file in the list of files.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1ba1e4af2cbc..4f9acc183cdc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9920,6 +9920,7 @@ INTEL UNCORE FREQUENCY CONTROL
 M:	Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
 L:	platform-driver-x86@vger.kernel.org
 S:	Maintained
+F:	Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst
 F:	drivers/platform/x86/intel/uncore-frequency.c
 
 INTEL VENDOR SPECIFIC EXTENDED CAPABILITIES DRIVER

From d09e673f497164442ad7976a870d7cc857783fd4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Feb 2022 08:31:28 -0500
Subject: [PATCH 061/229] NFS: Store the change attribute in the directory page
 cache

Use the change attribute and the first cookie in a directory page cache
entry to validate that the page is up to date.

Suggested-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 68 ++++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 31 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6f0a38db6c37..a1767f755460 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,6 +140,7 @@ struct nfs_cache_array_entry {
 };
 
 struct nfs_cache_array {
+	u64 change_attr;
 	u64 last_cookie;
 	unsigned int size;
 	unsigned char page_full : 1,
@@ -176,12 +177,14 @@ static void nfs_readdir_array_init(struct nfs_cache_array *array)
 	memset(array, 0, sizeof(struct nfs_cache_array));
 }
 
-static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
+static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
+					u64 change_attr)
 {
 	struct nfs_cache_array *array;
 
 	array = kmap_atomic(page);
 	nfs_readdir_array_init(array);
+	array->change_attr = change_attr;
 	array->last_cookie = last_cookie;
 	array->cookies_are_ordered = 1;
 	kunmap_atomic(array);
@@ -208,7 +211,7 @@ nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
 {
 	struct page *page = alloc_page(gfp_flags);
 	if (page)
-		nfs_readdir_page_init_array(page, last_cookie);
+		nfs_readdir_page_init_array(page, last_cookie, 0);
 	return page;
 }
 
@@ -305,19 +308,43 @@ out:
 	return ret;
 }
 
+static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
+				      u64 change_attr)
+{
+	struct nfs_cache_array *array = kmap_atomic(page);
+	int ret = true;
+
+	if (array->change_attr != change_attr)
+		ret = false;
+	if (array->size > 0 && array->array[0].cookie != last_cookie)
+		ret = false;
+	kunmap_atomic(array);
+	return ret;
+}
+
+static void nfs_readdir_page_unlock_and_put(struct page *page)
+{
+	unlock_page(page);
+	put_page(page);
+}
+
 static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
 						pgoff_t index, u64 last_cookie)
 {
 	struct page *page;
+	u64 change_attr;
 
 	page = grab_cache_page(mapping, index);
-	if (page && !PageUptodate(page)) {
-		nfs_readdir_page_init_array(page, last_cookie);
-		if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
-			nfs_zap_mapping(mapping->host, mapping);
-		SetPageUptodate(page);
+	if (!page)
+		return NULL;
+	change_attr = inode_peek_iversion_raw(mapping->host);
+	if (PageUptodate(page)) {
+		if (nfs_readdir_page_validate(page, last_cookie, change_attr))
+			return page;
+		nfs_readdir_clear_array(page);
 	}
-
+	nfs_readdir_page_init_array(page, last_cookie, change_attr);
+	SetPageUptodate(page);
 	return page;
 }
 
@@ -357,12 +384,6 @@ static void nfs_readdir_page_set_eof(struct page *page)
 	kunmap_atomic(array);
 }
 
-static void nfs_readdir_page_unlock_and_put(struct page *page)
-{
-	unlock_page(page);
-	put_page(page);
-}
-
 static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
 					      pgoff_t index, u64 cookie)
 {
@@ -419,16 +440,6 @@ out_eof:
 	return -EBADCOOKIE;
 }
 
-static bool
-nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
-{
-	if (nfsi->cache_validity & (NFS_INO_INVALID_CHANGE |
-				    NFS_INO_INVALID_DATA))
-		return false;
-	smp_rmb();
-	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
-}
-
 static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
 					      u64 cookie)
 {
@@ -457,8 +468,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
 
 			new_pos = nfs_readdir_page_offset(desc->page) + i;
-			if (desc->attr_gencount != nfsi->attr_gencount ||
-			    !nfs_readdir_inode_mapping_valid(nfsi)) {
+			if (desc->attr_gencount != nfsi->attr_gencount) {
 				desc->duped = 0;
 				desc->attr_gencount = nfsi->attr_gencount;
 			} else if (new_pos < desc->prev_index) {
@@ -1095,11 +1105,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	 * to either find the entry with the appropriate number or
 	 * revalidate the cookie.
 	 */
-	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
-		res = nfs_revalidate_mapping(inode, file->f_mapping);
-		if (res < 0)
-			goto out;
-	}
+	nfs_revalidate_mapping(inode, file->f_mapping);
 
 	res = -ENOMEM;
 	desc = kzalloc(sizeof(*desc), GFP_KERNEL);

From 728dd0ab37421396927749fc8cec9c2009c526c8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Feb 2022 08:59:33 -0500
Subject: [PATCH 062/229] NFS: Don't re-read the entire page cache to find the
 next cookie

If the page cache entry that was last read gets invalidated for some
reason, then make sure we can re-create it on the next call to readdir.
This, combined with the cache page validation, allows us to reuse the
cached value of page-index on successive calls to nfs_readdir.

Credit is due to Benjamin Coddington for showing that the concept works,
and that it allows for improved cache sharing between processes even in
the case where pages are lost due to LRU or active invalidation.

Suggested-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 10 +++++++---
 include/linux/nfs_fs.h |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a1767f755460..93f70698e401 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1120,6 +1120,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->dup_cookie = dir_ctx->dup_cookie;
 	desc->duped = dir_ctx->duped;
 	page_index = dir_ctx->page_index;
+	desc->page_index = page_index;
+	desc->last_cookie = dir_ctx->last_cookie;
 	desc->attr_gencount = dir_ctx->attr_gencount;
 	desc->eof = dir_ctx->eof;
 	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
@@ -1168,6 +1170,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	spin_lock(&file->f_lock);
 	dir_ctx->dir_cookie = desc->dir_cookie;
 	dir_ctx->dup_cookie = desc->dup_cookie;
+	dir_ctx->last_cookie = desc->last_cookie;
 	dir_ctx->duped = desc->duped;
 	dir_ctx->attr_gencount = desc->attr_gencount;
 	dir_ctx->page_index = desc->page_index;
@@ -1209,10 +1212,11 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 	}
 	if (offset != filp->f_pos) {
 		filp->f_pos = offset;
-		if (nfs_readdir_use_cookie(filp))
-			dir_ctx->dir_cookie = offset;
-		else
+		if (!nfs_readdir_use_cookie(filp)) {
 			dir_ctx->dir_cookie = 0;
+			dir_ctx->page_index = 0;
+		} else
+			dir_ctx->dir_cookie = offset;
 		if (offset == 0)
 			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
 		dir_ctx->duped = 0;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 6e10725887d1..1c533f2c1f36 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -105,6 +105,7 @@ struct nfs_open_dir_context {
 	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
 	__u64 dup_cookie;
+	__u64 last_cookie;
 	pgoff_t page_index;
 	signed char duped;
 	bool eof;

From c8f0523ba398b72ffdb6e41930c089b75a6e2acf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 26 Feb 2022 09:38:19 -0500
Subject: [PATCH 063/229] NFS: Don't advance the page pointer unless the page
 is full

When we hit the end of the data in the readdir page, we don't want to
start filling a new page, unless this one is full.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 93f70698e401..60f7feee0a16 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -417,6 +417,18 @@ bool nfs_readdir_use_cookie(const struct file *filp)
 	return true;
 }
 
+static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
+					struct nfs_readdir_descriptor *desc)
+{
+	if (array->page_full) {
+		desc->last_cookie = array->last_cookie;
+		desc->current_index += array->size;
+		desc->cache_entry_index = 0;
+		desc->page_index++;
+	} else
+		desc->last_cookie = array->array[0].cookie;
+}
+
 static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
 				      struct nfs_readdir_descriptor *desc)
 {
@@ -428,6 +440,7 @@ static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
 	if (diff >= array->size) {
 		if (array->page_is_eof)
 			goto out_eof;
+		nfs_readdir_seek_next_array(array, desc);
 		return -EAGAIN;
 	}
 
@@ -500,7 +513,8 @@ check_eof:
 		status = -EBADCOOKIE;
 		if (desc->dir_cookie == array->last_cookie)
 			desc->eof = true;
-	}
+	} else
+		nfs_readdir_seek_next_array(array, desc);
 out:
 	return status;
 }
@@ -517,11 +531,6 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 	else
 		status = nfs_readdir_search_for_cookie(array, desc);
 
-	if (status == -EAGAIN) {
-		desc->last_cookie = array->last_cookie;
-		desc->current_index += array->size;
-		desc->page_index++;
-	}
 	kunmap_atomic(array);
 	return status;
 }
@@ -998,7 +1007,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 {
 	struct file	*file = desc->file;
 	struct nfs_cache_array *array;
-	unsigned int i = 0;
+	unsigned int i;
 
 	array = kmap(desc->page);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -1011,10 +1020,13 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 			break;
 		}
 		memcpy(desc->verf, verf, sizeof(desc->verf));
-		if (i < (array->size-1))
-			desc->dir_cookie = array->array[i+1].cookie;
-		else
+		if (i == array->size - 1) {
 			desc->dir_cookie = array->last_cookie;
+			nfs_readdir_seek_next_array(array, desc);
+		} else {
+			desc->dir_cookie = array->array[i + 1].cookie;
+			desc->last_cookie = array->array[0].cookie;
+		}
 		if (nfs_readdir_use_cookie(file))
 			desc->ctx->pos = desc->dir_cookie;
 		else

From 580f236737d13ee25d5b0b1d124f50014fe6833b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 7 Feb 2022 13:37:00 -0500
Subject: [PATCH 064/229] NFS: Adjust the amount of readahead performed by NFS
 readdir

The current NFS readdir code will always try to maximise the amount of
readahead it performs on the assumption that we can cache anything that
isn't immediately read by the process.
There are several cases where this assumption breaks down, including
when the 'ls -l' heuristic kicks in to try to force use of readdirplus
as a batch replacement for lookup/getattr.

This patch therefore tries to tone down the amount of readahead we
perform, and adjust it to try to match the amount of data being
requested by user space.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 53 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs.h |  1 +
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 60f7feee0a16..520dc3ec4aef 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,6 +69,8 @@ const struct address_space_operations nfs_dir_aops = {
 	.freepage = nfs_readdir_clear_array,
 };
 
+#define NFS_INIT_DTSIZE PAGE_SIZE
+
 static struct nfs_open_dir_context *
 alloc_nfs_open_dir_context(struct inode *dir)
 {
@@ -78,6 +80,7 @@ alloc_nfs_open_dir_context(struct inode *dir)
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
 	if (ctx != NULL) {
 		ctx->attr_gencount = nfsi->attr_gencount;
+		ctx->dtsize = NFS_INIT_DTSIZE;
 		spin_lock(&dir->i_lock);
 		if (list_empty(&nfsi->open_files) &&
 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -154,6 +157,7 @@ struct nfs_readdir_descriptor {
 	struct page	*page;
 	struct dir_context *ctx;
 	pgoff_t		page_index;
+	pgoff_t		page_index_max;
 	u64		dir_cookie;
 	u64		last_cookie;
 	u64		dup_cookie;
@@ -166,12 +170,36 @@ struct nfs_readdir_descriptor {
 	unsigned long	gencount;
 	unsigned long	attr_gencount;
 	unsigned int	cache_entry_index;
+	unsigned int	buffer_fills;
+	unsigned int	dtsize;
 	signed char duped;
 	bool plus;
 	bool eob;
 	bool eof;
 };
 
+static void nfs_set_dtsize(struct nfs_readdir_descriptor *desc, unsigned int sz)
+{
+	struct nfs_server *server = NFS_SERVER(file_inode(desc->file));
+	unsigned int maxsize = server->dtsize;
+
+	if (sz > maxsize)
+		sz = maxsize;
+	if (sz < NFS_MIN_FILE_IO_SIZE)
+		sz = NFS_MIN_FILE_IO_SIZE;
+	desc->dtsize = sz;
+}
+
+static void nfs_shrink_dtsize(struct nfs_readdir_descriptor *desc)
+{
+	nfs_set_dtsize(desc, desc->dtsize >> 1);
+}
+
+static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
+{
+	nfs_set_dtsize(desc, desc->dtsize << 1);
+}
+
 static void nfs_readdir_array_init(struct nfs_cache_array *array)
 {
 	memset(array, 0, sizeof(struct nfs_cache_array));
@@ -784,6 +812,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 				break;
 			arrays++;
 			*arrays = page = new;
+			desc->page_index_max++;
 		} else {
 			new = nfs_readdir_page_get_next(mapping,
 							page->index + 1,
@@ -793,6 +822,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 			if (page != *arrays)
 				nfs_readdir_page_unlock_and_put(page);
 			page = new;
+			desc->page_index_max = new->index;
 		}
 		status = nfs_readdir_add_to_array(entry, page);
 	} while (!status && !entry->eof);
@@ -858,7 +888,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 	struct nfs_entry *entry;
 	size_t array_size;
 	struct inode *inode = file_inode(desc->file);
-	size_t dtsize = NFS_SERVER(inode)->dtsize;
+	unsigned int dtsize = desc->dtsize;
 	int status = -ENOMEM;
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
@@ -894,6 +924,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 
 		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
 						 arrays, narrays);
+		desc->buffer_fills++;
 	} while (!status && nfs_readdir_page_needs_filling(page) &&
 		page_mapping(page));
 
@@ -941,6 +972,10 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
+		/* Grow the dtsize if we had to go back for more pages */
+		if (desc->page_index == desc->page_index_max)
+			nfs_grow_dtsize(desc);
+		desc->page_index_max = desc->page_index;
 		res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
 					       &desc->page, 1);
 		if (res < 0) {
@@ -1075,6 +1110,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->cache_entry_index = 0;
 	desc->last_cookie = desc->dir_cookie;
 	desc->duped = 0;
+	desc->page_index_max = 0;
 
 	status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
 
@@ -1084,10 +1120,22 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	}
 	desc->page = NULL;
 
+	/*
+	 * Grow the dtsize if we have to go back for more pages,
+	 * or shrink it if we're reading too many.
+	 */
+	if (!desc->eof) {
+		if (!desc->eob)
+			nfs_grow_dtsize(desc);
+		else if (desc->buffer_fills == 1 &&
+			 i < (desc->page_index_max >> 1))
+			nfs_shrink_dtsize(desc);
+	}
 
 	for (i = 0; i < sz && arrays[i]; i++)
 		nfs_readdir_page_array_free(arrays[i]);
 out:
+	desc->page_index_max = -1;
 	kfree(arrays);
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
 	return status;
@@ -1126,6 +1174,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->file = file;
 	desc->ctx = ctx;
 	desc->plus = nfs_use_readdirplus(inode, ctx);
+	desc->page_index_max = -1;
 
 	spin_lock(&file->f_lock);
 	desc->dir_cookie = dir_ctx->dir_cookie;
@@ -1136,6 +1185,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->last_cookie = dir_ctx->last_cookie;
 	desc->attr_gencount = dir_ctx->attr_gencount;
 	desc->eof = dir_ctx->eof;
+	nfs_set_dtsize(desc, dir_ctx->dtsize);
 	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
 	spin_unlock(&file->f_lock);
 
@@ -1187,6 +1237,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->attr_gencount = desc->attr_gencount;
 	dir_ctx->page_index = desc->page_index;
 	dir_ctx->eof = desc->eof;
+	dir_ctx->dtsize = desc->dtsize;
 	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
 	spin_unlock(&file->f_lock);
 out_free:
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1c533f2c1f36..691a27936849 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -107,6 +107,7 @@ struct nfs_open_dir_context {
 	__u64 dup_cookie;
 	__u64 last_cookie;
 	pgoff_t page_index;
+	unsigned int dtsize;
 	signed char duped;
 	bool eof;
 };

From 6c34f05b754622f473b546fd19ad3a89bd65bd89 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Feb 2022 16:23:12 -0500
Subject: [PATCH 065/229] NFS: If the cookie verifier changes, we must
 invalidate the page cache

Ensure that if the cookie verifier changes when we use the zero-valued
cookie, then we invalidate any cached pages.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 520dc3ec4aef..9998d7d17367 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -990,9 +990,14 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 		/*
 		 * Set the cookie verifier if the page cache was empty
 		 */
-		if (desc->page_index == 0)
+		if (desc->last_cookie == 0 &&
+		    memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
 			memcpy(nfsi->cookieverf, verf,
 			       sizeof(nfsi->cookieverf));
+			invalidate_inode_pages2_range(desc->file->f_mapping,
+						      desc->page_index_max + 1,
+						      -1);
+		}
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0)

From 9ff89c25d8addeee8eea84fa828f1d2ad659cc54 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 7 Feb 2022 15:07:01 -0500
Subject: [PATCH 066/229] NFS: Simplify nfs_readdir_xdr_to_array()

Recent changes to readdir mean that we can cope with partially filled
page cache entries, so we no longer need to rely on looping in
nfs_readdir_xdr_to_array().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9998d7d17367..9d086ab4f889 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -889,6 +889,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 	size_t array_size;
 	struct inode *inode = file_inode(desc->file);
 	unsigned int dtsize = desc->dtsize;
+	unsigned int pglen;
 	int status = -ENOMEM;
 
 	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
@@ -906,28 +907,20 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 	if (!pages)
 		goto out;
 
-	do {
-		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
-						pages, dtsize,
-						verf_res);
-		if (status < 0)
-			break;
-
-		pglen = status;
-		if (pglen == 0) {
-			nfs_readdir_page_set_eof(page);
-			break;
-		}
-
-		verf_arg = verf_res;
+	status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages,
+					dtsize, verf_res);
+	if (status < 0)
+		goto free_pages;
 
+	pglen = status;
+	if (pglen != 0)
 		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
 						 arrays, narrays);
-		desc->buffer_fills++;
-	} while (!status && nfs_readdir_page_needs_filling(page) &&
-		page_mapping(page));
+	else
+		nfs_readdir_page_set_eof(page);
+	desc->buffer_fills++;
 
+free_pages:
 	nfs_readdir_free_pages(pages, array_size);
 out:
 	nfs_free_fattr(entry->fattr);

From 9c3f4d988c23d099095c8b75cbd449e0466fa102 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 17 Feb 2022 13:02:37 -0500
Subject: [PATCH 067/229] NFS: Reduce use of uncached readdir

When reading a very large directory, we want to try to keep the page
cache up to date if doing so is inexpensive. With the change to allow
readdir to continue reading even when the cache is incomplete, we no
longer need to fall back to uncached readdir in order to scale to large
directories.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9d086ab4f889..dc6acfd14fc7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -999,28 +999,11 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	return res;
 }
 
-static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
-{
-	struct address_space *mapping = desc->file->f_mapping;
-	struct inode *dir = file_inode(desc->file);
-	unsigned int dtsize = NFS_SERVER(dir)->dtsize;
-	loff_t size = i_size_read(dir);
-
-	/*
-	 * Default to uncached readdir if the page cache is empty, and
-	 * we're looking for a non-zero cookie in a large directory.
-	 */
-	return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
-}
-
 /* Search for desc->dir_cookie from the beginning of the page cache */
 static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
-	if (nfs_readdir_dont_search_cache(desc))
-		return -EBADCOOKIE;
-
 	do {
 		if (desc->page_index == 0) {
 			desc->current_index = 0;
@@ -1273,10 +1256,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 	}
 	if (offset != filp->f_pos) {
 		filp->f_pos = offset;
-		if (!nfs_readdir_use_cookie(filp)) {
+		dir_ctx->page_index = 0;
+		if (!nfs_readdir_use_cookie(filp))
 			dir_ctx->dir_cookie = 0;
-			dir_ctx->page_index = 0;
-		} else
+		else
 			dir_ctx->dir_cookie = offset;
 		if (offset == 0)
 			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));

From 230bc98f7a2a49eb472d184bdec91fd3096384b3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 17 Feb 2022 11:08:24 -0500
Subject: [PATCH 068/229] NFS: Improve heuristic for readdirplus

The heuristic for readdirplus is designed to try to detect 'ls -l' and
similar patterns. It does so by looking for cache hit/miss patterns in
both the attribute cache and in the dcache of the files in a given
directory, and then sets a flag for the readdirplus code to interpret.

The problem with this approach is that a single attribute or dcache miss
can cause the NFS code to force a refresh of the attributes for the
entire set of files contained in the directory.

To be able to make a more nuanced decision, let's sample the number of
hits and misses in the set of open directory descriptors. That allows us
to set thresholds at which we start preferring READDIRPLUS over regular
READDIR, or at which we start to force a re-read of the remaining
readdir cache using READDIRPLUS.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 80 +++++++++++++++++++++++++++---------------
 fs/nfs/inode.c         |  4 +--
 fs/nfs/internal.h      |  4 +--
 fs/nfs/nfstrace.h      |  1 -
 include/linux/nfs_fs.h |  5 +--
 5 files changed, 58 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index dc6acfd14fc7..098fc1bdaac8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -87,8 +87,7 @@ alloc_nfs_open_dir_context(struct inode *dir)
 			nfs_set_cache_invalid(dir,
 					      NFS_INO_INVALID_DATA |
 						      NFS_INO_REVAL_FORCED);
-		list_add(&ctx->list, &nfsi->open_files);
-		clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+		list_add_tail_rcu(&ctx->list, &nfsi->open_files);
 		memcpy(ctx->verf, nfsi->cookieverf, sizeof(ctx->verf));
 		spin_unlock(&dir->i_lock);
 		return ctx;
@@ -99,9 +98,9 @@ alloc_nfs_open_dir_context(struct inode *dir)
 static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
 {
 	spin_lock(&dir->i_lock);
-	list_del(&ctx->list);
+	list_del_rcu(&ctx->list);
 	spin_unlock(&dir->i_lock);
-	kfree(ctx);
+	kfree_rcu(ctx, rcu_head);
 }
 
 /*
@@ -594,7 +593,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
 			desc->plus = arg.plus = false;
 			goto again;
 		}
@@ -644,51 +642,63 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 	return 1;
 }
 
-static
-bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+#define NFS_READDIR_CACHE_USAGE_THRESHOLD (8UL)
+
+static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
+				unsigned int cache_hits,
+				unsigned int cache_misses)
 {
 	if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
 		return false;
-	if (test_and_clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags))
-		return true;
-	if (ctx->pos == 0)
+	if (ctx->pos == 0 ||
+	    cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
 		return true;
 	return false;
 }
 
 /*
- * This function is called by the lookup and getattr code to request the
+ * This function is called by the getattr code to request the
  * use of readdirplus to accelerate any future lookups in the same
  * directory.
  */
-void nfs_advise_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_hit(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
+	struct nfs_open_dir_context *ctx;
 
 	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
-	    !list_empty(&nfsi->open_files))
-		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+	    S_ISDIR(dir->i_mode)) {
+		rcu_read_lock();
+		list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+			atomic_inc(&ctx->cache_hits);
+		rcu_read_unlock();
+	}
 }
 
 /*
  * This function is mainly for use by nfs_getattr().
  *
  * If this is an 'ls -l', we want to force use of readdirplus.
- * Do this by checking if there is an active file descriptor
- * and calling nfs_advise_use_readdirplus, then forcing a
- * cache flush.
  */
-void nfs_force_use_readdirplus(struct inode *dir)
+void nfs_readdir_record_entry_cache_miss(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
+	struct nfs_open_dir_context *ctx;
 
 	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
-	    !list_empty(&nfsi->open_files)) {
-		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
-		set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+	    S_ISDIR(dir->i_mode)) {
+		rcu_read_lock();
+		list_for_each_entry_rcu (ctx, &nfsi->open_files, list)
+			atomic_inc(&ctx->cache_misses);
+		rcu_read_unlock();
 	}
 }
 
+static void nfs_lookup_advise_force_readdirplus(struct inode *dir)
+{
+	nfs_readdir_record_entry_cache_miss(dir);
+}
+
 static
 void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry,
 		unsigned long dir_verifier)
@@ -1122,6 +1132,19 @@ out:
 	return status;
 }
 
+#define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
+
+static void nfs_readdir_handle_cache_misses(struct inode *inode,
+					    struct nfs_readdir_descriptor *desc,
+					    pgoff_t page_index,
+					    unsigned int cache_misses)
+{
+	if (desc->ctx->pos == 0 ||
+	    cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
+		return;
+	invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+}
+
 /* The file offset position represents the dirent entry number.  A
    last cookie cache takes care of the common case of reading the
    whole directory.
@@ -1133,6 +1156,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	struct nfs_readdir_descriptor *desc;
+	unsigned int cache_hits, cache_misses;
 	pgoff_t page_index;
 	int res;
 
@@ -1154,7 +1178,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		goto out;
 	desc->file = file;
 	desc->ctx = ctx;
-	desc->plus = nfs_use_readdirplus(inode, ctx);
 	desc->page_index_max = -1;
 
 	spin_lock(&file->f_lock);
@@ -1168,6 +1191,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->eof = dir_ctx->eof;
 	nfs_set_dtsize(desc, dir_ctx->dtsize);
 	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+	cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
+	cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
 	spin_unlock(&file->f_lock);
 
 	if (desc->eof) {
@@ -1175,9 +1200,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		goto out_free;
 	}
 
-	if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
-	    list_is_singular(&nfsi->open_files))
-		invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+	desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
+	nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses);
 
 	do {
 		res = readdir_search_pagecache(desc);
@@ -1196,7 +1220,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 		if (res == -ETOOSMALL && desc->plus) {
-			clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
 			nfs_zap_caches(inode);
 			desc->page_index = 0;
 			desc->plus = false;
@@ -1610,7 +1633,7 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
 	nfs_set_verifier(dentry, dir_verifier);
 
 	/* set a readdirplus hint that we had a cache miss */
-	nfs_force_use_readdirplus(dir);
+	nfs_lookup_advise_force_readdirplus(dir);
 	ret = 1;
 out:
 	nfs_free_fattr(fattr);
@@ -1667,7 +1690,6 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
 				nfs_mark_dir_for_revalidate(dir);
 			goto out_bad;
 		}
-		nfs_advise_use_readdirplus(dir);
 		goto out_valid;
 	}
 
@@ -1872,7 +1894,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 		goto out;
 
 	/* Notify readdir to use READDIRPLUS */
-	nfs_force_use_readdirplus(dir);
+	nfs_lookup_advise_force_readdirplus(dir);
 
 no_entry:
 	res = d_splice_alias(inode, dentry);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 7cecabf57b95..bbf4357ff727 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -787,7 +787,7 @@ static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
 	if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
 		return;
 	parent = dget_parent(dentry);
-	nfs_force_use_readdirplus(d_inode(parent));
+	nfs_readdir_record_entry_cache_miss(d_inode(parent));
 	dput(parent);
 }
 
@@ -798,7 +798,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
 	if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
 		return;
 	parent = dget_parent(dentry);
-	nfs_advise_use_readdirplus(d_inode(parent));
+	nfs_readdir_record_entry_cache_hit(d_inode(parent));
 	dput(parent);
 }
 
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b5398af53c7f..194840a97e3a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -366,8 +366,8 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const struct nfs_client_initdata *);
 
 /* dir.c */
-extern void nfs_advise_use_readdirplus(struct inode *dir);
-extern void nfs_force_use_readdirplus(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_hit(struct inode *dir);
+extern void nfs_readdir_record_entry_cache_miss(struct inode *dir);
 extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc);
 extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 45a310b586ce..3672f6703ee7 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,7 +36,6 @@
 
 #define nfs_show_nfsi_flags(v) \
 	__print_flags(v, "|", \
-			{ BIT(NFS_INO_ADVISE_RDPLUS), "ADVISE_RDPLUS" }, \
 			{ BIT(NFS_INO_STALE), "STALE" }, \
 			{ BIT(NFS_INO_ACL_LRU_SET), "ACL_LRU_SET" }, \
 			{ BIT(NFS_INO_INVALIDATING), "INVALIDATING" }, \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 691a27936849..20a4cf0acad2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -101,6 +101,8 @@ struct nfs_open_context {
 
 struct nfs_open_dir_context {
 	struct list_head list;
+	atomic_t cache_hits;
+	atomic_t cache_misses;
 	unsigned long attr_gencount;
 	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
@@ -110,6 +112,7 @@ struct nfs_open_dir_context {
 	unsigned int dtsize;
 	signed char duped;
 	bool eof;
+	struct rcu_head rcu_head;
 };
 
 /*
@@ -274,13 +277,11 @@ struct nfs4_copy_state {
 /*
  * Bit offsets in flags field
  */
-#define NFS_INO_ADVISE_RDPLUS	(0)		/* advise readdirplus */
 #define NFS_INO_STALE		(1)		/* possible stale inode */
 #define NFS_INO_ACL_LRU_SET	(2)		/* Inode is on the LRU list */
 #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
 #define NFS_INO_PRESERVE_UNLINKED (4)		/* preserve file if removed while open */
 #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
-#define NFS_INO_FORCE_READDIR	(7)		/* force readdirplus */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
 #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */

From ad1e109a4109ce0cbdfebfbe1958d0c333166d5c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 17 Feb 2022 15:46:23 -0500
Subject: [PATCH 069/229] NFS: Don't ask for readdirplus unless it can help
 nfs_getattr()

If attribute caching is turned off, then use of readdirplus is not going
to help stat() performance.
Readdirplus also doesn't help if a file is being written to, since we
will have to flush those writes in order to sync the mtime/ctime.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bbf4357ff727..e51d86707fca 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -780,26 +780,32 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 }
 EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
 
+/*
+ * Don't request help from readdirplus if the file is being written to,
+ * or if attribute caching is turned off
+ */
+static bool nfs_getattr_readdirplus_enable(const struct inode *inode)
+{
+	return nfs_server_capable(inode, NFS_CAP_READDIRPLUS) &&
+	       !nfs_have_writebacks(inode) && NFS_MAXATTRTIMEO(inode) > 5 * HZ;
+}
+
 static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
 {
-	struct dentry *parent;
-
-	if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
-		return;
-	parent = dget_parent(dentry);
-	nfs_readdir_record_entry_cache_miss(d_inode(parent));
-	dput(parent);
+	if (!IS_ROOT(dentry)) {
+		struct dentry *parent = dget_parent(dentry);
+		nfs_readdir_record_entry_cache_miss(d_inode(parent));
+		dput(parent);
+	}
 }
 
 static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
 {
-	struct dentry *parent;
-
-	if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
-		return;
-	parent = dget_parent(dentry);
-	nfs_readdir_record_entry_cache_hit(d_inode(parent));
-	dput(parent);
+	if (!IS_ROOT(dentry)) {
+		struct dentry *parent = dget_parent(dentry);
+		nfs_readdir_record_entry_cache_hit(d_inode(parent));
+		dput(parent);
+	}
 }
 
 static u32 nfs_get_valid_attrmask(struct inode *inode)
@@ -835,6 +841,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 	int err = 0;
 	bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
 	bool do_update = false;
+	bool readdirplus_enabled = nfs_getattr_readdirplus_enable(inode);
 
 	trace_nfs_getattr_enter(inode);
 
@@ -843,7 +850,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 			STATX_INO | STATX_SIZE | STATX_BLOCKS;
 
 	if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
-		nfs_readdirplus_parent_cache_hit(path->dentry);
+		if (readdirplus_enabled)
+			nfs_readdirplus_parent_cache_hit(path->dentry);
 		goto out_no_revalidate;
 	}
 
@@ -893,15 +901,12 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
 
 	if (do_update) {
-		/* Update the attribute cache */
-		if (!(server->flags & NFS_MOUNT_NOAC))
+		if (readdirplus_enabled)
 			nfs_readdirplus_parent_cache_miss(path->dentry);
-		else
-			nfs_readdirplus_parent_cache_hit(path->dentry);
 		err = __nfs_revalidate_inode(server, inode);
 		if (err)
 			goto out;
-	} else
+	} else if (readdirplus_enabled)
 		nfs_readdirplus_parent_cache_hit(path->dentry);
 out_no_revalidate:
 	/* Only return attributes that were revalidated. */

From c49c68944f2d4c3827cd8ac7f70da277674d11ad Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 18 Feb 2022 12:04:06 -0500
Subject: [PATCH 070/229] NFSv4: Ask for a full XDR buffer of readdir goodness

Instead of pretending that we know the ratio of directory info vs
readdirplus attribute info, just set the 'dircount' field to the same
value as the 'maxcount' field.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3xdr.c | 7 ++++---
 fs/nfs/nfs4xdr.c | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 7ab60ad98776..d6779ceeb39e 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1261,6 +1261,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
 static void encode_readdirplus3args(struct xdr_stream *xdr,
 				    const struct nfs3_readdirargs *args)
 {
+	uint32_t dircount = args->count;
+	uint32_t maxcount = args->count;
 	__be32 *p;
 
 	encode_nfs_fh3(xdr, args->fh);
@@ -1273,9 +1275,8 @@ static void encode_readdirplus3args(struct xdr_stream *xdr,
 	 * readdirplus: need dircount + buffer size.
 	 * We just make sure we make dircount big enough
 	 */
-	*p++ = cpu_to_be32(args->count >> 3);
-
-	*p = cpu_to_be32(args->count);
+	*p++ = cpu_to_be32(dircount);
+	*p = cpu_to_be32(maxcount);
 }
 
 static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8e70b92df4cc..b7780b97dc4d 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1605,7 +1605,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 		FATTR4_WORD0_RDATTR_ERROR,
 		FATTR4_WORD1_MOUNTED_ON_FILEID,
 	};
-	uint32_t dircount = readdir->count >> 1;
+	uint32_t dircount = readdir->count;
+	uint32_t maxcount = readdir->count;
 	__be32 *p, verf[2];
 	uint32_t attrlen = 0;
 	unsigned int i;
@@ -1618,7 +1619,6 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 			FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
 			FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
 		attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
-		dircount >>= 1;
 	}
 	/* Use mounted_on_fileid only if the server supports it */
 	if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
@@ -1634,7 +1634,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 	encode_nfs4_verifier(xdr, &readdir->verifier);
 	p = reserve_space(xdr, 12 + (attrlen << 2));
 	*p++ = cpu_to_be32(dircount);
-	*p++ = cpu_to_be32(readdir->count);
+	*p++ = cpu_to_be32(maxcount);
 	*p++ = cpu_to_be32(attrlen);
 	for (i = 0; i < attrlen; i++)
 		*p++ = cpu_to_be32(attrs[i]);

From 2c2c336506e9bd4056fca25301b8a06fb7aefd32 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 19 Feb 2022 09:56:45 -0500
Subject: [PATCH 071/229] NFS: Readdirplus can't help lookup for case
 insensitive filesystems

If the filesystem is case insensitive, then readdirplus can't help with
cache misses, since it won't return case folded variants of the filename.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 098fc1bdaac8..dcfc44411787 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -696,6 +696,8 @@ void nfs_readdir_record_entry_cache_miss(struct inode *dir)
 
 static void nfs_lookup_advise_force_readdirplus(struct inode *dir)
 {
+	if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
+		return;
 	nfs_readdir_record_entry_cache_miss(dir);
 }
 

From 0b3cc71b5ab31ef90eb9b8b2d8ca580fbf88c590 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 19 Feb 2022 10:06:05 -0500
Subject: [PATCH 072/229] NFS: Don't request readdirplus when revalidation was
 forced

If the revalidation was forced, due to the presence of a LOOKUP_EXCL or
a LOOKUP_REVAL flag, then readdirplus won't help. It also can't help
when we're doing a path component lookup.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index dcfc44411787..cf7974642a19 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -694,10 +694,13 @@ void nfs_readdir_record_entry_cache_miss(struct inode *dir)
 	}
 }
 
-static void nfs_lookup_advise_force_readdirplus(struct inode *dir)
+static void nfs_lookup_advise_force_readdirplus(struct inode *dir,
+						unsigned int flags)
 {
 	if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE))
 		return;
+	if (flags & (LOOKUP_EXCL | LOOKUP_PARENT | LOOKUP_REVAL))
+		return;
 	nfs_readdir_record_entry_cache_miss(dir);
 }
 
@@ -1596,15 +1599,17 @@ nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry,
 	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
 }
 
-static int
-nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
-			     struct inode *inode)
+static int nfs_lookup_revalidate_dentry(struct inode *dir,
+					struct dentry *dentry,
+					struct inode *inode, unsigned int flags)
 {
 	struct nfs_fh *fhandle;
 	struct nfs_fattr *fattr;
 	unsigned long dir_verifier;
 	int ret;
 
+	trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+
 	ret = -ENOMEM;
 	fhandle = nfs_alloc_fhandle();
 	fattr = nfs_alloc_fattr_with_label(NFS_SERVER(inode));
@@ -1625,6 +1630,10 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
 		}
 		goto out;
 	}
+
+	/* Request help from readdirplus */
+	nfs_lookup_advise_force_readdirplus(dir, flags);
+
 	ret = 0;
 	if (nfs_compare_fh(NFS_FH(inode), fhandle))
 		goto out;
@@ -1634,8 +1643,6 @@ nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry,
 	nfs_setsecurity(inode, fattr);
 	nfs_set_verifier(dentry, dir_verifier);
 
-	/* set a readdirplus hint that we had a cache miss */
-	nfs_lookup_advise_force_readdirplus(dir);
 	ret = 1;
 out:
 	nfs_free_fattr(fattr);
@@ -1701,8 +1708,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
 	if (NFS_STALE(inode))
 		goto out_bad;
 
-	trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
-	return nfs_lookup_revalidate_dentry(dir, dentry, inode);
+	return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
 out_valid:
 	return nfs_lookup_revalidate_done(dir, dentry, inode, 1);
 out_bad:
@@ -1896,7 +1902,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 		goto out;
 
 	/* Notify readdir to use READDIRPLUS */
-	nfs_lookup_advise_force_readdirplus(dir);
+	nfs_lookup_advise_force_readdirplus(dir, flags);
 
 no_entry:
 	res = d_splice_alias(inode, dentry);
@@ -2159,7 +2165,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
 reval_dentry:
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
-	return nfs_lookup_revalidate_dentry(dir, dentry, inode);
+	return nfs_lookup_revalidate_dentry(dir, dentry, inode, flags);
 
 full_reval:
 	return nfs_do_lookup_revalidate(dir, dentry, flags);

From 310e3187450db2ca5699758296d23fc2ce3f37f0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 19 Feb 2022 19:24:38 -0500
Subject: [PATCH 073/229] NFS: Add basic readdir tracing

Add tracing to track how often the client goes to the server for updated
readdir information.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c      | 13 ++++++++-
 fs/nfs/nfstrace.h | 68 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cf7974642a19..d591d20f7534 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -984,10 +984,14 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 		if (desc->page_index == desc->page_index_max)
 			nfs_grow_dtsize(desc);
 		desc->page_index_max = desc->page_index;
+		trace_nfs_readdir_cache_fill(desc->file, nfsi->cookieverf,
+					     desc->last_cookie,
+					     desc->page->index, desc->dtsize);
 		res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
 					       &desc->page, 1);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
+			trace_nfs_readdir_cache_fill_done(inode, res);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
 				invalidate_inode_pages2(desc->file->f_mapping);
 				desc->page_index = 0;
@@ -1108,7 +1112,14 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->duped = 0;
 	desc->page_index_max = 0;
 
+	trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
+				   -1, desc->dtsize);
+
 	status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
+	if (status < 0) {
+		trace_nfs_readdir_uncached_done(file_inode(desc->file), status);
+		goto out_free;
+	}
 
 	for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
 		desc->page = arrays[i];
@@ -1127,7 +1138,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 			 i < (desc->page_index_max >> 1))
 			nfs_shrink_dtsize(desc);
 	}
-
+out_free:
 	for (i = 0; i < sz && arrays[i]; i++)
 		nfs_readdir_page_array_free(arrays[i]);
 out:
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 3672f6703ee7..c2d0543ecb2d 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -160,6 +160,8 @@ DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
 DEFINE_NFS_INODE_EVENT(nfs_access_enter);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done);
+DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done);
 
 TRACE_EVENT(nfs_access_exit,
 		TP_PROTO(
@@ -271,6 +273,72 @@ DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
 DEFINE_NFS_UPDATE_SIZE_EVENT(update);
 DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
 
+DECLARE_EVENT_CLASS(nfs_readdir_event,
+		TP_PROTO(
+			const struct file *file,
+			const __be32 *verifier,
+			u64 cookie,
+			pgoff_t page_index,
+			unsigned int dtsize
+		),
+
+		TP_ARGS(file, verifier, cookie, page_index, dtsize),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(u64, version)
+			__array(char, verifier, NFS4_VERIFIER_SIZE)
+			__field(u64, cookie)
+			__field(pgoff_t, index)
+			__field(unsigned int, dtsize)
+		),
+
+		TP_fast_assign(
+			const struct inode *dir = file_inode(file);
+			const struct nfs_inode *nfsi = NFS_I(dir);
+
+			__entry->dev = dir->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->version = inode_peek_iversion_raw(dir);
+			if (cookie != 0)
+				memcpy(__entry->verifier, verifier,
+				       NFS4_VERIFIER_SIZE);
+			else
+				memset(__entry->verifier, 0,
+				       NFS4_VERIFIER_SIZE);
+			__entry->cookie = cookie;
+			__entry->index = page_index;
+			__entry->dtsize = dtsize;
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+			"cookie=%s:0x%llx cache_index=%lu dtsize=%u",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid, __entry->fhandle,
+			__entry->version, show_nfs4_verifier(__entry->verifier),
+			(unsigned long long)__entry->cookie, __entry->index,
+			__entry->dtsize
+		)
+);
+
+#define DEFINE_NFS_READDIR_EVENT(name) \
+	DEFINE_EVENT(nfs_readdir_event, name, \
+			TP_PROTO( \
+				const struct file *file, \
+				const __be32 *verifier, \
+				u64 cookie, \
+				pgoff_t page_index, \
+				unsigned int dtsize \
+				), \
+			TP_ARGS(file, verifier, cookie, page_index, dtsize))
+
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_cache_fill);
+DEFINE_NFS_READDIR_EVENT(nfs_readdir_uncached);
+
 DECLARE_EVENT_CLASS(nfs_lookup_event,
 		TP_PROTO(
 			const struct inode *dir,

From eace45a18ccb34a746c3060601103aa14ca29923 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 19 Feb 2022 19:19:35 -0500
Subject: [PATCH 074/229] NFS: Trace effects of readdirplus on the dcache

Trace the effects of readdirplus on attribute and dentry revalidation.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c      | 5 +++++
 fs/nfs/nfstrace.h | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d591d20f7534..8b25a39b1761 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -754,8 +754,12 @@ again:
 			status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
 			if (!status)
 				nfs_setsecurity(d_inode(dentry), entry->fattr);
+			trace_nfs_readdir_lookup_revalidate(d_inode(parent),
+							    dentry, 0, status);
 			goto out;
 		} else {
+			trace_nfs_readdir_lookup_revalidate_failed(
+				d_inode(parent), dentry, 0);
 			d_invalidate(dentry);
 			dput(dentry);
 			dentry = NULL;
@@ -777,6 +781,7 @@ again:
 		dentry = alias;
 	}
 	nfs_set_verifier(dentry, dir_verifier);
+	trace_nfs_readdir_lookup(d_inode(parent), dentry, 0);
 out:
 	dput(dentry);
 }
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index c2d0543ecb2d..7c1102b991d0 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -432,6 +432,9 @@ DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_enter);
 DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_exit);
 DEFINE_NFS_LOOKUP_EVENT(nfs_lookup_revalidate_enter);
 DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_lookup_revalidate_exit);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup);
+DEFINE_NFS_LOOKUP_EVENT(nfs_readdir_lookup_revalidate_failed);
+DEFINE_NFS_LOOKUP_EVENT_DONE(nfs_readdir_lookup_revalidate);
 
 TRACE_EVENT(nfs_atomic_open_enter,
 		TP_PROTO(

From 11d03d0a1ed8dbdabab1b5ab21861ad5cad4aef2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 19 Feb 2022 19:09:21 -0500
Subject: [PATCH 075/229] NFS: Trace effects of the readdirplus heuristic

Enable tracking of when the readdirplus heuristic causes a page cache
invalidation.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c      | 11 ++++++++++-
 fs/nfs/nfstrace.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8b25a39b1761..8a246df98db5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1000,6 +1000,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
 				invalidate_inode_pages2(desc->file->f_mapping);
 				desc->page_index = 0;
+				trace_nfs_readdir_invalidate_cache_range(
+					inode, 0, MAX_LFS_FILESIZE);
 				return -EAGAIN;
 			}
 			return res;
@@ -1014,6 +1016,9 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			invalidate_inode_pages2_range(desc->file->f_mapping,
 						      desc->page_index_max + 1,
 						      -1);
+			trace_nfs_readdir_invalidate_cache_range(
+				inode, desc->page_index_max + 1,
+				MAX_LFS_FILESIZE);
 		}
 	}
 	res = nfs_readdir_search_array(desc);
@@ -1163,7 +1168,11 @@ static void nfs_readdir_handle_cache_misses(struct inode *inode,
 	if (desc->ctx->pos == 0 ||
 	    cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
 		return;
-	invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
+	if (invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1) == 0)
+		return;
+	trace_nfs_readdir_invalidate_cache_range(
+		inode, (loff_t)(page_index + 1) << PAGE_SHIFT,
+		MAX_LFS_FILESIZE);
 }
 
 /* The file offset position represents the dirent entry number.  A
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 7c1102b991d0..ec2645d20abf 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -273,6 +273,56 @@ DEFINE_NFS_UPDATE_SIZE_EVENT(wcc);
 DEFINE_NFS_UPDATE_SIZE_EVENT(update);
 DEFINE_NFS_UPDATE_SIZE_EVENT(grow);
 
+DECLARE_EVENT_CLASS(nfs_inode_range_event,
+		TP_PROTO(
+			const struct inode *inode,
+			loff_t range_start,
+			loff_t range_end
+		),
+
+		TP_ARGS(inode, range_start, range_end),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(u64, version)
+			__field(loff_t, range_start)
+			__field(loff_t, range_end)
+		),
+
+		TP_fast_assign(
+			const struct nfs_inode *nfsi = NFS_I(inode);
+
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
+			__entry->fileid = nfsi->fileid;
+			__entry->version = inode_peek_iversion_raw(inode);
+			__entry->range_start = range_start;
+			__entry->range_end = range_end;
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu "
+			"range=[%lld, %lld]",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle, __entry->version,
+			__entry->range_start, __entry->range_end
+		)
+);
+
+#define DEFINE_NFS_INODE_RANGE_EVENT(name) \
+	DEFINE_EVENT(nfs_inode_range_event, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				loff_t range_start, \
+				loff_t range_end \
+			), \
+			TP_ARGS(inode, range_start, range_end))
+
+DEFINE_NFS_INODE_RANGE_EVENT(nfs_readdir_invalidate_cache_range);
+
 DECLARE_EVENT_CLASS(nfs_readdir_event,
 		TP_PROTO(
 			const struct file *file,

From 9332cf14e2db4253bec827da66bd95e6c0f6a2f3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 26 Feb 2022 18:38:41 -0500
Subject: [PATCH 076/229] NFS: Clean up page array initialisation/free

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8a246df98db5..4983950de2ad 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -199,20 +199,17 @@ static void nfs_grow_dtsize(struct nfs_readdir_descriptor *desc)
 	nfs_set_dtsize(desc, desc->dtsize << 1);
 }
 
-static void nfs_readdir_array_init(struct nfs_cache_array *array)
-{
-	memset(array, 0, sizeof(struct nfs_cache_array));
-}
-
 static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
 					u64 change_attr)
 {
 	struct nfs_cache_array *array;
 
 	array = kmap_atomic(page);
-	nfs_readdir_array_init(array);
 	array->change_attr = change_attr;
 	array->last_cookie = last_cookie;
+	array->size = 0;
+	array->page_full = 0;
+	array->page_is_eof = 0;
 	array->cookies_are_ordered = 1;
 	kunmap_atomic(array);
 }
@@ -220,16 +217,15 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie,
 /*
  * we are freeing strings created by nfs_add_to_readdir_array()
  */
-static
-void nfs_readdir_clear_array(struct page *page)
+static void nfs_readdir_clear_array(struct page *page)
 {
 	struct nfs_cache_array *array;
-	int i;
+	unsigned int i;
 
 	array = kmap_atomic(page);
 	for (i = 0; i < array->size; i++)
 		kfree(array->array[i].name);
-	nfs_readdir_array_init(array);
+	array->size = 0;
 	kunmap_atomic(array);
 }
 

From f648022faa68ef76058aa121d1aa3a967d59cae8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 23 Feb 2022 11:31:51 -0500
Subject: [PATCH 077/229] NFS: Convert readdir page cache to use a cookie based
 index

Instead of using a linear index to address the pages, use the cookie of
the first entry, since that is what we use to match the page anyway.

This allows us to avoid re-reading the entire cache on a seekdir() type
of operation. The latter is very common when re-exporting NFS, and is a
major performance drain.

The change does affect our duplicate cookie detection, since we can no
longer rely on the page index as a linear offset for detecting whether
we looped backwards. However since we no longer do a linear search
through all the pages on each call to nfs_readdir(), this is less of a
concern than it was previously.
The other downside is that invalidate_mapping_pages() no longer can use
the page index to avoid clearing pages that have been read. A subsequent
patch will restore the functionality this provides to the 'ls -l'
heuristic.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/Kconfig         |   4 ++
 fs/nfs/dir.c           | 149 ++++++++++++++++++-----------------------
 include/linux/nfs_fs.h |   2 -
 3 files changed, 69 insertions(+), 86 deletions(-)

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 14a72224b657..47a53b3362b6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -4,6 +4,10 @@ config NFS_FS
 	depends on INET && FILE_LOCKING && MULTIUSER
 	select LOCKD
 	select SUNRPC
+	select CRYPTO
+	select CRYPTO_HASH
+	select XXHASH
+	select CRYPTO_XXHASH
 	select NFS_ACL_SUPPORT if NFS_V3_ACL
 	help
 	  Choose Y here if you want to access files residing on other
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4983950de2ad..8c2552d89310 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -39,6 +39,7 @@
 #include <linux/sched.h>
 #include <linux/kmemleak.h>
 #include <linux/xattr.h>
+#include <linux/xxhash.h>
 
 #include "delegation.h"
 #include "iostat.h"
@@ -159,9 +160,7 @@ struct nfs_readdir_descriptor {
 	pgoff_t		page_index_max;
 	u64		dir_cookie;
 	u64		last_cookie;
-	u64		dup_cookie;
 	loff_t		current_index;
-	loff_t		prev_index;
 
 	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	unsigned long	dir_verifier;
@@ -171,7 +170,6 @@ struct nfs_readdir_descriptor {
 	unsigned int	cache_entry_index;
 	unsigned int	buffer_fills;
 	unsigned int	dtsize;
-	signed char duped;
 	bool plus;
 	bool eob;
 	bool eof;
@@ -331,6 +329,28 @@ out:
 	return ret;
 }
 
+#define NFS_READDIR_COOKIE_MASK (U32_MAX >> 14)
+/*
+ * Hash algorithm allowing content addressible access to sequences
+ * of directory cookies. Content is addressed by the value of the
+ * cookie index of the first readdir entry in a page.
+ *
+ * The xxhash algorithm is chosen because it is fast, and is supposed
+ * to result in a decent flat distribution of hashes.
+ *
+ * We then select only the first 18 bits to avoid issues with excessive
+ * memory use for the page cache XArray. 18 bits should allow the caching
+ * of 262144 pages of sequences of readdir entries. Since each page holds
+ * 127 readdir entries for a typical 64-bit system, that works out to a
+ * cache of ~ 33 million entries per directory.
+ */
+static pgoff_t nfs_readdir_page_cookie_hash(u64 cookie)
+{
+	if (cookie == 0)
+		return 0;
+	return xxhash(&cookie, sizeof(cookie), 0) & NFS_READDIR_COOKIE_MASK;
+}
+
 static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
 				      u64 change_attr)
 {
@@ -352,15 +372,15 @@ static void nfs_readdir_page_unlock_and_put(struct page *page)
 }
 
 static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
-						pgoff_t index, u64 last_cookie)
+						u64 last_cookie,
+						u64 change_attr)
 {
+	pgoff_t index = nfs_readdir_page_cookie_hash(last_cookie);
 	struct page *page;
-	u64 change_attr;
 
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return NULL;
-	change_attr = inode_peek_iversion_raw(mapping->host);
 	if (PageUptodate(page)) {
 		if (nfs_readdir_page_validate(page, last_cookie, change_attr))
 			return page;
@@ -371,11 +391,6 @@ static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
 	return page;
 }
 
-static loff_t nfs_readdir_page_offset(struct page *page)
-{
-	return (loff_t)page->index * (loff_t)nfs_readdir_array_maxentries();
-}
-
 static u64 nfs_readdir_page_last_cookie(struct page *page)
 {
 	struct nfs_cache_array *array;
@@ -408,11 +423,11 @@ static void nfs_readdir_page_set_eof(struct page *page)
 }
 
 static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
-					      pgoff_t index, u64 cookie)
+					      u64 cookie, u64 change_attr)
 {
 	struct page *page;
 
-	page = nfs_readdir_page_get_locked(mapping, index, cookie);
+	page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
 	if (page) {
 		if (nfs_readdir_page_last_cookie(page) == cookie)
 			return page;
@@ -452,6 +467,13 @@ static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
 		desc->last_cookie = array->array[0].cookie;
 }
 
+static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)
+{
+	desc->current_index = 0;
+	desc->last_cookie = 0;
+	desc->page_index = 0;
+}
+
 static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
 				      struct nfs_readdir_descriptor *desc)
 {
@@ -492,8 +514,7 @@ static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
 static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 					 struct nfs_readdir_descriptor *desc)
 {
-	int i;
-	loff_t new_pos;
+	unsigned int i;
 	int status = -EAGAIN;
 
 	if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
@@ -501,32 +522,10 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 
 	for (i = 0; i < array->size; i++) {
 		if (array->array[i].cookie == desc->dir_cookie) {
-			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-
-			new_pos = nfs_readdir_page_offset(desc->page) + i;
-			if (desc->attr_gencount != nfsi->attr_gencount) {
-				desc->duped = 0;
-				desc->attr_gencount = nfsi->attr_gencount;
-			} else if (new_pos < desc->prev_index) {
-				if (desc->duped > 0
-				    && desc->dup_cookie == desc->dir_cookie) {
-					if (printk_ratelimit()) {
-						pr_notice("NFS: directory %pD2 contains a readdir loop."
-								"Please contact your server vendor.  "
-								"The file: %s has duplicate cookie %llu\n",
-								desc->file, array->array[i].name, desc->dir_cookie);
-					}
-					status = -ELOOP;
-					goto out;
-				}
-				desc->dup_cookie = desc->dir_cookie;
-				desc->duped = -1;
-			}
 			if (nfs_readdir_use_cookie(desc->file))
 				desc->ctx->pos = desc->dir_cookie;
 			else
-				desc->ctx->pos = new_pos;
-			desc->prev_index = new_pos;
+				desc->ctx->pos = desc->current_index + i;
 			desc->cache_entry_index = i;
 			return 0;
 		}
@@ -538,7 +537,6 @@ check_eof:
 			desc->eof = true;
 	} else
 		nfs_readdir_seek_next_array(array, desc);
-out:
 	return status;
 }
 
@@ -785,10 +783,9 @@ out:
 /* Perform conversion from xdr to cache array */
 static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 				   struct nfs_entry *entry,
-				   struct page **xdr_pages,
-				   unsigned int buflen,
-				   struct page **arrays,
-				   size_t narrays)
+				   struct page **xdr_pages, unsigned int buflen,
+				   struct page **arrays, size_t narrays,
+				   u64 change_attr)
 {
 	struct address_space *mapping = desc->file->f_mapping;
 	struct xdr_stream stream;
@@ -828,18 +825,16 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 				break;
 			arrays++;
 			*arrays = page = new;
-			desc->page_index_max++;
 		} else {
-			new = nfs_readdir_page_get_next(mapping,
-							page->index + 1,
-							entry->prev_cookie);
+			new = nfs_readdir_page_get_next(
+				mapping, entry->prev_cookie, change_attr);
 			if (!new)
 				break;
 			if (page != *arrays)
 				nfs_readdir_page_unlock_and_put(page);
 			page = new;
-			desc->page_index_max = new->index;
 		}
+		desc->page_index_max++;
 		status = nfs_readdir_add_to_array(entry, page);
 	} while (!status && !entry->eof);
 
@@ -899,6 +894,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 				    __be32 *verf_arg, __be32 *verf_res,
 				    struct page **arrays, size_t narrays)
 {
+	u64 change_attr;
 	struct page **pages;
 	struct page *page = *arrays;
 	struct nfs_entry *entry;
@@ -923,6 +919,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 	if (!pages)
 		goto out;
 
+	change_attr = inode_peek_iversion_raw(inode);
 	status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, pages,
 					dtsize, verf_res);
 	if (status < 0)
@@ -931,7 +928,7 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 	pglen = status;
 	if (pglen != 0)
 		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
-						 arrays, narrays);
+						 arrays, narrays, change_attr);
 	else
 		nfs_readdir_page_set_eof(page);
 	desc->buffer_fills++;
@@ -961,9 +958,11 @@ nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
 static struct page *
 nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
 {
-	return nfs_readdir_page_get_locked(desc->file->f_mapping,
-					   desc->page_index,
-					   desc->last_cookie);
+	struct address_space *mapping = desc->file->f_mapping;
+	u64 change_attr = inode_peek_iversion_raw(mapping->host);
+
+	return nfs_readdir_page_get_locked(mapping, desc->last_cookie,
+					   change_attr);
 }
 
 /*
@@ -995,7 +994,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			trace_nfs_readdir_cache_fill_done(inode, res);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
 				invalidate_inode_pages2(desc->file->f_mapping);
-				desc->page_index = 0;
+				nfs_readdir_rewind_search(desc);
 				trace_nfs_readdir_invalidate_cache_range(
 					inode, 0, MAX_LFS_FILESIZE);
 				return -EAGAIN;
@@ -1009,12 +1008,10 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 		    memcmp(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf))) {
 			memcpy(nfsi->cookieverf, verf,
 			       sizeof(nfsi->cookieverf));
-			invalidate_inode_pages2_range(desc->file->f_mapping,
-						      desc->page_index_max + 1,
+			invalidate_inode_pages2_range(desc->file->f_mapping, 1,
 						      -1);
 			trace_nfs_readdir_invalidate_cache_range(
-				inode, desc->page_index_max + 1,
-				MAX_LFS_FILESIZE);
+				inode, 1, MAX_LFS_FILESIZE);
 		}
 	}
 	res = nfs_readdir_search_array(desc);
@@ -1030,11 +1027,6 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 	int res;
 
 	do {
-		if (desc->page_index == 0) {
-			desc->current_index = 0;
-			desc->prev_index = 0;
-			desc->last_cookie = 0;
-		}
 		res = find_and_lock_cache_page(desc);
 	} while (res == -EAGAIN);
 	return res;
@@ -1072,8 +1064,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
 			desc->ctx->pos = desc->dir_cookie;
 		else
 			desc->ctx->pos++;
-		if (desc->duped != 0)
-			desc->duped = 1;
 	}
 	if (array->page_is_eof)
 		desc->eof = !desc->eob;
@@ -1115,7 +1105,6 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->page_index = 0;
 	desc->cache_entry_index = 0;
 	desc->last_cookie = desc->dir_cookie;
-	desc->duped = 0;
 	desc->page_index_max = 0;
 
 	trace_nfs_readdir_uncached(desc->file, desc->verf, desc->last_cookie,
@@ -1148,6 +1137,8 @@ out_free:
 	for (i = 0; i < sz && arrays[i]; i++)
 		nfs_readdir_page_array_free(arrays[i]);
 out:
+	if (!nfs_readdir_use_cookie(desc->file))
+		nfs_readdir_rewind_search(desc);
 	desc->page_index_max = -1;
 	kfree(arrays);
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
@@ -1158,17 +1149,14 @@ out:
 
 static void nfs_readdir_handle_cache_misses(struct inode *inode,
 					    struct nfs_readdir_descriptor *desc,
-					    pgoff_t page_index,
 					    unsigned int cache_misses)
 {
 	if (desc->ctx->pos == 0 ||
 	    cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
 		return;
-	if (invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1) == 0)
+	if (invalidate_mapping_pages(inode->i_mapping, 0, -1) == 0)
 		return;
-	trace_nfs_readdir_invalidate_cache_range(
-		inode, (loff_t)(page_index + 1) << PAGE_SHIFT,
-		MAX_LFS_FILESIZE);
+	trace_nfs_readdir_invalidate_cache_range(inode, 0, MAX_LFS_FILESIZE);
 }
 
 /* The file offset position represents the dirent entry number.  A
@@ -1183,7 +1171,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	struct nfs_readdir_descriptor *desc;
 	unsigned int cache_hits, cache_misses;
-	pgoff_t page_index;
 	int res;
 
 	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1208,10 +1195,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 
 	spin_lock(&file->f_lock);
 	desc->dir_cookie = dir_ctx->dir_cookie;
-	desc->dup_cookie = dir_ctx->dup_cookie;
-	desc->duped = dir_ctx->duped;
-	page_index = dir_ctx->page_index;
-	desc->page_index = page_index;
+	desc->page_index = dir_ctx->page_index;
 	desc->last_cookie = dir_ctx->last_cookie;
 	desc->attr_gencount = dir_ctx->attr_gencount;
 	desc->eof = dir_ctx->eof;
@@ -1227,7 +1211,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	}
 
 	desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
-	nfs_readdir_handle_cache_misses(inode, desc, page_index, cache_misses);
+	nfs_readdir_handle_cache_misses(inode, desc, cache_misses);
 
 	do {
 		res = readdir_search_pagecache(desc);
@@ -1247,7 +1231,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		}
 		if (res == -ETOOSMALL && desc->plus) {
 			nfs_zap_caches(inode);
-			desc->page_index = 0;
 			desc->plus = false;
 			desc->eof = false;
 			continue;
@@ -1261,9 +1244,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 
 	spin_lock(&file->f_lock);
 	dir_ctx->dir_cookie = desc->dir_cookie;
-	dir_ctx->dup_cookie = desc->dup_cookie;
 	dir_ctx->last_cookie = desc->last_cookie;
-	dir_ctx->duped = desc->duped;
 	dir_ctx->attr_gencount = desc->attr_gencount;
 	dir_ctx->page_index = desc->page_index;
 	dir_ctx->eof = desc->eof;
@@ -1306,13 +1287,13 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 	if (offset != filp->f_pos) {
 		filp->f_pos = offset;
 		dir_ctx->page_index = 0;
-		if (!nfs_readdir_use_cookie(filp))
+		if (!nfs_readdir_use_cookie(filp)) {
 			dir_ctx->dir_cookie = 0;
-		else
+			dir_ctx->last_cookie = 0;
+		} else {
 			dir_ctx->dir_cookie = offset;
-		if (offset == 0)
-			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
-		dir_ctx->duped = 0;
+			dir_ctx->last_cookie = offset;
+		}
 		dir_ctx->eof = false;
 	}
 	spin_unlock(&filp->f_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 20a4cf0acad2..42aad886d3c0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -106,11 +106,9 @@ struct nfs_open_dir_context {
 	unsigned long attr_gencount;
 	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
-	__u64 dup_cookie;
 	__u64 last_cookie;
 	pgoff_t page_index;
 	unsigned int dtsize;
-	signed char duped;
 	bool eof;
 	struct rcu_head rcu_head;
 };

From b0365ccb0712efacf99936e94e92eb7ae63de4d5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 23 Feb 2022 13:29:59 -0500
Subject: [PATCH 078/229] NFS: Fix up forced readdirplus

Avoid clearing the entire readdir page cache if we're just doing forced
readdirplus for the 'ls -l' heuristic.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 56 +++++++++++++++++++++++++++++-------------
 fs/nfs/nfstrace.h      |  1 +
 include/linux/nfs_fs.h |  1 +
 3 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8c2552d89310..f6aac1e8a8b9 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -170,6 +170,7 @@ struct nfs_readdir_descriptor {
 	unsigned int	cache_entry_index;
 	unsigned int	buffer_fills;
 	unsigned int	dtsize;
+	bool clear_cache;
 	bool plus;
 	bool eob;
 	bool eof;
@@ -227,6 +228,13 @@ static void nfs_readdir_clear_array(struct page *page)
 	kunmap_atomic(array);
 }
 
+static void nfs_readdir_page_reinit_array(struct page *page, u64 last_cookie,
+					  u64 change_attr)
+{
+	nfs_readdir_clear_array(page);
+	nfs_readdir_page_init_array(page, last_cookie, change_attr);
+}
+
 static struct page *
 nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
 {
@@ -428,12 +436,11 @@ static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
 	struct page *page;
 
 	page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
-	if (page) {
-		if (nfs_readdir_page_last_cookie(page) == cookie)
-			return page;
-		nfs_readdir_page_unlock_and_put(page);
-	}
-	return NULL;
+	if (!page)
+		return NULL;
+	if (nfs_readdir_page_last_cookie(page) != cookie)
+		nfs_readdir_page_reinit_array(page, cookie, change_attr);
+	return page;
 }
 
 static inline
@@ -960,9 +967,15 @@ nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
 {
 	struct address_space *mapping = desc->file->f_mapping;
 	u64 change_attr = inode_peek_iversion_raw(mapping->host);
+	u64 cookie = desc->last_cookie;
+	struct page *page;
 
-	return nfs_readdir_page_get_locked(mapping, desc->last_cookie,
-					   change_attr);
+	page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
+	if (!page)
+		return NULL;
+	if (desc->clear_cache && !nfs_readdir_page_needs_filling(page))
+		nfs_readdir_page_reinit_array(page, cookie, change_attr);
+	return page;
 }
 
 /*
@@ -1013,6 +1026,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			trace_nfs_readdir_invalidate_cache_range(
 				inode, 1, MAX_LFS_FILESIZE);
 		}
+		desc->clear_cache = false;
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0)
@@ -1147,16 +1161,17 @@ out:
 
 #define NFS_READDIR_CACHE_MISS_THRESHOLD (16UL)
 
-static void nfs_readdir_handle_cache_misses(struct inode *inode,
+static bool nfs_readdir_handle_cache_misses(struct inode *inode,
 					    struct nfs_readdir_descriptor *desc,
-					    unsigned int cache_misses)
+					    unsigned int cache_misses,
+					    bool force_clear)
 {
-	if (desc->ctx->pos == 0 ||
-	    cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD)
-		return;
-	if (invalidate_mapping_pages(inode->i_mapping, 0, -1) == 0)
-		return;
-	trace_nfs_readdir_invalidate_cache_range(inode, 0, MAX_LFS_FILESIZE);
+	if (desc->ctx->pos == 0 || !desc->plus)
+		return false;
+	if (cache_misses <= NFS_READDIR_CACHE_MISS_THRESHOLD && !force_clear)
+		return false;
+	trace_nfs_readdir_force_readdirplus(inode);
+	return true;
 }
 
 /* The file offset position represents the dirent entry number.  A
@@ -1171,6 +1186,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
 	struct nfs_readdir_descriptor *desc;
 	unsigned int cache_hits, cache_misses;
+	bool force_clear;
 	int res;
 
 	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -1203,6 +1219,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
 	cache_hits = atomic_xchg(&dir_ctx->cache_hits, 0);
 	cache_misses = atomic_xchg(&dir_ctx->cache_misses, 0);
+	force_clear = dir_ctx->force_clear;
 	spin_unlock(&file->f_lock);
 
 	if (desc->eof) {
@@ -1211,7 +1228,9 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	}
 
 	desc->plus = nfs_use_readdirplus(inode, ctx, cache_hits, cache_misses);
-	nfs_readdir_handle_cache_misses(inode, desc, cache_misses);
+	force_clear = nfs_readdir_handle_cache_misses(inode, desc, cache_misses,
+						      force_clear);
+	desc->clear_cache = force_clear;
 
 	do {
 		res = readdir_search_pagecache(desc);
@@ -1240,6 +1259,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 
 		nfs_do_filldir(desc, nfsi->cookieverf);
 		nfs_readdir_page_unlock_and_put_cached(desc);
+		if (desc->page_index == desc->page_index_max)
+			desc->clear_cache = force_clear;
 	} while (!desc->eob && !desc->eof);
 
 	spin_lock(&file->f_lock);
@@ -1247,6 +1268,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->last_cookie = desc->last_cookie;
 	dir_ctx->attr_gencount = desc->attr_gencount;
 	dir_ctx->page_index = desc->page_index;
+	dir_ctx->force_clear = force_clear;
 	dir_ctx->eof = desc->eof;
 	dir_ctx->dtsize = desc->dtsize;
 	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index ec2645d20abf..59f4ca803fd0 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -160,6 +160,7 @@ DEFINE_NFS_INODE_EVENT(nfs_fsync_enter);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit);
 DEFINE_NFS_INODE_EVENT(nfs_access_enter);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid);
+DEFINE_NFS_INODE_EVENT(nfs_readdir_force_readdirplus);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_cache_fill_done);
 DEFINE_NFS_INODE_EVENT_DONE(nfs_readdir_uncached_done);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 42aad886d3c0..3893386ceaed 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -109,6 +109,7 @@ struct nfs_open_dir_context {
 	__u64 last_cookie;
 	pgoff_t page_index;
 	unsigned int dtsize;
+	bool force_clear;
 	bool eof;
 	struct rcu_head rcu_head;
 };

From 0adf85b445c7fbc5d2df1f8c1bc54d62c4340237 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 27 Feb 2022 12:46:24 -0500
Subject: [PATCH 079/229] NFS: Optimise away the previous cookie field

Replace the 'previous cookie' field in struct nfs_entry with the
array->last_cookie.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c            | 26 ++++++++++++++------------
 fs/nfs/nfs2xdr.c        |  1 -
 fs/nfs/nfs3xdr.c        |  1 -
 fs/nfs/nfs4xdr.c        |  1 -
 include/linux/nfs_xdr.h |  3 +--
 5 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f6aac1e8a8b9..033249a72e92 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -301,19 +301,20 @@ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
 	return 0;
 }
 
-static
-int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
+static int nfs_readdir_page_array_append(struct page *page,
+					 const struct nfs_entry *entry,
+					 u64 *cookie)
 {
 	struct nfs_cache_array *array;
 	struct nfs_cache_array_entry *cache_entry;
 	const char *name;
-	int ret;
+	int ret = -ENOMEM;
 
 	name = nfs_readdir_copy_name(entry->name, entry->len);
-	if (!name)
-		return -ENOMEM;
 
 	array = kmap_atomic(page);
+	if (!name)
+		goto out;
 	ret = nfs_readdir_array_can_expand(array);
 	if (ret) {
 		kfree(name);
@@ -321,7 +322,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	}
 
 	cache_entry = &array->array[array->size];
-	cache_entry->cookie = entry->prev_cookie;
+	cache_entry->cookie = array->last_cookie;
 	cache_entry->ino = entry->ino;
 	cache_entry->d_type = entry->d_type;
 	cache_entry->name_len = entry->len;
@@ -333,6 +334,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	if (entry->eof != 0)
 		nfs_readdir_array_set_eof(array);
 out:
+	*cookie = array->last_cookie;
 	kunmap_atomic(array);
 	return ret;
 }
@@ -798,6 +800,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 	struct xdr_stream stream;
 	struct xdr_buf buf;
 	struct page *scratch, *new, *page = *arrays;
+	u64 cookie;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
@@ -819,22 +822,21 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 			nfs_prime_dcache(file_dentry(desc->file), entry,
 					desc->dir_verifier);
 
-		status = nfs_readdir_add_to_array(entry, page);
+		status = nfs_readdir_page_array_append(page, entry, &cookie);
 		if (status != -ENOSPC)
 			continue;
 
 		if (page->mapping != mapping) {
 			if (!--narrays)
 				break;
-			new = nfs_readdir_page_array_alloc(entry->prev_cookie,
-							   GFP_KERNEL);
+			new = nfs_readdir_page_array_alloc(cookie, GFP_KERNEL);
 			if (!new)
 				break;
 			arrays++;
 			*arrays = page = new;
 		} else {
-			new = nfs_readdir_page_get_next(
-				mapping, entry->prev_cookie, change_attr);
+			new = nfs_readdir_page_get_next(mapping, cookie,
+							change_attr);
 			if (!new)
 				break;
 			if (page != *arrays)
@@ -842,7 +844,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 			page = new;
 		}
 		desc->page_index_max++;
-		status = nfs_readdir_add_to_array(entry, page);
+		status = nfs_readdir_page_array_append(page, entry, &cookie);
 	} while (!status && !entry->eof);
 
 	switch (status) {
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 3d5ba43f44bb..05c3b4b2b3dd 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -955,7 +955,6 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	 * The type (size and byte order) of nfscookie isn't defined in
 	 * RFC 1094.  This implementation assumes that it's an XDR uint32.
 	 */
-	entry->prev_cookie = entry->cookie;
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		return -EAGAIN;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index d6779ceeb39e..3b0b650c9c5a 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2024,7 +2024,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 			zero_nfs_fh3(entry->fh);
 	}
 
-	entry->prev_cookie = entry->cookie;
 	entry->cookie = new_cookie;
 
 	return 0;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b7780b97dc4d..86a5f6516928 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7508,7 +7508,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 	if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE)
 		entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
 
-	entry->prev_cookie = entry->cookie;
 	entry->cookie = new_cookie;
 
 	return 0;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 728cb0c1f0b6..82f7c2730b9a 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -745,8 +745,7 @@ struct nfs_auth_info {
  */
 struct nfs_entry {
 	__u64			ino;
-	__u64			cookie,
-				prev_cookie;
+	__u64			cookie;
 	const char *		name;
 	unsigned int		len;
 	int			eof;

From 612896ec5a4edbf98c4a631503899da04df76480 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 24 Feb 2022 11:48:35 -0500
Subject: [PATCH 080/229] NFS: Cache all entries in the readdirplus reply

Even if we're not able to cache all the entries in the readdir buffer,
let's ensure that we do prime the dcache.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 033249a72e92..7e12102b29e7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -789,6 +789,21 @@ out:
 	dput(dentry);
 }
 
+static int nfs_readdir_entry_decode(struct nfs_readdir_descriptor *desc,
+				    struct nfs_entry *entry,
+				    struct xdr_stream *stream)
+{
+	int ret;
+
+	if (entry->fattr->label)
+		entry->fattr->label->len = NFS4_MAXLABELLEN;
+	ret = xdr_decode(desc, entry, stream);
+	if (ret || !desc->plus)
+		return ret;
+	nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier);
+	return 0;
+}
+
 /* Perform conversion from xdr to cache array */
 static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 				   struct nfs_entry *entry,
@@ -811,17 +826,10 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 	xdr_set_scratch_page(&stream, scratch);
 
 	do {
-		if (entry->fattr->label)
-			entry->fattr->label->len = NFS4_MAXLABELLEN;
-
-		status = xdr_decode(desc, entry, &stream);
+		status = nfs_readdir_entry_decode(desc, entry, &stream);
 		if (status != 0)
 			break;
 
-		if (desc->plus)
-			nfs_prime_dcache(file_dentry(desc->file), entry,
-					desc->dir_verifier);
-
 		status = nfs_readdir_page_array_append(page, entry, &cookie);
 		if (status != -ENOSPC)
 			continue;
@@ -849,15 +857,19 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 
 	switch (status) {
 	case -EBADCOOKIE:
-		if (entry->eof) {
-			nfs_readdir_page_set_eof(page);
-			status = 0;
-		}
-		break;
-	case -ENOSPC:
+		if (!entry->eof)
+			break;
+		nfs_readdir_page_set_eof(page);
+		fallthrough;
 	case -EAGAIN:
 		status = 0;
 		break;
+	case -ENOSPC:
+		status = 0;
+		if (!desc->plus)
+			break;
+		while (!nfs_readdir_entry_decode(desc, entry, &stream))
+			;
 	}
 
 	if (page != *arrays)

From 0409ab77728d705f376ae28fd7114161295e7ed2 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 2 Mar 2022 11:29:13 +0000
Subject: [PATCH 081/229] dt-bindings: opp: Add "opp-microwatt" entry in the
 OPP

Add new entry for the OPP which provides information about power
expressed in micro-Watts. It is useful for the Energy Model framework.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../devicetree/bindings/opp/opp-v2-base.yaml  | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/Documentation/devicetree/bindings/opp/opp-v2-base.yaml b/Documentation/devicetree/bindings/opp/opp-v2-base.yaml
index 15a76bcd6d42..04a592c0f862 100644
--- a/Documentation/devicetree/bindings/opp/opp-v2-base.yaml
+++ b/Documentation/devicetree/bindings/opp/opp-v2-base.yaml
@@ -93,6 +93,21 @@ patternProperties:
         minItems: 1
         maxItems: 8   # Should be enough regulators
 
+      opp-microwatt:
+        description: |
+          The power for the OPP in micro-Watts.
+
+          Entries for multiple regulators shall be provided in the same field
+          separated by angular brackets <>. If current values aren't required
+          for a regulator, then it shall be filled with 0. If power values
+          aren't required for any of the regulators, then this field is not
+          required. The OPP binding doesn't provide any provisions to relate the
+          values to their power supplies or the order in which the supplies need
+          to be configured and that is left for the implementation specific
+          binding.
+        minItems: 1
+        maxItems: 8   # Should be enough regulators
+
       opp-level:
         description:
           A value representing the performance level of the device.
@@ -203,6 +218,14 @@ patternProperties:
         minItems: 1
         maxItems: 8   # Should be enough regulators
 
+      '^opp-microwatt':
+        description:
+          Named opp-microwatt property. Similar to opp-microamp property,
+          but for microwatt instead.
+        $ref: /schemas/types.yaml#/definitions/uint32-array
+        minItems: 1
+        maxItems: 8   # Should be enough regulators
+
     dependencies:
       opp-avg-kBps: [ opp-peak-kBps ]
 

From 4f9a7a1dc2a294c5c5c4b0246e2281e6ec88fb91 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 2 Mar 2022 11:29:14 +0000
Subject: [PATCH 082/229] OPP: Add "opp-microwatt" supporting code

Add new property to the OPP: power value. The OPP entry in the DT can have
"opp-microwatt". Add the needed code to handle this new property in the
existing infrastructure.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c     | 25 ++++++++++++++++++++++
 drivers/opp/debugfs.c  |  3 +++
 drivers/opp/of.c       | 47 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/pm_opp.h | 12 ++++++++++-
 4 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 3057beabd370..740407252298 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -113,6 +113,31 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
 
+/**
+ * dev_pm_opp_get_power() - Gets the power corresponding to an opp
+ * @opp:	opp for which power has to be returned for
+ *
+ * Return: power in micro watt corresponding to the opp, else
+ * return 0
+ *
+ * This is useful only for devices with single power supply.
+ */
+unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp)
+{
+	unsigned long opp_power = 0;
+	int i;
+
+	if (IS_ERR_OR_NULL(opp)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return 0;
+	}
+	for (i = 0; i < opp->opp_table->regulator_count; i++)
+		opp_power += opp->supplies[i].u_watt;
+
+	return opp_power;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_power);
+
 /**
  * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
  * @opp:	opp for which frequency has to be returned for
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
index b5f2f9f39392..3fcc1f97f2d1 100644
--- a/drivers/opp/debugfs.c
+++ b/drivers/opp/debugfs.c
@@ -100,6 +100,9 @@ static void opp_debug_create_supplies(struct dev_pm_opp *opp,
 
 		debugfs_create_ulong("u_amp", S_IRUGO, d,
 				     &opp->supplies[i].u_amp);
+
+		debugfs_create_ulong("u_watt", S_IRUGO, d,
+				     &opp->supplies[i].u_watt);
 	}
 }
 
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 2f40afa4e65c..7bff30f27dc1 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -575,8 +575,9 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
 static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 			      struct opp_table *opp_table)
 {
-	u32 *microvolt, *microamp = NULL;
-	int supplies = opp_table->regulator_count, vcount, icount, ret, i, j;
+	u32 *microvolt, *microamp = NULL, *microwatt = NULL;
+	int supplies = opp_table->regulator_count;
+	int vcount, icount, pcount, ret, i, j;
 	struct property *prop = NULL;
 	char name[NAME_MAX];
 
@@ -688,6 +689,43 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 		}
 	}
 
+	/* Search for "opp-microwatt" */
+	sprintf(name, "opp-microwatt");
+	prop = of_find_property(opp->np, name, NULL);
+
+	if (prop) {
+		pcount = of_property_count_u32_elems(opp->np, name);
+		if (pcount < 0) {
+			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
+				name, pcount);
+			ret = pcount;
+			goto free_microamp;
+		}
+
+		if (pcount != supplies) {
+			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+				__func__, name, pcount, supplies);
+			ret = -EINVAL;
+			goto free_microamp;
+		}
+
+		microwatt = kmalloc_array(pcount, sizeof(*microwatt),
+					  GFP_KERNEL);
+		if (!microwatt) {
+			ret = -EINVAL;
+			goto free_microamp;
+		}
+
+		ret = of_property_read_u32_array(opp->np, name, microwatt,
+						 pcount);
+		if (ret) {
+			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
+				name, ret);
+			ret = -EINVAL;
+			goto free_microwatt;
+		}
+	}
+
 	for (i = 0, j = 0; i < supplies; i++) {
 		opp->supplies[i].u_volt = microvolt[j++];
 
@@ -701,8 +739,13 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 
 		if (microamp)
 			opp->supplies[i].u_amp = microamp[i];
+
+		if (microwatt)
+			opp->supplies[i].u_watt = microwatt[i];
 	}
 
+free_microwatt:
+	kfree(microwatt);
 free_microamp:
 	kfree(microamp);
 free_microvolt:
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 879c138c7b8e..0d85a63a1f78 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -32,14 +32,17 @@ enum dev_pm_opp_event {
  * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
  * @u_volt_max:	Maximum voltage in microvolts corresponding to this OPP
  * @u_amp:	Maximum current drawn by the device in microamperes
+ * @u_watt:	Power used by the device in microwatts
  *
- * This structure stores the voltage/current values for a single power supply.
+ * This structure stores the voltage/current/power values for a single power
+ * supply.
  */
 struct dev_pm_opp_supply {
 	unsigned long u_volt;
 	unsigned long u_volt_min;
 	unsigned long u_volt_max;
 	unsigned long u_amp;
+	unsigned long u_watt;
 };
 
 /**
@@ -94,6 +97,8 @@ void dev_pm_opp_put_opp_table(struct opp_table *opp_table);
 
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
 
+unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp);
+
 unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp);
 
 unsigned int dev_pm_opp_get_level(struct dev_pm_opp *opp);
@@ -186,6 +191,11 @@ static inline unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 	return 0;
 }
 
+static inline unsigned long dev_pm_opp_get_power(struct dev_pm_opp *opp)
+{
+	return 0;
+}
+
 static inline unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
 {
 	return 0;

From caeea9e6671984c3865918459d756b961a24bb49 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 2 Mar 2022 11:29:15 +0000
Subject: [PATCH 083/229] PM: EM: add macro to set .active_power() callback
 conditionally

The Energy Model is able to use new power values coming from DT. Add a new
macro which is helpful in setting the .active_power() callback
conditionally in setup time. The dual-macro implementation handles both
kernel configurations: w/ EM and w/o EM built-in.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 include/linux/energy_model.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 6377adc3b78d..9f3c400bc52d 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -116,6 +116,7 @@ struct em_data_callback {
 			    struct device *dev);
 };
 #define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
+#define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb)
 
 struct em_perf_domain *em_cpu_get(int cpu);
 struct em_perf_domain *em_pd_get(struct device *dev);
@@ -264,6 +265,7 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
 #else
 struct em_data_callback {};
 #define EM_DATA_CB(_active_power_cb) { }
+#define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0)
 
 static inline
 int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,

From 32bf8bc9a077680566b2b4f88b9b46e6cc740179 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 2 Mar 2022 11:29:16 +0000
Subject: [PATCH 084/229] OPP: Add support of "opp-microwatt" for EM
 registration

The Energy Model (EM) can be created based on DT entry:
'dynamic-power-coefficient'. It's a 'simple' EM which is limited to the
dynamic power. It has to fit into the math formula which requires also
information about voltage. Some of the platforms don't expose voltage
information, thus it's not possible to use EM registration using DT.

This patch aims to fix it. It introduces new implementation of the EM
registration callback. The new mechanism relies on the new OPP feature
allowing to get power (which is coming from "opp-microwatt" DT property)
expressed in micro-Watts.

The patch also opens new opportunity to better support platforms, which
have a decent static power. It allows to register the EM based on real
power measurements which models total power (static + dynamic), so better
reflects real HW.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/of.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index 7bff30f27dc1..440ab5a03df9 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1438,6 +1438,38 @@ struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node);
 
+/*
+ * Callback function provided to the Energy Model framework upon registration.
+ * It provides the power used by @dev at @kHz if it is the frequency of an
+ * existing OPP, or at the frequency of the first OPP above @kHz otherwise
+ * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
+ * frequency and @mW to the associated power.
+ *
+ * Returns 0 on success or a proper -EINVAL value in case of error.
+ */
+static int __maybe_unused
+_get_dt_power(unsigned long *mW, unsigned long *kHz, struct device *dev)
+{
+	struct dev_pm_opp *opp;
+	unsigned long opp_freq, opp_power;
+
+	/* Find the right frequency and related OPP */
+	opp_freq = *kHz * 1000;
+	opp = dev_pm_opp_find_freq_ceil(dev, &opp_freq);
+	if (IS_ERR(opp))
+		return -EINVAL;
+
+	opp_power = dev_pm_opp_get_power(opp);
+	dev_pm_opp_put(opp);
+	if (!opp_power)
+		return -EINVAL;
+
+	*kHz = opp_freq / 1000;
+	*mW = opp_power / 1000;
+
+	return 0;
+}
+
 /*
  * Callback function provided to the Energy Model framework upon registration.
  * This computes the power estimated by @dev at @kHz if it is the frequency
@@ -1488,6 +1520,24 @@ static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz,
 	return 0;
 }
 
+static bool _of_has_opp_microwatt_property(struct device *dev)
+{
+	unsigned long power, freq = 0;
+	struct dev_pm_opp *opp;
+
+	/* Check if at least one OPP has needed property */
+	opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	if (IS_ERR(opp))
+		return false;
+
+	power = dev_pm_opp_get_power(opp);
+	dev_pm_opp_put(opp);
+	if (!power)
+		return false;
+
+	return true;
+}
+
 /**
  * dev_pm_opp_of_register_em() - Attempt to register an Energy Model
  * @dev		: Device for which an Energy Model has to be registered
@@ -1501,7 +1551,7 @@ static int __maybe_unused _get_power(unsigned long *mW, unsigned long *kHz,
  */
 int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 {
-	struct em_data_callback em_cb = EM_DATA_CB(_get_power);
+	struct em_data_callback em_cb;
 	struct device_node *np;
 	int ret, nr_opp;
 	u32 cap;
@@ -1517,6 +1567,12 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 		goto failed;
 	}
 
+	/* First, try to find more precised Energy Model in DT */
+	if (_of_has_opp_microwatt_property(dev)) {
+		EM_SET_ACTIVE_POWER_CB(em_cb, _get_dt_power);
+		goto register_em;
+	}
+
 	np = of_node_get(dev->of_node);
 	if (!np) {
 		ret = -EINVAL;
@@ -1538,6 +1594,9 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 		goto failed;
 	}
 
+	EM_SET_ACTIVE_POWER_CB(em_cb, _get_power);
+
+register_em:
 	ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus, true);
 	if (ret)
 		goto failed;

From f48a0c475c2aec8f2274703e1dc7be503f40f7cc Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 2 Mar 2022 11:29:17 +0000
Subject: [PATCH 085/229] Documentation: EM: Describe new registration method
 using DT

The new registration method allows to get power values from the DT OPP
definition. The new OPP entry property "opp-microwatt" contains total
power expressed in micro-Watts. Align the EM documentation with this
new possible registration method of EM.

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 Documentation/power/energy-model.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
index 5ac62a7b4b7c..49549aab41b4 100644
--- a/Documentation/power/energy-model.rst
+++ b/Documentation/power/energy-model.rst
@@ -113,6 +113,16 @@ to: return warning/error, stop working or panic.
 See Section 3. for an example of driver implementing this
 callback, or Section 2.4 for further documentation on this API
 
+Registration of EM using DT
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The  EM can also be registered using OPP framework and information in DT
+"operating-points-v2". Each OPP entry in DT can be extended with a property
+"opp-microwatt" containing micro-Watts power value. This OPP DT property
+allows a platform to register EM power values which are reflecting total power
+(static + dynamic). These power values might be coming directly from
+experiments and measurements.
+
 Registration of 'simple' EM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From f5ff79fddf0efecca538046b5cc20fb3ded2ec4f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 26 Feb 2022 16:40:21 +0100
Subject: [PATCH 086/229] dma-mapping: remove CONFIG_DMA_REMAP

CONFIG_DMA_REMAP is used to build a few helpers around the core
vmalloc code, and to use them in case there is a highmem page in
dma-direct, and to make dma coherent allocations be able to use
non-contiguous pages allocations for DMA allocations in the dma-iommu
layer.

Right now it needs to be explicitly selected by architectures, and
is only done so by architectures that require remapping to deal
with devices that are not DMA coherent.  Make it unconditional for
builds with CONFIG_MMU as it is very little extra code, but makes
it much more likely that large DMA allocations succeed on x86.

This fixes hot plugging a NVMe thunderbolt SSD for me, which tries
to allocate a 1MB buffer that is otherwise hard to obtain due to
memory fragmentation on a heavily used laptop.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm/Kconfig          |  2 +-
 arch/xtensa/Kconfig       |  2 +-
 drivers/iommu/dma-iommu.c | 14 +++++---------
 kernel/dma/Kconfig        |  7 +------
 kernel/dma/Makefile       |  2 +-
 kernel/dma/direct.c       | 18 +++++++-----------
 6 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fabe39169b12..11fbf66ef8f8 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -47,7 +47,7 @@ config ARM
 	select DMA_DECLARE_COHERENT
 	select DMA_GLOBAL_POOL if !MMU
 	select DMA_OPS
-	select DMA_REMAP if MMU
+	select DMA_NONCOHERENT_MMAP if MMU
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
 	select GENERIC_ALLOCATOR
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 8ac599aa6d99..76438ee313d1 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -17,7 +17,7 @@ config XTENSA
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
-	select DMA_REMAP if MMU
+	select DMA_NONCOHERENT_MMAP if MMU
 	select GENERIC_ATOMIC64
 	select GENERIC_IRQ_SHOW
 	select GENERIC_PCI_IOMAP
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index d85d54f2b549..cebced7ddf39 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -852,7 +852,6 @@ out_unmap:
 	return NULL;
 }
 
-#ifdef CONFIG_DMA_REMAP
 static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
 		size_t size, enum dma_data_direction dir, gfp_t gfp,
 		unsigned long attrs)
@@ -882,7 +881,6 @@ static void iommu_dma_free_noncontiguous(struct device *dev, size_t size,
 	sg_free_table(&sh->sgt);
 	kfree(sh);
 }
-#endif /* CONFIG_DMA_REMAP */
 
 static void iommu_dma_sync_single_for_cpu(struct device *dev,
 		dma_addr_t dma_handle, size_t size, enum dma_data_direction dir)
@@ -1276,7 +1274,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr)
 	    dma_free_from_pool(dev, cpu_addr, alloc_size))
 		return;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+	if (is_vmalloc_addr(cpu_addr)) {
 		/*
 		 * If it the address is remapped, then it's either non-coherent
 		 * or highmem CMA, or an iommu_dma_alloc_remap() construction.
@@ -1318,7 +1316,7 @@ static void *iommu_dma_alloc_pages(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && (!coherent || PageHighMem(page))) {
+	if (!coherent || PageHighMem(page)) {
 		pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
 
 		cpu_addr = dma_common_contiguous_remap(page, alloc_size,
@@ -1350,7 +1348,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 
 	gfp |= __GFP_ZERO;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && gfpflags_allow_blocking(gfp) &&
+	if (gfpflags_allow_blocking(gfp) &&
 	    !(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) {
 		return iommu_dma_alloc_remap(dev, size, handle, gfp,
 				dma_pgprot(dev, PAGE_KERNEL, attrs), attrs);
@@ -1391,7 +1389,7 @@ static int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
 	if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
 		return -ENXIO;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+	if (is_vmalloc_addr(cpu_addr)) {
 		struct page **pages = dma_common_find_pages(cpu_addr);
 
 		if (pages)
@@ -1413,7 +1411,7 @@ static int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
 	struct page *page;
 	int ret;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+	if (is_vmalloc_addr(cpu_addr)) {
 		struct page **pages = dma_common_find_pages(cpu_addr);
 
 		if (pages) {
@@ -1445,10 +1443,8 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.free			= iommu_dma_free,
 	.alloc_pages		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
-#ifdef CONFIG_DMA_REMAP
 	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
 	.free_noncontiguous	= iommu_dma_free_noncontiguous,
-#endif
 	.mmap			= iommu_dma_mmap,
 	.get_sgtable		= iommu_dma_get_sgtable,
 	.map_page		= iommu_dma_map_page,
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1b02179758cb..56866aaa2ae1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL
 	select DMA_DECLARE_COHERENT
 	bool
 
-config DMA_REMAP
-	bool
-	depends on MMU
-	select DMA_NONCOHERENT_MMAP
-
 config DMA_DIRECT_REMAP
 	bool
-	select DMA_REMAP
 	select DMA_COHERENT_POOL
+	select DMA_NONCOHERENT_MMAP
 
 config DMA_CMA
 	bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT)	+= coherent.o
 obj-$(CONFIG_DMA_API_DEBUG)		+= debug.o
 obj-$(CONFIG_SWIOTLB)			+= swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)		+= pool.o
-obj-$(CONFIG_DMA_REMAP)			+= remap.o
+obj-$(CONFIG_MMU)			+= remap.o
 obj-$(CONFIG_DMA_MAP_BENCHMARK)		+= map_benchmark.o
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50f48e9e4598..35a1d29d6a2e 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -265,17 +265,13 @@ void *dma_direct_alloc(struct device *dev, size_t size,
 	page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
 	if (!page)
 		return NULL;
+
+	/*
+	 * dma_alloc_contiguous can return highmem pages depending on a
+	 * combination the cma= arguments and per-arch setup.  These need to be
+	 * remapped to return a kernel virtual address.
+	 */
 	if (PageHighMem(page)) {
-		/*
-		 * Depending on the cma= arguments and per-arch setup,
-		 * dma_alloc_contiguous could return highmem pages.
-		 * Without remapping there is no way to return them here, so
-		 * log an error and fail.
-		 */
-		if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
-			dev_info(dev, "Rejecting highmem page from CMA.\n");
-			goto out_free_pages;
-		}
 		remap = true;
 		set_uncached = false;
 	}
@@ -349,7 +345,7 @@ void dma_direct_free(struct device *dev, size_t size,
 	    dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
 		return;
 
-	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+	if (is_vmalloc_addr(cpu_addr)) {
 		vunmap(cpu_addr);
 	} else {
 		if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))

From 80e4390981618e290616dbd06ea190d4576f219d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 28 Feb 2022 14:04:53 -0800
Subject: [PATCH 087/229] dma-debug: fix return value of __setup handlers

When valid kernel command line parameters
  dma_debug=off dma_debug_entries=100
are used, they are reported as Unknown parameters and added to init's
environment strings, polluting it.

  Unknown kernel command line parameters "BOOT_IMAGE=/boot/bzImage-517rc5
    dma_debug=off dma_debug_entries=100", will be passed to user space.

and

 Run /sbin/init as init process
   with arguments:
     /sbin/init
   with environment:
     HOME=/
     TERM=linux
     BOOT_IMAGE=/boot/bzImage-517rc5
     dma_debug=off
     dma_debug_entries=100

Return 1 from these __setup handlers to indicate that the command line
option has been handled.

Fixes: 59d3daafa1726 ("dma-debug: add kernel command line parameters")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Igor Zhbanov <i.zhbanov@omprussia.ru>
Link: lore.kernel.org/r/64644a2f-4a20-bab3-1e15-3b2cdd0defe3@omprussia.ru
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: iommu@lists.linux-foundation.org
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7a14ca29c377..f8ff598596b8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str)
 		global_disable = true;
 	}
 
-	return 0;
+	return 1;
 }
 
 static __init int dma_debug_entries_cmdline(char *str)
@@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str)
 		return -EINVAL;
 	if (!get_option(&str, &nr_prealloc_entries))
 		nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
-	return 0;
+	return 1;
 }
 
 __setup("dma_debug=", dma_debug_cmdline);

From 9a61d0838cd0a81529badfba7bfa39e81d5529d3 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Fri, 25 Feb 2022 20:00:21 +0530
Subject: [PATCH 088/229] drivers/nvdimm: Add nvdimm pmu structure

A structure is added called nvdimm_pmu, for performance
stats reporting support of nvdimm devices. It can be used to add
device pmu data such as pmu data structure for performance
stats, nvdimm device pointer along with cpumask attributes.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@in.ibm.com>
Link: https://lore.kernel.org/r/20220225143024.47947-2-kjain@linux.ibm.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/nd.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/linux/nd.h b/include/linux/nd.h
index 8a8c63edb1b2..ad186e828263 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -8,6 +8,7 @@
 #include <linux/ndctl.h>
 #include <linux/device.h>
 #include <linux/badblocks.h>
+#include <linux/perf_event.h>
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
@@ -23,6 +24,25 @@ enum nvdimm_claim_class {
 	NVDIMM_CCLASS_UNKNOWN,
 };
 
+/**
+ * struct nvdimm_pmu - data structure for nvdimm perf driver
+ * @pmu: pmu data structure for nvdimm performance stats.
+ * @dev: nvdimm device pointer.
+ * @cpu: designated cpu for counter access.
+ * @node: node for cpu hotplug notifier link.
+ * @cpuhp_state: state for cpu hotplug notification.
+ * @arch_cpumask: cpumask to get designated cpu for counter access.
+ */
+struct nvdimm_pmu {
+	struct pmu pmu;
+	struct device *dev;
+	int cpu;
+	struct hlist_node node;
+	enum cpuhp_state cpuhp_state;
+	/* cpumask provided by arch/platform specific code */
+	struct cpumask arch_cpumask;
+};
+
 struct nd_device_driver {
 	struct device_driver drv;
 	unsigned long type;

From 0fab1ba6ad6ba1f76380f92ead95c6e861ef8116 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Fri, 25 Feb 2022 20:00:22 +0530
Subject: [PATCH 089/229] drivers/nvdimm: Add perf interface to expose nvdimm
 performance stats

A common interface is added to get performance stats reporting
support for nvdimm devices. Added interface defines supported
event list, config fields for the event attributes and their
corresponding bit values which are exported via sysfs.

Interface also added support for pmu register/unregister functions,
cpu hotplug feature along with macros for handling events addition
via sysfs. It adds attribute groups for format, cpumask and events
to the pmu structure.

User could use the standard perf tool to access perf events exposed
via nvdimm pmu.

[Declare pmu functions in nd.h file to resolve implicit-function-declaration
warning and make hotplug function static as reported by kernel test robot]

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Link: https://lore.kernel.org/all/202202241242.zqzGkguy-lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Madhavan Srinivasan <maddy@in.ibm.com>
Link: https://lore.kernel.org/r/20220225143024.47947-3-kjain@linux.ibm.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/Makefile  |   1 +
 drivers/nvdimm/nd_perf.c | 328 +++++++++++++++++++++++++++++++++++++++
 include/linux/nd.h       |  24 +++
 3 files changed, 353 insertions(+)
 create mode 100644 drivers/nvdimm/nd_perf.c

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 29203f3d3069..25dba6095612 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -18,6 +18,7 @@ nd_e820-y := e820.o
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
+libnvdimm-y += nd_perf.o
 libnvdimm-y += dimm.o
 libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c
new file mode 100644
index 000000000000..314415894acf
--- /dev/null
+++ b/drivers/nvdimm/nd_perf.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nd_perf.c: NVDIMM Device Performance Monitoring Unit support
+ *
+ * Perf interface to expose nvdimm performance stats.
+ *
+ * Copyright (C) 2021 IBM Corporation
+ */
+
+#define pr_fmt(fmt) "nvdimm_pmu: " fmt
+
+#include <linux/nd.h>
+
+#define EVENT(_name, _code)     enum{_name = _code}
+
+/*
+ * NVDIMM Events codes.
+ */
+
+/* Controller Reset Count */
+EVENT(CTL_RES_CNT,		0x1);
+/* Controller Reset Elapsed Time */
+EVENT(CTL_RES_TM,		0x2);
+/* Power-on Seconds */
+EVENT(POWERON_SECS,		0x3);
+/* Life Remaining */
+EVENT(MEM_LIFE,		0x4);
+/* Critical Resource Utilization */
+EVENT(CRI_RES_UTIL,		0x5);
+/* Host Load Count */
+EVENT(HOST_L_CNT,		0x6);
+/* Host Store Count */
+EVENT(HOST_S_CNT,		0x7);
+/* Host Store Duration */
+EVENT(HOST_S_DUR,		0x8);
+/* Host Load Duration */
+EVENT(HOST_L_DUR,		0x9);
+/* Media Read Count */
+EVENT(MED_R_CNT,		0xa);
+/* Media Write Count */
+EVENT(MED_W_CNT,		0xb);
+/* Media Read Duration */
+EVENT(MED_R_DUR,		0xc);
+/* Media Write Duration */
+EVENT(MED_W_DUR,		0xd);
+/* Cache Read Hit Count */
+EVENT(CACHE_RH_CNT,		0xe);
+/* Cache Write Hit Count */
+EVENT(CACHE_WH_CNT,		0xf);
+/* Fast Write Count */
+EVENT(FAST_W_CNT,		0x10);
+
+NVDIMM_EVENT_ATTR(ctl_res_cnt,		CTL_RES_CNT);
+NVDIMM_EVENT_ATTR(ctl_res_tm,		CTL_RES_TM);
+NVDIMM_EVENT_ATTR(poweron_secs,		POWERON_SECS);
+NVDIMM_EVENT_ATTR(mem_life,		MEM_LIFE);
+NVDIMM_EVENT_ATTR(cri_res_util,		CRI_RES_UTIL);
+NVDIMM_EVENT_ATTR(host_l_cnt,		HOST_L_CNT);
+NVDIMM_EVENT_ATTR(host_s_cnt,		HOST_S_CNT);
+NVDIMM_EVENT_ATTR(host_s_dur,		HOST_S_DUR);
+NVDIMM_EVENT_ATTR(host_l_dur,		HOST_L_DUR);
+NVDIMM_EVENT_ATTR(med_r_cnt,		MED_R_CNT);
+NVDIMM_EVENT_ATTR(med_w_cnt,		MED_W_CNT);
+NVDIMM_EVENT_ATTR(med_r_dur,		MED_R_DUR);
+NVDIMM_EVENT_ATTR(med_w_dur,		MED_W_DUR);
+NVDIMM_EVENT_ATTR(cache_rh_cnt,		CACHE_RH_CNT);
+NVDIMM_EVENT_ATTR(cache_wh_cnt,		CACHE_WH_CNT);
+NVDIMM_EVENT_ATTR(fast_w_cnt,		FAST_W_CNT);
+
+static struct attribute *nvdimm_events_attr[] = {
+	NVDIMM_EVENT_PTR(CTL_RES_CNT),
+	NVDIMM_EVENT_PTR(CTL_RES_TM),
+	NVDIMM_EVENT_PTR(POWERON_SECS),
+	NVDIMM_EVENT_PTR(MEM_LIFE),
+	NVDIMM_EVENT_PTR(CRI_RES_UTIL),
+	NVDIMM_EVENT_PTR(HOST_L_CNT),
+	NVDIMM_EVENT_PTR(HOST_S_CNT),
+	NVDIMM_EVENT_PTR(HOST_S_DUR),
+	NVDIMM_EVENT_PTR(HOST_L_DUR),
+	NVDIMM_EVENT_PTR(MED_R_CNT),
+	NVDIMM_EVENT_PTR(MED_W_CNT),
+	NVDIMM_EVENT_PTR(MED_R_DUR),
+	NVDIMM_EVENT_PTR(MED_W_DUR),
+	NVDIMM_EVENT_PTR(CACHE_RH_CNT),
+	NVDIMM_EVENT_PTR(CACHE_WH_CNT),
+	NVDIMM_EVENT_PTR(FAST_W_CNT),
+	NULL
+};
+
+static struct attribute_group nvdimm_pmu_events_group = {
+	.name = "events",
+	.attrs = nvdimm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-4");
+
+static struct attribute *nvdimm_pmu_format_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group nvdimm_pmu_format_group = {
+	.name = "format",
+	.attrs = nvdimm_pmu_format_attr,
+};
+
+ssize_t nvdimm_events_sysfs_show(struct device *dev,
+				 struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+static ssize_t nvdimm_pmu_cpumask_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	struct nvdimm_pmu *nd_pmu;
+
+	nd_pmu = container_of(pmu, struct nvdimm_pmu, pmu);
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(nd_pmu->cpu));
+}
+
+static int nvdimm_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct nvdimm_pmu *nd_pmu;
+	u32 target;
+	int nodeid;
+	const struct cpumask *cpumask;
+
+	nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
+
+	/* Clear it, incase given cpu is set in nd_pmu->arch_cpumask */
+	cpumask_test_and_clear_cpu(cpu, &nd_pmu->arch_cpumask);
+
+	/*
+	 * If given cpu is not same as current designated cpu for
+	 * counter access, just return.
+	 */
+	if (cpu != nd_pmu->cpu)
+		return 0;
+
+	/* Check for any active cpu in nd_pmu->arch_cpumask */
+	target = cpumask_any(&nd_pmu->arch_cpumask);
+
+	/*
+	 * Incase we don't have any active cpu in nd_pmu->arch_cpumask,
+	 * check in given cpu's numa node list.
+	 */
+	if (target >= nr_cpu_ids) {
+		nodeid = cpu_to_node(cpu);
+		cpumask = cpumask_of_node(nodeid);
+		target = cpumask_any_but(cpumask, cpu);
+	}
+	nd_pmu->cpu = target;
+
+	/* Migrate nvdimm pmu events to the new target cpu if valid */
+	if (target >= 0 && target < nr_cpu_ids)
+		perf_pmu_migrate_context(&nd_pmu->pmu, cpu, target);
+
+	return 0;
+}
+
+static int nvdimm_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct nvdimm_pmu *nd_pmu;
+
+	nd_pmu = hlist_entry_safe(node, struct nvdimm_pmu, node);
+
+	if (nd_pmu->cpu >= nr_cpu_ids)
+		nd_pmu->cpu = cpu;
+
+	return 0;
+}
+
+static int create_cpumask_attr_group(struct nvdimm_pmu *nd_pmu)
+{
+	struct perf_pmu_events_attr *pmu_events_attr;
+	struct attribute **attrs_group;
+	struct attribute_group *nvdimm_pmu_cpumask_group;
+
+	pmu_events_attr = kzalloc(sizeof(*pmu_events_attr), GFP_KERNEL);
+	if (!pmu_events_attr)
+		return -ENOMEM;
+
+	attrs_group = kzalloc(2 * sizeof(struct attribute *), GFP_KERNEL);
+	if (!attrs_group) {
+		kfree(pmu_events_attr);
+		return -ENOMEM;
+	}
+
+	/* Allocate memory for cpumask attribute group */
+	nvdimm_pmu_cpumask_group = kzalloc(sizeof(*nvdimm_pmu_cpumask_group), GFP_KERNEL);
+	if (!nvdimm_pmu_cpumask_group) {
+		kfree(pmu_events_attr);
+		kfree(attrs_group);
+		return -ENOMEM;
+	}
+
+	sysfs_attr_init(&pmu_events_attr->attr.attr);
+	pmu_events_attr->attr.attr.name = "cpumask";
+	pmu_events_attr->attr.attr.mode = 0444;
+	pmu_events_attr->attr.show = nvdimm_pmu_cpumask_show;
+	attrs_group[0] = &pmu_events_attr->attr.attr;
+	attrs_group[1] = NULL;
+
+	nvdimm_pmu_cpumask_group->attrs = attrs_group;
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR] = nvdimm_pmu_cpumask_group;
+	return 0;
+}
+
+static int nvdimm_pmu_cpu_hotplug_init(struct nvdimm_pmu *nd_pmu)
+{
+	int nodeid, rc;
+	const struct cpumask *cpumask;
+
+	/*
+	 * Incase of cpu hotplug feature, arch specific code
+	 * can provide required cpumask which can be used
+	 * to get designatd cpu for counter access.
+	 * Check for any active cpu in nd_pmu->arch_cpumask.
+	 */
+	if (!cpumask_empty(&nd_pmu->arch_cpumask)) {
+		nd_pmu->cpu = cpumask_any(&nd_pmu->arch_cpumask);
+	} else {
+		/* pick active cpu from the cpumask of device numa node. */
+		nodeid = dev_to_node(nd_pmu->dev);
+		cpumask = cpumask_of_node(nodeid);
+		nd_pmu->cpu = cpumask_any(cpumask);
+	}
+
+	rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "perf/nvdimm:online",
+				     nvdimm_pmu_cpu_online, nvdimm_pmu_cpu_offline);
+
+	if (rc < 0)
+		return rc;
+
+	nd_pmu->cpuhp_state = rc;
+
+	/* Register the pmu instance for cpu hotplug */
+	rc = cpuhp_state_add_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node);
+	if (rc) {
+		cpuhp_remove_multi_state(nd_pmu->cpuhp_state);
+		return rc;
+	}
+
+	/* Create cpumask attribute group */
+	rc = create_cpumask_attr_group(nd_pmu);
+	if (rc) {
+		cpuhp_state_remove_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node);
+		cpuhp_remove_multi_state(nd_pmu->cpuhp_state);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void nvdimm_pmu_free_hotplug_memory(struct nvdimm_pmu *nd_pmu)
+{
+	cpuhp_state_remove_instance_nocalls(nd_pmu->cpuhp_state, &nd_pmu->node);
+	cpuhp_remove_multi_state(nd_pmu->cpuhp_state);
+
+	if (nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR])
+		kfree(nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]->attrs);
+	kfree(nd_pmu->pmu.attr_groups[NVDIMM_PMU_CPUMASK_ATTR]);
+}
+
+int register_nvdimm_pmu(struct nvdimm_pmu *nd_pmu, struct platform_device *pdev)
+{
+	int rc;
+
+	if (!nd_pmu || !pdev)
+		return -EINVAL;
+
+	/* event functions like add/del/read/event_init and pmu name should not be NULL */
+	if (WARN_ON_ONCE(!(nd_pmu->pmu.event_init && nd_pmu->pmu.add &&
+			   nd_pmu->pmu.del && nd_pmu->pmu.read && nd_pmu->pmu.name)))
+		return -EINVAL;
+
+	nd_pmu->pmu.attr_groups = kzalloc((NVDIMM_PMU_NULL_ATTR + 1) *
+					  sizeof(struct attribute_group *), GFP_KERNEL);
+	if (!nd_pmu->pmu.attr_groups)
+		return -ENOMEM;
+
+	/*
+	 * Add platform_device->dev pointer to nvdimm_pmu to access
+	 * device data in events functions.
+	 */
+	nd_pmu->dev = &pdev->dev;
+
+	/* Fill attribute groups for the nvdimm pmu device */
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_FORMAT_ATTR] = &nvdimm_pmu_format_group;
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_EVENT_ATTR] = &nvdimm_pmu_events_group;
+	nd_pmu->pmu.attr_groups[NVDIMM_PMU_NULL_ATTR] = NULL;
+
+	/* Fill attribute group for cpumask */
+	rc = nvdimm_pmu_cpu_hotplug_init(nd_pmu);
+	if (rc) {
+		pr_info("cpu hotplug feature failed for device: %s\n", nd_pmu->pmu.name);
+		kfree(nd_pmu->pmu.attr_groups);
+		return rc;
+	}
+
+	rc = perf_pmu_register(&nd_pmu->pmu, nd_pmu->pmu.name, -1);
+	if (rc) {
+		kfree(nd_pmu->pmu.attr_groups);
+		nvdimm_pmu_free_hotplug_memory(nd_pmu);
+		return rc;
+	}
+
+	pr_info("%s NVDIMM performance monitor support registered\n",
+		nd_pmu->pmu.name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_nvdimm_pmu);
+
+void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu)
+{
+	perf_pmu_unregister(&nd_pmu->pmu);
+	nvdimm_pmu_free_hotplug_memory(nd_pmu);
+	kfree(nd_pmu);
+}
+EXPORT_SYMBOL_GPL(unregister_nvdimm_pmu);
diff --git a/include/linux/nd.h b/include/linux/nd.h
index ad186e828263..4813c7089e5c 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -9,6 +9,7 @@
 #include <linux/device.h>
 #include <linux/badblocks.h>
 #include <linux/perf_event.h>
+#include <linux/platform_device.h>
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
@@ -24,6 +25,19 @@ enum nvdimm_claim_class {
 	NVDIMM_CCLASS_UNKNOWN,
 };
 
+#define NVDIMM_EVENT_VAR(_id)  event_attr_##_id
+#define NVDIMM_EVENT_PTR(_id)  (&event_attr_##_id.attr.attr)
+
+#define NVDIMM_EVENT_ATTR(_name, _id)				\
+	PMU_EVENT_ATTR(_name, NVDIMM_EVENT_VAR(_id), _id,	\
+			nvdimm_events_sysfs_show)
+
+/* Event attribute array index */
+#define NVDIMM_PMU_FORMAT_ATTR	0
+#define NVDIMM_PMU_EVENT_ATTR	1
+#define NVDIMM_PMU_CPUMASK_ATTR	2
+#define NVDIMM_PMU_NULL_ATTR	3
+
 /**
  * struct nvdimm_pmu - data structure for nvdimm perf driver
  * @pmu: pmu data structure for nvdimm performance stats.
@@ -43,6 +57,16 @@ struct nvdimm_pmu {
 	struct cpumask arch_cpumask;
 };
 
+extern ssize_t nvdimm_events_sysfs_show(struct device *dev,
+					struct device_attribute *attr,
+					char *page);
+
+int register_nvdimm_pmu(struct nvdimm_pmu *nvdimm, struct platform_device *pdev);
+void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu);
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu);
+int perf_pmu_register(struct pmu *pmu, const char *name, int type);
+void perf_pmu_unregister(struct pmu *pmu);
+
 struct nd_device_driver {
 	struct device_driver drv;
 	unsigned long type;

From 4c08d4bbc089a95f3f38389c2b79dbc6ab24f10b Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Fri, 25 Feb 2022 20:00:23 +0530
Subject: [PATCH 090/229] powerpc/papr_scm: Add perf interface support

Performance monitoring support for papr-scm nvdimm devices
via perf interface is added which includes addition of pmu
functions like add/del/read/event_init for nvdimm_pmu struture.

A new parameter 'priv' in added to the pdev_archdata structure to save
nvdimm_pmu device pointer, to handle the unregistering of pmu device.

papr_scm_pmu_register function populates the nvdimm_pmu structure
with name, capabilities, cpumask along with event handling
functions. Finally the populated nvdimm_pmu structure is passed to
register the pmu device. Event handling functions internally uses
hcall to get events and counter data.

Result in power9 machine with 2 nvdimm device:

Ex: List all event by perf list

command:# perf list nmem

  nmem0/cache_rh_cnt/                                [Kernel PMU event]
  nmem0/cache_wh_cnt/                                [Kernel PMU event]
  nmem0/cri_res_util/                                [Kernel PMU event]
  nmem0/ctl_res_cnt/                                 [Kernel PMU event]
  nmem0/ctl_res_tm/                                  [Kernel PMU event]
  nmem0/fast_w_cnt/                                  [Kernel PMU event]
  nmem0/host_l_cnt/                                  [Kernel PMU event]
  nmem0/host_l_dur/                                  [Kernel PMU event]
  nmem0/host_s_cnt/                                  [Kernel PMU event]
  nmem0/host_s_dur/                                  [Kernel PMU event]
  nmem0/med_r_cnt/                                   [Kernel PMU event]
  nmem0/med_r_dur/                                   [Kernel PMU event]
  nmem0/med_w_cnt/                                   [Kernel PMU event]
  nmem0/med_w_dur/                                   [Kernel PMU event]
  nmem0/mem_life/                                    [Kernel PMU event]
  nmem0/poweron_secs/                                [Kernel PMU event]
  ...
  nmem1/mem_life/                                    [Kernel PMU event]
  nmem1/poweron_secs/                                [Kernel PMU event]

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
[Add numa_map_to_online_node function call to get online node id]
Reported-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@in.ibm.com>
Link: https://lore.kernel.org/r/20220225143024.47947-4-kjain@linux.ibm.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/include/asm/device.h         |   5 +
 arch/powerpc/platforms/pseries/papr_scm.c | 225 ++++++++++++++++++++++
 2 files changed, 230 insertions(+)

diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 219559d65864..47ed639f3b8f 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -48,6 +48,11 @@ struct dev_archdata {
 
 struct pdev_archdata {
 	u64 dma_mask;
+	/*
+	 * Pointer to nvdimm_pmu structure, to handle the unregistering
+	 * of pmu device
+	 */
+	void *priv;
 };
 
 #endif /* _ASM_POWERPC_DEVICE_H */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index f48e87ac89c9..4dd513d7c029 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -19,6 +19,7 @@
 #include <asm/papr_pdsm.h>
 #include <asm/mce.h>
 #include <asm/unaligned.h>
+#include <linux/perf_event.h>
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -68,6 +69,8 @@
 #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
 #define PAPR_SCM_PERF_STATS_VERSION 0x1
 
+#define to_nvdimm_pmu(_pmu)	container_of(_pmu, struct nvdimm_pmu, pmu)
+
 /* Struct holding a single performance metric */
 struct papr_scm_perf_stat {
 	u8 stat_id[8];
@@ -120,6 +123,9 @@ struct papr_scm_priv {
 
 	/* length of the stat buffer as expected by phyp */
 	size_t stat_buffer_len;
+
+	 /* array to have event_code and stat_id mappings */
+	char **nvdimm_events_map;
 };
 
 static int papr_scm_pmem_flush(struct nd_region *nd_region,
@@ -340,6 +346,218 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
 	return 0;
 }
 
+static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, u64 *count)
+{
+	struct papr_scm_perf_stat *stat;
+	struct papr_scm_perf_stats *stats;
+	struct papr_scm_priv *p = (struct papr_scm_priv *)dev->driver_data;
+	int rc, size;
+
+	/* Allocate request buffer enough to hold single performance stat */
+	size = sizeof(struct papr_scm_perf_stats) +
+		sizeof(struct papr_scm_perf_stat);
+
+	if (!p || !p->nvdimm_events_map)
+		return -EINVAL;
+
+	stats = kzalloc(size, GFP_KERNEL);
+	if (!stats)
+		return -ENOMEM;
+
+	stat = &stats->scm_statistic[0];
+	memcpy(&stat->stat_id,
+	       p->nvdimm_events_map[event->attr.config],
+		sizeof(stat->stat_id));
+	stat->stat_val = 0;
+
+	rc = drc_pmem_query_stats(p, stats, 1);
+	if (rc < 0) {
+		kfree(stats);
+		return rc;
+	}
+
+	*count = be64_to_cpu(stat->stat_val);
+	kfree(stats);
+	return 0;
+}
+
+static int papr_scm_pmu_event_init(struct perf_event *event)
+{
+	struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+	struct papr_scm_priv *p;
+
+	if (!nd_pmu)
+		return -EINVAL;
+
+	/* test the event attr type for PMU enumeration */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* it does not support event sampling mode */
+	if (is_sampling_event(event))
+		return -EOPNOTSUPP;
+
+	/* no branch sampling */
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	p = (struct papr_scm_priv *)nd_pmu->dev->driver_data;
+	if (!p)
+		return -EINVAL;
+
+	/* Invalid eventcode */
+	if (event->attr.config == 0 || event->attr.config > 16)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int papr_scm_pmu_add(struct perf_event *event, int flags)
+{
+	u64 count;
+	int rc;
+	struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+
+	if (!nd_pmu)
+		return -EINVAL;
+
+	if (flags & PERF_EF_START) {
+		rc = papr_scm_pmu_get_value(event, nd_pmu->dev, &count);
+		if (rc)
+			return rc;
+
+		local64_set(&event->hw.prev_count, count);
+	}
+
+	return 0;
+}
+
+static void papr_scm_pmu_read(struct perf_event *event)
+{
+	u64 prev, now;
+	int rc;
+	struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+
+	if (!nd_pmu)
+		return;
+
+	rc = papr_scm_pmu_get_value(event, nd_pmu->dev, &now);
+	if (rc)
+		return;
+
+	prev = local64_xchg(&event->hw.prev_count, now);
+	local64_add(now - prev, &event->count);
+}
+
+static void papr_scm_pmu_del(struct perf_event *event, int flags)
+{
+	papr_scm_pmu_read(event);
+}
+
+static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu *nd_pmu)
+{
+	struct papr_scm_perf_stat *stat;
+	struct papr_scm_perf_stats *stats;
+	char *statid;
+	int index, rc, count;
+	u32 available_events;
+
+	if (!p->stat_buffer_len)
+		return -ENOENT;
+
+	available_events = (p->stat_buffer_len  - sizeof(struct papr_scm_perf_stats))
+			/ sizeof(struct papr_scm_perf_stat);
+
+	/* Allocate the buffer for phyp where stats are written */
+	stats = kzalloc(p->stat_buffer_len, GFP_KERNEL);
+	if (!stats) {
+		rc = -ENOMEM;
+		return rc;
+	}
+
+	/* Allocate memory to nvdimm_event_map */
+	p->nvdimm_events_map = kcalloc(available_events, sizeof(char *), GFP_KERNEL);
+	if (!p->nvdimm_events_map) {
+		rc = -ENOMEM;
+		goto out_stats;
+	}
+
+	/* Called to get list of events supported */
+	rc = drc_pmem_query_stats(p, stats, 0);
+	if (rc)
+		goto out_nvdimm_events_map;
+
+	for (index = 0, stat = stats->scm_statistic, count = 0;
+		     index < available_events; index++, ++stat) {
+		statid = kzalloc(strlen(stat->stat_id) + 1, GFP_KERNEL);
+		if (!statid) {
+			rc = -ENOMEM;
+			goto out_nvdimm_events_map;
+		}
+
+		strcpy(statid, stat->stat_id);
+		p->nvdimm_events_map[count] = statid;
+		count++;
+	}
+	p->nvdimm_events_map[count] = NULL;
+	kfree(stats);
+	return 0;
+
+out_nvdimm_events_map:
+	kfree(p->nvdimm_events_map);
+out_stats:
+	kfree(stats);
+	return rc;
+}
+
+static void papr_scm_pmu_register(struct papr_scm_priv *p)
+{
+	struct nvdimm_pmu *nd_pmu;
+	int rc, nodeid;
+
+	nd_pmu = kzalloc(sizeof(*nd_pmu), GFP_KERNEL);
+	if (!nd_pmu) {
+		rc = -ENOMEM;
+		goto pmu_err_print;
+	}
+
+	rc = papr_scm_pmu_check_events(p, nd_pmu);
+	if (rc)
+		goto pmu_check_events_err;
+
+	nd_pmu->pmu.task_ctx_nr = perf_invalid_context;
+	nd_pmu->pmu.name = nvdimm_name(p->nvdimm);
+	nd_pmu->pmu.event_init = papr_scm_pmu_event_init;
+	nd_pmu->pmu.read = papr_scm_pmu_read;
+	nd_pmu->pmu.add = papr_scm_pmu_add;
+	nd_pmu->pmu.del = papr_scm_pmu_del;
+
+	nd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_INTERRUPT |
+				PERF_PMU_CAP_NO_EXCLUDE;
+
+	/*updating the cpumask variable */
+	nodeid = numa_map_to_online_node(dev_to_node(&p->pdev->dev));
+	nd_pmu->arch_cpumask = *cpumask_of_node(nodeid);
+
+	rc = register_nvdimm_pmu(nd_pmu, p->pdev);
+	if (rc)
+		goto pmu_register_err;
+
+	/*
+	 * Set archdata.priv value to nvdimm_pmu structure, to handle the
+	 * unregistering of pmu device.
+	 */
+	p->pdev->archdata.priv = nd_pmu;
+	return;
+
+pmu_register_err:
+	kfree(p->nvdimm_events_map);
+pmu_check_events_err:
+	kfree(nd_pmu);
+pmu_err_print:
+	dev_info(&p->pdev->dev, "nvdimm pmu didn't register rc=%d\n", rc);
+}
+
 /*
  * Issue hcall to retrieve dimm health info and populate papr_scm_priv with the
  * health information.
@@ -1236,6 +1454,7 @@ static int papr_scm_probe(struct platform_device *pdev)
 		goto err2;
 
 	platform_set_drvdata(pdev, p);
+	papr_scm_pmu_register(p);
 
 	return 0;
 
@@ -1254,6 +1473,12 @@ static int papr_scm_remove(struct platform_device *pdev)
 
 	nvdimm_bus_unregister(p->bus);
 	drc_pmem_unbind(p);
+
+	if (pdev->archdata.priv)
+		unregister_nvdimm_pmu(pdev->archdata.priv);
+
+	pdev->archdata.priv = NULL;
+	kfree(p->nvdimm_events_map);
 	kfree(p->bus_desc.provider_name);
 	kfree(p);
 

From 2bec6d9aa89cbe97deb6fbc64708212b780605a4 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Fri, 25 Feb 2022 20:00:24 +0530
Subject: [PATCH 091/229] docs: ABI: sysfs-bus-nvdimm: Document sysfs event
 format entries for nvdimm pmu

Details are added for the event, cpumask and format attributes
in the ABI documentation.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Nageswara R Sastry <rnsastry@linux.ibm.com>
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Reviewed-by: Madhavan Srinivasan <maddy@in.ibm.com>
Link: https://lore.kernel.org/r/20220225143024.47947-5-kjain@linux.ibm.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/sysfs-bus-nvdimm | 35 ++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-nvdimm b/Documentation/ABI/testing/sysfs-bus-nvdimm
index bff84a16812a..1c1f5acbf53d 100644
--- a/Documentation/ABI/testing/sysfs-bus-nvdimm
+++ b/Documentation/ABI/testing/sysfs-bus-nvdimm
@@ -6,3 +6,38 @@ Description:
 
 The libnvdimm sub-system implements a common sysfs interface for
 platform nvdimm resources. See Documentation/driver-api/nvdimm/.
+
+What:           /sys/bus/event_source/devices/nmemX/format
+Date:           February 2022
+KernelVersion:  5.18
+Contact:        Kajol Jain <kjain@linux.ibm.com>
+Description:	(RO) Attribute group to describe the magic bits
+		that go into perf_event_attr.config for a particular pmu.
+		(See ABI/testing/sysfs-bus-event_source-devices-format).
+
+		Each attribute under this group defines a bit range of the
+		perf_event_attr.config. Supported attribute is listed
+		below::
+		  event  = "config:0-4"  - event ID
+
+		For example::
+			ctl_res_cnt = "event=0x1"
+
+What:           /sys/bus/event_source/devices/nmemX/events
+Date:           February 2022
+KernelVersion:  5.18
+Contact:        Kajol Jain <kjain@linux.ibm.com>
+Description:	(RO) Attribute group to describe performance monitoring events
+                for the nvdimm memory device. Each attribute in this group
+                describes a single performance monitoring event supported by
+                this nvdimm pmu.  The name of the file is the name of the event.
+                (See ABI/testing/sysfs-bus-event_source-devices-events). A
+                listing of the events supported by a given nvdimm provider type
+                can be found in Documentation/driver-api/nvdimm/$provider.
+
+What:          /sys/bus/event_source/devices/nmemX/cpumask
+Date:          February 2022
+KernelVersion:  5.18
+Contact:        Kajol Jain <kjain@linux.ibm.com>
+Description:	(RO) This sysfs file exposes the cpumask which is designated to
+		to retrieve nvdimm pmu event counter data.

From 8ddde07a3d285a0f3cec14924446608320fdc013 Mon Sep 17 00:00:00 2001
From: Tian Tao <tiantao6@hisilicon.com>
Date: Tue, 8 Mar 2022 16:59:10 +0800
Subject: [PATCH 092/229] dma-mapping: benchmark: extract a common header file
 for map_benchmark definition

kernel/dma/map_benchmark.c and selftests/dma/dma_map_benchmark.c
have duplicate map_benchmark definitions, which tends to lead to
inconsistent changes to map_benchmark on both sides, extract a
common header file to avoid this problem.

Signed-off-by: Tian Tao <tiantao6@hisilicon.com>
Acked-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/map_benchmark.h                 | 31 +++++++++++++++++++
 kernel/dma/map_benchmark.c                    | 25 +--------------
 .../testing/selftests/dma/dma_map_benchmark.c | 25 +--------------
 3 files changed, 33 insertions(+), 48 deletions(-)
 create mode 100644 include/linux/map_benchmark.h

diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h
new file mode 100644
index 000000000000..62674c83bde4
--- /dev/null
+++ b/include/linux/map_benchmark.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2022 HiSilicon Limited.
+ */
+
+#ifndef _KERNEL_DMA_BENCHMARK_H
+#define _KERNEL_DMA_BENCHMARK_H
+
+#define DMA_MAP_BENCHMARK       _IOWR('d', 1, struct map_benchmark)
+#define DMA_MAP_MAX_THREADS     1024
+#define DMA_MAP_MAX_SECONDS     300
+#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
+
+#define DMA_MAP_BIDIRECTIONAL   0
+#define DMA_MAP_TO_DEVICE       1
+#define DMA_MAP_FROM_DEVICE     2
+
+struct map_benchmark {
+	__u64 avg_map_100ns; /* average map latency in 100ns */
+	__u64 map_stddev; /* standard deviation of map latency */
+	__u64 avg_unmap_100ns; /* as above */
+	__u64 unmap_stddev;
+	__u32 threads; /* how many threads will do map/unmap in parallel */
+	__u32 seconds; /* how long the test will last */
+	__s32 node; /* which numa node this benchmark will run on */
+	__u32 dma_bits; /* DMA addressing capability */
+	__u32 dma_dir; /* DMA data direction */
+	__u32 dma_trans_ns; /* time for DMA transmission in ns */
+	__u32 granule;  /* how many PAGE_SIZE will do map/unmap once a time */
+};
+#endif /* _KERNEL_DMA_BENCHMARK_H */
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..0520a8f4fb1d 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,6 +11,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
+#include <linux/map_benchmark.h>
 #include <linux/math64.h>
 #include <linux/module.h>
 #include <linux/pci.h>
@@ -18,30 +19,6 @@
 #include <linux/slab.h>
 #include <linux/timekeeping.h>
 
-#define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS	1024
-#define DMA_MAP_MAX_SECONDS	300
-#define DMA_MAP_MAX_TRANS_DELAY	(10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL	0
-#define DMA_MAP_TO_DEVICE	1
-#define DMA_MAP_FROM_DEVICE	2
-
-struct map_benchmark {
-	__u64 avg_map_100ns; /* average map latency in 100ns */
-	__u64 map_stddev; /* standard deviation of map latency */
-	__u64 avg_unmap_100ns; /* as above */
-	__u64 unmap_stddev;
-	__u32 threads; /* how many threads will do map/unmap in parallel */
-	__u32 seconds; /* how long the test will last */
-	__s32 node; /* which numa node this benchmark will run on */
-	__u32 dma_bits; /* DMA addressing capability */
-	__u32 dma_dir; /* DMA data direction */
-	__u32 dma_trans_ns; /* time for DMA transmission in ns */
-	__u32 granule;	/* how many PAGE_SIZE will do map/unmap once a time */
-	__u8 expansion[76];	/* For future use */
-};
-
 struct map_benchmark_data {
 	struct map_benchmark bparam;
 	struct device *dev;
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c
index 485dff51bad2..c3b3c09e995e 100644
--- a/tools/testing/selftests/dma/dma_map_benchmark.c
+++ b/tools/testing/selftests/dma/dma_map_benchmark.c
@@ -10,40 +10,17 @@
 #include <unistd.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <linux/map_benchmark.h>
 #include <linux/types.h>
 
 #define NSEC_PER_MSEC	1000000L
 
-#define DMA_MAP_BENCHMARK	_IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS	1024
-#define DMA_MAP_MAX_SECONDS     300
-#define DMA_MAP_MAX_TRANS_DELAY	(10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL	0
-#define DMA_MAP_TO_DEVICE	1
-#define DMA_MAP_FROM_DEVICE	2
-
 static char *directions[] = {
 	"BIDIRECTIONAL",
 	"TO_DEVICE",
 	"FROM_DEVICE",
 };
 
-struct map_benchmark {
-	__u64 avg_map_100ns; /* average map latency in 100ns */
-	__u64 map_stddev; /* standard deviation of map latency */
-	__u64 avg_unmap_100ns; /* as above */
-	__u64 unmap_stddev;
-	__u32 threads; /* how many threads will do map/unmap in parallel */
-	__u32 seconds; /* how long the test will last */
-	__s32 node; /* which numa node this benchmark will run on */
-	__u32 dma_bits; /* DMA addressing capability */
-	__u32 dma_dir; /* DMA data direction */
-	__u32 dma_trans_ns; /* time for DMA transmission in ns */
-	__u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
-	__u8 expansion[76];	/* For future use */
-};
-
 int main(int argc, char **argv)
 {
 	struct map_benchmark map;

From b537bf429a682131f94df166e6daf39ec48fac03 Mon Sep 17 00:00:00 2001
From: Wang Qing <wangqing@vivo.com>
Date: Sun, 27 Feb 2022 19:15:03 -0800
Subject: [PATCH 093/229] xen: use time_is_before_eq_jiffies() instead of open
 coding it

Use the helper function time_is_{before,after}_jiffies() to improve
code readability.

Signed-off-by: Wang Qing <wangqing@vivo.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/1646018104-61415-1-git-send-email-wangqing@vivo.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/balloon.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index a2c4fc49c483..dfe26fa17e95 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -59,6 +59,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/moduleparam.h>
+#include <linux/jiffies.h>
 
 #include <asm/page.h>
 #include <asm/tlb.h>
@@ -794,7 +795,7 @@ static int __init balloon_wait_finish(void)
 		if (balloon_state == BP_ECANCELED) {
 			pr_warn_once("Initial ballooning failed, %ld pages need to be freed.\n",
 				     -credit);
-			if (jiffies - last_changed >= HZ * balloon_boot_timeout)
+			if (time_is_before_eq_jiffies(last_changed + HZ * balloon_boot_timeout))
 				panic("Initial ballooning failed!\n");
 		}
 

From eed05744322da07dd7e419432dcedf3c2e017179 Mon Sep 17 00:00:00 2001
From: Dongli Zhang <dongli.zhang@oracle.com>
Date: Wed, 2 Mar 2022 08:40:32 -0800
Subject: [PATCH 094/229] xen: delay xen_hvm_init_time_ops() if kdump is boot
 on vcpu>=32

The sched_clock() can be used very early since commit 857baa87b642
("sched/clock: Enable sched clock early"). In addition, with commit
38669ba205d1 ("x86/xen/time: Output xen sched_clock time from 0"), kdump
kernel in Xen HVM guest may panic at very early stage when accessing
&__this_cpu_read(xen_vcpu)->time as in below:

setup_arch()
 -> init_hypervisor_platform()
     -> x86_init.hyper.init_platform = xen_hvm_guest_init()
         -> xen_hvm_init_time_ops()
             -> xen_clocksource_read()
                 -> src = &__this_cpu_read(xen_vcpu)->time;

This is because Xen HVM supports at most MAX_VIRT_CPUS=32 'vcpu_info'
embedded inside 'shared_info' during early stage until xen_vcpu_setup() is
used to allocate/relocate 'vcpu_info' for boot cpu at arbitrary address.

However, when Xen HVM guest panic on vcpu >= 32, since
xen_vcpu_info_reset(0) would set per_cpu(xen_vcpu, cpu) = NULL when
vcpu >= 32, xen_clocksource_read() on vcpu >= 32 would panic.

This patch calls xen_hvm_init_time_ops() again later in
xen_hvm_smp_prepare_boot_cpu() after the 'vcpu_info' for boot vcpu is
registered when the boot vcpu is >= 32.

This issue can be reproduced on purpose via below command at the guest
side when kdump/kexec is enabled:

"taskset -c 33 echo c > /proc/sysrq-trigger"

The bugfix for PVM is not implemented due to the lack of testing
environment.

[boris: xen_hvm_init_time_ops() returns on errors instead of jumping to end]

Cc: Joe Jin <joe.jin@oracle.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20220302164032.14569-3-dongli.zhang@oracle.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/smp_hvm.c |  6 ++++++
 arch/x86/xen/time.c    | 24 +++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index 6ff3c887e0b9..b70afdff419c 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -19,6 +19,12 @@ static void __init xen_hvm_smp_prepare_boot_cpu(void)
 	 */
 	xen_vcpu_setup(0);
 
+	/*
+	 * Called again in case the kernel boots on vcpu >= MAX_VIRT_CPUS.
+	 * Refer to comments in xen_hvm_init_time_ops().
+	 */
+	xen_hvm_init_time_ops();
+
 	/*
 	 * The alternative logic (which patches the unlock/lock) runs before
 	 * the smp bootup up code is activated. Hence we need to set this up
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index d9c945ee1100..9ef0a5cca96e 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -558,6 +558,11 @@ static void xen_hvm_setup_cpu_clockevents(void)
 
 void __init xen_hvm_init_time_ops(void)
 {
+	static bool hvm_time_initialized;
+
+	if (hvm_time_initialized)
+		return;
+
 	/*
 	 * vector callback is needed otherwise we cannot receive interrupts
 	 * on cpu > 0 and at this point we don't know how many cpus are
@@ -567,7 +572,22 @@ void __init xen_hvm_init_time_ops(void)
 		return;
 
 	if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
-		pr_info("Xen doesn't support pvclock on HVM, disable pv timer");
+		pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
+		return;
+	}
+
+	/*
+	 * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
+	 * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
+	 * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
+	 * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
+	 *
+	 * The xen_hvm_init_time_ops() should be called again later after
+	 * __this_cpu_read(xen_vcpu) is available.
+	 */
+	if (!__this_cpu_read(xen_vcpu)) {
+		pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
+			xen_vcpu_nr(0));
 		return;
 	}
 
@@ -577,6 +597,8 @@ void __init xen_hvm_init_time_ops(void)
 	x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
 
 	x86_platform.set_wallclock = xen_set_wallclock;
+
+	hvm_time_initialized = true;
 }
 #endif
 

From b359b3a0296a8a9c853c7ee98f9728942d7f0f4e Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Mon, 7 Mar 2022 14:25:54 +0800
Subject: [PATCH 095/229] x86/xen: Fix kerneldoc warning

Fix the following W=1 kernel warnings:

arch/x86/xen/setup.c:725: warning: expecting prototype for
machine_specific_memory_setup(). Prototype was for xen_memory_setup()
instead.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20220307062554.8334-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index af216feb63d9..81aa46f770c5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -719,7 +719,7 @@ static void __init xen_reserve_xen_mfnlist(void)
 }
 
 /**
- * machine_specific_memory_setup - Hook for machine specific memory setup.
+ * xen_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {

From 982e4430beb94e65c482d36a11dbb779f20c38a7 Mon Sep 17 00:00:00 2001
From: zhanglianjie <zhanglianjie@uniontech.com>
Date: Sat, 5 Mar 2022 21:38:23 +0800
Subject: [PATCH 096/229] drivers/xen: use helper macro __ATTR_RW

Use helper macro __ATTR_RW to define HYPERVISOR_ATTR_RW to make code more clear.
Minor readability improvement.

Remove extra whitespace [boris: added this comment]

Signed-off-by: zhanglianjie <zhanglianjie@uniontech.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20220305133823.158961-1-zhanglianjie@uniontech.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/sys-hypervisor.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index feb1d16252e7..fcb0792f090e 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -22,11 +22,10 @@
 #endif
 
 #define HYPERVISOR_ATTR_RO(_name) \
-static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
+static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name)
 
 #define HYPERVISOR_ATTR_RW(_name) \
-static struct hyp_sysfs_attr _name##_attr = \
-	__ATTR(_name, 0644, _name##_show, _name##_store)
+static struct hyp_sysfs_attr _name##_attr = __ATTR_RW(_name)
 
 struct hyp_sysfs_attr {
 	struct attribute attr;

From 19397e8b546d20226153dafe5dce34c4393752c4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 11:17:32 -0600
Subject: [PATCH 097/229] ptrace: Move ptrace_report_syscall into ptrace.h

Move ptrace_report_syscall from tracehook.h into ptrace.h where it
belongs.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-1-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h    | 27 +++++++++++++++++++++++++++
 include/linux/tracehook.h | 26 --------------------------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 8aee2945ff08..91b1074edb4c 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -413,4 +413,31 @@ static inline void user_single_step_report(struct pt_regs *regs)
 extern int task_current_syscall(struct task_struct *target, struct syscall_info *info);
 
 extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact);
+
+/*
+ * ptrace report for syscall entry and exit looks identical.
+ */
+static inline int ptrace_report_syscall(unsigned long message)
+{
+	int ptrace = current->ptrace;
+
+	if (!(ptrace & PT_PTRACED))
+		return 0;
+
+	current->ptrace_message = message;
+	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+
+	/*
+	 * this isn't the same as continuing with a signal, but it will do
+	 * for normal use.  strace only continues with a signal if the
+	 * stopping signal is not SIGTRAP.  -brl
+	 */
+	if (current->exit_code) {
+		send_sig(current->exit_code, current, 1);
+		current->exit_code = 0;
+	}
+
+	current->ptrace_message = 0;
+	return fatal_signal_pending(current);
+}
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 88c007ab5ebc..998bc3863559 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -51,32 +51,6 @@
 #include <linux/blk-cgroup.h>
 struct linux_binprm;
 
-/*
- * ptrace report for syscall entry and exit looks identical.
- */
-static inline int ptrace_report_syscall(unsigned long message)
-{
-	int ptrace = current->ptrace;
-
-	if (!(ptrace & PT_PTRACED))
-		return 0;
-
-	current->ptrace_message = message;
-	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
-
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-
-	current->ptrace_message = 0;
-	return fatal_signal_pending(current);
-}
 
 /**
  * tracehook_report_syscall_entry - task is about to attempt a system call

From 42da6b7e7db5316dac541baf4ed8168bc3bb3f74 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 11:27:10 -0600
Subject: [PATCH 098/229] ptrace/arm: Rename tracehook_report_syscall
 report_syscall

Make the arm and arm64 code more concise and less confusing by
renaming the architecture specific tracehook_report_syscall to
report_syscall.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-2-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/arm/kernel/ptrace.c   | 7 +++----
 arch/arm64/kernel/ptrace.c | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 43b963ea4a0e..e5aa3237853d 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -831,8 +831,7 @@ enum ptrace_syscall_dir {
 	PTRACE_SYSCALL_EXIT,
 };
 
-static void tracehook_report_syscall(struct pt_regs *regs,
-				    enum ptrace_syscall_dir dir)
+static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 {
 	unsigned long ip;
 
@@ -856,7 +855,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 	int scno;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+		report_syscall(regs, PTRACE_SYSCALL_ENTER);
 
 	/* Do seccomp after ptrace; syscall may have changed. */
 #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
@@ -897,5 +896,5 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 		trace_sys_exit(regs, regs_return_value(regs));
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+		report_syscall(regs, PTRACE_SYSCALL_EXIT);
 }
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 39dbdfdc38d3..b7845575f86f 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1792,8 +1792,7 @@ enum ptrace_syscall_dir {
 	PTRACE_SYSCALL_EXIT,
 };
 
-static void tracehook_report_syscall(struct pt_regs *regs,
-				     enum ptrace_syscall_dir dir)
+static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 {
 	int regno;
 	unsigned long saved_reg;
@@ -1842,7 +1841,7 @@ int syscall_trace_enter(struct pt_regs *regs)
 	unsigned long flags = read_thread_flags();
 
 	if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) {
-		tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER);
+		report_syscall(regs, PTRACE_SYSCALL_ENTER);
 		if (flags & _TIF_SYSCALL_EMU)
 			return NO_SYSCALL;
 	}
@@ -1870,7 +1869,7 @@ void syscall_trace_exit(struct pt_regs *regs)
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
 
 	if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP))
-		tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT);
+		report_syscall(regs, PTRACE_SYSCALL_EXIT);
 
 	rseq_syscall(regs);
 }

From 153474ba1a4aed0a7b797b4c2be8c35c7a4e57bd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 11:46:37 -0600
Subject: [PATCH 099/229] ptrace: Create ptrace_report_syscall_{entry,exit} in
 ptrace.h

Rename tracehook_report_syscall_{entry,exit} to
ptrace_report_syscall_{entry,exit} and place them in ptrace.h

There is no longer any generic tracehook infractructure so make
these ptrace specific functions ptrace specific.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-3-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/Kconfig                        |  2 +-
 arch/alpha/kernel/ptrace.c          |  5 ++-
 arch/arc/kernel/ptrace.c            |  5 ++-
 arch/arm/kernel/ptrace.c            |  5 ++-
 arch/arm64/kernel/ptrace.c          |  7 ++--
 arch/csky/kernel/ptrace.c           |  5 ++-
 arch/h8300/kernel/ptrace.c          |  5 ++-
 arch/hexagon/kernel/traps.c         |  6 ++--
 arch/ia64/kernel/ptrace.c           |  4 +--
 arch/m68k/kernel/ptrace.c           |  6 ++--
 arch/microblaze/kernel/ptrace.c     |  5 ++-
 arch/mips/kernel/ptrace.c           |  5 ++-
 arch/nds32/include/asm/syscall.h    |  2 +-
 arch/nds32/kernel/ptrace.c          |  5 ++-
 arch/nios2/kernel/ptrace.c          |  5 ++-
 arch/openrisc/kernel/ptrace.c       |  5 ++-
 arch/parisc/kernel/ptrace.c         |  7 ++--
 arch/powerpc/kernel/ptrace/ptrace.c |  8 ++---
 arch/riscv/kernel/ptrace.c          |  5 ++-
 arch/sh/kernel/ptrace_32.c          |  5 ++-
 arch/sparc/kernel/ptrace_32.c       |  5 ++-
 arch/sparc/kernel/ptrace_64.c       |  5 ++-
 arch/um/kernel/ptrace.c             |  5 ++-
 arch/xtensa/kernel/ptrace.c         |  5 ++-
 include/asm-generic/syscall.h       |  2 +-
 include/linux/entry-common.h        |  6 ++--
 include/linux/ptrace.h              | 51 +++++++++++++++++++++++++++++
 include/linux/tracehook.h           | 51 -----------------------------
 include/uapi/linux/ptrace.h         |  2 +-
 kernel/entry/common.c               |  1 +
 30 files changed, 109 insertions(+), 126 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 678a80713b21..a517a949eb1d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -217,7 +217,7 @@ config TRACE_IRQFLAGS_SUPPORT
 #	asm/syscall.h		supplying asm-generic/syscall.h interface
 #	linux/regset.h		user_regset interfaces
 #	CORE_DUMP_USE_REGSET	#define'd in linux/elf.h
-#	TIF_SYSCALL_TRACE	calls tracehook_report_syscall_{entry,exit}
+#	TIF_SYSCALL_TRACE	calls ptrace_report_syscall_{entry,exit}
 #	TIF_NOTIFY_RESUME	calls tracehook_notify_resume()
 #	signal delivery		calls tracehook_signal_handler()
 #
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index 8c43212ae38e..a1a239ea002d 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -15,7 +15,6 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/signal.h>
-#include <linux/tracehook.h>
 #include <linux/audit.h>
 
 #include <linux/uaccess.h>
@@ -323,7 +322,7 @@ asmlinkage unsigned long syscall_trace_enter(void)
 	unsigned long ret = 0;
 	struct pt_regs *regs = current_pt_regs();
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(current_pt_regs()))
+	    ptrace_report_syscall_entry(current_pt_regs()))
 		ret = -1UL;
 	audit_syscall_entry(regs->r0, regs->r16, regs->r17, regs->r18, regs->r19);
 	return ret ?: current_pt_regs()->r0;
@@ -334,5 +333,5 @@ syscall_trace_leave(void)
 {
 	audit_syscall_exit(current_pt_regs());
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(current_pt_regs(), 0);
+		ptrace_report_syscall_exit(current_pt_regs(), 0);
 }
diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c
index 883391977fdf..54b419ac8bda 100644
--- a/arch/arc/kernel/ptrace.c
+++ b/arch/arc/kernel/ptrace.c
@@ -4,7 +4,6 @@
  */
 
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/sched/task_stack.h>
 #include <linux/regset.h>
 #include <linux/unistd.h>
@@ -258,7 +257,7 @@ long arch_ptrace(struct task_struct *child, long request,
 
 asmlinkage int syscall_trace_entry(struct pt_regs *regs)
 {
-	if (tracehook_report_syscall_entry(regs))
+	if (ptrace_report_syscall_entry(regs))
 		return ULONG_MAX;
 
 	return regs->r8;
@@ -266,5 +265,5 @@ asmlinkage int syscall_trace_entry(struct pt_regs *regs)
 
 asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 {
-	tracehook_report_syscall_exit(regs, 0);
+	ptrace_report_syscall_exit(regs, 0);
 }
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index e5aa3237853d..bfe88c6e60d5 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -22,7 +22,6 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/regset.h>
 #include <linux/audit.h>
-#include <linux/tracehook.h>
 #include <linux/unistd.h>
 
 #include <asm/syscall.h>
@@ -843,8 +842,8 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 	regs->ARM_ip = dir;
 
 	if (dir == PTRACE_SYSCALL_EXIT)
-		tracehook_report_syscall_exit(regs, 0);
-	else if (tracehook_report_syscall_entry(regs))
+		ptrace_report_syscall_exit(regs, 0);
+	else if (ptrace_report_syscall_entry(regs))
 		current_thread_info()->abi_syscall = -1;
 
 	regs->ARM_ip = ip;
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b7845575f86f..230a47b9189e 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -27,7 +27,6 @@
 #include <linux/perf_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 
 #include <asm/compat.h>
@@ -1818,11 +1817,11 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 	regs->regs[regno] = dir;
 
 	if (dir == PTRACE_SYSCALL_ENTER) {
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			forget_syscall(regs);
 		regs->regs[regno] = saved_reg;
 	} else if (!test_thread_flag(TIF_SINGLESTEP)) {
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 		regs->regs[regno] = saved_reg;
 	} else {
 		regs->regs[regno] = saved_reg;
@@ -1832,7 +1831,7 @@ static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
 		 * tracer modifications to the registers may have rewound the
 		 * state machine.
 		 */
-		tracehook_report_syscall_exit(regs, 1);
+		ptrace_report_syscall_exit(regs, 1);
 	}
 }
 
diff --git a/arch/csky/kernel/ptrace.c b/arch/csky/kernel/ptrace.c
index 1a5f54e0d272..0f7e7b653c72 100644
--- a/arch/csky/kernel/ptrace.c
+++ b/arch/csky/kernel/ptrace.c
@@ -12,7 +12,6 @@
 #include <linux/sched/task_stack.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
 
@@ -321,7 +320,7 @@ long arch_ptrace(struct task_struct *child, long request,
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			return -1;
 
 	if (secure_computing() == -1)
@@ -339,7 +338,7 @@ asmlinkage void syscall_trace_exit(struct pt_regs *regs)
 	audit_syscall_exit(regs);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
 		trace_sys_exit(regs, syscall_get_return_value(current, regs));
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index a11db009d0ea..a9898b27b756 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -12,7 +12,6 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/audit.h>
-#include <linux/tracehook.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
 
@@ -174,7 +173,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	long ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
+	    ptrace_report_syscall_entry(regs))
 		/*
 		 * Tracing decided this syscall should not happen.
 		 * We'll return a bogus call number to get an ENOSYS
@@ -196,5 +195,5 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index 1240f038cce0..6447763ce5a9 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -14,7 +14,7 @@
 #include <linux/kdebug.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 #include <asm/traps.h>
 #include <asm/vm_fault.h>
 #include <asm/syscall.h>
@@ -348,7 +348,7 @@ void do_trap0(struct pt_regs *regs)
 
 		/* allow strace to catch syscall args  */
 		if (unlikely(test_thread_flag(TIF_SYSCALL_TRACE) &&
-			tracehook_report_syscall_entry(regs)))
+			ptrace_report_syscall_entry(regs)))
 			return;  /*  return -ENOSYS somewhere?  */
 
 		/* Interrupts should be re-enabled for syscall processing */
@@ -386,7 +386,7 @@ void do_trap0(struct pt_regs *regs)
 
 		/* allow strace to get the syscall return state  */
 		if (unlikely(test_thread_flag(TIF_SYSCALL_TRACE)))
-			tracehook_report_syscall_exit(regs, 0);
+			ptrace_report_syscall_exit(regs, 0);
 
 		break;
 	case TRAP_DEBUG:
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 6a1439eaa050..6af64aae087d 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1217,7 +1217,7 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 		     struct pt_regs regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (tracehook_report_syscall_entry(&regs))
+		if (ptrace_report_syscall_entry(&regs))
 			return -ENOSYS;
 
 	/* copy user rbs to kernel rbs */
@@ -1243,7 +1243,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(&regs, step);
+		ptrace_report_syscall_exit(&regs, step);
 
 	/* copy user rbs to kernel rbs */
 	if (test_thread_flag(TIF_RESTORE_RSE))
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index aa3a0b8d07e9..a0c99fe3118e 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -19,7 +19,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/signal.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 
 #include <linux/uaccess.h>
 #include <asm/page.h>
@@ -282,13 +282,13 @@ asmlinkage int syscall_trace_enter(void)
 	int ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		ret = tracehook_report_syscall_entry(task_pt_regs(current));
+		ret = ptrace_report_syscall_entry(task_pt_regs(current));
 	return ret;
 }
 
 asmlinkage void syscall_trace_leave(void)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(task_pt_regs(current), 0);
+		ptrace_report_syscall_exit(task_pt_regs(current), 0);
 }
 #endif /* CONFIG_COLDFIRE */
diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c
index badd286882ae..5234d0c1dcaa 100644
--- a/arch/microblaze/kernel/ptrace.c
+++ b/arch/microblaze/kernel/ptrace.c
@@ -33,7 +33,6 @@
 #include <linux/elf.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
-#include <linux/tracehook.h>
 
 #include <linux/errno.h>
 #include <asm/processor.h>
@@ -140,7 +139,7 @@ asmlinkage unsigned long do_syscall_trace_enter(struct pt_regs *regs)
 	secure_computing_strict(regs->r12);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
+	    ptrace_report_syscall_entry(regs))
 		/*
 		 * Tracing decided this syscall should not happen.
 		 * We'll return a bogus call number to get an ENOSYS
@@ -161,7 +160,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
 
 void ptrace_disable(struct task_struct *child)
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index db7c5be1d4a3..567aec4abac0 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -27,7 +27,6 @@
 #include <linux/smp.h>
 #include <linux/security.h>
 #include <linux/stddef.h>
-#include <linux/tracehook.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/ftrace.h>
@@ -1317,7 +1316,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 	current_thread_info()->syscall = syscall;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			return -1;
 		syscall = current_thread_info()->syscall;
 	}
@@ -1376,7 +1375,7 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 		trace_sys_exit(regs, regs_return_value(regs));
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 	user_enter();
 }
diff --git a/arch/nds32/include/asm/syscall.h b/arch/nds32/include/asm/syscall.h
index 90aa56c94af1..04d55ce18d50 100644
--- a/arch/nds32/include/asm/syscall.h
+++ b/arch/nds32/include/asm/syscall.h
@@ -39,7 +39,7 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
  *
  * It's only valid to call this when @task is stopped for system
  * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT),
- * after tracehook_report_syscall_entry() returned nonzero to prevent
+ * after ptrace_report_syscall_entry() returned nonzero to prevent
  * the system call from taking place.
  *
  * This rolls back the register state in @regs so it's as if the
diff --git a/arch/nds32/kernel/ptrace.c b/arch/nds32/kernel/ptrace.c
index d0eda870fbc2..6a6988cf689d 100644
--- a/arch/nds32/kernel/ptrace.c
+++ b/arch/nds32/kernel/ptrace.c
@@ -3,7 +3,6 @@
 
 #include <linux/ptrace.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/sched/task_stack.h>
 
@@ -103,7 +102,7 @@ void user_disable_single_step(struct task_struct *child)
 asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			forget_syscall(regs);
 	}
 	return regs->syscallno;
@@ -113,6 +112,6 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
 	int step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 
 }
diff --git a/arch/nios2/kernel/ptrace.c b/arch/nios2/kernel/ptrace.c
index a6ea9e1b4f61..cd62f310778b 100644
--- a/arch/nios2/kernel/ptrace.c
+++ b/arch/nios2/kernel/ptrace.c
@@ -15,7 +15,6 @@
 #include <linux/regset.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/user.h>
 
@@ -134,7 +133,7 @@ asmlinkage int do_syscall_trace_enter(void)
 	int ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		ret = tracehook_report_syscall_entry(task_pt_regs(current));
+		ret = ptrace_report_syscall_entry(task_pt_regs(current));
 
 	return ret;
 }
@@ -142,5 +141,5 @@ asmlinkage int do_syscall_trace_enter(void)
 asmlinkage void do_syscall_trace_exit(void)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(task_pt_regs(current), 0);
+		ptrace_report_syscall_exit(task_pt_regs(current), 0);
 }
diff --git a/arch/openrisc/kernel/ptrace.c b/arch/openrisc/kernel/ptrace.c
index 4d60ae2a12fa..b971740fc2aa 100644
--- a/arch/openrisc/kernel/ptrace.c
+++ b/arch/openrisc/kernel/ptrace.c
@@ -22,7 +22,6 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 
 #include <asm/thread_info.h>
@@ -159,7 +158,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 	long ret = 0;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
+	    ptrace_report_syscall_entry(regs))
 		/*
 		 * Tracing decided this syscall should not happen.
 		 * We'll return a bogus call number to get an ENOSYS
@@ -181,5 +180,5 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 65de6c4c9354..96ef6a6b66e5 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -15,7 +15,6 @@
 #include <linux/elf.h>
 #include <linux/errno.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/personality.h>
 #include <linux/regset.h>
@@ -316,7 +315,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 long do_syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
-		int rc = tracehook_report_syscall_entry(regs);
+		int rc = ptrace_report_syscall_entry(regs);
 
 		/*
 		 * As tracesys_next does not set %r28 to -ENOSYS
@@ -327,7 +326,7 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 		if (rc) {
 			/*
 			 * A nonzero return code from
-			 * tracehook_report_syscall_entry() tells us
+			 * ptrace_report_syscall_entry() tells us
 			 * to prevent the syscall execution.  Skip
 			 * the syscall call and the syscall restart handling.
 			 *
@@ -381,7 +380,7 @@ void do_syscall_trace_exit(struct pt_regs *regs)
 #endif
 
 	if (stepping || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, stepping);
+		ptrace_report_syscall_exit(regs, stepping);
 }
 
 
diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c
index c43f77e2ac31..f394b0d6473f 100644
--- a/arch/powerpc/kernel/ptrace/ptrace.c
+++ b/arch/powerpc/kernel/ptrace/ptrace.c
@@ -16,7 +16,7 @@
  */
 
 #include <linux/regset.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/context_tracking.h>
 #include <linux/syscalls.h>
@@ -263,12 +263,12 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 	flags = read_thread_flags() & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE);
 
 	if (flags) {
-		int rc = tracehook_report_syscall_entry(regs);
+		int rc = ptrace_report_syscall_entry(regs);
 
 		if (unlikely(flags & _TIF_SYSCALL_EMU)) {
 			/*
 			 * A nonzero return code from
-			 * tracehook_report_syscall_entry() tells us to prevent
+			 * ptrace_report_syscall_entry() tells us to prevent
 			 * the syscall execution, but we are not going to
 			 * execute it anyway.
 			 *
@@ -334,7 +334,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
 
 void __init pt_regs_check(void);
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index a89243730153..793c7da0554b 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -17,7 +17,6 @@
 #include <linux/regset.h>
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
-#include <linux/tracehook.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -241,7 +240,7 @@ long arch_ptrace(struct task_struct *child, long request,
 __visible int do_syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		if (tracehook_report_syscall_entry(regs))
+		if (ptrace_report_syscall_entry(regs))
 			return -1;
 
 	/*
@@ -266,7 +265,7 @@ __visible void do_syscall_trace_exit(struct pt_regs *regs)
 	audit_syscall_exit(regs);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 	if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index 5281685f6ad1..d417988d9770 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -20,7 +20,6 @@
 #include <linux/io.h>
 #include <linux/audit.h>
 #include <linux/seccomp.h>
-#include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/regset.h>
 #include <linux/hw_breakpoint.h>
@@ -456,7 +455,7 @@ long arch_ptrace(struct task_struct *child, long request,
 asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
 {
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs)) {
+	    ptrace_report_syscall_entry(regs)) {
 		regs->regs[0] = -ENOSYS;
 		return -1;
 	}
@@ -484,5 +483,5 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 
 	step = test_thread_flag(TIF_SINGLESTEP);
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c
index 5318174a0268..e7db48acb838 100644
--- a/arch/sparc/kernel/ptrace_32.c
+++ b/arch/sparc/kernel/ptrace_32.c
@@ -21,7 +21,6 @@
 #include <linux/signal.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
-#include <linux/tracehook.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -439,9 +438,9 @@ asmlinkage int syscall_trace(struct pt_regs *regs, int syscall_exit_p)
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)) {
 		if (syscall_exit_p)
-			tracehook_report_syscall_exit(regs, 0);
+			ptrace_report_syscall_exit(regs, 0);
 		else
-			ret = tracehook_report_syscall_entry(regs);
+			ret = ptrace_report_syscall_entry(regs);
 	}
 
 	return ret;
diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c
index 2b92155db8a5..86a7eb5c27ba 100644
--- a/arch/sparc/kernel/ptrace_64.c
+++ b/arch/sparc/kernel/ptrace_64.c
@@ -25,7 +25,6 @@
 #include <linux/audit.h>
 #include <linux/signal.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <trace/syscall.h>
 #include <linux/compat.h>
 #include <linux/elf.h>
@@ -1095,7 +1094,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
 		user_exit();
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		ret = tracehook_report_syscall_entry(regs);
+		ret = ptrace_report_syscall_entry(regs);
 
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->u_regs[UREG_G1]);
@@ -1118,7 +1117,7 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 		trace_sys_exit(regs, regs->u_regs[UREG_I0]);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, 0);
+		ptrace_report_syscall_exit(regs, 0);
 
 	if (test_thread_flag(TIF_NOHZ))
 		user_enter();
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index b425f47bddbb..bfaf6ab1ac03 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -6,7 +6,6 @@
 #include <linux/audit.h>
 #include <linux/ptrace.h>
 #include <linux/sched.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <asm/ptrace-abi.h>
 
@@ -135,7 +134,7 @@ int syscall_trace_enter(struct pt_regs *regs)
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return 0;
 
-	return tracehook_report_syscall_entry(regs);
+	return ptrace_report_syscall_entry(regs);
 }
 
 void syscall_trace_leave(struct pt_regs *regs)
@@ -151,7 +150,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 
-	tracehook_report_syscall_exit(regs, 0);
+	ptrace_report_syscall_exit(regs, 0);
 	/* force do_signal() --> is_syscall() */
 	if (ptraced & PT_PTRACED)
 		set_thread_flag(TIF_SIGPENDING);
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index bb3f4797d212..323c678a691f 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -26,7 +26,6 @@
 #include <linux/security.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 
 #define CREATE_TRACE_POINTS
@@ -550,7 +549,7 @@ int do_syscall_trace_enter(struct pt_regs *regs)
 		regs->areg[2] = -ENOSYS;
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs)) {
+	    ptrace_report_syscall_entry(regs)) {
 		regs->areg[2] = -ENOSYS;
 		regs->syscall = NO_SYSCALL;
 		return 0;
@@ -583,5 +582,5 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 	step = test_thread_flag(TIF_SINGLESTEP);
 
 	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index 81695eb02a12..5a80fe728dc8 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -44,7 +44,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs);
  *
  * It's only valid to call this when @task is stopped for system
  * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or
- * %SYSCALL_WORK_SYSCALL_AUDIT), after tracehook_report_syscall_entry()
+ * %SYSCALL_WORK_SYSCALL_AUDIT), after ptrace_report_syscall_entry()
  * returned nonzero to prevent the system call from taking place.
  *
  * This rolls back the register state in @regs so it's as if the
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 2e2b8d6140ed..a670e9fba7a9 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -3,7 +3,7 @@
 #define __LINUX_ENTRYCOMMON_H
 
 #include <linux/static_call_types.h>
-#include <linux/tracehook.h>
+#include <linux/ptrace.h>
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
@@ -95,7 +95,7 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs
 #ifndef arch_syscall_enter_tracehook
 static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs)
 {
-	return tracehook_report_syscall_entry(regs);
+	return ptrace_report_syscall_entry(regs);
 }
 #endif
 
@@ -294,7 +294,7 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step);
 #ifndef arch_syscall_exit_tracehook
 static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
 {
-	tracehook_report_syscall_exit(regs, step);
+	ptrace_report_syscall_exit(regs, step);
 }
 #endif
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 91b1074edb4c..5310f43e4762 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -440,4 +440,55 @@ static inline int ptrace_report_syscall(unsigned long message)
 	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
+
+/**
+ * ptrace_report_syscall_entry - task is about to attempt a system call
+ * @regs:		user register state of current task
+ *
+ * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
+ * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
+ * entered the kernel for a system call.  Full user register state is
+ * available here.  Changing the values in @regs can affect the system
+ * call number and arguments to be tried.  It is safe to block here,
+ * preventing the system call from beginning.
+ *
+ * Returns zero normally, or nonzero if the calling arch code should abort
+ * the system call.  That must prevent normal entry so no system call is
+ * made.  If @task ever returns to user mode after this, its register state
+ * is unspecified, but should be something harmless like an %ENOSYS error
+ * return.  It should preserve enough information so that syscall_rollback()
+ * can work (see asm-generic/syscall.h).
+ *
+ * Called without locks, just after entering kernel mode.
+ */
+static inline __must_check int ptrace_report_syscall_entry(
+	struct pt_regs *regs)
+{
+	return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY);
+}
+
+/**
+ * ptrace_report_syscall_exit - task has just finished a system call
+ * @regs:		user register state of current task
+ * @step:		nonzero if simulating single-step or block-step
+ *
+ * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
+ * the current task has just finished an attempted system call.  Full
+ * user register state is available here.  It is safe to block here,
+ * preventing signals from being processed.
+ *
+ * If @step is nonzero, this report is also in lieu of the normal
+ * trap that would follow the system call instruction because
+ * user_enable_block_step() or user_enable_single_step() was used.
+ * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
+ *
+ * Called without locks, just before checking for pending signals.
+ */
+static inline void ptrace_report_syscall_exit(struct pt_regs *regs, int step)
+{
+	if (step)
+		user_single_step_report(regs);
+	else
+		ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT);
+}
 #endif
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 998bc3863559..819e82ac09bd 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -52,57 +52,6 @@
 struct linux_binprm;
 
 
-/**
- * tracehook_report_syscall_entry - task is about to attempt a system call
- * @regs:		user register state of current task
- *
- * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or
- * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just
- * entered the kernel for a system call.  Full user register state is
- * available here.  Changing the values in @regs can affect the system
- * call number and arguments to be tried.  It is safe to block here,
- * preventing the system call from beginning.
- *
- * Returns zero normally, or nonzero if the calling arch code should abort
- * the system call.  That must prevent normal entry so no system call is
- * made.  If @task ever returns to user mode after this, its register state
- * is unspecified, but should be something harmless like an %ENOSYS error
- * return.  It should preserve enough information so that syscall_rollback()
- * can work (see asm-generic/syscall.h).
- *
- * Called without locks, just after entering kernel mode.
- */
-static inline __must_check int tracehook_report_syscall_entry(
-	struct pt_regs *regs)
-{
-	return ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_ENTRY);
-}
-
-/**
- * tracehook_report_syscall_exit - task has just finished a system call
- * @regs:		user register state of current task
- * @step:		nonzero if simulating single-step or block-step
- *
- * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when
- * the current task has just finished an attempted system call.  Full
- * user register state is available here.  It is safe to block here,
- * preventing signals from being processed.
- *
- * If @step is nonzero, this report is also in lieu of the normal
- * trap that would follow the system call instruction because
- * user_enable_block_step() or user_enable_single_step() was used.
- * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set.
- *
- * Called without locks, just before checking for pending signals.
- */
-static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
-{
-	if (step)
-		user_single_step_report(regs);
-	else
-		ptrace_report_syscall(PTRACE_EVENTMSG_SYSCALL_EXIT);
-}
-
 /**
  * tracehook_signal_handler - signal handler setup is complete
  * @stepping:		nonzero if debugger single-step or block-step in use
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 3747bf816f9a..b7af92e07d1f 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -114,7 +114,7 @@ struct ptrace_rseq_configuration {
 
 /*
  * These values are stored in task->ptrace_message
- * by tracehook_report_syscall_* to describe the current syscall-stop.
+ * by ptrace_report_syscall_* to describe the current syscall-stop.
  */
 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..f52e57c4d6d8 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
+#include <linux/tracehook.h>
 #include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>

From 0cfcb2b9ef48bbcaf5d43b9f1893f63a938e8176 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:00:55 -0600
Subject: [PATCH 100/229] ptrace: Remove arch_syscall_{enter,exit}_tracehook

These functions are alwasy one-to-one wrappers around
ptrace_report_syscall_entry and ptrace_report_syscall_exit.
So directly call the functions they are wrapping instead.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-4-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/entry-common.h | 43 ++----------------------------------
 kernel/entry/common.c        |  4 ++--
 2 files changed, 4 insertions(+), 43 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index a670e9fba7a9..9efbdda61f7a 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -79,26 +79,6 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs);
 static __always_inline void arch_check_user_regs(struct pt_regs *regs) {}
 #endif
 
-/**
- * arch_syscall_enter_tracehook - Wrapper around tracehook_report_syscall_entry()
- * @regs:	Pointer to currents pt_regs
- *
- * Returns: 0 on success or an error code to skip the syscall.
- *
- * Defaults to tracehook_report_syscall_entry(). Can be replaced by
- * architecture specific code.
- *
- * Invoked from syscall_enter_from_user_mode()
- */
-static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs);
-
-#ifndef arch_syscall_enter_tracehook
-static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs)
-{
-	return ptrace_report_syscall_entry(regs);
-}
-#endif
-
 /**
  * enter_from_user_mode - Establish state when coming from user mode
  *
@@ -157,7 +137,7 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
  * It handles the following work items:
  *
  *  1) syscall_work flag dependent invocations of
- *     arch_syscall_enter_tracehook(), __secure_computing(), trace_sys_enter()
+ *     ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter()
  *  2) Invocation of audit_syscall_entry()
  */
 long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall);
@@ -279,25 +259,6 @@ static __always_inline void arch_exit_to_user_mode(void) { }
  */
 void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
 
-/**
- * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit()
- * @regs:	Pointer to currents pt_regs
- * @step:	Indicator for single step
- *
- * Defaults to tracehook_report_syscall_exit(). Can be replaced by
- * architecture specific code.
- *
- * Invoked from syscall_exit_to_user_mode()
- */
-static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step);
-
-#ifndef arch_syscall_exit_tracehook
-static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step)
-{
-	ptrace_report_syscall_exit(regs, step);
-}
-#endif
-
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
  *
@@ -347,7 +308,7 @@ void syscall_exit_to_user_mode_work(struct pt_regs *regs);
  *	- rseq syscall exit
  *      - audit
  *	- syscall tracing
- *	- tracehook (single stepping)
+ *	- ptrace (single stepping)
  *
  *  2) Preparatory work
  *	- Exit to user mode loop (common TIF handling). Invokes
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f52e57c4d6d8..f0b1daa1e8da 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -59,7 +59,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
 
 	/* Handle ptrace */
 	if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
-		ret = arch_syscall_enter_tracehook(regs);
+		ret = ptrace_report_syscall_entry(regs);
 		if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
 			return -1L;
 	}
@@ -253,7 +253,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
 
 	step = report_single_step(work);
 	if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
-		arch_syscall_exit_tracehook(regs, step);
+		ptrace_report_syscall_exit(regs, step);
 }
 
 /*

From c145137dc990fd67b52fbc52faae5ba46f168cca Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:04:27 -0600
Subject: [PATCH 101/229] ptrace: Remove tracehook_signal_handler

The two line function tracehook_signal_handler is only called from
signal_delivered.  Expand it inline in signal_delivered and remove it.
Just to make it easier to understand what is going on.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-5-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/Kconfig              |  1 -
 include/linux/tracehook.h | 17 -----------------
 kernel/signal.c           |  3 ++-
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a517a949eb1d..6382520ef0a5 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -219,7 +219,6 @@ config TRACE_IRQFLAGS_SUPPORT
 #	CORE_DUMP_USE_REGSET	#define'd in linux/elf.h
 #	TIF_SYSCALL_TRACE	calls ptrace_report_syscall_{entry,exit}
 #	TIF_NOTIFY_RESUME	calls tracehook_notify_resume()
-#	signal delivery		calls tracehook_signal_handler()
 #
 config HAVE_ARCH_TRACEHOOK
 	bool
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 819e82ac09bd..b77bf4917196 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -52,23 +52,6 @@
 struct linux_binprm;
 
 
-/**
- * tracehook_signal_handler - signal handler setup is complete
- * @stepping:		nonzero if debugger single-step or block-step in use
- *
- * Called by the arch code after a signal handler has been set up.
- * Register and stack state reflects the user handler about to run.
- * Signal mask changes have already been made.
- *
- * Called without locks, shortly before returning to user mode
- * (or handling more signals).
- */
-static inline void tracehook_signal_handler(int stepping)
-{
-	if (stepping)
-		ptrace_notify(SIGTRAP);
-}
-
 /**
  * set_notify_resume - cause tracehook_notify_resume() to be called
  * @task:		task that will call tracehook_notify_resume()
diff --git a/kernel/signal.c b/kernel/signal.c
index 38602738866e..0e0bd1c1068b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2898,7 +2898,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 	set_current_blocked(&blocked);
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		sas_ss_reset(current);
-	tracehook_signal_handler(stepping);
+	if (stepping)
+		ptrace_notify(SIGTRAP);
 }
 
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)

From 8ca07e17c9dd4c4afcb4a3f2ea8f0a0d41c0f982 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Jan 2022 13:55:47 -0600
Subject: [PATCH 102/229] task_work: Remove unnecessary include from
 posix_timers.h

Break a header file circular dependency by removing the unnecessary
include of task_work.h from posix_timers.h.

sched.h -> posix-timers.h
posix-timers.h -> task_work.h
task_work.h -> sched.h

Add missing includes of task_work.h to:
arch/x86/mm/tlb.c
kernel/time/posix-cpu-timers.c

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-6-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/x86/mm/tlb.c              | 1 +
 include/linux/posix-timers.h   | 1 -
 kernel/time/posix-cpu-timers.c | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index a6cf56a14939..6eb4d91d5365 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -9,6 +9,7 @@
 #include <linux/cpu.h>
 #include <linux/debugfs.h>
 #include <linux/sched/smt.h>
+#include <linux/task_work.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 5bbcd280bfd2..83539bb2f023 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -6,7 +6,6 @@
 #include <linux/list.h>
 #include <linux/alarmtimer.h>
 #include <linux/timerqueue.h>
-#include <linux/task_work.h>
 
 struct kernel_siginfo;
 struct task_struct;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96b4e7810426..9190d9eb236d 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -15,6 +15,7 @@
 #include <linux/workqueue.h>
 #include <linux/compat.h>
 #include <linux/sched/deadline.h>
+#include <linux/task_work.h>
 
 #include "posix-timers.h"
 

From 7f62d40d9cb50fd146fe8ff071f98fa3c1855083 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 08:52:41 -0600
Subject: [PATCH 103/229] task_work: Introduce task_work_pending

Wrap the test of task->task_works in a helper function to make
it clear what is being tested.

All of the other readers of task->task_work use READ_ONCE and this is
even necessary on current as other processes can update
task->task_work.  So for consistency I have added READ_ONCE into
task_work_pending.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-7-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/io_uring.c             | 6 +++---
 include/linux/task_work.h | 5 +++++
 include/linux/tracehook.h | 4 ++--
 kernel/signal.c           | 4 ++--
 kernel/task_work.c        | 2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e54c4127422e..e85261079a78 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2590,7 +2590,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 
 static inline bool io_run_task_work(void)
 {
-	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
+	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
 		__set_current_state(TASK_RUNNING);
 		tracehook_notify_signal();
 		return true;
@@ -7602,7 +7602,7 @@ static int io_sq_thread(void *data)
 		}
 
 		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
-		if (!io_sqd_events_pending(sqd) && !current->task_works) {
+		if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
 			bool needs_sched = true;
 
 			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
@@ -10321,7 +10321,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
 
 		hlist_for_each_entry(req, list, hash_node)
 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
-					req->task->task_works != NULL);
+					task_work_pending(req->task));
 	}
 
 	seq_puts(m, "CqOverflowList:\n");
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 5b8a93f288bb..897494b597ba 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -19,6 +19,11 @@ enum task_work_notify_mode {
 	TWA_SIGNAL,
 };
 
+static inline bool task_work_pending(struct task_struct *task)
+{
+	return READ_ONCE(task->task_works);
+}
+
 int task_work_add(struct task_struct *task, struct callback_head *twork,
 			enum task_work_notify_mode mode);
 
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index b77bf4917196..fa834a22e86e 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -90,7 +90,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 	 * hlist_add_head(task->task_works);
 	 */
 	smp_mb__after_atomic();
-	if (unlikely(current->task_works))
+	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 #ifdef CONFIG_KEYS_REQUEST_CACHE
@@ -115,7 +115,7 @@ static inline void tracehook_notify_signal(void)
 {
 	clear_thread_flag(TIF_NOTIFY_SIGNAL);
 	smp_mb__after_atomic();
-	if (current->task_works)
+	if (task_work_pending(current))
 		task_work_run();
 }
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 0e0bd1c1068b..3b4cf25fb9b3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2344,7 +2344,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 void ptrace_notify(int exit_code)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
-	if (unlikely(current->task_works))
+	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
@@ -2626,7 +2626,7 @@ bool get_signal(struct ksignal *ksig)
 	struct signal_struct *signal = current->signal;
 	int signr;
 
-	if (unlikely(current->task_works))
+	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	/*
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 1698fbe6f0e1..cc6fccb0e24d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -78,7 +78,7 @@ task_work_cancel_match(struct task_struct *task,
 	struct callback_head *work;
 	unsigned long flags;
 
-	if (likely(!task->task_works))
+	if (likely(!task_work_pending(task)))
 		return NULL;
 	/*
 	 * If cmpxchg() fails we continue without updating pprev.

From 8ba62d37949e248c698c26e0d82d72fda5d33ebf Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 09:51:14 -0600
Subject: [PATCH 104/229] task_work: Call tracehook_notify_signal from
 get_signal on all architectures

Always handle TIF_NOTIFY_SIGNAL in get_signal.  With commit 35d0b389f3b2
("task_work: unconditionally run task_work from get_signal()") always
calling task_work_run all of the work of tracehook_notify_signal is
already happening except clearing TIF_NOTIFY_SIGNAL.

Factor clear_notify_signal out of tracehook_notify_signal and use it in
get_signal so that get_signal only needs one call of task_work_run.

To keep the semantics in sync update xfer_to_guest_mode_work (which
does not call get_signal) to call tracehook_notify_signal if either
_TIF_SIGPENDING or _TIF_NOTIFY_SIGNAL.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-8-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/s390/kernel/signal.c    |  4 ++--
 arch/x86/kernel/signal.c     |  4 ++--
 include/linux/entry-common.h |  2 +-
 include/linux/tracehook.h    |  9 +++++++--
 kernel/entry/common.c        | 12 ++----------
 kernel/entry/kvm.c           |  2 +-
 kernel/signal.c              | 14 +++-----------
 7 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 307f5d99514d..ea9e5e8182cd 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -453,7 +453,7 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset,
  * stack-frames in one go after that.
  */
 
-void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
+void arch_do_signal_or_restart(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 	sigset_t *oldset = sigmask_to_save();
@@ -466,7 +466,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
 	current->thread.system_call =
 		test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0;
 
-	if (has_signal && get_signal(&ksig)) {
+	if (get_signal(&ksig)) {
 		/* Whee!  Actually deliver the signal.  */
 		if (current->thread.system_call) {
 			regs->int_code = current->thread.system_call;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ec71e06ae364..de3d5b5724d8 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -861,11 +861,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
+void arch_do_signal_or_restart(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
-	if (has_signal && get_signal(&ksig)) {
+	if (get_signal(&ksig)) {
 		/* Whee! Actually deliver the signal.  */
 		handle_signal(&ksig, regs);
 		return;
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 9efbdda61f7a..3537fd25f14e 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -257,7 +257,7 @@ static __always_inline void arch_exit_to_user_mode(void) { }
  *
  * Invoked from exit_to_user_mode_loop().
  */
-void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal);
+void arch_do_signal_or_restart(struct pt_regs *regs);
 
 /**
  * exit_to_user_mode - Fixup state when exiting to user mode
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index fa834a22e86e..b44a7820c468 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -106,6 +106,12 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 	rseq_handle_notify_resume(NULL, regs);
 }
 
+static inline void clear_notify_signal(void)
+{
+	clear_thread_flag(TIF_NOTIFY_SIGNAL);
+	smp_mb__after_atomic();
+}
+
 /*
  * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
  * is currently used by TWA_SIGNAL based task_work, which requires breaking
@@ -113,8 +119,7 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
  */
 static inline void tracehook_notify_signal(void)
 {
-	clear_thread_flag(TIF_NOTIFY_SIGNAL);
-	smp_mb__after_atomic();
+	clear_notify_signal();
 	if (task_work_pending(current))
 		task_work_run();
 }
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f0b1daa1e8da..79eaf9b4b10d 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -139,15 +139,7 @@ void noinstr exit_to_user_mode(void)
 }
 
 /* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
-	if (ti_work & _TIF_NOTIFY_SIGNAL)
-		tracehook_notify_signal();
-
-	arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
 
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 					    unsigned long ti_work)
@@ -170,7 +162,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 			klp_update_patch_state(current);
 
 		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			handle_signal_work(regs, ti_work);
+			arch_do_signal_or_restart(regs);
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
 			tracehook_notify_resume(regs);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 96d476e06c77..cabf36a489e4 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,7 +8,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 	do {
 		int ret;
 
-		if (ti_work & _TIF_NOTIFY_SIGNAL)
+		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 			tracehook_notify_signal();
 
 		if (ti_work & _TIF_SIGPENDING) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 3b4cf25fb9b3..8632b88982c9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2626,20 +2626,12 @@ bool get_signal(struct ksignal *ksig)
 	struct signal_struct *signal = current->signal;
 	int signr;
 
+	clear_notify_signal();
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
-	/*
-	 * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
-	 * that the arch handlers don't all have to do it. If we get here
-	 * without TIF_SIGPENDING, just exit after running signal work.
-	 */
-	if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
-		if (test_thread_flag(TIF_NOTIFY_SIGNAL))
-			tracehook_notify_signal();
-		if (!task_sigpending(current))
-			return false;
-	}
+	if (!task_sigpending(current))
+		return false;
 
 	if (unlikely(uprobe_deny_signal()))
 		return false;

From 7c5d8fa6fbb12a3f0eefe8762bfede508e147cb3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 11:18:54 -0600
Subject: [PATCH 105/229] task_work: Decouple TIF_NOTIFY_SIGNAL and task_work

There are a small handful of reasons besides pending signals that the
kernel might want to break out of interruptible sleeps.  The flag
TIF_NOTIFY_SIGNAL and the helpers that set and clear TIF_NOTIFY_SIGNAL
provide that the infrastructure for breaking out of interruptible
sleeps and entering the return to user space slow path for those
cases.

Expand tracehook_notify_signal inline in it's callers and remove it,
which makes clear that TIF_NOTIFY_SIGNAL and task_work are separate
concepts.

Update the comment on set_notify_signal to more accurately describe
it's purpose.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-9-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/io-wq.c                |  4 +++-
 fs/io_uring.c             |  4 +++-
 include/linux/tracehook.h | 15 ++-------------
 kernel/entry/kvm.c        |  7 +++++--
 4 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index bb7f161bb19c..8b9147873c2c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -515,7 +515,9 @@ static bool io_flush_signals(void)
 {
 	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) {
 		__set_current_state(TASK_RUNNING);
-		tracehook_notify_signal();
+		clear_notify_signal();
+		if (task_work_pending(current))
+			task_work_run();
 		return true;
 	}
 	return false;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e85261079a78..d5fbae1030f9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2592,7 +2592,9 @@ static inline bool io_run_task_work(void)
 {
 	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) {
 		__set_current_state(TASK_RUNNING);
-		tracehook_notify_signal();
+		clear_notify_signal();
+		if (task_work_pending(current))
+			task_work_run();
 		return true;
 	}
 
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index b44a7820c468..e5d676e841e3 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -113,19 +113,8 @@ static inline void clear_notify_signal(void)
 }
 
 /*
- * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This
- * is currently used by TWA_SIGNAL based task_work, which requires breaking
- * wait loops to ensure that task_work is noticed and run.
- */
-static inline void tracehook_notify_signal(void)
-{
-	clear_notify_signal();
-	if (task_work_pending(current))
-		task_work_run();
-}
-
-/*
- * Called when we have work to process from exit_to_user_mode_loop()
+ * Called to break out of interruptible wait loops, and enter the
+ * exit_to_user_mode_loop().
  */
 static inline void set_notify_signal(struct task_struct *task)
 {
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index cabf36a489e4..3ab5f98988c3 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,8 +8,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 	do {
 		int ret;
 
-		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
-			tracehook_notify_signal();
+		if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+			clear_notify_signal();
+			if (task_work_pending(current))
+				task_work_run();
+		}
 
 		if (ti_work & _TIF_SIGPENDING) {
 			kvm_handle_signal_exit(vcpu);

From 593febb143d17aa2096cd123c9d62b6981eb1d97 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 11:46:54 -0600
Subject: [PATCH 106/229] signal: Move set_notify_signal and
 clear_notify_signal into sched/signal.h

The header tracehook.h is no place for code to live.  The functions
set_notify_signal and clear_notify_signal are not about signals.  They
are about interruptions that act like signals.  The fundamental signal
primitives wind up calling set_notify_signal and clear_notify_signal.
Which means they need to be maintained with the signal code.

Since set_notify_signal and clear_notify_signal must be maintained
with the signal subsystem move them into sched/signal.h and claim them
as part of the signal subsystem.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-10-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 17 +++++++++++++++++
 include/linux/tracehook.h    | 17 -----------------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index b6ecb9fc4cd2..3c8b34876744 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -349,6 +349,23 @@ extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type);
 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 
+static inline void clear_notify_signal(void)
+{
+	clear_thread_flag(TIF_NOTIFY_SIGNAL);
+	smp_mb__after_atomic();
+}
+
+/*
+ * Called to break out of interruptible wait loops, and enter the
+ * exit_to_user_mode_loop().
+ */
+static inline void set_notify_signal(struct task_struct *task)
+{
+	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+	    !wake_up_state(task, TASK_INTERRUPTIBLE))
+		kick_process(task);
+}
+
 static inline int restart_syscall(void)
 {
 	set_tsk_thread_flag(current, TIF_SIGPENDING);
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index e5d676e841e3..1b7365aef8da 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -106,21 +106,4 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
 	rseq_handle_notify_resume(NULL, regs);
 }
 
-static inline void clear_notify_signal(void)
-{
-	clear_thread_flag(TIF_NOTIFY_SIGNAL);
-	smp_mb__after_atomic();
-}
-
-/*
- * Called to break out of interruptible wait loops, and enter the
- * exit_to_user_mode_loop().
- */
-static inline void set_notify_signal(struct task_struct *task)
-{
-	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
-	    !wake_up_state(task, TASK_INTERRUPTIBLE))
-		kick_process(task);
-}
-
 #endif	/* <linux/tracehook.h> */

From d3c51a0c8944e3a5bba458358b4c1f9ae2de0133 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Jan 2022 14:36:58 -0600
Subject: [PATCH 107/229] resume_user_mode: Remove #ifdef TIF_NOTIFY_RESUME in
 set_notify_resume

Every architecture defines TIF_NOTIFY_RESUME so remove the unnecessary
ifdef.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-11-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/tracehook.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 1b7365aef8da..946404ebe10b 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -63,10 +63,8 @@ struct linux_binprm;
  */
 static inline void set_notify_resume(struct task_struct *task)
 {
-#ifdef TIF_NOTIFY_RESUME
 	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
 		kick_process(task);
-#endif
 }
 
 /**

From 03248addadf1a5ef0a03cbcd5ec905b49adb9658 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 12:20:45 -0600
Subject: [PATCH 108/229] resume_user_mode: Move to resume_user_mode.h

Move set_notify_resume and tracehook_notify_resume into resume_user_mode.h.
While doing that rename tracehook_notify_resume to resume_user_mode_work.

Update all of the places that included tracehook.h for these functions to
include resume_user_mode.h instead.

Update all of the callers of tracehook_notify_resume to call
resume_user_mode_work.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-12-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 arch/Kconfig                     |  2 +-
 arch/alpha/kernel/signal.c       |  4 +-
 arch/arc/kernel/signal.c         |  4 +-
 arch/arm/kernel/signal.c         |  4 +-
 arch/arm64/kernel/signal.c       |  4 +-
 arch/csky/kernel/signal.c        |  4 +-
 arch/h8300/kernel/signal.c       |  4 +-
 arch/hexagon/kernel/process.c    |  4 +-
 arch/hexagon/kernel/signal.c     |  1 -
 arch/ia64/kernel/process.c       |  4 +-
 arch/ia64/kernel/ptrace.c        |  2 +-
 arch/ia64/kernel/signal.c        |  1 -
 arch/m68k/kernel/signal.c        |  4 +-
 arch/microblaze/kernel/signal.c  |  4 +-
 arch/mips/kernel/signal.c        |  4 +-
 arch/nds32/kernel/signal.c       |  4 +-
 arch/nios2/kernel/signal.c       |  4 +-
 arch/openrisc/kernel/signal.c    |  4 +-
 arch/parisc/kernel/signal.c      |  4 +-
 arch/powerpc/kernel/signal.c     |  4 +-
 arch/riscv/kernel/signal.c       |  4 +-
 arch/sh/kernel/signal_32.c       |  4 +-
 arch/sparc/kernel/signal32.c     |  1 -
 arch/sparc/kernel/signal_32.c    |  4 +-
 arch/sparc/kernel/signal_64.c    |  4 +-
 arch/um/kernel/process.c         |  4 +-
 arch/xtensa/kernel/signal.c      |  4 +-
 block/blk-cgroup.c               |  2 +-
 include/linux/entry-kvm.h        |  2 +-
 include/linux/resume_user_mode.h | 64 ++++++++++++++++++++++++++++++++
 include/linux/tracehook.h        | 51 -------------------------
 kernel/entry/common.c            |  4 +-
 kernel/entry/kvm.c               |  2 +-
 kernel/task_work.c               |  2 +-
 mm/memcontrol.c                  |  2 +-
 35 files changed, 117 insertions(+), 107 deletions(-)
 create mode 100644 include/linux/resume_user_mode.h

diff --git a/arch/Kconfig b/arch/Kconfig
index 6382520ef0a5..2e3979c3d66d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -218,7 +218,7 @@ config TRACE_IRQFLAGS_SUPPORT
 #	linux/regset.h		user_regset interfaces
 #	CORE_DUMP_USE_REGSET	#define'd in linux/elf.h
 #	TIF_SYSCALL_TRACE	calls ptrace_report_syscall_{entry,exit}
-#	TIF_NOTIFY_RESUME	calls tracehook_notify_resume()
+#	TIF_NOTIFY_RESUME	calls resume_user_mode_work()
 #
 config HAVE_ARCH_TRACEHOOK
 	bool
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index d8ed71d5bed3..6f47f256fe80 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/binfmts.h>
 #include <linux/bitops.h>
 #include <linux/syscalls.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <linux/uaccess.h>
 #include <asm/sigcontext.h>
@@ -531,7 +531,7 @@ do_work_pending(struct pt_regs *regs, unsigned long thread_flags,
 				do_signal(regs, r0, r19);
 				r0 = 0;
 			} else {
-				tracehook_notify_resume(regs);
+				resume_user_mode_work(regs);
 			}
 		}
 		local_irq_disable();
diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c
index cb2f88502baf..f748483628f2 100644
--- a/arch/arc/kernel/signal.c
+++ b/arch/arc/kernel/signal.c
@@ -49,7 +49,7 @@
 #include <linux/personality.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/sched/task_stack.h>
 
 #include <asm/ucontext.h>
@@ -438,5 +438,5 @@ void do_notify_resume(struct pt_regs *regs)
 	 * user mode
 	 */
 	if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index c532a6041066..459abc5d1819 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -9,7 +9,7 @@
 #include <linux/signal.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/uprobes.h>
 #include <linux/syscalls.h>
 
@@ -627,7 +627,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 			} else if (thread_flags & _TIF_UPROBE) {
 				uprobe_notify_resume(regs);
 			} else {
-				tracehook_notify_resume(regs);
+				resume_user_mode_work(regs);
 			}
 		}
 		local_irq_disable();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index d8aaf4b6f432..413c51de9d10 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -17,7 +17,7 @@
 #include <linux/uaccess.h>
 #include <linux/sizes.h>
 #include <linux/string.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/ratelimit.h>
 #include <linux/syscalls.h>
 
@@ -941,7 +941,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags)
 				do_signal(regs);
 
 			if (thread_flags & _TIF_NOTIFY_RESUME)
-				tracehook_notify_resume(regs);
+				resume_user_mode_work(regs);
 
 			if (thread_flags & _TIF_FOREIGN_FPSTATE)
 				fpsimd_restore_current_state();
diff --git a/arch/csky/kernel/signal.c b/arch/csky/kernel/signal.c
index c7b763d2f526..7a3149a27e4d 100644
--- a/arch/csky/kernel/signal.c
+++ b/arch/csky/kernel/signal.c
@@ -3,7 +3,7 @@
 #include <linux/signal.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/traps.h>
 #include <asm/ucontext.h>
@@ -265,5 +265,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 		do_signal(regs);
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index 75a1c36b105a..0716fc8a8ce2 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -39,7 +39,7 @@
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/binfmts.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/setup.h>
 #include <linux/uaccess.h>
@@ -283,5 +283,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 		do_signal(regs);
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c
index 232dfd8956aa..ae3f728eeca0 100644
--- a/arch/hexagon/kernel/process.c
+++ b/arch/hexagon/kernel/process.c
@@ -14,7 +14,7 @@
 #include <linux/tick.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 /*
  * Program thread launch.  Often defined as a macro in processor.h,
@@ -178,7 +178,7 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 	}
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 		return 1;
 	}
 
diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c
index 94cc7ff52dce..bcba31e9e0ae 100644
--- a/arch/hexagon/kernel/signal.c
+++ b/arch/hexagon/kernel/signal.c
@@ -7,7 +7,6 @@
 
 #include <linux/linkage.h>
 #include <linux/syscalls.h>
-#include <linux/tracehook.h>
 #include <linux/sched/task_stack.h>
 
 #include <asm/registers.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 834df24a88f1..d7a256bd9d6b 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -32,7 +32,7 @@
 #include <linux/delay.h>
 #include <linux/kdebug.h>
 #include <linux/utsname.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/rcupdate.h>
 
 #include <asm/cpu.h>
@@ -179,7 +179,7 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall)
 
 	if (test_thread_flag(TIF_NOTIFY_RESUME)) {
 		local_irq_enable();	/* force interrupt enable */
-		tracehook_notify_resume(&scr->pt);
+		resume_user_mode_work(&scr->pt);
 	}
 
 	/* copy user rbs to kernel rbs */
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 6af64aae087d..a19acd9f5e1f 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -23,7 +23,7 @@
 #include <linux/signal.h>
 #include <linux/regset.h>
 #include <linux/elf.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/processor.h>
 #include <asm/ptrace_offsets.h>
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index c1b299760bf7..51cf6a7ec158 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/smp.h>
diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index 338817d0cb3f..49533f65958a 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -43,7 +43,7 @@
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/extable.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/setup.h>
 #include <linux/uaccess.h>
@@ -1109,5 +1109,5 @@ void do_notify_resume(struct pt_regs *regs)
 		do_signal(regs);
 
 	if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c
index 23e8a9336a29..561eb82d7af6 100644
--- a/arch/microblaze/kernel/signal.c
+++ b/arch/microblaze/kernel/signal.c
@@ -31,7 +31,7 @@
 #include <linux/personality.h>
 #include <linux/percpu.h>
 #include <linux/linkage.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <asm/entry.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
@@ -311,5 +311,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, int in_syscall)
 		do_signal(regs, in_syscall);
 
 	if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 5bce782e694c..1a99f26bf99f 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -25,7 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/abi.h>
 #include <asm/asm.h>
@@ -916,7 +916,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 		do_signal(regs);
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 
 	user_enter();
 }
diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c
index 7e3ca430a223..551caef595cb 100644
--- a/arch/nds32/kernel/signal.c
+++ b/arch/nds32/kernel/signal.c
@@ -6,7 +6,7 @@
 #include <linux/ptrace.h>
 #include <linux/personality.h>
 #include <linux/freezer.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/uaccess.h>
 
 #include <asm/cacheflush.h>
@@ -380,5 +380,5 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags)
 		do_signal(regs);
 
 	if (thread_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/nios2/kernel/signal.c b/arch/nios2/kernel/signal.c
index 2009ae2d3c3b..530b60c99545 100644
--- a/arch/nios2/kernel/signal.c
+++ b/arch/nios2/kernel/signal.c
@@ -15,7 +15,7 @@
 #include <linux/uaccess.h>
 #include <linux/unistd.h>
 #include <linux/personality.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/ucontext.h>
 #include <asm/cacheflush.h>
@@ -319,7 +319,7 @@ asmlinkage int do_notify_resume(struct pt_regs *regs)
 			return restart;
 		}
 	} else if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 
 	return 0;
 }
diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c
index 92c5b70740f5..80f69740c731 100644
--- a/arch/openrisc/kernel/signal.c
+++ b/arch/openrisc/kernel/signal.c
@@ -21,7 +21,7 @@
 #include <linux/ptrace.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <asm/processor.h>
 #include <asm/syscall.h>
@@ -309,7 +309,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 				}
 				syscall = 0;
 			} else {
-				tracehook_notify_resume(regs);
+				resume_user_mode_work(regs);
 			}
 		}
 		local_irq_disable();
diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c
index 46b1050640b8..2f7ebe9add20 100644
--- a/arch/parisc/kernel/signal.c
+++ b/arch/parisc/kernel/signal.c
@@ -22,7 +22,7 @@
 #include <linux/errno.h>
 #include <linux/wait.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
 #include <linux/compat.h>
@@ -602,5 +602,5 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall)
 		do_signal(regs, in_syscall);
 
 	if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index b93b87df499d..f7f8620663c7 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -9,7 +9,7 @@
  * this archive for more details.
  */
 
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/signal.h>
 #include <linux/uprobes.h>
 #include <linux/key.h>
@@ -294,7 +294,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 	}
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
 
 static unsigned long get_tm_stackpointer(struct task_struct *tsk)
diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c
index c2d5ecbe5526..d80bf5896c6f 100644
--- a/arch/riscv/kernel/signal.c
+++ b/arch/riscv/kernel/signal.c
@@ -9,7 +9,7 @@
 #include <linux/signal.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/linkage.h>
 
 #include <asm/ucontext.h>
@@ -317,5 +317,5 @@ asmlinkage __visible void do_notify_resume(struct pt_regs *regs,
 		do_signal(regs);
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c
index dd3092911efa..90f495d35db2 100644
--- a/arch/sh/kernel/signal_32.c
+++ b/arch/sh/kernel/signal_32.c
@@ -25,7 +25,7 @@
 #include <linux/personality.h>
 #include <linux/binfmts.h>
 #include <linux/io.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <asm/ucontext.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -503,5 +503,5 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0,
 		do_signal(regs, save_r0);
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index 6cc124a3bb98..f9fe502b81c6 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -20,7 +20,6 @@
 #include <linux/binfmts.h>
 #include <linux/compat.h>
 #include <linux/bitops.h>
-#include <linux/tracehook.h>
 
 #include <linux/uaccess.h>
 #include <asm/ptrace.h>
diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c
index ffab16369bea..80c89b362d8b 100644
--- a/arch/sparc/kernel/signal_32.c
+++ b/arch/sparc/kernel/signal_32.c
@@ -19,7 +19,7 @@
 #include <linux/smp.h>
 #include <linux/binfmts.h>	/* do_coredum */
 #include <linux/bitops.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 #include <linux/uaccess.h>
 #include <asm/ptrace.h>
@@ -524,7 +524,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0,
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 		do_signal(regs, orig_i0);
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
 
 asmlinkage int do_sys_sigstack(struct sigstack __user *ssptr,
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 2a78d2af1265..8b9fc76cd3e0 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/wait.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/unistd.h>
 #include <linux/mm.h>
 #include <linux/tty.h>
@@ -552,7 +552,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
 		do_signal(regs, orig_i0);
 	if (thread_info_flags & _TIF_NOTIFY_RESUME)
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 	user_enter();
 }
 
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 4a420778ed87..80504680be08 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -23,7 +23,7 @@
 #include <linux/seq_file.h>
 #include <linux/tick.h>
 #include <linux/threads.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <asm/current.h>
 #include <asm/mmu_context.h>
 #include <linux/uaccess.h>
@@ -104,7 +104,7 @@ void interrupt_end(void)
 	    test_thread_flag(TIF_NOTIFY_SIGNAL))
 		do_signal(regs);
 	if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
 
 int get_current_pid(void)
diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c
index f6c949895b3e..6f68649e86ba 100644
--- a/arch/xtensa/kernel/signal.c
+++ b/arch/xtensa/kernel/signal.c
@@ -19,7 +19,7 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/personality.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/sched/task_stack.h>
 
 #include <asm/ucontext.h>
@@ -511,5 +511,5 @@ void do_notify_resume(struct pt_regs *regs)
 		do_signal(regs);
 
 	if (test_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(regs);
+		resume_user_mode_work(regs);
 }
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 650f7e27989f..4d8be1634bc6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -28,7 +28,7 @@
 #include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/blk-cgroup.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/part_stat.h>
 #include "blk.h"
diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h
index 07c878d6e323..6813171afccb 100644
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -3,7 +3,7 @@
 #define __LINUX_ENTRYKVM_H
 
 #include <linux/static_call_types.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/syscalls.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
new file mode 100644
index 000000000000..285189454449
--- /dev/null
+++ b/include/linux/resume_user_mode.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef LINUX_RESUME_USER_MODE_H
+#define LINUX_RESUME_USER_MODE_H
+
+#include <linux/sched.h>
+#include <linux/task_work.h>
+#include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
+
+/**
+ * set_notify_resume - cause resume_user_mode_work() to be called
+ * @task:		task that will call resume_user_mode_work()
+ *
+ * Calling this arranges that @task will call resume_user_mode_work()
+ * before returning to user mode.  If it's already running in user mode,
+ * it will enter the kernel and call resume_user_mode_work() soon.
+ * If it's blocked, it will not be woken.
+ */
+static inline void set_notify_resume(struct task_struct *task)
+{
+	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
+		kick_process(task);
+}
+
+
+/**
+ * resume_user_mode_work - Perform work before returning to user mode
+ * @regs:		user-mode registers of @current task
+ *
+ * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
+ * about to return to user mode, and the user state in @regs can be
+ * inspected or adjusted.  The caller in arch code has cleared
+ * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
+ * asynchronously, this will be called again before we return to
+ * user mode.
+ *
+ * Called without locks.
+ */
+static inline void resume_user_mode_work(struct pt_regs *regs)
+{
+	clear_thread_flag(TIF_NOTIFY_RESUME);
+	/*
+	 * This barrier pairs with task_work_add()->set_notify_resume() after
+	 * hlist_add_head(task->task_works);
+	 */
+	smp_mb__after_atomic();
+	if (unlikely(task_work_pending(current)))
+		task_work_run();
+
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+	if (unlikely(current->cached_requested_key)) {
+		key_put(current->cached_requested_key);
+		current->cached_requested_key = NULL;
+	}
+#endif
+
+	mem_cgroup_handle_over_high();
+	blkcg_maybe_throttle_current();
+
+	rseq_handle_notify_resume(NULL, regs);
+}
+
+#endif /* LINUX_RESUME_USER_MODE_H */
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 946404ebe10b..9f6b3fd1880a 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -52,56 +52,5 @@
 struct linux_binprm;
 
 
-/**
- * set_notify_resume - cause tracehook_notify_resume() to be called
- * @task:		task that will call tracehook_notify_resume()
- *
- * Calling this arranges that @task will call tracehook_notify_resume()
- * before returning to user mode.  If it's already running in user mode,
- * it will enter the kernel and call tracehook_notify_resume() soon.
- * If it's blocked, it will not be woken.
- */
-static inline void set_notify_resume(struct task_struct *task)
-{
-	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
-		kick_process(task);
-}
-
-/**
- * tracehook_notify_resume - report when about to return to user mode
- * @regs:		user-mode registers of @current task
- *
- * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
- * about to return to user mode, and the user state in @regs can be
- * inspected or adjusted.  The caller in arch code has cleared
- * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
- * asynchronously, this will be called again before we return to
- * user mode.
- *
- * Called without locks.
- */
-static inline void tracehook_notify_resume(struct pt_regs *regs)
-{
-	clear_thread_flag(TIF_NOTIFY_RESUME);
-	/*
-	 * This barrier pairs with task_work_add()->set_notify_resume() after
-	 * hlist_add_head(task->task_works);
-	 */
-	smp_mb__after_atomic();
-	if (unlikely(task_work_pending(current)))
-		task_work_run();
-
-#ifdef CONFIG_KEYS_REQUEST_CACHE
-	if (unlikely(current->cached_requested_key)) {
-		key_put(current->cached_requested_key);
-		current->cached_requested_key = NULL;
-	}
-#endif
-
-	mem_cgroup_handle_over_high();
-	blkcg_maybe_throttle_current();
-
-	rseq_handle_notify_resume(NULL, regs);
-}
 
 #endif	/* <linux/tracehook.h> */
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 79eaf9b4b10d..a86823cad853 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,7 +2,7 @@
 
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/highmem.h>
 #include <linux/livepatch.h>
 #include <linux/audit.h>
@@ -165,7 +165,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
 			arch_do_signal_or_restart(regs);
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
-			tracehook_notify_resume(regs);
+			resume_user_mode_work(regs);
 
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 3ab5f98988c3..9d09f489b60e 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -23,7 +23,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
 			schedule();
 
 		if (ti_work & _TIF_NOTIFY_RESUME)
-			tracehook_notify_resume(NULL);
+			resume_user_mode_work(NULL);
 
 		ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
 		if (ret)
diff --git a/kernel/task_work.c b/kernel/task_work.c
index cc6fccb0e24d..c59e1a49bc40 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/spinlock.h>
 #include <linux/task_work.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 
 static struct callback_head work_exited; /* all we need is ->next == NULL */
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09d342c7cbd0..2aaa400f34d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
 #include <linux/psi.h>
 #include <linux/seq_buf.h>
 #include "internal.h"

From 355f841a3f8ca980c9682937a5257d3a1f6fc09d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 9 Feb 2022 12:47:08 -0600
Subject: [PATCH 109/229] tracehook: Remove tracehook.h

Now that all of the definitions have moved out of tracehook.h into
ptrace.h, sched/signal.h, resume_user_mode.h there is nothing left in
tracehook.h so remove it.

Update the few files that were depending upon tracehook.h to bring in
definitions to use the headers they need directly.

Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lkml.kernel.org/r/20220309162454.123006-13-ebiederm@xmission.com
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 MAINTAINERS                          |  1 -
 arch/s390/include/asm/entry-common.h |  1 -
 arch/s390/kernel/ptrace.c            |  1 -
 arch/s390/kernel/signal.c            |  1 -
 arch/x86/kernel/ptrace.c             |  1 -
 arch/x86/kernel/signal.c             |  1 -
 fs/coredump.c                        |  1 -
 fs/exec.c                            |  1 -
 fs/io-wq.c                           |  2 +-
 fs/io_uring.c                        |  1 -
 fs/proc/array.c                      |  1 -
 fs/proc/base.c                       |  1 -
 include/linux/tracehook.h            | 56 ----------------------------
 kernel/exit.c                        |  3 +-
 kernel/livepatch/transition.c        |  1 -
 kernel/seccomp.c                     |  1 -
 kernel/signal.c                      |  2 +-
 security/apparmor/domain.c           |  1 -
 security/selinux/hooks.c             |  1 -
 19 files changed, 4 insertions(+), 74 deletions(-)
 delete mode 100644 include/linux/tracehook.h

diff --git a/MAINTAINERS b/MAINTAINERS
index ea3e6c914384..2f16a23a26a2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15623,7 +15623,6 @@ F:	arch/*/ptrace*.c
 F:	include/asm-generic/syscall.h
 F:	include/linux/ptrace.h
 F:	include/linux/regset.h
-F:	include/linux/tracehook.h
 F:	include/uapi/linux/ptrace.h
 F:	include/uapi/linux/ptrace.h
 F:	kernel/ptrace.c
diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h
index 17aead80aadb..eabab24b71dd 100644
--- a/arch/s390/include/asm/entry-common.h
+++ b/arch/s390/include/asm/entry-common.h
@@ -5,7 +5,6 @@
 #include <linux/sched.h>
 #include <linux/audit.h>
 #include <linux/randomize_kstack.h>
-#include <linux/tracehook.h>
 #include <linux/processor.h>
 #include <linux/uaccess.h>
 #include <asm/timex.h>
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 0ea3d02b378d..641fa36f6101 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -21,7 +21,6 @@
 #include <linux/signal.h>
 #include <linux/elf.h>
 #include <linux/regset.h>
-#include <linux/tracehook.h>
 #include <linux/seccomp.h>
 #include <linux/compat.h>
 #include <trace/syscall.h>
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index ea9e5e8182cd..8b7b5f80f722 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -25,7 +25,6 @@
 #include <linux/tty.h>
 #include <linux/personality.h>
 #include <linux/binfmts.h>
-#include <linux/tracehook.h>
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <asm/ucontext.h>
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 6d2244c94799..419768d7605e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -13,7 +13,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/security.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index de3d5b5724d8..e439eb14325f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -18,7 +18,6 @@
 #include <linux/kstrtox.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
-#include <linux/tracehook.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
 #include <linux/personality.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c060c0a2d72..f54c5e316df3 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -31,7 +31,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-#include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302..e23e2d430485 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,7 +56,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-#include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
 #include <linux/fs_struct.h>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 8b9147873c2c..cb3cb1833ef6 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -13,7 +13,7 @@
 #include <linux/slab.h>
 #include <linux/rculist_nulls.h>
 #include <linux/cpu.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
 #include <linux/audit.h>
 #include <uapi/linux/io_uring.h>
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d5fbae1030f9..6c7eacc0ebd6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,7 +78,6 @@
 #include <linux/task_work.h>
 #include <linux/pagemap.h>
 #include <linux/io_uring.h>
-#include <linux/tracehook.h>
 #include <linux/audit.h>
 #include <linux/security.h>
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index fd8b0c12b2cb..eb815759842c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -88,7 +88,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/prctl.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/string_helpers.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_struct.h>
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d654ce7150fd..01fb37ecc89f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,7 +74,6 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
-#include <linux/tracehook.h>
 #include <linux/printk.h>
 #include <linux/cache.h>
 #include <linux/cgroup.h>
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
deleted file mode 100644
index 9f6b3fd1880a..000000000000
--- a/include/linux/tracehook.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Tracing hooks
- *
- * Copyright (C) 2008-2009 Red Hat, Inc.  All rights reserved.
- *
- * This file defines hook entry points called by core code where
- * user tracing/debugging support might need to do something.  These
- * entry points are called tracehook_*().  Each hook declared below
- * has a detailed kerneldoc comment giving the context (locking et
- * al) from which it is called, and the meaning of its return value.
- *
- * Each function here typically has only one call site, so it is ok
- * to have some nontrivial tracehook_*() inlines.  In all cases, the
- * fast path when no tracing is enabled should be very short.
- *
- * The purpose of this file and the tracehook_* layer is to consolidate
- * the interface that the kernel core and arch code uses to enable any
- * user debugging or tracing facility (such as ptrace).  The interfaces
- * here are carefully documented so that maintainers of core and arch
- * code do not need to think about the implementation details of the
- * tracing facilities.  Likewise, maintainers of the tracing code do not
- * need to understand all the calling core or arch code in detail, just
- * documented circumstances of each call, such as locking conditions.
- *
- * If the calling core code changes so that locking is different, then
- * it is ok to change the interface documented here.  The maintainer of
- * core code changing should notify the maintainers of the tracing code
- * that they need to work out the change.
- *
- * Some tracehook_*() inlines take arguments that the current tracing
- * implementations might not necessarily use.  These function signatures
- * are chosen to pass in all the information that is on hand in the
- * caller and might conceivably be relevant to a tracer, so that the
- * core code won't have to be updated when tracing adds more features.
- * If a call site changes so that some of those parameters are no longer
- * already on hand without extra work, then the tracehook_* interface
- * can change so there is no make-work burden on the core code.  The
- * maintainer of core code changing should notify the maintainers of the
- * tracing code that they need to work out the change.
- */
-
-#ifndef _LINUX_TRACEHOOK_H
-#define _LINUX_TRACEHOOK_H	1
-
-#include <linux/sched.h>
-#include <linux/ptrace.h>
-#include <linux/security.h>
-#include <linux/task_work.h>
-#include <linux/memcontrol.h>
-#include <linux/blk-cgroup.h>
-struct linux_binprm;
-
-
-
-#endif	/* <linux/tracehook.h> */
diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..9326d1f97fc7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,8 @@
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..df808d97d84f 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,7 +9,6 @@
 
 #include <linux/cpu.h>
 #include <linux/stacktrace.h>
-#include <linux/tracehook.h>
 #include "core.h"
 #include "patch.h"
 #include "transition.h"
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4d8f44a17727..63198086ee83 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -39,7 +39,6 @@
 #include <linux/pid.h>
 #include <linux/ptrace.h>
 #include <linux/capability.h>
-#include <linux/tracehook.h>
 #include <linux/uaccess.h>
 #include <linux/anon_inodes.h>
 #include <linux/lockdep.h>
diff --git a/kernel/signal.c b/kernel/signal.c
index 8632b88982c9..c2dee5420567 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,7 +32,7 @@
 #include <linux/signal.h>
 #include <linux/signalfd.h>
 #include <linux/ratelimit.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
 #include <linux/capability.h>
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 583680f6cd81..a29e69d2c300 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -14,7 +14,6 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/syscalls.h>
-#include <linux/tracehook.h>
 #include <linux/personality.h>
 #include <linux/xattr.h>
 #include <linux/user_namespace.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5b6895e4fc29..4d2cd6b9f6fc 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -25,7 +25,6 @@
 #include <linux/kd.h>
 #include <linux/kernel.h>
 #include <linux/kernel_read_file.h>
-#include <linux/tracehook.h>
 #include <linux/errno.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>

From cac2ed0a1b0653d95e8714667385214b06f67c0f Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 9 Mar 2022 20:45:40 +0530
Subject: [PATCH 110/229] dt-bindings: dvfs: Use MediaTek CPUFREQ HW as an
 example

Qcom CPUFREQ HW don't have the support for generic performance domains yet.
So use MediaTek CPUFREQ HW that has the support available in mainline.

This also silences the below dtschema warnings for "cpufreq-qcom-hw.yaml":

Documentation/devicetree/bindings/dvfs/performance-domain.example.dt.yaml: performance-controller@12340000: reg: [[305397760, 4096]] is too short
        From schema: Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
Documentation/devicetree/bindings/dvfs/performance-domain.example.dt.yaml: performance-controller@12340000: 'clocks' is a required property
        From schema: Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
Documentation/devicetree/bindings/dvfs/performance-domain.example.dt.yaml: performance-controller@12340000: 'clock-names' is a required property
        From schema: Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
Documentation/devicetree/bindings/dvfs/performance-domain.example.dt.yaml: performance-controller@12340000: '#freq-domain-cells' is a required property
        From schema: Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
Documentation/devicetree/bindings/dvfs/performance-domain.example.dt.yaml: performance-controller@12340000: '#performance-domain-cells' does not match any of the regexes: 'pinctrl-[0-9]+'
        From schema: Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml

Cc: Hector Yuan <hector.yuan@mediatek.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Acked-by: Sudeep Holla <sudeep.holla@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../bindings/dvfs/performance-domain.yaml          | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/Documentation/devicetree/bindings/dvfs/performance-domain.yaml b/Documentation/devicetree/bindings/dvfs/performance-domain.yaml
index c8b91207f34d..9e0bcf1a89fe 100644
--- a/Documentation/devicetree/bindings/dvfs/performance-domain.yaml
+++ b/Documentation/devicetree/bindings/dvfs/performance-domain.yaml
@@ -52,10 +52,16 @@ additionalProperties: true
 
 examples:
   - |
-    performance: performance-controller@12340000 {
-        compatible = "qcom,cpufreq-hw";
-        reg = <0x12340000 0x1000>;
-        #performance-domain-cells = <1>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        performance: performance-controller@11bc00 {
+            compatible = "mediatek,cpufreq-hw";
+            reg = <0 0x0011bc10 0 0x120>, <0 0x0011bd30 0 0x120>;
+
+            #performance-domain-cells = <1>;
+        };
     };
 
     // The node above defines a performance controller that is a performance

From b7f2b0d3511a6bbf9387f08f370f9125663e18d8 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 9 Mar 2022 20:45:41 +0530
Subject: [PATCH 111/229] dt-bindings: cpufreq: cpufreq-qcom-hw: Convert to
 YAML bindings

Convert Qualcomm cpufreq devicetree binding to YAML.

Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@somainline.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 .../bindings/cpufreq/cpufreq-qcom-hw.txt      | 172 ---------------
 .../bindings/cpufreq/cpufreq-qcom-hw.yaml     | 201 ++++++++++++++++++
 2 files changed, 201 insertions(+), 172 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt
 create mode 100644 Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml

diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt
deleted file mode 100644
index 9299028ee712..000000000000
--- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-Qualcomm Technologies, Inc. CPUFREQ Bindings
-
-CPUFREQ HW is a hardware engine used by some Qualcomm Technologies, Inc. (QTI)
-SoCs to manage frequency in hardware. It is capable of controlling frequency
-for multiple clusters.
-
-Properties:
-- compatible
-	Usage:		required
-	Value type:	<string>
-	Definition:	must be "qcom,cpufreq-hw" or "qcom,cpufreq-epss".
-
-- clocks
-	Usage:		required
-	Value type:	<phandle> From common clock binding.
-	Definition:	clock handle for XO clock and GPLL0 clock.
-
-- clock-names
-	Usage:		required
-	Value type:	<string> From common clock binding.
-	Definition:	must be "xo", "alternate".
-
-- reg
-	Usage:		required
-	Value type:	<prop-encoded-array>
-	Definition:	Addresses and sizes for the memory of the HW bases in
-			each frequency domain.
-- reg-names
-	Usage:		Optional
-	Value type:	<string>
-	Definition:	Frequency domain name i.e.
-			"freq-domain0", "freq-domain1".
-
-- #freq-domain-cells:
-	Usage:		required.
-	Definition:	Number of cells in a freqency domain specifier.
-
-* Property qcom,freq-domain
-Devices supporting freq-domain must set their "qcom,freq-domain" property with
-phandle to a cpufreq_hw followed by the Domain ID(0/1) in the CPU DT node.
-
-
-Example:
-
-Example 1: Dual-cluster, Quad-core per cluster. CPUs within a cluster switch
-DCVS state together.
-
-/ {
-	cpus {
-		#address-cells = <2>;
-		#size-cells = <0>;
-
-		CPU0: cpu@0 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x0>;
-			enable-method = "psci";
-			next-level-cache = <&L2_0>;
-			qcom,freq-domain = <&cpufreq_hw 0>;
-			L2_0: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-				L3_0: l3-cache {
-				      compatible = "cache";
-				};
-			};
-		};
-
-		CPU1: cpu@100 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x100>;
-			enable-method = "psci";
-			next-level-cache = <&L2_100>;
-			qcom,freq-domain = <&cpufreq_hw 0>;
-			L2_100: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-
-		CPU2: cpu@200 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x200>;
-			enable-method = "psci";
-			next-level-cache = <&L2_200>;
-			qcom,freq-domain = <&cpufreq_hw 0>;
-			L2_200: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-
-		CPU3: cpu@300 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x300>;
-			enable-method = "psci";
-			next-level-cache = <&L2_300>;
-			qcom,freq-domain = <&cpufreq_hw 0>;
-			L2_300: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-
-		CPU4: cpu@400 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x400>;
-			enable-method = "psci";
-			next-level-cache = <&L2_400>;
-			qcom,freq-domain = <&cpufreq_hw 1>;
-			L2_400: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-
-		CPU5: cpu@500 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x500>;
-			enable-method = "psci";
-			next-level-cache = <&L2_500>;
-			qcom,freq-domain = <&cpufreq_hw 1>;
-			L2_500: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-
-		CPU6: cpu@600 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x600>;
-			enable-method = "psci";
-			next-level-cache = <&L2_600>;
-			qcom,freq-domain = <&cpufreq_hw 1>;
-			L2_600: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-
-		CPU7: cpu@700 {
-			device_type = "cpu";
-			compatible = "qcom,kryo385";
-			reg = <0x0 0x700>;
-			enable-method = "psci";
-			next-level-cache = <&L2_700>;
-			qcom,freq-domain = <&cpufreq_hw 1>;
-			L2_700: l2-cache {
-				compatible = "cache";
-				next-level-cache = <&L3_0>;
-			};
-		};
-	};
-
- soc {
-	cpufreq_hw: cpufreq@17d43000 {
-		compatible = "qcom,cpufreq-hw";
-		reg = <0x17d43000 0x1400>, <0x17d45800 0x1400>;
-		reg-names = "freq-domain0", "freq-domain1";
-
-		clocks = <&rpmhcc RPMH_CXO_CLK>, <&gcc GPLL0>;
-		clock-names = "xo", "alternate";
-
-		#freq-domain-cells = <1>;
-	};
-}
diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
new file mode 100644
index 000000000000..2f1b8b6852a0
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml
@@ -0,0 +1,201 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/cpufreq/cpufreq-qcom-hw.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies, Inc. CPUFREQ
+
+maintainers:
+  - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+
+description: |
+
+  CPUFREQ HW is a hardware engine used by some Qualcomm Technologies, Inc. (QTI)
+  SoCs to manage frequency in hardware. It is capable of controlling frequency
+  for multiple clusters.
+
+properties:
+  compatible:
+    oneOf:
+      - description: v1 of CPUFREQ HW
+        items:
+          - const: qcom,cpufreq-hw
+
+      - description: v2 of CPUFREQ HW (EPSS)
+        items:
+          - enum:
+              - qcom,sm8250-cpufreq-epss
+          - const: qcom,cpufreq-epss
+
+  reg:
+    minItems: 2
+    items:
+      - description: Frequency domain 0 register region
+      - description: Frequency domain 1 register region
+      - description: Frequency domain 2 register region
+
+  reg-names:
+    minItems: 2
+    items:
+      - const: freq-domain0
+      - const: freq-domain1
+      - const: freq-domain2
+
+  clocks:
+    items:
+      - description: XO Clock
+      - description: GPLL0 Clock
+
+  clock-names:
+    items:
+      - const: xo
+      - const: alternate
+
+  '#freq-domain-cells':
+    const: 1
+
+required:
+  - compatible
+  - reg
+  - clocks
+  - clock-names
+  - '#freq-domain-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/clock/qcom,gcc-sdm845.h>
+    #include <dt-bindings/clock/qcom,rpmh.h>
+
+    // Example 1: Dual-cluster, Quad-core per cluster. CPUs within a cluster
+    // switch DCVS state together.
+    cpus {
+      #address-cells = <2>;
+      #size-cells = <0>;
+
+      CPU0: cpu@0 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x0>;
+        enable-method = "psci";
+        next-level-cache = <&L2_0>;
+        qcom,freq-domain = <&cpufreq_hw 0>;
+        L2_0: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+          L3_0: l3-cache {
+            compatible = "cache";
+          };
+        };
+      };
+
+      CPU1: cpu@100 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x100>;
+        enable-method = "psci";
+        next-level-cache = <&L2_100>;
+        qcom,freq-domain = <&cpufreq_hw 0>;
+        L2_100: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+
+      CPU2: cpu@200 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x200>;
+        enable-method = "psci";
+        next-level-cache = <&L2_200>;
+        qcom,freq-domain = <&cpufreq_hw 0>;
+        L2_200: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+
+      CPU3: cpu@300 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x300>;
+        enable-method = "psci";
+        next-level-cache = <&L2_300>;
+        qcom,freq-domain = <&cpufreq_hw 0>;
+        L2_300: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+
+      CPU4: cpu@400 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x400>;
+        enable-method = "psci";
+        next-level-cache = <&L2_400>;
+        qcom,freq-domain = <&cpufreq_hw 1>;
+        L2_400: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+
+      CPU5: cpu@500 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x500>;
+        enable-method = "psci";
+        next-level-cache = <&L2_500>;
+        qcom,freq-domain = <&cpufreq_hw 1>;
+        L2_500: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+
+      CPU6: cpu@600 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x600>;
+        enable-method = "psci";
+        next-level-cache = <&L2_600>;
+        qcom,freq-domain = <&cpufreq_hw 1>;
+        L2_600: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+
+      CPU7: cpu@700 {
+        device_type = "cpu";
+        compatible = "qcom,kryo385";
+        reg = <0x0 0x700>;
+        enable-method = "psci";
+        next-level-cache = <&L2_700>;
+        qcom,freq-domain = <&cpufreq_hw 1>;
+        L2_700: l2-cache {
+          compatible = "cache";
+          next-level-cache = <&L3_0>;
+        };
+      };
+    };
+
+    soc {
+      #address-cells = <1>;
+      #size-cells = <1>;
+
+      cpufreq@17d43000 {
+        compatible = "qcom,cpufreq-hw";
+        reg = <0x17d43000 0x1400>, <0x17d45800 0x1400>;
+        reg-names = "freq-domain0", "freq-domain1";
+
+        clocks = <&rpmhcc RPMH_CXO_CLK>, <&gcc GPLL0>;
+        clock-names = "xo", "alternate";
+
+        #freq-domain-cells = <1>;
+      };
+    };
+...

From d9d290d7e659e9db3e4518040cc18b97f5535f4a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:21 -0800
Subject: [PATCH 112/229] nvdimm/region: Fix default alignment for small
 regions

In preparation for removing BLK aperture support the NVDIMM unit tests
discovered that the default alignment can be set higher than the
capacity of the region. Fall back to PAGE_SIZE in that case.

Given this has not been seen in the wild, elide notifying -stable.

Fixes: 2522afb86a8c ("libnvdimm/region: Introduce an 'align' attribute")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688416128.2879318.17890707310125575258.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/region_devs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 9ccf3d608799..70ad891a76ba 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1025,6 +1025,9 @@ static unsigned long default_align(struct nd_region *nd_region)
 		}
 	}
 
+	if (nd_region->ndr_size < MEMREMAP_COMPAT_ALIGN_MAX)
+		align = PAGE_SIZE;
+
 	mappings = max_t(u16, 1, nd_region->ndr_mappings);
 	div_u64_rem(align, mappings, &remainder);
 	if (remainder)

From f8669f1d6a86a6b17104ceca9340ded280307ac1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:26 -0800
Subject: [PATCH 113/229] nvdimm/blk: Delete the block-aperture window driver

Block Aperture Window support was an attempt to layer an error model
over PMEM for platforms that did not support machine-check-recovery.
However, it was abandoned before it ever shipped, and only ever existed
in the ACPI specification. Meanwhile Linux has carried a large pile of
dead code for non-shipping infrastructure. For years it has been off to
the side out of the way, but now CXL and recent directions with DAX
support have the potential to collide with this code.

In preparation for adding discontiguous namespace support, a
pre-requisite for the nvdimm subsystem to replace device-mapper for
striping + concatenation use cases, delete BLK aperture support.

On the obscure chance that some hardware vendor shipped support for this
mode, note that the driver will still keep BLK space reserved in the
label area. So an end user in this case would still have the opportunity
to report the regression to get BLK-mode support restored without
risking the data they have on that device.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688416668.2879318.16903178375774275120.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/driver-api/nvdimm/nvdimm.rst | 400 +++++----------------
 drivers/nvdimm/Kconfig                     |  25 +-
 drivers/nvdimm/Makefile                    |   3 -
 drivers/nvdimm/blk.c                       | 335 -----------------
 tools/testing/nvdimm/Kbuild                |   4 -
 tools/testing/nvdimm/config_check.c        |   1 -
 6 files changed, 89 insertions(+), 679 deletions(-)
 delete mode 100644 drivers/nvdimm/blk.c

diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst
index 1d8302b89bd4..7917f6471092 100644
--- a/Documentation/driver-api/nvdimm/nvdimm.rst
+++ b/Documentation/driver-api/nvdimm/nvdimm.rst
@@ -14,10 +14,8 @@ Version 13
 	Overview
 	    Supporting Documents
 	    Git Trees
-	LIBNVDIMM PMEM and BLK
-	Why BLK?
-	    PMEM vs BLK
-	        BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+	LIBNVDIMM PMEM
+	        PMEM-REGIONs, Atomic Sectors, and DAX
 	Example NVDIMM Platform
 	LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
 	    LIBNDCTL: Context
@@ -53,19 +51,12 @@ PMEM:
   block device composed of PMEM is capable of DAX.  A PMEM address range
   may span an interleave of several DIMMs.
 
-BLK:
-  A set of one or more programmable memory mapped apertures provided
-  by a DIMM to access its media.  This indirection precludes the
-  performance benefit of interleaving, but enables DIMM-bounded failure
-  modes.
-
 DPA:
   DIMM Physical Address, is a DIMM-relative offset.  With one DIMM in
   the system there would be a 1:1 system-physical-address:DPA association.
   Once more DIMMs are added a memory controller interleave must be
   decoded to determine the DPA associated with a given
-  system-physical-address.  BLK capacity always has a 1:1 relationship
-  with a single-DIMM's DPA range.
+  system-physical-address.
 
 DAX:
   File system extensions to bypass the page cache and block layer to
@@ -84,30 +75,30 @@ BTT:
   Block Translation Table: Persistent memory is byte addressable.
   Existing software may have an expectation that the power-fail-atomicity
   of writes is at least one sector, 512 bytes.  The BTT is an indirection
-  table with atomic update semantics to front a PMEM/BLK block device
+  table with atomic update semantics to front a PMEM block device
   driver and present arbitrary atomic sector sizes.
 
 LABEL:
   Metadata stored on a DIMM device that partitions and identifies
-  (persistently names) storage between PMEM and BLK.  It also partitions
-  BLK storage to host BTTs with different parameters per BLK-partition.
-  Note that traditional partition tables, GPT/MBR, are layered on top of a
-  BLK or PMEM device.
+  (persistently names) capacity allocated to different PMEM namespaces. It
+  also indicates whether an address abstraction like a BTT is applied to
+  the namepsace.  Note that traditional partition tables, GPT/MBR, are
+  layered on top of a PMEM namespace, or an address abstraction like BTT
+  if present, but partition support is deprecated going forward.
 
 
 Overview
 ========
 
-The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
-PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
-and BLK mode access.  These three modes of operation are described by
-the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6.  While the LIBNVDIMM
-implementation is generic and supports pre-NFIT platforms, it was guided
-by the superset of capabilities need to support this ACPI 6 definition
-for NVDIMM resources.  The bulk of the kernel implementation is in place
-to handle the case where DPA accessible via PMEM is aliased with DPA
-accessible via BLK.  When that occurs a LABEL is needed to reserve DPA
-for exclusive access via one mode a time.
+The LIBNVDIMM subsystem provides support for PMEM described by platform
+firmware or a device driver. On ACPI based systems the platform firmware
+conveys persistent memory resource via the ACPI NFIT "NVDIMM Firmware
+Interface Table" in ACPI 6. While the LIBNVDIMM subsystem implementation
+is generic and supports pre-NFIT platforms, it was guided by the
+superset of capabilities need to support this ACPI 6 definition for
+NVDIMM resources. The original implementation supported the
+block-window-aperture capability described in the NFIT, but that support
+has since been abandoned and never shipped in a product.
 
 Supporting Documents
 --------------------
@@ -125,107 +116,38 @@ Git Trees
 ---------
 
 LIBNVDIMM:
-	https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
+	https://git.kernel.org/cgit/linux/kernel/git/nvdimm/nvdimm.git
 LIBNDCTL:
 	https://github.com/pmem/ndctl.git
-PMEM:
-	https://github.com/01org/prd
 
 
-LIBNVDIMM PMEM and BLK
-======================
+LIBNVDIMM PMEM
+==============
 
 Prior to the arrival of the NFIT, non-volatile memory was described to a
 system in various ad-hoc ways.  Usually only the bare minimum was
 provided, namely, a single system-physical-address range where writes
 are expected to be durable after a system power loss.  Now, the NFIT
 specification standardizes not only the description of PMEM, but also
-BLK and platform message-passing entry points for control and
-configuration.
+platform message-passing entry points for control and configuration.
 
-For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
-device driver:
+PMEM (nd_pmem.ko): Drives a system-physical-address range.  This range is
+contiguous in system memory and may be interleaved (hardware memory controller
+striped) across multiple DIMMs.  When interleaved the platform may optionally
+provide details of which DIMMs are participating in the interleave.
 
-    1. PMEM (nd_pmem.ko): Drives a system-physical-address range.  This
-       range is contiguous in system memory and may be interleaved (hardware
-       memory controller striped) across multiple DIMMs.  When interleaved the
-       platform may optionally provide details of which DIMMs are participating
-       in the interleave.
+It is worth noting that when the labeling capability is detected (a EFI
+namespace label index block is found), then no block device is created
+by default as userspace needs to do at least one allocation of DPA to
+the PMEM range.  In contrast ND_NAMESPACE_IO ranges, once registered,
+can be immediately attached to nd_pmem. This latter mode is called
+label-less or "legacy".
 
-       Note that while LIBNVDIMM describes system-physical-address ranges that may
-       alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
-       alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
-       distinction.  The different device-types are an implementation detail
-       that userspace can exploit to implement policies like "only interface
-       with address ranges from certain DIMMs".  It is worth noting that when
-       aliasing is present and a DIMM lacks a label, then no block device can
-       be created by default as userspace needs to do at least one allocation
-       of DPA to the PMEM range.  In contrast ND_NAMESPACE_IO ranges, once
-       registered, can be immediately attached to nd_pmem.
+PMEM-REGIONs, Atomic Sectors, and DAX
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-    2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
-       defined apertures.  A set of apertures will access just one DIMM.
-       Multiple windows (apertures) allow multiple concurrent accesses, much like
-       tagged-command-queuing, and would likely be used by different threads or
-       different CPUs.
-
-       The NFIT specification defines a standard format for a BLK-aperture, but
-       the spec also allows for vendor specific layouts, and non-NFIT BLK
-       implementations may have other designs for BLK I/O.  For this reason
-       "nd_blk" calls back into platform-specific code to perform the I/O.
-
-       One such implementation is defined in the "Driver Writer's Guide" and "DSM
-       Interface Example".
-
-
-Why BLK?
-========
-
-While PMEM provides direct byte-addressable CPU-load/store access to
-NVDIMM storage, it does not provide the best system RAS (recovery,
-availability, and serviceability) model.  An access to a corrupted
-system-physical-address address causes a CPU exception while an access
-to a corrupted address through an BLK-aperture causes that block window
-to raise an error status in a register.  The latter is more aligned with
-the standard error model that host-bus-adapter attached disks present.
-
-Also, if an administrator ever wants to replace a memory it is easier to
-service a system at DIMM module boundaries.  Compare this to PMEM where
-data could be interleaved in an opaque hardware specific manner across
-several DIMMs.
-
-PMEM vs BLK
------------
-
-BLK-apertures solve these RAS problems, but their presence is also the
-major contributing factor to the complexity of the ND subsystem.  They
-complicate the implementation because PMEM and BLK alias in DPA space.
-Any given DIMM's DPA-range may contribute to one or more
-system-physical-address sets of interleaved DIMMs, *and* may also be
-accessed in its entirety through its BLK-aperture.  Accessing a DPA
-through a system-physical-address while simultaneously accessing the
-same DPA through a BLK-aperture has undefined results.  For this reason,
-DIMMs with this dual interface configuration include a DSM function to
-store/retrieve a LABEL.  The LABEL effectively partitions the DPA-space
-into exclusive system-physical-address and BLK-aperture accessible
-regions.  For simplicity a DIMM is allowed a PMEM "region" per each
-interleave set in which it is a member.  The remaining DPA space can be
-carved into an arbitrary number of BLK devices with discontiguous
-extents.
-
-BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-One of the few
-reasons to allow multiple BLK namespaces per REGION is so that each
-BLK-namespace can be configured with a BTT with unique atomic sector
-sizes.  While a PMEM device can host a BTT the LABEL specification does
-not provide for a sector size to be specified for a PMEM namespace.
-
-This is due to the expectation that the primary usage model for PMEM is
-via DAX, and the BTT is incompatible with DAX.  However, for the cases
-where an application or filesystem still needs atomic sector update
-guarantees it can register a BTT on a PMEM device or partition.  See
+For the cases where an application or filesystem still needs atomic sector
+update guarantees it can register a BTT on a PMEM device or partition.  See
 LIBNVDIMM/NDCTL: Block Translation Table "btt"
 
 
@@ -236,51 +158,40 @@ For the remainder of this document the following diagram will be
 referenced for any example sysfs layouts::
 
 
-                               (a)               (b)           DIMM   BLK-REGION
+                               (a)               (b)           DIMM
             +-------------------+--------+--------+--------+
-  +------+  |       pm0.0       | blk2.0 | pm1.0  | blk2.1 |    0      region2
+  +------+  |       pm0.0       |  free  | pm1.0  |  free  |    0
   | imc0 +--+- - - region0- - - +--------+        +--------+
-  +--+---+  |       pm0.0       | blk3.0 | pm1.0  | blk3.1 |    1      region3
+  +--+---+  |       pm0.0       |  free  | pm1.0  |  free  |    1
      |      +-------------------+--------v        v--------+
   +--+---+                               |                 |
   | cpu0 |                                     region1
   +--+---+                               |                 |
      |      +----------------------------^        ^--------+
-  +--+---+  |           blk4.0           | pm1.0  | blk4.0 |    2      region4
+  +--+---+  |           free             | pm1.0  |  free  |    2
   | imc1 +--+----------------------------|        +--------+
-  +------+  |           blk5.0           | pm1.0  | blk5.0 |    3      region5
+  +------+  |           free             | pm1.0  |  free  |    3
             +----------------------------+--------+--------+
 
 In this platform we have four DIMMs and two memory controllers in one
-socket.  Each unique interface (BLK or PMEM) to DPA space is identified
-by a region device with a dynamically assigned id (REGION0 - REGION5).
+socket.  Each PMEM interleave set is identified by a region device with
+a dynamically assigned id.
 
     1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
        single PMEM namespace is created in the REGION0-SPA-range that spans most
        of DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
-       interleaved system-physical-address range is reclaimed as BLK-aperture
-       accessed space starting at DPA-offset (a) into each DIMM.  In that
-       reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
-       REGION3 where "blk2.0" and "blk3.0" are just human readable names that
-       could be set to any user-desired name in the LABEL.
+       interleaved system-physical-address range is left free for
+       another PMEM namespace to be defined.
 
     2. In the last portion of DIMM0 and DIMM1 we have an interleaved
        system-physical-address range, REGION1, that spans those two DIMMs as
        well as DIMM2 and DIMM3.  Some of REGION1 is allocated to a PMEM namespace
-       named "pm1.0", the rest is reclaimed in 4 BLK-aperture namespaces (for
-       each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
-       "blk5.0".
-
-    3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
-       interleaved system-physical-address range (i.e. the DPA address past
-       offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
-       Note, that this example shows that BLK-aperture namespaces don't need to
-       be contiguous in DPA-space.
+       named "pm1.0".
 
     This bus is provided by the kernel under the device
     /sys/devices/platform/nfit_test.0 when the nfit_test.ko module from
-    tools/testing/nvdimm is loaded.  This not only test LIBNVDIMM but the
-    acpi_nfit.ko driver as well.
+    tools/testing/nvdimm is loaded. This module is a unit test for
+    LIBNVDIMM and the  acpi_nfit.ko driver.
 
 
 LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
@@ -469,17 +380,14 @@ identified by an "nfit_handle" a 32-bit value where:
 LIBNVDIMM/LIBNDCTL: Region
 --------------------------
 
-A generic REGION device is registered for each PMEM range or BLK-aperture
-set.  Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
-sets on the "nfit_test.0" bus.  The primary role of regions are to be a
-container of "mappings".  A mapping is a tuple of <DIMM,
-DPA-start-offset, length>.
+A generic REGION device is registered for each PMEM interleave-set /
+range. Per the example there are 2 PMEM regions on the "nfit_test.0"
+bus. The primary role of regions are to be a container of "mappings".  A
+mapping is a tuple of <DIMM, DPA-start-offset, length>.
 
-LIBNVDIMM provides a built-in driver for these REGION devices.  This driver
-is responsible for reconciling the aliased DPA mappings across all
-regions, parsing the LABEL, if present, and then emitting NAMESPACE
-devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
-nd_blk device driver to consume.
+LIBNVDIMM provides a built-in driver for REGION devices.  This driver
+is responsible for all parsing LABELs, if present, and then emitting NAMESPACE
+devices for the nd_pmem driver to consume.
 
 In addition to the generic attributes of "mapping"s, "interleave_ways"
 and "size" the REGION device also exports some convenience attributes.
@@ -493,8 +401,6 @@ LIBNVDIMM: region::
 
 	struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
 			struct nd_region_desc *ndr_desc);
-	struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
-			struct nd_region_desc *ndr_desc);
 
 ::
 
@@ -527,8 +433,9 @@ LIBNDCTL: region enumeration example
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Sample region retrieval routines based on NFIT-unique data like
-"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
-BLK::
+"spa_index" (interleave set id).
+
+::
 
 	static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
 			unsigned int spa_index)
@@ -544,139 +451,23 @@ BLK::
 		return NULL;
 	}
 
-	static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
-			unsigned int handle)
-	{
-		struct ndctl_region *region;
-
-		ndctl_region_foreach(bus, region) {
-			struct ndctl_mapping *map;
-
-			if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
-				continue;
-			ndctl_mapping_foreach(region, map) {
-				struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
-
-				if (ndctl_dimm_get_handle(dimm) == handle)
-					return region;
-			}
-		}
-		return NULL;
-	}
-
-
-Why Not Encode the Region Type into the Region Name?
-----------------------------------------------------
-
-At first glance it seems since NFIT defines just PMEM and BLK interface
-types that we should simply name REGION devices with something derived
-from those type names.  However, the ND subsystem explicitly keeps the
-REGION name generic and expects userspace to always consider the
-region-attributes for four reasons:
-
-    1. There are already more than two REGION and "namespace" types.  For
-       PMEM there are two subtypes.  As mentioned previously we have PMEM where
-       the constituent DIMM devices are known and anonymous PMEM.  For BLK
-       regions the NFIT specification already anticipates vendor specific
-       implementations.  The exact distinction of what a region contains is in
-       the region-attributes not the region-name or the region-devtype.
-
-    2. A region with zero child-namespaces is a possible configuration.  For
-       example, the NFIT allows for a DCR to be published without a
-       corresponding BLK-aperture.  This equates to a DIMM that can only accept
-       control/configuration messages, but no i/o through a descendant block
-       device.  Again, this "type" is advertised in the attributes ('mappings'
-       == 0) and the name does not tell you much.
-
-    3. What if a third major interface type arises in the future?  Outside
-       of vendor specific implementations, it's not difficult to envision a
-       third class of interface type beyond BLK and PMEM.  With a generic name
-       for the REGION level of the device-hierarchy old userspace
-       implementations can still make sense of new kernel advertised
-       region-types.  Userspace can always rely on the generic region
-       attributes like "mappings", "size", etc and the expected child devices
-       named "namespace".  This generic format of the device-model hierarchy
-       allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
-       future-proof.
-
-    4. There are more robust mechanisms for determining the major type of a
-       region than a device name.  See the next section, How Do I Determine the
-       Major Type of a Region?
-
-How Do I Determine the Major Type of a Region?
-----------------------------------------------
-
-Outside of the blanket recommendation of "use libndctl", or simply
-looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
-"nstype" integer attribute, here are some other options.
-
-1. module alias lookup
-^^^^^^^^^^^^^^^^^^^^^^
-
-    The whole point of region/namespace device type differentiation is to
-    decide which block-device driver will attach to a given LIBNVDIMM namespace.
-    One can simply use the modalias to lookup the resulting module.  It's
-    important to note that this method is robust in the presence of a
-    vendor-specific driver down the road.  If a vendor-specific
-    implementation wants to supplant the standard nd_blk driver it can with
-    minimal impact to the rest of LIBNVDIMM.
-
-    In fact, a vendor may also want to have a vendor-specific region-driver
-    (outside of nd_region).  For example, if a vendor defined its own LABEL
-    format it would need its own region driver to parse that LABEL and emit
-    the resulting namespaces.  The output from module resolution is more
-    accurate than a region-name or region-devtype.
-
-2. udev
-^^^^^^^
-
-    The kernel "devtype" is registered in the udev database::
-
-	# udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
-	P: /devices/platform/nfit_test.0/ndbus0/region0
-	E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
-	E: DEVTYPE=nd_pmem
-	E: MODALIAS=nd:t2
-	E: SUBSYSTEM=nd
-
-	# udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
-	P: /devices/platform/nfit_test.0/ndbus0/region4
-	E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
-	E: DEVTYPE=nd_blk
-	E: MODALIAS=nd:t3
-	E: SUBSYSTEM=nd
-
-    ...and is available as a region attribute, but keep in mind that the
-    "devtype" does not indicate sub-type variations and scripts should
-    really be understanding the other attributes.
-
-3. type specific attributes
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-    As it currently stands a BLK-aperture region will never have a
-    "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region.  A
-    BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
-    that does not allow I/O.  A PMEM region with a "mappings" value of zero
-    is a simple system-physical-address range.
-
 
 LIBNVDIMM/LIBNDCTL: Namespace
 -----------------------------
 
-A REGION, after resolving DPA aliasing and LABEL specified boundaries,
-surfaces one or more "namespace" devices.  The arrival of a "namespace"
-device currently triggers either the nd_blk or nd_pmem driver to load
-and register a disk/block device.
+A REGION, after resolving DPA aliasing and LABEL specified boundaries, surfaces
+one or more "namespace" devices.  The arrival of a "namespace" device currently
+triggers the nd_pmem driver to load and register a disk/block device.
 
 LIBNVDIMM: namespace
 ^^^^^^^^^^^^^^^^^^^^
 
-Here is a sample layout from the three major types of NAMESPACE where
-namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
-attribute), namespace2.0 represents a BLK namespace (note it has a
-'sector_size' attribute) that, and namespace6.0 represents an anonymous
-PMEM namespace (note that has no 'uuid' attribute due to not support a
-LABEL)::
+Here is a sample layout from the 2 major types of NAMESPACE where namespace0.0
+represents DIMM-info-backed PMEM (note that it has a 'uuid' attribute), and
+namespace1.0 represents an anonymous PMEM namespace (note that has no 'uuid'
+attribute due to not support a LABEL)
+
+::
 
 	/sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
 	|-- alt_name
@@ -691,20 +482,7 @@ LABEL)::
 	|-- type
 	|-- uevent
 	`-- uuid
-	/sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
-	|-- alt_name
-	|-- devtype
-	|-- dpa_extents
-	|-- force_raw
-	|-- modalias
-	|-- numa_node
-	|-- sector_size
-	|-- size
-	|-- subsystem -> ../../../../../../bus/nd
-	|-- type
-	|-- uevent
-	`-- uuid
-	/sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
+	/sys/devices/platform/nfit_test.1/ndbus1/region1/namespace1.0
 	|-- block
 	|   `-- pmem0
 	|-- devtype
@@ -786,9 +564,9 @@ Why the Term "namespace"?
 LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
 -------------------------------------------------
 
-A BTT (design document: https://pmem.io/2014/09/23/btt.html) is a stacked
-block device driver that fronts either the whole block device or a
-partition of a block device emitted by either a PMEM or BLK NAMESPACE.
+A BTT (design document: https://pmem.io/2014/09/23/btt.html) is a
+personality driver for a namespace that fronts entire namespace as an
+'address abstraction'.
 
 LIBNVDIMM: btt layout
 ^^^^^^^^^^^^^^^^^^^^^
@@ -815,7 +593,9 @@ LIBNDCTL: btt creation example
 Similar to namespaces an idle BTT device is automatically created per
 region.  Each time this "seed" btt device is configured and enabled a new
 seed is created.  Creating a BTT configuration involves two steps of
-finding and idle BTT and assigning it to consume a PMEM or BLK namespace::
+finding and idle BTT and assigning it to consume a namespace.
+
+::
 
 	static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
 	{
@@ -863,25 +643,15 @@ For the given example above, here is the view of the objects as seen by the
 LIBNDCTL API::
 
               +---+
-              |CTX|    +---------+   +--------------+  +---------------+
-              +-+-+  +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
-                |    | +---------+   +--------------+  +---------------+
-  +-------+     |    | +---------+   +--------------+  +---------------+
-  | DIMM0 <-+   |    +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
-  +-------+ |   |    | +---------+   +--------------+  +---------------+
+              |CTX|
+              +-+-+
+                |
+  +-------+     |
+  | DIMM0 <-+   |      +---------+   +--------------+  +---------------+
+  +-------+ |   |    +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
   | DIMM1 <-+ +-v--+ | +---------+   +--------------+  +---------------+
-  +-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6  "blk2.0" |
-  | DIMM2 <-+ +----+ | +---------+ | +--------------+  +----------------------+
-  +-------+ |        |             +-> NAMESPACE2.1 +--> ND5  "blk2.1" | BTT2 |
-  | DIMM3 <-+        |               +--------------+  +----------------------+
-  +-------+          | +---------+   +--------------+  +---------------+
-                     +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4  "blk3.0" |
-                     | +---------+ | +--------------+  +----------------------+
-                     |             +-> NAMESPACE3.1 +--> ND3  "blk3.1" | BTT1 |
-                     |               +--------------+  +----------------------+
-                     | +---------+   +--------------+  +---------------+
-                     +-> REGION4 +---> NAMESPACE4.0 +--> ND2  "blk4.0" |
-                     | +---------+   +--------------+  +---------------+
-                     | +---------+   +--------------+  +----------------------+
-                     +-> REGION5 +---> NAMESPACE5.0 +--> ND1  "blk5.0" | BTT0 |
-                       +---------+   +--------------+  +---------------+------+
+  +-------+ +-+BUS0+-| +---------+   +--------------+  +----------------------+
+  | DIMM2 <-+ +----+ +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" | BTT1 |
+  +-------+ |        | +---------+   +--------------+  +---------------+------+
+  | DIMM3 <-+
+  +-------+
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 347fe7afa583..5a29046e3319 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -10,12 +10,9 @@ menuconfig LIBNVDIMM
 	  ACPI-6-NFIT defined resources.  On platforms that define an
 	  NFIT, or otherwise can discover NVDIMM resources, a libnvdimm
 	  bus is registered to advertise PMEM (persistent memory)
-	  namespaces (/dev/pmemX) and BLK (sliding mmio window(s))
-	  namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a
+	  namespaces (/dev/pmemX). A PMEM namespace refers to a
 	  memory resource that may span multiple DIMMs and support DAX
-	  (see CONFIG_DAX).  A BLK namespace refers to an NVDIMM control
-	  region which exposes an mmio register set for windowed access
-	  mode to non-volatile memory.
+	  (see CONFIG_DAX).
 
 if LIBNVDIMM
 
@@ -38,19 +35,6 @@ config BLK_DEV_PMEM
 
 	  Say Y if you want to use an NVDIMM
 
-config ND_BLK
-	tristate "BLK: Block data window (aperture) device support"
-	default LIBNVDIMM
-	select ND_BTT if BTT
-	help
-	  Support NVDIMMs, or other devices, that implement a BLK-mode
-	  access capability.  BLK-mode access uses memory-mapped-i/o
-	  apertures to access persistent media.
-
-	  Say Y if your platform firmware emits an ACPI.NFIT table
-	  (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode
-	  capabilities.
-
 config ND_CLAIM
 	bool
 
@@ -67,9 +51,8 @@ config BTT
 	  applications that rely on sector writes not being torn (a
 	  guarantee that typical disks provide) can continue to do so.
 	  The BTT manifests itself as an alternate personality for an
-	  NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX,
-	  ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys,
-	  etc...).
+	  NVDIMM namespace, i.e. a namespace can be in raw mode pmemX,
+	  or 'sectored' mode.
 
 	  Select Y if unsure
 
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 25dba6095612..3fb806748716 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -2,7 +2,6 @@
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
-obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_OF_PMEM) += of_pmem.o
 obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
@@ -11,8 +10,6 @@ nd_pmem-y := pmem.o
 
 nd_btt-y := btt.o
 
-nd_blk-y := blk.o
-
 nd_e820-y := e820.o
 
 libnvdimm-y := core.o
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
deleted file mode 100644
index 228c33b8d1d6..000000000000
--- a/drivers/nvdimm/blk.c
+++ /dev/null
@@ -1,335 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * NVDIMM Block Window Driver
- * Copyright (c) 2014, Intel Corporation.
- */
-
-#include <linux/blkdev.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/nd.h>
-#include <linux/sizes.h>
-#include "nd.h"
-
-static u32 nsblk_meta_size(struct nd_namespace_blk *nsblk)
-{
-	return nsblk->lbasize - ((nsblk->lbasize >= 4096) ? 4096 : 512);
-}
-
-static u32 nsblk_internal_lbasize(struct nd_namespace_blk *nsblk)
-{
-	return roundup(nsblk->lbasize, INT_LBASIZE_ALIGNMENT);
-}
-
-static u32 nsblk_sector_size(struct nd_namespace_blk *nsblk)
-{
-	return nsblk->lbasize - nsblk_meta_size(nsblk);
-}
-
-static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
-				resource_size_t ns_offset, unsigned int len)
-{
-	int i;
-
-	for (i = 0; i < nsblk->num_resources; i++) {
-		if (ns_offset < resource_size(nsblk->res[i])) {
-			if (ns_offset + len > resource_size(nsblk->res[i])) {
-				dev_WARN_ONCE(&nsblk->common.dev, 1,
-					"illegal request\n");
-				return SIZE_MAX;
-			}
-			return nsblk->res[i]->start + ns_offset;
-		}
-		ns_offset -= resource_size(nsblk->res[i]);
-	}
-
-	dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n");
-	return SIZE_MAX;
-}
-
-static struct nd_blk_region *to_ndbr(struct nd_namespace_blk *nsblk)
-{
-	struct nd_region *nd_region;
-	struct device *parent;
-
-	parent = nsblk->common.dev.parent;
-	nd_region = container_of(parent, struct nd_region, dev);
-	return container_of(nd_region, struct nd_blk_region, nd_region);
-}
-
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
-		struct bio_integrity_payload *bip, u64 lba, int rw)
-{
-	struct nd_blk_region *ndbr = to_ndbr(nsblk);
-	unsigned int len = nsblk_meta_size(nsblk);
-	resource_size_t	dev_offset, ns_offset;
-	u32 internal_lbasize, sector_size;
-	int err = 0;
-
-	internal_lbasize = nsblk_internal_lbasize(nsblk);
-	sector_size = nsblk_sector_size(nsblk);
-	ns_offset = lba * internal_lbasize + sector_size;
-	dev_offset = to_dev_offset(nsblk, ns_offset, len);
-	if (dev_offset == SIZE_MAX)
-		return -EIO;
-
-	while (len) {
-		unsigned int cur_len;
-		struct bio_vec bv;
-		void *iobuf;
-
-		bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
-		/*
-		 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and
-		 * .bv_offset already adjusted for iter->bi_bvec_done, and we
-		 * can use those directly
-		 */
-
-		cur_len = min(len, bv.bv_len);
-		iobuf = kmap_atomic(bv.bv_page);
-		err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset,
-				cur_len, rw);
-		kunmap_atomic(iobuf);
-		if (err)
-			return err;
-
-		len -= cur_len;
-		dev_offset += cur_len;
-		if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len))
-			return -EIO;
-	}
-
-	return err;
-}
-
-#else /* CONFIG_BLK_DEV_INTEGRITY */
-static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
-		struct bio_integrity_payload *bip, u64 lba, int rw)
-{
-	return 0;
-}
-#endif
-
-static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
-		struct bio_integrity_payload *bip, struct page *page,
-		unsigned int len, unsigned int off, int rw, sector_t sector)
-{
-	struct nd_blk_region *ndbr = to_ndbr(nsblk);
-	resource_size_t	dev_offset, ns_offset;
-	u32 internal_lbasize, sector_size;
-	int err = 0;
-	void *iobuf;
-	u64 lba;
-
-	internal_lbasize = nsblk_internal_lbasize(nsblk);
-	sector_size = nsblk_sector_size(nsblk);
-	while (len) {
-		unsigned int cur_len;
-
-		/*
-		 * If we don't have an integrity payload, we don't have to
-		 * split the bvec into sectors, as this would cause unnecessary
-		 * Block Window setup/move steps. the do_io routine is capable
-		 * of handling len <= PAGE_SIZE.
-		 */
-		cur_len = bip ? min(len, sector_size) : len;
-
-		lba = div_u64(sector << SECTOR_SHIFT, sector_size);
-		ns_offset = lba * internal_lbasize;
-		dev_offset = to_dev_offset(nsblk, ns_offset, cur_len);
-		if (dev_offset == SIZE_MAX)
-			return -EIO;
-
-		iobuf = kmap_atomic(page);
-		err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw);
-		kunmap_atomic(iobuf);
-		if (err)
-			return err;
-
-		if (bip) {
-			err = nd_blk_rw_integrity(nsblk, bip, lba, rw);
-			if (err)
-				return err;
-		}
-		len -= cur_len;
-		off += cur_len;
-		sector += sector_size >> SECTOR_SHIFT;
-	}
-
-	return err;
-}
-
-static void nd_blk_submit_bio(struct bio *bio)
-{
-	struct bio_integrity_payload *bip;
-	struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data;
-	struct bvec_iter iter;
-	unsigned long start;
-	struct bio_vec bvec;
-	int err = 0, rw;
-	bool do_acct;
-
-	if (!bio_integrity_prep(bio))
-		return;
-
-	bip = bio_integrity(bio);
-	rw = bio_data_dir(bio);
-	do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
-	if (do_acct)
-		start = bio_start_io_acct(bio);
-	bio_for_each_segment(bvec, bio, iter) {
-		unsigned int len = bvec.bv_len;
-
-		BUG_ON(len > PAGE_SIZE);
-		err = nsblk_do_bvec(nsblk, bip, bvec.bv_page, len,
-				bvec.bv_offset, rw, iter.bi_sector);
-		if (err) {
-			dev_dbg(&nsblk->common.dev,
-					"io error in %s sector %lld, len %d,\n",
-					(rw == READ) ? "READ" : "WRITE",
-					(unsigned long long) iter.bi_sector, len);
-			bio->bi_status = errno_to_blk_status(err);
-			break;
-		}
-	}
-	if (do_acct)
-		bio_end_io_acct(bio, start);
-
-	bio_endio(bio);
-}
-
-static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *iobuf, size_t n, int rw,
-		unsigned long flags)
-{
-	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
-	struct nd_blk_region *ndbr = to_ndbr(nsblk);
-	resource_size_t	dev_offset;
-
-	dev_offset = to_dev_offset(nsblk, offset, n);
-
-	if (unlikely(offset + n > nsblk->size)) {
-		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
-		return -EFAULT;
-	}
-
-	if (dev_offset == SIZE_MAX)
-		return -EIO;
-
-	return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw);
-}
-
-static const struct block_device_operations nd_blk_fops = {
-	.owner = THIS_MODULE,
-	.submit_bio =  nd_blk_submit_bio,
-};
-
-static void nd_blk_release_disk(void *disk)
-{
-	del_gendisk(disk);
-	blk_cleanup_disk(disk);
-}
-
-static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
-{
-	struct device *dev = &nsblk->common.dev;
-	resource_size_t available_disk_size;
-	struct gendisk *disk;
-	u64 internal_nlba;
-	int rc;
-
-	internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
-	available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
-
-	disk = blk_alloc_disk(NUMA_NO_NODE);
-	if (!disk)
-		return -ENOMEM;
-
-	disk->fops		= &nd_blk_fops;
-	disk->private_data	= nsblk;
-	nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
-
-	blk_queue_max_hw_sectors(disk->queue, UINT_MAX);
-	blk_queue_logical_block_size(disk->queue, nsblk_sector_size(nsblk));
-	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
-
-	if (nsblk_meta_size(nsblk)) {
-		rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
-
-		if (rc)
-			goto out_before_devm_err;
-	}
-
-	set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
-	rc = device_add_disk(dev, disk, NULL);
-	if (rc)
-		goto out_before_devm_err;
-
-	/* nd_blk_release_disk() is called if this fails */
-	if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
-		return -ENOMEM;
-
-	nvdimm_check_and_set_ro(disk);
-	return 0;
-
-out_before_devm_err:
-	blk_cleanup_disk(disk);
-	return rc;
-}
-
-static int nd_blk_probe(struct device *dev)
-{
-	struct nd_namespace_common *ndns;
-	struct nd_namespace_blk *nsblk;
-
-	ndns = nvdimm_namespace_common_probe(dev);
-	if (IS_ERR(ndns))
-		return PTR_ERR(ndns);
-
-	nsblk = to_nd_namespace_blk(&ndns->dev);
-	nsblk->size = nvdimm_namespace_capacity(ndns);
-	dev_set_drvdata(dev, nsblk);
-
-	ndns->rw_bytes = nsblk_rw_bytes;
-	if (is_nd_btt(dev))
-		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns) == 0) {
-		/* we'll come back as btt-blk */
-		return -ENXIO;
-	} else
-		return nsblk_attach_disk(nsblk);
-}
-
-static void nd_blk_remove(struct device *dev)
-{
-	if (is_nd_btt(dev))
-		nvdimm_namespace_detach_btt(to_nd_btt(dev));
-}
-
-static struct nd_device_driver nd_blk_driver = {
-	.probe = nd_blk_probe,
-	.remove = nd_blk_remove,
-	.drv = {
-		.name = "nd_blk",
-	},
-	.type = ND_DRIVER_NAMESPACE_BLK,
-};
-
-static int __init nd_blk_init(void)
-{
-	return nd_driver_register(&nd_blk_driver);
-}
-
-static void __exit nd_blk_exit(void)
-{
-	driver_unregister(&nd_blk_driver.drv);
-}
-
-MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK);
-module_init(nd_blk_init);
-module_exit(nd_blk_exit);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index c57d9e9d4480..5eb5c23b062f 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -27,7 +27,6 @@ ccflags-y += -I$(srctree)/drivers/acpi/nfit/
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
 obj-$(CONFIG_ND_BTT) += nd_btt.o
-obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
 ifeq ($(CONFIG_DAX),m)
@@ -50,9 +49,6 @@ nd_pmem-y += config_check.o
 nd_btt-y := $(NVDIMM_SRC)/btt.o
 nd_btt-y += config_check.o
 
-nd_blk-y := $(NVDIMM_SRC)/blk.o
-nd_blk-y += config_check.o
-
 nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
 
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
index 3e3a5f518864..baed75e2ccbc 100644
--- a/tools/testing/nvdimm/config_check.c
+++ b/tools/testing/nvdimm/config_check.c
@@ -11,7 +11,6 @@ void check(void)
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_BLK_DEV_PMEM));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_PFN));
-	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
 	if (IS_ENABLED(CONFIG_ACPI_NFIT))
 		BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX));

From fadc38a6672a72bfb2937e85bc5a967f7ab581f8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:32 -0800
Subject: [PATCH 114/229] nvdimm/namespace: Delete blk namespace consideration
 in shared paths

Given is_namespace_blk() is never true outside of the NVDIMM unit tests
delete the support from namespace device management.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688417214.2879318.4698377272678028573.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/namespace_devs.c | 152 ++------------------------------
 1 file changed, 9 insertions(+), 143 deletions(-)

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index b57a2d36c517..5c76547c9b84 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -46,7 +46,6 @@ static void namespace_blk_release(struct device *dev)
 }
 
 static bool is_namespace_pmem(const struct device *dev);
-static bool is_namespace_blk(const struct device *dev);
 static bool is_namespace_io(const struct device *dev);
 
 static int is_uuid_busy(struct device *dev, void *data)
@@ -57,10 +56,6 @@ static int is_uuid_busy(struct device *dev, void *data)
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		uuid2 = nspm->uuid;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		uuid2 = nsblk->uuid;
 	} else if (is_nd_btt(dev)) {
 		struct nd_btt *nd_btt = to_nd_btt(dev);
 
@@ -178,12 +173,6 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		else
 			sprintf(name, "pmem%d%s", nd_region->id,
 					suffix ? suffix : "");
-	} else if (is_namespace_blk(&ndns->dev)) {
-		struct nd_namespace_blk *nsblk;
-
-		nsblk = to_nd_namespace_blk(&ndns->dev);
-		sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id,
-				suffix ? suffix : "");
 	} else {
 		return NULL;
 	}
@@ -201,10 +190,6 @@ const uuid_t *nd_dev_to_uuid(struct device *dev)
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		return nspm->uuid;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		return nsblk->uuid;
 	} else
 		return &uuid_null;
 }
@@ -229,10 +214,6 @@ static ssize_t __alt_name_store(struct device *dev, const char *buf,
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		ns_altname = &nspm->alt_name;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		ns_altname = &nsblk->alt_name;
 	} else
 		return -ENXIO;
 
@@ -264,24 +245,6 @@ out:
 	return rc;
 }
 
-static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk)
-{
-	struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	struct nd_label_id label_id;
-	resource_size_t size = 0;
-	struct resource *res;
-
-	if (!nsblk->uuid)
-		return 0;
-	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
-	for_each_dpa_resource(ndd, res)
-		if (strcmp(res->name, label_id.id) == 0)
-			size += resource_size(res);
-	return size;
-}
-
 static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
 {
 	struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
@@ -363,16 +326,6 @@ static int nd_namespace_label_update(struct nd_region *nd_region,
 			return 0;
 
 		return nd_pmem_namespace_label_update(nd_region, nspm, size);
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-		resource_size_t size = nd_namespace_blk_size(nsblk);
-
-		if (size == 0 && nsblk->uuid)
-			/* delete allocation */;
-		else if (!nsblk->uuid || !nsblk->lbasize)
-			return 0;
-
-		return nd_blk_namespace_label_update(nd_region, nsblk, size);
 	} else
 		return -ENXIO;
 }
@@ -405,10 +358,6 @@ static ssize_t alt_name_show(struct device *dev,
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		ns_altname = nspm->alt_name;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		ns_altname = nsblk->alt_name;
 	} else
 		return -ENXIO;
 
@@ -966,12 +915,6 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 
 		uuid = nspm->uuid;
 		id = nspm->id;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		uuid = nsblk->uuid;
-		flags = NSLABEL_FLAG_LOCAL;
-		id = nsblk->id;
 	}
 
 	/*
@@ -1067,10 +1010,6 @@ static ssize_t size_store(struct device *dev,
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		uuid = &nspm->uuid;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		uuid = &nsblk->uuid;
 	}
 
 	if (rc == 0 && val == 0 && uuid) {
@@ -1095,8 +1034,6 @@ resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns)
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		return resource_size(&nspm->nsio.res);
-	} else if (is_namespace_blk(dev)) {
-		return nd_namespace_blk_size(to_nd_namespace_blk(dev));
 	} else if (is_namespace_io(dev)) {
 		struct nd_namespace_io *nsio = to_nd_namespace_io(dev);
 
@@ -1152,12 +1089,8 @@ static uuid_t *namespace_to_uuid(struct device *dev)
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		return nspm->uuid;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		return nsblk->uuid;
-	} else
-		return ERR_PTR(-ENXIO);
+	}
+	return ERR_PTR(-ENXIO);
 }
 
 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
@@ -1183,7 +1116,6 @@ static int namespace_update_uuid(struct nd_region *nd_region,
 				 struct device *dev, uuid_t *new_uuid,
 				 uuid_t **old_uuid)
 {
-	u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0;
 	struct nd_label_id old_label_id;
 	struct nd_label_id new_label_id;
 	int i;
@@ -1214,8 +1146,8 @@ static int namespace_update_uuid(struct nd_region *nd_region,
 			return -EBUSY;
 	}
 
-	nd_label_gen_id(&old_label_id, *old_uuid, flags);
-	nd_label_gen_id(&new_label_id, new_uuid, flags);
+	nd_label_gen_id(&old_label_id, *old_uuid, 0);
+	nd_label_gen_id(&new_label_id, new_uuid, 0);
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
@@ -1261,10 +1193,6 @@ static ssize_t uuid_store(struct device *dev,
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		ns_uuid = &nspm->uuid;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		ns_uuid = &nsblk->uuid;
 	} else
 		return -ENXIO;
 
@@ -1321,13 +1249,6 @@ static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 };
 static ssize_t sector_size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		return nd_size_select_show(nsblk->lbasize,
-				blk_lbasize_supported, buf);
-	}
-
 	if (is_namespace_pmem(dev)) {
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
@@ -1345,12 +1266,7 @@ static ssize_t sector_size_store(struct device *dev,
 	unsigned long *lbasize;
 	ssize_t rc = 0;
 
-	if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		lbasize = &nsblk->lbasize;
-		supported = blk_lbasize_supported;
-	} else if (is_namespace_pmem(dev)) {
+	if (is_namespace_pmem(dev)) {
 		struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
 
 		lbasize = &nspm->lbasize;
@@ -1390,11 +1306,6 @@ static ssize_t dpa_extents_show(struct device *dev,
 
 		uuid = nspm->uuid;
 		flags = 0;
-	} else if (is_namespace_blk(dev)) {
-		struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-
-		uuid = nsblk->uuid;
-		flags = NSLABEL_FLAG_LOCAL;
 	}
 
 	if (!uuid)
@@ -1627,10 +1538,7 @@ static umode_t namespace_visible(struct kobject *kobj,
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 
-	if (a == &dev_attr_resource.attr && is_namespace_blk(dev))
-		return 0;
-
-	if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
+	if (is_namespace_pmem(dev)) {
 		if (a == &dev_attr_size.attr)
 			return 0644;
 
@@ -1682,11 +1590,6 @@ static bool is_namespace_pmem(const struct device *dev)
 	return dev ? dev->type == &namespace_pmem_device_type : false;
 }
 
-static bool is_namespace_blk(const struct device *dev)
-{
-	return dev ? dev->type == &namespace_blk_device_type : false;
-}
-
 static bool is_namespace_io(const struct device *dev)
 {
 	return dev ? dev->type == &namespace_io_device_type : false;
@@ -1769,18 +1672,6 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 		nspm = to_nd_namespace_pmem(&ndns->dev);
 		if (uuid_not_set(nspm->uuid, &ndns->dev, __func__))
 			return ERR_PTR(-ENODEV);
-	} else if (is_namespace_blk(&ndns->dev)) {
-		struct nd_namespace_blk *nsblk;
-
-		nsblk = to_nd_namespace_blk(&ndns->dev);
-		if (uuid_not_set(nsblk->uuid, &ndns->dev, __func__))
-			return ERR_PTR(-ENODEV);
-		if (!nsblk->lbasize) {
-			dev_dbg(&ndns->dev, "sector size not set\n");
-			return ERR_PTR(-ENODEV);
-		}
-		if (!nd_namespace_blk_validate(nsblk))
-			return ERR_PTR(-ENODEV);
 	}
 
 	return ndns;
@@ -1790,16 +1681,12 @@ EXPORT_SYMBOL(nvdimm_namespace_common_probe);
 int devm_namespace_enable(struct device *dev, struct nd_namespace_common *ndns,
 		resource_size_t size)
 {
-	if (is_namespace_blk(&ndns->dev))
-		return 0;
 	return devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev), size);
 }
 EXPORT_SYMBOL_GPL(devm_namespace_enable);
 
 void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns)
 {
-	if (is_namespace_blk(&ndns->dev))
-		return;
 	devm_nsio_disable(dev, to_nd_namespace_io(&ndns->dev));
 }
 EXPORT_SYMBOL_GPL(devm_namespace_disable);
@@ -2225,7 +2112,6 @@ static int add_namespace_resource(struct nd_region *nd_region,
 
 	for (i = 0; i < count; i++) {
 		uuid_t *uuid = namespace_to_uuid(devs[i]);
-		struct resource *res;
 
 		if (IS_ERR(uuid)) {
 			WARN_ON(1);
@@ -2234,20 +2120,9 @@ static int add_namespace_resource(struct nd_region *nd_region,
 
 		if (!nsl_uuid_equal(ndd, nd_label, uuid))
 			continue;
-		if (is_namespace_blk(devs[i])) {
-			res = nsblk_add_resource(nd_region, ndd,
-					to_nd_namespace_blk(devs[i]),
-					nsl_get_dpa(ndd, nd_label));
-			if (!res)
-				return -ENXIO;
-			nd_dbg_dpa(nd_region, ndd, res, "%d assign\n", count);
-		} else {
-			dev_err(&nd_region->dev,
-				"error: conflicting extents for uuid: %pUb\n",
-				uuid);
-			return -ENXIO;
-		}
-		break;
+		dev_err(&nd_region->dev,
+			"error: conflicting extents for uuid: %pUb\n", uuid);
+		return -ENXIO;
 	}
 
 	return i;
@@ -2305,20 +2180,11 @@ static int cmp_dpa(const void *a, const void *b)
 {
 	const struct device *dev_a = *(const struct device **) a;
 	const struct device *dev_b = *(const struct device **) b;
-	struct nd_namespace_blk *nsblk_a, *nsblk_b;
 	struct nd_namespace_pmem *nspm_a, *nspm_b;
 
 	if (is_namespace_io(dev_a))
 		return 0;
 
-	if (is_namespace_blk(dev_a)) {
-		nsblk_a = to_nd_namespace_blk(dev_a);
-		nsblk_b = to_nd_namespace_blk(dev_b);
-
-		return memcmp(&nsblk_a->res[0]->start, &nsblk_b->res[0]->start,
-				sizeof(resource_size_t));
-	}
-
 	nspm_a = to_nd_namespace_pmem(dev_a);
 	nspm_b = to_nd_namespace_pmem(dev_b);
 

From 84bd3690bf54a2f2f3b8449afa022aac1957ba17 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:37 -0800
Subject: [PATCH 115/229] nvdimm/namespace: Delete nd_namespace_blk

Now that none of the configuration paths consider BLK namespaces, delete
the BLK namespace data and supporting code.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688417727.2879318.11691110761800109662.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/label.c          | 340 --------------------------------
 drivers/nvdimm/label.h          |   3 -
 drivers/nvdimm/namespace_devs.c | 227 ++-------------------
 drivers/nvdimm/nd-core.h        |   3 -
 drivers/nvdimm/nd.h             |   1 -
 include/linux/nd.h              |  26 ---
 6 files changed, 13 insertions(+), 587 deletions(-)

diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 5ec9a4023df9..8c972bcb2ac3 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -968,326 +968,6 @@ static int __pmem_label_update(struct nd_region *nd_region,
 	return rc;
 }
 
-static bool is_old_resource(struct resource *res, struct resource **list, int n)
-{
-	int i;
-
-	if (res->flags & DPA_RESOURCE_ADJUSTED)
-		return false;
-	for (i = 0; i < n; i++)
-		if (res == list[i])
-			return true;
-	return false;
-}
-
-static struct resource *to_resource(struct nvdimm_drvdata *ndd,
-		struct nd_namespace_label *nd_label)
-{
-	struct resource *res;
-
-	for_each_dpa_resource(ndd, res) {
-		if (res->start != nsl_get_dpa(ndd, nd_label))
-			continue;
-		if (resource_size(res) != nsl_get_rawsize(ndd, nd_label))
-			continue;
-		return res;
-	}
-
-	return NULL;
-}
-
-/*
- * Use the presence of the type_guid as a flag to determine isetcookie
- * usage and nlabel + position policy for blk-aperture namespaces.
- */
-static void nsl_set_blk_isetcookie(struct nvdimm_drvdata *ndd,
-				   struct nd_namespace_label *nd_label,
-				   u64 isetcookie)
-{
-	if (efi_namespace_label_has(ndd, type_guid)) {
-		nsl_set_isetcookie(ndd, nd_label, isetcookie);
-		return;
-	}
-	nsl_set_isetcookie(ndd, nd_label, 0); /* N/A */
-}
-
-bool nsl_validate_blk_isetcookie(struct nvdimm_drvdata *ndd,
-				 struct nd_namespace_label *nd_label,
-				 u64 isetcookie)
-{
-	if (!efi_namespace_label_has(ndd, type_guid))
-		return true;
-
-	if (nsl_get_isetcookie(ndd, nd_label) != isetcookie) {
-		dev_dbg(ndd->dev, "expect cookie %#llx got %#llx\n", isetcookie,
-			nsl_get_isetcookie(ndd, nd_label));
-		return false;
-	}
-
-	return true;
-}
-
-static void nsl_set_blk_nlabel(struct nvdimm_drvdata *ndd,
-			       struct nd_namespace_label *nd_label, int nlabel,
-			       bool first)
-{
-	if (!efi_namespace_label_has(ndd, type_guid)) {
-		nsl_set_nlabel(ndd, nd_label, 0); /* N/A */
-		return;
-	}
-	nsl_set_nlabel(ndd, nd_label, first ? nlabel : 0xffff);
-}
-
-static void nsl_set_blk_position(struct nvdimm_drvdata *ndd,
-				 struct nd_namespace_label *nd_label,
-				 bool first)
-{
-	if (!efi_namespace_label_has(ndd, type_guid)) {
-		nsl_set_position(ndd, nd_label, 0);
-		return;
-	}
-	nsl_set_position(ndd, nd_label, first ? 0 : 0xffff);
-}
-
-/*
- * 1/ Account all the labels that can be freed after this update
- * 2/ Allocate and write the label to the staging (next) index
- * 3/ Record the resources in the namespace device
- */
-static int __blk_label_update(struct nd_region *nd_region,
-		struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk,
-		int num_labels)
-{
-	int i, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO;
-	struct nd_interleave_set *nd_set = nd_region->nd_set;
-	struct nd_namespace_common *ndns = &nsblk->common;
-	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	struct nd_namespace_label *nd_label;
-	struct nd_label_ent *label_ent, *e;
-	struct nd_namespace_index *nsindex;
-	unsigned long *free, *victim_map = NULL;
-	struct resource *res, **old_res_list;
-	struct nd_label_id label_id;
-	int min_dpa_idx = 0;
-	LIST_HEAD(list);
-	u32 nslot, slot;
-
-	if (!preamble_next(ndd, &nsindex, &free, &nslot))
-		return -ENXIO;
-
-	old_res_list = nsblk->res;
-	nfree = nd_label_nfree(ndd);
-	old_num_resources = nsblk->num_resources;
-	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
-
-	/*
-	 * We need to loop over the old resources a few times, which seems a
-	 * bit inefficient, but we need to know that we have the label
-	 * space before we start mutating the tracking structures.
-	 * Otherwise the recovery method of last resort for userspace is
-	 * disable and re-enable the parent region.
-	 */
-	alloc = 0;
-	for_each_dpa_resource(ndd, res) {
-		if (strcmp(res->name, label_id.id) != 0)
-			continue;
-		if (!is_old_resource(res, old_res_list, old_num_resources))
-			alloc++;
-	}
-
-	victims = 0;
-	if (old_num_resources) {
-		/* convert old local-label-map to dimm-slot victim-map */
-		victim_map = bitmap_zalloc(nslot, GFP_KERNEL);
-		if (!victim_map)
-			return -ENOMEM;
-
-		/* mark unused labels for garbage collection */
-		for_each_clear_bit_le(slot, free, nslot) {
-			nd_label = to_label(ndd, slot);
-			if (!nsl_uuid_equal(ndd, nd_label, nsblk->uuid))
-				continue;
-			res = to_resource(ndd, nd_label);
-			if (res && is_old_resource(res, old_res_list,
-						old_num_resources))
-				continue;
-			slot = to_slot(ndd, nd_label);
-			set_bit(slot, victim_map);
-			victims++;
-		}
-	}
-
-	/* don't allow updates that consume the last label */
-	if (nfree - alloc < 0 || nfree - alloc + victims < 1) {
-		dev_info(&nsblk->common.dev, "insufficient label space\n");
-		bitmap_free(victim_map);
-		return -ENOSPC;
-	}
-	/* from here on we need to abort on error */
-
-
-	/* assign all resources to the namespace before writing the labels */
-	nsblk->res = NULL;
-	nsblk->num_resources = 0;
-	for_each_dpa_resource(ndd, res) {
-		if (strcmp(res->name, label_id.id) != 0)
-			continue;
-		if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) {
-			rc = -ENOMEM;
-			goto abort;
-		}
-	}
-
-	/* release slots associated with any invalidated UUIDs */
-	mutex_lock(&nd_mapping->lock);
-	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list)
-		if (test_and_clear_bit(ND_LABEL_REAP, &label_ent->flags)) {
-			reap_victim(nd_mapping, label_ent);
-			list_move(&label_ent->list, &list);
-		}
-	mutex_unlock(&nd_mapping->lock);
-
-	/*
-	 * Find the resource associated with the first label in the set
-	 * per the v1.2 namespace specification.
-	 */
-	for (i = 0; i < nsblk->num_resources; i++) {
-		struct resource *min = nsblk->res[min_dpa_idx];
-
-		res = nsblk->res[i];
-		if (res->start < min->start)
-			min_dpa_idx = i;
-	}
-
-	for (i = 0; i < nsblk->num_resources; i++) {
-		size_t offset;
-
-		res = nsblk->res[i];
-		if (is_old_resource(res, old_res_list, old_num_resources))
-			continue; /* carry-over */
-		slot = nd_label_alloc_slot(ndd);
-		if (slot == UINT_MAX) {
-			rc = -ENXIO;
-			goto abort;
-		}
-		dev_dbg(ndd->dev, "allocated: %d\n", slot);
-
-		nd_label = to_label(ndd, slot);
-		memset(nd_label, 0, sizeof_namespace_label(ndd));
-		nsl_set_uuid(ndd, nd_label, nsblk->uuid);
-		nsl_set_name(ndd, nd_label, nsblk->alt_name);
-		nsl_set_flags(ndd, nd_label, NSLABEL_FLAG_LOCAL);
-
-		nsl_set_blk_nlabel(ndd, nd_label, nsblk->num_resources,
-				   i == min_dpa_idx);
-		nsl_set_blk_position(ndd, nd_label, i == min_dpa_idx);
-		nsl_set_blk_isetcookie(ndd, nd_label, nd_set->cookie2);
-
-		nsl_set_dpa(ndd, nd_label, res->start);
-		nsl_set_rawsize(ndd, nd_label, resource_size(res));
-		nsl_set_lbasize(ndd, nd_label, nsblk->lbasize);
-		nsl_set_slot(ndd, nd_label, slot);
-		nsl_set_type_guid(ndd, nd_label, &nd_set->type_guid);
-		nsl_set_claim_class(ndd, nd_label, ndns->claim_class);
-		nsl_calculate_checksum(ndd, nd_label);
-
-		/* update label */
-		offset = nd_label_offset(ndd, nd_label);
-		rc = nvdimm_set_config_data(ndd, offset, nd_label,
-				sizeof_namespace_label(ndd));
-		if (rc < 0)
-			goto abort;
-	}
-
-	/* free up now unused slots in the new index */
-	for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) {
-		dev_dbg(ndd->dev, "free: %d\n", slot);
-		nd_label_free_slot(ndd, slot);
-	}
-
-	/* update index */
-	rc = nd_label_write_index(ndd, ndd->ns_next,
-			nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0);
-	if (rc)
-		goto abort;
-
-	/*
-	 * Now that the on-dimm labels are up to date, fix up the tracking
-	 * entries in nd_mapping->labels
-	 */
-	nlabel = 0;
-	mutex_lock(&nd_mapping->lock);
-	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
-		nd_label = label_ent->label;
-		if (!nd_label)
-			continue;
-		nlabel++;
-		if (!nsl_uuid_equal(ndd, nd_label, nsblk->uuid))
-			continue;
-		nlabel--;
-		list_move(&label_ent->list, &list);
-		label_ent->label = NULL;
-	}
-	list_splice_tail_init(&list, &nd_mapping->labels);
-	mutex_unlock(&nd_mapping->lock);
-
-	if (nlabel + nsblk->num_resources > num_labels) {
-		/*
-		 * Bug, we can't end up with more resources than
-		 * available labels
-		 */
-		WARN_ON_ONCE(1);
-		rc = -ENXIO;
-		goto out;
-	}
-
-	mutex_lock(&nd_mapping->lock);
-	label_ent = list_first_entry_or_null(&nd_mapping->labels,
-			typeof(*label_ent), list);
-	if (!label_ent) {
-		WARN_ON(1);
-		mutex_unlock(&nd_mapping->lock);
-		rc = -ENXIO;
-		goto out;
-	}
-	for_each_clear_bit_le(slot, free, nslot) {
-		nd_label = to_label(ndd, slot);
-		if (!nsl_uuid_equal(ndd, nd_label, nsblk->uuid))
-			continue;
-		res = to_resource(ndd, nd_label);
-		res->flags &= ~DPA_RESOURCE_ADJUSTED;
-		dev_vdbg(&nsblk->common.dev, "assign label slot: %d\n", slot);
-		list_for_each_entry_from(label_ent, &nd_mapping->labels, list) {
-			if (label_ent->label)
-				continue;
-			label_ent->label = nd_label;
-			nd_label = NULL;
-			break;
-		}
-		if (nd_label)
-			dev_WARN(&nsblk->common.dev,
-					"failed to track label slot%d\n", slot);
-	}
-	mutex_unlock(&nd_mapping->lock);
-
- out:
-	kfree(old_res_list);
-	bitmap_free(victim_map);
-	return rc;
-
- abort:
-	/*
-	 * 1/ repair the allocated label bitmap in the index
-	 * 2/ restore the resource list
-	 */
-	nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd));
-	kfree(nsblk->res);
-	nsblk->res = old_res_list;
-	nsblk->num_resources = old_num_resources;
-	old_res_list = NULL;
-	goto out;
-}
-
 static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
 {
 	int i, old_num_labels = 0;
@@ -1425,26 +1105,6 @@ int nd_pmem_namespace_label_update(struct nd_region *nd_region,
 	return 0;
 }
 
-int nd_blk_namespace_label_update(struct nd_region *nd_region,
-		struct nd_namespace_blk *nsblk, resource_size_t size)
-{
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-	struct resource *res;
-	int count = 0;
-
-	if (size == 0)
-		return del_labels(nd_mapping, nsblk->uuid);
-
-	for_each_dpa_resource(to_ndd(nd_mapping), res)
-		count++;
-
-	count = init_labels(nd_mapping, count);
-	if (count < 0)
-		return count;
-
-	return __blk_label_update(nd_region, nd_mapping, nsblk, count);
-}
-
 int __init nd_label_init(void)
 {
 	WARN_ON(guid_parse(NVDIMM_BTT_GUID, &nvdimm_btt_guid));
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
index 8ee248fc214f..198ef1df298b 100644
--- a/drivers/nvdimm/label.h
+++ b/drivers/nvdimm/label.h
@@ -221,9 +221,6 @@ bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot);
 u32 nd_label_nfree(struct nvdimm_drvdata *ndd);
 struct nd_region;
 struct nd_namespace_pmem;
-struct nd_namespace_blk;
 int nd_pmem_namespace_label_update(struct nd_region *nd_region,
 		struct nd_namespace_pmem *nspm, resource_size_t size);
-int nd_blk_namespace_label_update(struct nd_region *nd_region,
-		struct nd_namespace_blk *nsblk, resource_size_t size);
 #endif /* __LABEL_H__ */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 5c76547c9b84..d1c190b02657 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -32,19 +32,6 @@ static void namespace_pmem_release(struct device *dev)
 	kfree(nspm);
 }
 
-static void namespace_blk_release(struct device *dev)
-{
-	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
-	struct nd_region *nd_region = to_nd_region(dev->parent);
-
-	if (nsblk->id >= 0)
-		ida_simple_remove(&nd_region->ns_ida, nsblk->id);
-	kfree(nsblk->alt_name);
-	kfree(nsblk->uuid);
-	kfree(nsblk->res);
-	kfree(nsblk);
-}
-
 static bool is_namespace_pmem(const struct device *dev);
 static bool is_namespace_io(const struct device *dev);
 
@@ -245,65 +232,6 @@ out:
 	return rc;
 }
 
-static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
-{
-	struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent);
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	struct nd_label_id label_id;
-	struct resource *res;
-	int count, i;
-
-	if (!nsblk->uuid || !nsblk->lbasize || !ndd)
-		return false;
-
-	count = 0;
-	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
-	for_each_dpa_resource(ndd, res) {
-		if (strcmp(res->name, label_id.id) != 0)
-			continue;
-		/*
-		 * Resources with unacknowledged adjustments indicate a
-		 * failure to update labels
-		 */
-		if (res->flags & DPA_RESOURCE_ADJUSTED)
-			return false;
-		count++;
-	}
-
-	/* These values match after a successful label update */
-	if (count != nsblk->num_resources)
-		return false;
-
-	for (i = 0; i < nsblk->num_resources; i++) {
-		struct resource *found = NULL;
-
-		for_each_dpa_resource(ndd, res)
-			if (res == nsblk->res[i]) {
-				found = res;
-				break;
-			}
-		/* stale resource */
-		if (!found)
-			return false;
-	}
-
-	return true;
-}
-
-resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk)
-{
-	resource_size_t size;
-
-	nvdimm_bus_lock(&nsblk->common.dev);
-	size = __nd_namespace_blk_validate(nsblk);
-	nvdimm_bus_unlock(&nsblk->common.dev);
-
-	return size;
-}
-EXPORT_SYMBOL(nd_namespace_blk_validate);
-
-
 static int nd_namespace_label_update(struct nd_region *nd_region,
 		struct device *dev)
 {
@@ -1579,12 +1507,6 @@ static const struct device_type namespace_pmem_device_type = {
 	.groups = nd_namespace_attribute_groups,
 };
 
-static const struct device_type namespace_blk_device_type = {
-	.name = "nd_namespace_blk",
-	.release = namespace_blk_release,
-	.groups = nd_namespace_attribute_groups,
-};
-
 static bool is_namespace_pmem(const struct device *dev)
 {
 	return dev ? dev->type == &namespace_pmem_device_type : false;
@@ -1964,54 +1886,6 @@ static struct device *create_namespace_pmem(struct nd_region *nd_region,
 	return ERR_PTR(rc);
 }
 
-struct resource *nsblk_add_resource(struct nd_region *nd_region,
-		struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
-		resource_size_t start)
-{
-	struct nd_label_id label_id;
-	struct resource *res;
-
-	nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL);
-	res = krealloc(nsblk->res,
-			sizeof(void *) * (nsblk->num_resources + 1),
-			GFP_KERNEL);
-	if (!res)
-		return NULL;
-	nsblk->res = (struct resource **) res;
-	for_each_dpa_resource(ndd, res)
-		if (strcmp(res->name, label_id.id) == 0
-				&& res->start == start) {
-			nsblk->res[nsblk->num_resources++] = res;
-			return res;
-		}
-	return NULL;
-}
-
-static struct device *nd_namespace_blk_create(struct nd_region *nd_region)
-{
-	struct nd_namespace_blk *nsblk;
-	struct device *dev;
-
-	if (!is_nd_blk(&nd_region->dev))
-		return NULL;
-
-	nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
-	if (!nsblk)
-		return NULL;
-
-	dev = &nsblk->common.dev;
-	dev->type = &namespace_blk_device_type;
-	nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL);
-	if (nsblk->id < 0) {
-		kfree(nsblk);
-		return NULL;
-	}
-	dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id);
-	dev->parent = &nd_region->dev;
-
-	return &nsblk->common.dev;
-}
-
 static struct device *nd_namespace_pmem_create(struct nd_region *nd_region)
 {
 	struct nd_namespace_pmem *nspm;
@@ -2050,10 +1924,7 @@ void nd_region_create_ns_seed(struct nd_region *nd_region)
 	if (nd_region_to_nstype(nd_region) == ND_DEVICE_NAMESPACE_IO)
 		return;
 
-	if (is_nd_blk(&nd_region->dev))
-		nd_region->ns_seed = nd_namespace_blk_create(nd_region);
-	else
-		nd_region->ns_seed = nd_namespace_pmem_create(nd_region);
+	nd_region->ns_seed = nd_namespace_pmem_create(nd_region);
 
 	/*
 	 * Seed creation failures are not fatal, provisioning is simply
@@ -2128,54 +1999,6 @@ static int add_namespace_resource(struct nd_region *nd_region,
 	return i;
 }
 
-static struct device *create_namespace_blk(struct nd_region *nd_region,
-		struct nd_namespace_label *nd_label, int count)
-{
-
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-	struct nd_interleave_set *nd_set = nd_region->nd_set;
-	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	struct nd_namespace_blk *nsblk;
-	char name[NSLABEL_NAME_LEN];
-	struct device *dev = NULL;
-	struct resource *res;
-	uuid_t uuid;
-
-	if (!nsl_validate_type_guid(ndd, nd_label, &nd_set->type_guid))
-		return ERR_PTR(-EAGAIN);
-	if (!nsl_validate_blk_isetcookie(ndd, nd_label, nd_set->cookie2))
-		return ERR_PTR(-EAGAIN);
-
-	nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
-	if (!nsblk)
-		return ERR_PTR(-ENOMEM);
-	dev = &nsblk->common.dev;
-	dev->type = &namespace_blk_device_type;
-	dev->parent = &nd_region->dev;
-	nsblk->id = -1;
-	nsblk->lbasize = nsl_get_lbasize(ndd, nd_label);
-	nsl_get_uuid(ndd, nd_label, &uuid);
-	nsblk->uuid = kmemdup(&uuid, sizeof(uuid_t), GFP_KERNEL);
-	nsblk->common.claim_class = nsl_get_claim_class(ndd, nd_label);
-	if (!nsblk->uuid)
-		goto blk_err;
-	nsl_get_name(ndd, nd_label, name);
-	if (name[0]) {
-		nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, GFP_KERNEL);
-		if (!nsblk->alt_name)
-			goto blk_err;
-	}
-	res = nsblk_add_resource(nd_region, ndd, nsblk,
-			nsl_get_dpa(ndd, nd_label));
-	if (!res)
-		goto blk_err;
-	nd_dbg_dpa(nd_region, ndd, res, "%d: assign\n", count);
-	return dev;
- blk_err:
-	namespace_blk_release(dev);
-	return ERR_PTR(-ENXIO);
-}
-
 static int cmp_dpa(const void *a, const void *b)
 {
 	const struct device *dev_a = *(const struct device **) a;
@@ -2233,12 +2056,7 @@ static struct device **scan_labels(struct nd_region *nd_region)
 		kfree(devs);
 		devs = __devs;
 
-		if (is_nd_blk(&nd_region->dev))
-			dev = create_namespace_blk(nd_region, nd_label, count);
-		else
-			dev = create_namespace_pmem(nd_region, nd_mapping,
-						    nd_label);
-
+		dev = create_namespace_pmem(nd_region, nd_mapping, nd_label);
 		if (IS_ERR(dev)) {
 			switch (PTR_ERR(dev)) {
 			case -EAGAIN:
@@ -2260,30 +2078,21 @@ static struct device **scan_labels(struct nd_region *nd_region)
 			? "blk" : "pmem", count == 1 ? "" : "s");
 
 	if (count == 0) {
+		struct nd_namespace_pmem *nspm;
+
 		/* Publish a zero-sized namespace for userspace to configure. */
 		nd_mapping_free_labels(nd_mapping);
 
 		devs = kcalloc(2, sizeof(dev), GFP_KERNEL);
 		if (!devs)
 			goto err;
-		if (is_nd_blk(&nd_region->dev)) {
-			struct nd_namespace_blk *nsblk;
 
-			nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL);
-			if (!nsblk)
-				goto err;
-			dev = &nsblk->common.dev;
-			dev->type = &namespace_blk_device_type;
-		} else {
-			struct nd_namespace_pmem *nspm;
-
-			nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
-			if (!nspm)
-				goto err;
-			dev = &nspm->nsio.common.dev;
-			dev->type = &namespace_pmem_device_type;
-			nd_namespace_pmem_set_resource(nd_region, nspm, 0);
-		}
+		nspm = kzalloc(sizeof(*nspm), GFP_KERNEL);
+		if (!nspm)
+			goto err;
+		dev = &nspm->nsio.common.dev;
+		dev->type = &namespace_pmem_device_type;
+		nd_namespace_pmem_set_resource(nd_region, nspm, 0);
 		dev->parent = &nd_region->dev;
 		devs[count++] = dev;
 	} else if (is_memory(&nd_region->dev)) {
@@ -2318,10 +2127,7 @@ static struct device **scan_labels(struct nd_region *nd_region)
  err:
 	if (devs) {
 		for (i = 0; devs[i]; i++)
-			if (is_nd_blk(&nd_region->dev))
-				namespace_blk_release(devs[i]);
-			else
-				namespace_pmem_release(devs[i]);
+			namespace_pmem_release(devs[i]);
 		kfree(devs);
 	}
 	return NULL;
@@ -2484,19 +2290,12 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
 		struct device *dev = devs[i];
 		int id;
 
-		if (type == ND_DEVICE_NAMESPACE_BLK) {
-			struct nd_namespace_blk *nsblk;
-
-			nsblk = to_nd_namespace_blk(dev);
-			id = ida_simple_get(&nd_region->ns_ida, 0, 0,
-					GFP_KERNEL);
-			nsblk->id = id;
-		} else if (type == ND_DEVICE_NAMESPACE_PMEM) {
+		if (type == ND_DEVICE_NAMESPACE_PMEM) {
 			struct nd_namespace_pmem *nspm;
 
 			nspm = to_nd_namespace_pmem(dev);
 			id = ida_simple_get(&nd_region->ns_ida, 0, 0,
-					GFP_KERNEL);
+					    GFP_KERNEL);
 			nspm->id = id;
 		} else
 			id = i;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index a11850dd475d..e4af0719cf33 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -150,9 +150,6 @@ int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
 resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id);
 int alias_dpa_busy(struct device *dev, void *data);
-struct resource *nsblk_add_resource(struct nd_region *nd_region,
-		struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk,
-		resource_size_t start);
 int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
 void get_ndd(struct nvdimm_drvdata *ndd);
 resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 6f8ce114032d..8391bf2729bc 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -687,7 +687,6 @@ static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
 
 	return false;
 }
-resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
 const uuid_t *nd_dev_to_uuid(struct device *dev);
 bool pmem_should_map_pages(struct device *dev);
 #endif /* __ND_H__ */
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 4813c7089e5c..7b2ccbdc1cbc 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -136,27 +136,6 @@ struct nd_namespace_pmem {
 	int id;
 };
 
-/**
- * struct nd_namespace_blk - namespace for dimm-bounded persistent memory
- * @alt_name: namespace name supplied in the dimm label
- * @uuid: namespace name supplied in the dimm label
- * @id: ida allocated id
- * @lbasize: blk namespaces have a native sector size when btt not present
- * @size: sum of all the resource ranges allocated to this namespace
- * @num_resources: number of dpa extents to claim
- * @res: discontiguous dpa extents for given dimm
- */
-struct nd_namespace_blk {
-	struct nd_namespace_common common;
-	char *alt_name;
-	uuid_t *uuid;
-	int id;
-	unsigned long lbasize;
-	resource_size_t size;
-	int num_resources;
-	struct resource **res;
-};
-
 static inline struct nd_namespace_io *to_nd_namespace_io(const struct device *dev)
 {
 	return container_of(dev, struct nd_namespace_io, common.dev);
@@ -169,11 +148,6 @@ static inline struct nd_namespace_pmem *to_nd_namespace_pmem(const struct device
 	return container_of(nsio, struct nd_namespace_pmem, nsio);
 }
 
-static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device *dev)
-{
-	return container_of(dev, struct nd_namespace_blk, common.dev);
-}
-
 /**
  * nvdimm_read_bytes() - synchronously read bytes from an nvdimm namespace
  * @ndns: device to read

From a4b96046a8823a796b28e713ecc8ec535f5e4607 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:42 -0800
Subject: [PATCH 116/229] ACPI: NFIT: Remove block aperture support

Delete the code to parse interleave-descriptor-tables and coordinate I/O
through a BLK aperture.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688418240.2879318.400185926874596938.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c         | 376 -------------------------------
 drivers/acpi/nfit/nfit.h         |   6 -
 tools/testing/nvdimm/test/nfit.c |  23 --
 3 files changed, 405 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index e5d7f2bda13f..bea6a219fddd 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -999,80 +999,6 @@ static void *add_table(struct acpi_nfit_desc *acpi_desc,
 	return table + hdr->length;
 }
 
-static void nfit_mem_find_spa_bdw(struct acpi_nfit_desc *acpi_desc,
-		struct nfit_mem *nfit_mem)
-{
-	u32 device_handle = __to_nfit_memdev(nfit_mem)->device_handle;
-	u16 dcr = nfit_mem->dcr->region_index;
-	struct nfit_spa *nfit_spa;
-
-	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		u16 range_index = nfit_spa->spa->range_index;
-		int type = nfit_spa_type(nfit_spa->spa);
-		struct nfit_memdev *nfit_memdev;
-
-		if (type != NFIT_SPA_BDW)
-			continue;
-
-		list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
-			if (nfit_memdev->memdev->range_index != range_index)
-				continue;
-			if (nfit_memdev->memdev->device_handle != device_handle)
-				continue;
-			if (nfit_memdev->memdev->region_index != dcr)
-				continue;
-
-			nfit_mem->spa_bdw = nfit_spa->spa;
-			return;
-		}
-	}
-
-	dev_dbg(acpi_desc->dev, "SPA-BDW not found for SPA-DCR %d\n",
-			nfit_mem->spa_dcr->range_index);
-	nfit_mem->bdw = NULL;
-}
-
-static void nfit_mem_init_bdw(struct acpi_nfit_desc *acpi_desc,
-		struct nfit_mem *nfit_mem, struct acpi_nfit_system_address *spa)
-{
-	u16 dcr = __to_nfit_memdev(nfit_mem)->region_index;
-	struct nfit_memdev *nfit_memdev;
-	struct nfit_bdw *nfit_bdw;
-	struct nfit_idt *nfit_idt;
-	u16 idt_idx, range_index;
-
-	list_for_each_entry(nfit_bdw, &acpi_desc->bdws, list) {
-		if (nfit_bdw->bdw->region_index != dcr)
-			continue;
-		nfit_mem->bdw = nfit_bdw->bdw;
-		break;
-	}
-
-	if (!nfit_mem->bdw)
-		return;
-
-	nfit_mem_find_spa_bdw(acpi_desc, nfit_mem);
-
-	if (!nfit_mem->spa_bdw)
-		return;
-
-	range_index = nfit_mem->spa_bdw->range_index;
-	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
-		if (nfit_memdev->memdev->range_index != range_index ||
-				nfit_memdev->memdev->region_index != dcr)
-			continue;
-		nfit_mem->memdev_bdw = nfit_memdev->memdev;
-		idt_idx = nfit_memdev->memdev->interleave_index;
-		list_for_each_entry(nfit_idt, &acpi_desc->idts, list) {
-			if (nfit_idt->idt->interleave_index != idt_idx)
-				continue;
-			nfit_mem->idt_bdw = nfit_idt->idt;
-			break;
-		}
-		break;
-	}
-}
-
 static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc,
 		struct acpi_nfit_system_address *spa)
 {
@@ -1189,7 +1115,6 @@ static int __nfit_mem_init(struct acpi_nfit_desc *acpi_desc,
 				nfit_mem->idt_dcr = nfit_idt->idt;
 				break;
 			}
-			nfit_mem_init_bdw(acpi_desc, nfit_mem, spa);
 		} else if (type == NFIT_SPA_PM) {
 			/*
 			 * A single dimm may belong to multiple SPA-PM
@@ -1532,8 +1457,6 @@ static int num_nvdimm_formats(struct nvdimm *nvdimm)
 
 	if (nfit_mem->memdev_pmem)
 		formats++;
-	if (nfit_mem->memdev_bdw)
-		formats++;
 	return formats;
 }
 
@@ -2079,11 +2002,6 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			continue;
 		}
 
-		if (nfit_mem->bdw && nfit_mem->memdev_pmem) {
-			set_bit(NDD_ALIASING, &flags);
-			set_bit(NDD_LABELING, &flags);
-		}
-
 		/* collate flags across all memdevs for this dimm */
 		list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
 			struct acpi_nfit_memory_map *dimm_memdev;
@@ -2429,272 +2347,6 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
 	return 0;
 }
 
-static u64 to_interleave_offset(u64 offset, struct nfit_blk_mmio *mmio)
-{
-	struct acpi_nfit_interleave *idt = mmio->idt;
-	u32 sub_line_offset, line_index, line_offset;
-	u64 line_no, table_skip_count, table_offset;
-
-	line_no = div_u64_rem(offset, mmio->line_size, &sub_line_offset);
-	table_skip_count = div_u64_rem(line_no, mmio->num_lines, &line_index);
-	line_offset = idt->line_offset[line_index]
-		* mmio->line_size;
-	table_offset = table_skip_count * mmio->table_size;
-
-	return mmio->base_offset + line_offset + table_offset + sub_line_offset;
-}
-
-static u32 read_blk_stat(struct nfit_blk *nfit_blk, unsigned int bw)
-{
-	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
-	u64 offset = nfit_blk->stat_offset + mmio->size * bw;
-	const u32 STATUS_MASK = 0x80000037;
-
-	if (mmio->num_lines)
-		offset = to_interleave_offset(offset, mmio);
-
-	return readl(mmio->addr.base + offset) & STATUS_MASK;
-}
-
-static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
-		resource_size_t dpa, unsigned int len, unsigned int write)
-{
-	u64 cmd, offset;
-	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[DCR];
-
-	enum {
-		BCW_OFFSET_MASK = (1ULL << 48)-1,
-		BCW_LEN_SHIFT = 48,
-		BCW_LEN_MASK = (1ULL << 8) - 1,
-		BCW_CMD_SHIFT = 56,
-	};
-
-	cmd = (dpa >> L1_CACHE_SHIFT) & BCW_OFFSET_MASK;
-	len = len >> L1_CACHE_SHIFT;
-	cmd |= ((u64) len & BCW_LEN_MASK) << BCW_LEN_SHIFT;
-	cmd |= ((u64) write) << BCW_CMD_SHIFT;
-
-	offset = nfit_blk->cmd_offset + mmio->size * bw;
-	if (mmio->num_lines)
-		offset = to_interleave_offset(offset, mmio);
-
-	writeq(cmd, mmio->addr.base + offset);
-	nvdimm_flush(nfit_blk->nd_region, NULL);
-
-	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
-		readq(mmio->addr.base + offset);
-}
-
-static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
-		resource_size_t dpa, void *iobuf, size_t len, int rw,
-		unsigned int lane)
-{
-	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
-	unsigned int copied = 0;
-	u64 base_offset;
-	int rc;
-
-	base_offset = nfit_blk->bdw_offset + dpa % L1_CACHE_BYTES
-		+ lane * mmio->size;
-	write_blk_ctl(nfit_blk, lane, dpa, len, rw);
-	while (len) {
-		unsigned int c;
-		u64 offset;
-
-		if (mmio->num_lines) {
-			u32 line_offset;
-
-			offset = to_interleave_offset(base_offset + copied,
-					mmio);
-			div_u64_rem(offset, mmio->line_size, &line_offset);
-			c = min_t(size_t, len, mmio->line_size - line_offset);
-		} else {
-			offset = base_offset + nfit_blk->bdw_offset;
-			c = len;
-		}
-
-		if (rw)
-			memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
-		else {
-			if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
-				arch_invalidate_pmem((void __force *)
-					mmio->addr.aperture + offset, c);
-
-			memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
-		}
-
-		copied += c;
-		len -= c;
-	}
-
-	if (rw)
-		nvdimm_flush(nfit_blk->nd_region, NULL);
-
-	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
-	return rc;
-}
-
-static int acpi_nfit_blk_region_do_io(struct nd_blk_region *ndbr,
-		resource_size_t dpa, void *iobuf, u64 len, int rw)
-{
-	struct nfit_blk *nfit_blk = nd_blk_region_provider_data(ndbr);
-	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
-	struct nd_region *nd_region = nfit_blk->nd_region;
-	unsigned int lane, copied = 0;
-	int rc = 0;
-
-	lane = nd_region_acquire_lane(nd_region);
-	while (len) {
-		u64 c = min(len, mmio->size);
-
-		rc = acpi_nfit_blk_single_io(nfit_blk, dpa + copied,
-				iobuf + copied, c, rw, lane);
-		if (rc)
-			break;
-
-		copied += c;
-		len -= c;
-	}
-	nd_region_release_lane(nd_region, lane);
-
-	return rc;
-}
-
-static int nfit_blk_init_interleave(struct nfit_blk_mmio *mmio,
-		struct acpi_nfit_interleave *idt, u16 interleave_ways)
-{
-	if (idt) {
-		mmio->num_lines = idt->line_count;
-		mmio->line_size = idt->line_size;
-		if (interleave_ways == 0)
-			return -ENXIO;
-		mmio->table_size = mmio->num_lines * interleave_ways
-			* mmio->line_size;
-	}
-
-	return 0;
-}
-
-static int acpi_nfit_blk_get_flags(struct nvdimm_bus_descriptor *nd_desc,
-		struct nvdimm *nvdimm, struct nfit_blk *nfit_blk)
-{
-	struct nd_cmd_dimm_flags flags;
-	int rc;
-
-	memset(&flags, 0, sizeof(flags));
-	rc = nd_desc->ndctl(nd_desc, nvdimm, ND_CMD_DIMM_FLAGS, &flags,
-			sizeof(flags), NULL);
-
-	if (rc >= 0 && flags.status == 0)
-		nfit_blk->dimm_flags = flags.flags;
-	else if (rc == -ENOTTY) {
-		/* fall back to a conservative default */
-		nfit_blk->dimm_flags = NFIT_BLK_DCR_LATCH | NFIT_BLK_READ_FLUSH;
-		rc = 0;
-	} else
-		rc = -ENXIO;
-
-	return rc;
-}
-
-static int acpi_nfit_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
-		struct device *dev)
-{
-	struct nvdimm_bus_descriptor *nd_desc = to_nd_desc(nvdimm_bus);
-	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
-	struct nfit_blk_mmio *mmio;
-	struct nfit_blk *nfit_blk;
-	struct nfit_mem *nfit_mem;
-	struct nvdimm *nvdimm;
-	int rc;
-
-	nvdimm = nd_blk_region_to_dimm(ndbr);
-	nfit_mem = nvdimm_provider_data(nvdimm);
-	if (!nfit_mem || !nfit_mem->dcr || !nfit_mem->bdw) {
-		dev_dbg(dev, "missing%s%s%s\n",
-				nfit_mem ? "" : " nfit_mem",
-				(nfit_mem && nfit_mem->dcr) ? "" : " dcr",
-				(nfit_mem && nfit_mem->bdw) ? "" : " bdw");
-		return -ENXIO;
-	}
-
-	nfit_blk = devm_kzalloc(dev, sizeof(*nfit_blk), GFP_KERNEL);
-	if (!nfit_blk)
-		return -ENOMEM;
-	nd_blk_region_set_provider_data(ndbr, nfit_blk);
-	nfit_blk->nd_region = to_nd_region(dev);
-
-	/* map block aperture memory */
-	nfit_blk->bdw_offset = nfit_mem->bdw->offset;
-	mmio = &nfit_blk->mmio[BDW];
-	mmio->addr.base = devm_nvdimm_memremap(dev, nfit_mem->spa_bdw->address,
-			nfit_mem->spa_bdw->length, nd_blk_memremap_flags(ndbr));
-	if (!mmio->addr.base) {
-		dev_dbg(dev, "%s failed to map bdw\n",
-				nvdimm_name(nvdimm));
-		return -ENOMEM;
-	}
-	mmio->size = nfit_mem->bdw->size;
-	mmio->base_offset = nfit_mem->memdev_bdw->region_offset;
-	mmio->idt = nfit_mem->idt_bdw;
-	mmio->spa = nfit_mem->spa_bdw;
-	rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_bdw,
-			nfit_mem->memdev_bdw->interleave_ways);
-	if (rc) {
-		dev_dbg(dev, "%s failed to init bdw interleave\n",
-				nvdimm_name(nvdimm));
-		return rc;
-	}
-
-	/* map block control memory */
-	nfit_blk->cmd_offset = nfit_mem->dcr->command_offset;
-	nfit_blk->stat_offset = nfit_mem->dcr->status_offset;
-	mmio = &nfit_blk->mmio[DCR];
-	mmio->addr.base = devm_nvdimm_ioremap(dev, nfit_mem->spa_dcr->address,
-			nfit_mem->spa_dcr->length);
-	if (!mmio->addr.base) {
-		dev_dbg(dev, "%s failed to map dcr\n",
-				nvdimm_name(nvdimm));
-		return -ENOMEM;
-	}
-	mmio->size = nfit_mem->dcr->window_size;
-	mmio->base_offset = nfit_mem->memdev_dcr->region_offset;
-	mmio->idt = nfit_mem->idt_dcr;
-	mmio->spa = nfit_mem->spa_dcr;
-	rc = nfit_blk_init_interleave(mmio, nfit_mem->idt_dcr,
-			nfit_mem->memdev_dcr->interleave_ways);
-	if (rc) {
-		dev_dbg(dev, "%s failed to init dcr interleave\n",
-				nvdimm_name(nvdimm));
-		return rc;
-	}
-
-	rc = acpi_nfit_blk_get_flags(nd_desc, nvdimm, nfit_blk);
-	if (rc < 0) {
-		dev_dbg(dev, "%s failed get DIMM flags\n",
-				nvdimm_name(nvdimm));
-		return rc;
-	}
-
-	if (nvdimm_has_flush(nfit_blk->nd_region) < 0)
-		dev_warn(dev, "unable to guarantee persistence of writes\n");
-
-	if (mmio->line_size == 0)
-		return 0;
-
-	if ((u32) nfit_blk->cmd_offset % mmio->line_size
-			+ 8 > mmio->line_size) {
-		dev_dbg(dev, "cmd_offset crosses interleave boundary\n");
-		return -ENXIO;
-	} else if ((u32) nfit_blk->stat_offset % mmio->line_size
-			+ 8 > mmio->line_size) {
-		dev_dbg(dev, "stat_offset crosses interleave boundary\n");
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
 static int ars_get_cap(struct acpi_nfit_desc *acpi_desc,
 		struct nd_cmd_ars_cap *cmd, struct nfit_spa *nfit_spa)
 {
@@ -2911,9 +2563,6 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 	struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc,
 			memdev->device_handle);
 	struct acpi_nfit_system_address *spa = nfit_spa->spa;
-	struct nd_blk_region_desc *ndbr_desc;
-	struct nfit_mem *nfit_mem;
-	int rc;
 
 	if (!nvdimm) {
 		dev_err(acpi_desc->dev, "spa%d dimm: %#x not found\n",
@@ -2928,30 +2577,6 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		mapping->start = memdev->address;
 		mapping->size = memdev->region_size;
 		break;
-	case NFIT_SPA_DCR:
-		nfit_mem = nvdimm_provider_data(nvdimm);
-		if (!nfit_mem || !nfit_mem->bdw) {
-			dev_dbg(acpi_desc->dev, "spa%d %s missing bdw\n",
-					spa->range_index, nvdimm_name(nvdimm));
-			break;
-		}
-
-		mapping->size = nfit_mem->bdw->capacity;
-		mapping->start = nfit_mem->bdw->start_address;
-		ndr_desc->num_lanes = nfit_mem->bdw->windows;
-		ndr_desc->mapping = mapping;
-		ndr_desc->num_mappings = 1;
-		ndbr_desc = to_blk_region_desc(ndr_desc);
-		ndbr_desc->enable = acpi_nfit_blk_region_enable;
-		ndbr_desc->do_io = acpi_desc->blk_do_io;
-		rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa);
-		if (rc)
-			return rc;
-		nfit_spa->nd_region = nvdimm_blk_region_create(acpi_desc->nvdimm_bus,
-				ndr_desc);
-		if (!nfit_spa->nd_region)
-			return -ENOMEM;
-		break;
 	}
 
 	return 0;
@@ -3635,7 +3260,6 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 
 	dev_set_drvdata(dev, acpi_desc);
 	acpi_desc->dev = dev;
-	acpi_desc->blk_do_io = acpi_nfit_blk_region_do_io;
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->provider_name = "ACPI.NFIT";
 	nd_desc->module = THIS_MODULE;
diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h
index c674f3df9be7..50882bdbeb96 100644
--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -208,13 +208,9 @@ struct nfit_mem {
 	struct nvdimm *nvdimm;
 	struct acpi_nfit_memory_map *memdev_dcr;
 	struct acpi_nfit_memory_map *memdev_pmem;
-	struct acpi_nfit_memory_map *memdev_bdw;
 	struct acpi_nfit_control_region *dcr;
-	struct acpi_nfit_data_region *bdw;
 	struct acpi_nfit_system_address *spa_dcr;
-	struct acpi_nfit_system_address *spa_bdw;
 	struct acpi_nfit_interleave *idt_dcr;
-	struct acpi_nfit_interleave *idt_bdw;
 	struct kernfs_node *flags_attr;
 	struct nfit_flush *nfit_flush;
 	struct list_head list;
@@ -266,8 +262,6 @@ struct acpi_nfit_desc {
 	unsigned long family_dsm_mask[NVDIMM_BUS_FAMILY_MAX + 1];
 	unsigned int platform_cap;
 	unsigned int scrub_tmo;
-	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
-			void *iobuf, u64 len, int rw);
 	enum nvdimm_fwa_state fwa_state;
 	enum nvdimm_fwa_capability fwa_cap;
 	int fwa_count;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 0bc91ffee257..65dbdda3a054 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -2842,28 +2842,6 @@ static void nfit_test1_setup(struct nfit_test *t)
 	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
 }
 
-static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-		void *iobuf, u64 len, int rw)
-{
-	struct nfit_blk *nfit_blk = ndbr->blk_provider_data;
-	struct nfit_blk_mmio *mmio = &nfit_blk->mmio[BDW];
-	struct nd_region *nd_region = &ndbr->nd_region;
-	unsigned int lane;
-
-	lane = nd_region_acquire_lane(nd_region);
-	if (rw)
-		memcpy(mmio->addr.base + dpa, iobuf, len);
-	else {
-		memcpy(iobuf, mmio->addr.base + dpa, len);
-
-		/* give us some some coverage of the arch_invalidate_pmem() API */
-		arch_invalidate_pmem(mmio->addr.base + dpa, len);
-	}
-	nd_region_release_lane(nd_region, lane);
-
-	return 0;
-}
-
 static unsigned long nfit_ctl_handle;
 
 union acpi_object *result;
@@ -3219,7 +3197,6 @@ static int nfit_test_probe(struct platform_device *pdev)
 	nfit_test->setup(nfit_test);
 	acpi_desc = &nfit_test->acpi_desc;
 	acpi_nfit_desc_init(acpi_desc, &pdev->dev);
-	acpi_desc->blk_do_io = nfit_test_blk_do_io;
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->provider_name = NULL;
 	nd_desc->module = THIS_MODULE;

From 3b6c6c039707f6bb7c64af2aa82a437fabb93aee Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2022 19:49:48 -0800
Subject: [PATCH 117/229] nvdimm/region: Delete nd_blk_region infrastructure

Now that the nd_namespace_blk infrastructure is removed, delete all the
region machinery to coordinate provisioning aliased capacity between
PMEM and BLK.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/164688418803.2879318.1302315202397235855.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c           |  11 +-
 drivers/nvdimm/bus.c               |   2 -
 drivers/nvdimm/dimm_devs.c         | 204 ++---------------------------
 drivers/nvdimm/label.c             |   6 +-
 drivers/nvdimm/label.h             |   2 +-
 drivers/nvdimm/namespace_devs.c    | 127 ++----------------
 drivers/nvdimm/nd-core.h           |  24 +---
 drivers/nvdimm/nd.h                |  12 --
 drivers/nvdimm/region.c            |  31 ++---
 drivers/nvdimm/region_devs.c       | 158 +++-------------------
 include/linux/libnvdimm.h          |  24 ----
 include/uapi/linux/ndctl.h         |   2 -
 tools/testing/nvdimm/test/ndtest.c |  67 +---------
 13 files changed, 66 insertions(+), 604 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index bea6a219fddd..fe61f617a943 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2036,10 +2036,6 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
 		}
 
-		/* Quirk to ignore LOCAL for labels on HYPERV DIMMs */
-		if (nfit_mem->family == NVDIMM_FAMILY_HYPERV)
-			set_bit(NDD_NOBLK, &flags);
-
 		if (test_bit(NFIT_MEM_LSR, &nfit_mem->flags)) {
 			set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
 			set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
@@ -2602,8 +2598,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 {
 	static struct nd_mapping_desc mappings[ND_MAX_MAPPINGS];
 	struct acpi_nfit_system_address *spa = nfit_spa->spa;
-	struct nd_blk_region_desc ndbr_desc;
-	struct nd_region_desc *ndr_desc;
+	struct nd_region_desc *ndr_desc, _ndr_desc;
 	struct nfit_memdev *nfit_memdev;
 	struct nvdimm_bus *nvdimm_bus;
 	struct resource res;
@@ -2619,10 +2614,10 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 
 	memset(&res, 0, sizeof(res));
 	memset(&mappings, 0, sizeof(mappings));
-	memset(&ndbr_desc, 0, sizeof(ndbr_desc));
+	memset(&_ndr_desc, 0, sizeof(_ndr_desc));
 	res.start = spa->address;
 	res.end = res.start + spa->length - 1;
-	ndr_desc = &ndbr_desc.ndr_desc;
+	ndr_desc = &_ndr_desc;
 	ndr_desc->res = &res;
 	ndr_desc->provider_data = nfit_spa;
 	ndr_desc->attr_groups = acpi_nfit_region_attribute_groups;
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 9dc7f3edd42b..a4b5f637e599 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -35,8 +35,6 @@ static int to_nd_device_type(struct device *dev)
 		return ND_DEVICE_DIMM;
 	else if (is_memory(dev))
 		return ND_DEVICE_REGION_PMEM;
-	else if (is_nd_blk(dev))
-		return ND_DEVICE_REGION_BLK;
 	else if (is_nd_dax(dev))
 		return ND_DEVICE_DAX_PMEM;
 	else if (is_nd_region(dev->parent))
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index dc7449a40003..ee507eed42b5 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -18,10 +18,6 @@
 
 static DEFINE_IDA(dimm_ida);
 
-static bool noblk;
-module_param(noblk, bool, 0444);
-MODULE_PARM_DESC(noblk, "force disable BLK / local alias support");
-
 /*
  * Retrieve bus and dimm handle and return if this bus supports
  * get_config_data commands
@@ -211,22 +207,6 @@ struct nvdimm *to_nvdimm(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(to_nvdimm);
 
-struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr)
-{
-	struct nd_region *nd_region = &ndbr->nd_region;
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-
-	return nd_mapping->nvdimm;
-}
-EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm);
-
-unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr)
-{
-	/* pmem mapping properties are private to libnvdimm */
-	return ARCH_MEMREMAP_PMEM;
-}
-EXPORT_SYMBOL_GPL(nd_blk_memremap_flags);
-
 struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping)
 {
 	struct nvdimm *nvdimm = nd_mapping->nvdimm;
@@ -312,8 +292,7 @@ static ssize_t flags_show(struct device *dev,
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 
-	return sprintf(buf, "%s%s%s\n",
-			test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+	return sprintf(buf, "%s%s\n",
 			test_bit(NDD_LABELING, &nvdimm->flags) ? "label " : "",
 			test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
 }
@@ -612,8 +591,6 @@ struct nvdimm *__nvdimm_create(struct nvdimm_bus *nvdimm_bus,
 
 	nvdimm->dimm_id = dimm_id;
 	nvdimm->provider_data = provider_data;
-	if (noblk)
-		flags |= 1 << NDD_NOBLK;
 	nvdimm->flags = flags;
 	nvdimm->cmd_mask = cmd_mask;
 	nvdimm->num_flush = num_flush;
@@ -726,133 +703,6 @@ static unsigned long dpa_align(struct nd_region *nd_region)
 	return nd_region->align / nd_region->ndr_mappings;
 }
 
-int alias_dpa_busy(struct device *dev, void *data)
-{
-	resource_size_t map_end, blk_start, new;
-	struct blk_alloc_info *info = data;
-	struct nd_mapping *nd_mapping;
-	struct nd_region *nd_region;
-	struct nvdimm_drvdata *ndd;
-	struct resource *res;
-	unsigned long align;
-	int i;
-
-	if (!is_memory(dev))
-		return 0;
-
-	nd_region = to_nd_region(dev);
-	for (i = 0; i < nd_region->ndr_mappings; i++) {
-		nd_mapping  = &nd_region->mapping[i];
-		if (nd_mapping->nvdimm == info->nd_mapping->nvdimm)
-			break;
-	}
-
-	if (i >= nd_region->ndr_mappings)
-		return 0;
-
-	ndd = to_ndd(nd_mapping);
-	map_end = nd_mapping->start + nd_mapping->size - 1;
-	blk_start = nd_mapping->start;
-
-	/*
-	 * In the allocation case ->res is set to free space that we are
-	 * looking to validate against PMEM aliasing collision rules
-	 * (i.e. BLK is allocated after all aliased PMEM).
-	 */
-	if (info->res) {
-		if (info->res->start >= nd_mapping->start
-				&& info->res->start < map_end)
-			/* pass */;
-		else
-			return 0;
-	}
-
- retry:
-	/*
-	 * Find the free dpa from the end of the last pmem allocation to
-	 * the end of the interleave-set mapping.
-	 */
-	align = dpa_align(nd_region);
-	if (!align)
-		return 0;
-
-	for_each_dpa_resource(ndd, res) {
-		resource_size_t start, end;
-
-		if (strncmp(res->name, "pmem", 4) != 0)
-			continue;
-
-		start = ALIGN_DOWN(res->start, align);
-		end = ALIGN(res->end + 1, align) - 1;
-		if ((start >= blk_start && start < map_end)
-				|| (end >= blk_start && end <= map_end)) {
-			new = max(blk_start, min(map_end, end) + 1);
-			if (new != blk_start) {
-				blk_start = new;
-				goto retry;
-			}
-		}
-	}
-
-	/* update the free space range with the probed blk_start */
-	if (info->res && blk_start > info->res->start) {
-		info->res->start = max(info->res->start, blk_start);
-		if (info->res->start > info->res->end)
-			info->res->end = info->res->start - 1;
-		return 1;
-	}
-
-	info->available -= blk_start - nd_mapping->start;
-
-	return 0;
-}
-
-/**
- * nd_blk_available_dpa - account the unused dpa of BLK region
- * @nd_mapping: container of dpa-resource-root + labels
- *
- * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges, but
- * we arrange for them to never start at an lower dpa than the last
- * PMEM allocation in an aliased region.
- */
-resource_size_t nd_blk_available_dpa(struct nd_region *nd_region)
-{
-	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	struct blk_alloc_info info = {
-		.nd_mapping = nd_mapping,
-		.available = nd_mapping->size,
-		.res = NULL,
-	};
-	struct resource *res;
-	unsigned long align;
-
-	if (!ndd)
-		return 0;
-
-	device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
-
-	/* now account for busy blk allocations in unaliased dpa */
-	align = dpa_align(nd_region);
-	if (!align)
-		return 0;
-	for_each_dpa_resource(ndd, res) {
-		resource_size_t start, end, size;
-
-		if (strncmp(res->name, "blk", 3) != 0)
-			continue;
-		start = ALIGN_DOWN(res->start, align);
-		end = ALIGN(res->end + 1, align) - 1;
-		size = end - start + 1;
-		if (size >= info.available)
-			return 0;
-		info.available -= size;
-	}
-
-	return info.available;
-}
-
 /**
  * nd_pmem_max_contiguous_dpa - For the given dimm+region, return the max
  *			   contiguous unallocated dpa range.
@@ -900,24 +750,16 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
  * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa
  * @nd_mapping: container of dpa-resource-root + labels
  * @nd_region: constrain available space check to this reference region
- * @overlap: calculate available space assuming this level of overlap
  *
  * Validate that a PMEM label, if present, aligns with the start of an
- * interleave set and truncate the available size at the lowest BLK
- * overlap point.
- *
- * The expectation is that this routine is called multiple times as it
- * probes for the largest BLK encroachment for any single member DIMM of
- * the interleave set.  Once that value is determined the PMEM-limit for
- * the set can be established.
+ * interleave set.
  */
 resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
-		struct nd_mapping *nd_mapping, resource_size_t *overlap)
+				      struct nd_mapping *nd_mapping)
 {
-	resource_size_t map_start, map_end, busy = 0, available, blk_start;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
+	resource_size_t map_start, map_end, busy = 0;
 	struct resource *res;
-	const char *reason;
 	unsigned long align;
 
 	if (!ndd)
@@ -929,46 +771,28 @@ resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
 
 	map_start = nd_mapping->start;
 	map_end = map_start + nd_mapping->size - 1;
-	blk_start = max(map_start, map_end + 1 - *overlap);
 	for_each_dpa_resource(ndd, res) {
 		resource_size_t start, end;
 
 		start = ALIGN_DOWN(res->start, align);
 		end = ALIGN(res->end + 1, align) - 1;
 		if (start >= map_start && start < map_end) {
-			if (strncmp(res->name, "blk", 3) == 0)
-				blk_start = min(blk_start,
-						max(map_start, start));
-			else if (end > map_end) {
-				reason = "misaligned to iset";
-				goto err;
-			} else
-				busy += end - start + 1;
+			if (end > map_end) {
+				nd_dbg_dpa(nd_region, ndd, res,
+					   "misaligned to iset\n");
+				return 0;
+			}
+			busy += end - start + 1;
 		} else if (end >= map_start && end <= map_end) {
-			if (strncmp(res->name, "blk", 3) == 0) {
-				/*
-				 * If a BLK allocation overlaps the start of
-				 * PMEM the entire interleave set may now only
-				 * be used for BLK.
-				 */
-				blk_start = map_start;
-			} else
-				busy += end - start + 1;
+			busy += end - start + 1;
 		} else if (map_start > start && map_start < end) {
 			/* total eclipse of the mapping */
 			busy += nd_mapping->size;
-			blk_start = map_start;
 		}
 	}
 
-	*overlap = map_end + 1 - blk_start;
-	available = blk_start - map_start;
-	if (busy < available)
-		return ALIGN_DOWN(available - busy, align);
-	return 0;
-
- err:
-	nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason);
+	if (busy < nd_mapping->size)
+		return ALIGN_DOWN(nd_mapping->size - busy, align);
 	return 0;
 }
 
@@ -999,7 +823,7 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
 /**
  * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id
  * @nvdimm: container of dpa-resource-root + labels
- * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid>
+ * @label_id: dpa resource name of the form pmem-<human readable uuid>
  */
 resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id)
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 8c972bcb2ac3..082253a3a956 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -334,8 +334,7 @@ char *nd_label_gen_id(struct nd_label_id *label_id, const uuid_t *uuid,
 {
 	if (!label_id || !uuid)
 		return NULL;
-	snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb",
-			flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid);
+	snprintf(label_id->id, ND_LABEL_ID_SIZE, "pmem-%pUb", uuid);
 	return label_id->id;
 }
 
@@ -406,7 +405,6 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 		return 0; /* no label, nothing to reserve */
 
 	for_each_clear_bit_le(slot, free, nslot) {
-		struct nvdimm *nvdimm = to_nvdimm(ndd->dev);
 		struct nd_namespace_label *nd_label;
 		struct nd_region *nd_region = NULL;
 		struct nd_label_id label_id;
@@ -421,8 +419,6 @@ int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd)
 
 		nsl_get_uuid(ndd, nd_label, &label_uuid);
 		flags = nsl_get_flags(ndd, nd_label);
-		if (test_bit(NDD_NOBLK, &nvdimm->flags))
-			flags &= ~NSLABEL_FLAG_LOCAL;
 		nd_label_gen_id(&label_id, &label_uuid, flags);
 		res = nvdimm_allocate_dpa(ndd, &label_id,
 					  nsl_get_dpa(ndd, nd_label),
diff --git a/drivers/nvdimm/label.h b/drivers/nvdimm/label.h
index 198ef1df298b..0650fb4b9821 100644
--- a/drivers/nvdimm/label.h
+++ b/drivers/nvdimm/label.h
@@ -193,7 +193,7 @@ struct nd_namespace_label {
 
 /**
  * struct nd_label_id - identifier string for dpa allocation
- * @id: "{blk|pmem}-<namespace uuid>"
+ * @id: "pmem-<namespace uuid>"
  */
 struct nd_label_id {
 	char id[ND_LABEL_ID_SIZE];
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index d1c190b02657..62b83b2e26e3 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -297,13 +297,11 @@ static int scan_free(struct nd_region *nd_region,
 		struct nd_mapping *nd_mapping, struct nd_label_id *label_id,
 		resource_size_t n)
 {
-	bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	int rc = 0;
 
 	while (n) {
 		struct resource *res, *last;
-		resource_size_t new_start;
 
 		last = NULL;
 		for_each_dpa_resource(ndd, res)
@@ -321,16 +319,7 @@ static int scan_free(struct nd_region *nd_region,
 			continue;
 		}
 
-		/*
-		 * Keep BLK allocations relegated to high DPA as much as
-		 * possible
-		 */
-		if (is_blk)
-			new_start = res->start + n;
-		else
-			new_start = res->start;
-
-		rc = adjust_resource(res, new_start, resource_size(res) - n);
+		rc = adjust_resource(res, res->start, resource_size(res) - n);
 		if (rc == 0)
 			res->flags |= DPA_RESOURCE_ADJUSTED;
 		nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc);
@@ -372,20 +361,12 @@ static resource_size_t init_dpa_allocation(struct nd_label_id *label_id,
 		struct nd_region *nd_region, struct nd_mapping *nd_mapping,
 		resource_size_t n)
 {
-	bool is_blk = strncmp(label_id->id, "blk", 3) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
-	resource_size_t first_dpa;
 	struct resource *res;
 	int rc = 0;
 
-	/* allocate blk from highest dpa first */
-	if (is_blk)
-		first_dpa = nd_mapping->start + nd_mapping->size - n;
-	else
-		first_dpa = nd_mapping->start;
-
 	/* first resource allocation for this label-id or dimm */
-	res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n);
+	res = nvdimm_allocate_dpa(ndd, label_id, nd_mapping->start, n);
 	if (!res)
 		rc = -EBUSY;
 
@@ -416,7 +397,6 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
 		resource_size_t n, struct resource *valid)
 {
 	bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0;
-	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	unsigned long align;
 
 	align = nd_region->align / nd_region->ndr_mappings;
@@ -429,21 +409,6 @@ static void space_valid(struct nd_region *nd_region, struct nvdimm_drvdata *ndd,
 	if (is_reserve)
 		return;
 
-	if (!is_pmem) {
-		struct nd_mapping *nd_mapping = &nd_region->mapping[0];
-		struct nvdimm_bus *nvdimm_bus;
-		struct blk_alloc_info info = {
-			.nd_mapping = nd_mapping,
-			.available = nd_mapping->size,
-			.res = valid,
-		};
-
-		WARN_ON(!is_nd_blk(&nd_region->dev));
-		nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-		device_for_each_child(&nvdimm_bus->dev, &info, alias_dpa_busy);
-		return;
-	}
-
 	/* allocation needs to be contiguous, so this is all or nothing */
 	if (resource_size(valid) < n)
 		goto invalid;
@@ -471,7 +436,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		resource_size_t n)
 {
 	resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1;
-	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
 	struct resource *res, *exist = NULL, valid;
 	const resource_size_t to_allocate = n;
@@ -569,10 +533,6 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 		}
 
 		if (strcmp(action, "allocate") == 0) {
-			/* BLK allocate bottom up */
-			if (!is_pmem)
-				valid.start += available - allocate;
-
 			new_res = nvdimm_allocate_dpa(ndd, label_id,
 					valid.start, allocate);
 			if (!new_res)
@@ -608,12 +568,7 @@ static resource_size_t scan_allocate(struct nd_region *nd_region,
 			return 0;
 	}
 
-	/*
-	 * If we allocated nothing in the BLK case it may be because we are in
-	 * an initial "pmem-reserve pass".  Only do an initial BLK allocation
-	 * when none of the DPA space is reserved.
-	 */
-	if ((is_pmem || !ndd->dpa.child) && n == to_allocate)
+	if (n == to_allocate)
 		return init_dpa_allocation(label_id, nd_region, nd_mapping, n);
 	return n;
 }
@@ -672,7 +627,7 @@ int __reserve_free_pmem(struct device *dev, void *data)
 		if (nd_mapping->nvdimm != nvdimm)
 			continue;
 
-		n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem);
+		n = nd_pmem_available_dpa(nd_region, nd_mapping);
 		if (n == 0)
 			return 0;
 		rem = scan_allocate(nd_region, nd_mapping, &label_id, n);
@@ -697,19 +652,6 @@ void release_free_pmem(struct nvdimm_bus *nvdimm_bus,
 			nvdimm_free_dpa(ndd, res);
 }
 
-static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
-		struct nd_mapping *nd_mapping)
-{
-	struct nvdimm *nvdimm = nd_mapping->nvdimm;
-	int rc;
-
-	rc = device_for_each_child(&nvdimm_bus->dev, nvdimm,
-			__reserve_free_pmem);
-	if (rc)
-		release_free_pmem(nvdimm_bus, nd_mapping);
-	return rc;
-}
-
 /**
  * grow_dpa_allocation - for each dimm allocate n bytes for @label_id
  * @nd_region: the set of dimms to allocate @n more bytes from
@@ -726,37 +668,14 @@ static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus,
 static int grow_dpa_allocation(struct nd_region *nd_region,
 		struct nd_label_id *label_id, resource_size_t n)
 {
-	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0;
 	int i;
 
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 		resource_size_t rem = n;
-		int rc, j;
-
-		/*
-		 * In the BLK case try once with all unallocated PMEM
-		 * reserved, and once without
-		 */
-		for (j = is_pmem; j < 2; j++) {
-			bool blk_only = j == 0;
-
-			if (blk_only) {
-				rc = reserve_free_pmem(nvdimm_bus, nd_mapping);
-				if (rc)
-					return rc;
-			}
-			rem = scan_allocate(nd_region, nd_mapping,
-					label_id, rem);
-			if (blk_only)
-				release_free_pmem(nvdimm_bus, nd_mapping);
-
-			/* try again and allow encroachments into PMEM */
-			if (rem == 0)
-				break;
-		}
+		int rc;
 
+		rem = scan_allocate(nd_region, nd_mapping, label_id, rem);
 		dev_WARN_ONCE(&nd_region->dev, rem,
 				"allocation underrun: %#llx of %#llx bytes\n",
 				(unsigned long long) n - rem,
@@ -869,8 +788,8 @@ static ssize_t __size_store(struct device *dev, unsigned long long val)
 		ndd = to_ndd(nd_mapping);
 
 		/*
-		 * All dimms in an interleave set, or the base dimm for a blk
-		 * region, need to be enabled for the size to be changed.
+		 * All dimms in an interleave set, need to be enabled
+		 * for the size to be changed.
 		 */
 		if (!ndd)
 			return -ENXIO;
@@ -1169,9 +1088,6 @@ static ssize_t resource_show(struct device *dev,
 }
 static DEVICE_ATTR_ADMIN_RO(resource);
 
-static const unsigned long blk_lbasize_supported[] = { 512, 520, 528,
-	4096, 4104, 4160, 4224, 0 };
-
 static const unsigned long pmem_lbasize_supported[] = { 512, 4096, 0 };
 
 static ssize_t sector_size_show(struct device *dev,
@@ -1823,10 +1739,7 @@ static struct device *create_namespace_pmem(struct nd_region *nd_region,
 	/*
 	 * Fix up each mapping's 'labels' to have the validated pmem label for
 	 * that position at labels[0], and NULL at labels[1].  In the process,
-	 * check that the namespace aligns with interleave-set.  We know
-	 * that it does not overlap with any blk namespaces by virtue of
-	 * the dimm being enabled (i.e. nd_label_reserve_dpa()
-	 * succeeded).
+	 * check that the namespace aligns with interleave-set.
 	 */
 	nsl_get_uuid(ndd, nd_label, &uuid);
 	rc = select_pmem_id(nd_region, &uuid);
@@ -1931,8 +1844,7 @@ void nd_region_create_ns_seed(struct nd_region *nd_region)
 	 * disabled until memory becomes available
 	 */
 	if (!nd_region->ns_seed)
-		dev_err(&nd_region->dev, "failed to create %s namespace\n",
-				is_nd_blk(&nd_region->dev) ? "blk" : "pmem");
+		dev_err(&nd_region->dev, "failed to create namespace\n");
 	else
 		nd_device_register(nd_region->ns_seed);
 }
@@ -2028,16 +1940,9 @@ static struct device **scan_labels(struct nd_region *nd_region)
 	list_for_each_entry_safe(label_ent, e, &nd_mapping->labels, list) {
 		struct nd_namespace_label *nd_label = label_ent->label;
 		struct device **__devs;
-		u32 flags;
 
 		if (!nd_label)
 			continue;
-		flags = nsl_get_flags(ndd, nd_label);
-		if (is_nd_blk(&nd_region->dev)
-				== !!(flags & NSLABEL_FLAG_LOCAL))
-			/* pass, region matches label type */;
-		else
-			continue;
 
 		/* skip labels that describe extents outside of the region */
 		if (nsl_get_dpa(ndd, nd_label) < nd_mapping->start ||
@@ -2073,9 +1978,8 @@ static struct device **scan_labels(struct nd_region *nd_region)
 
 	}
 
-	dev_dbg(&nd_region->dev, "discovered %d %s namespace%s\n",
-			count, is_nd_blk(&nd_region->dev)
-			? "blk" : "pmem", count == 1 ? "" : "s");
+	dev_dbg(&nd_region->dev, "discovered %d namespace%s\n", count,
+		count == 1 ? "" : "s");
 
 	if (count == 0) {
 		struct nd_namespace_pmem *nspm;
@@ -2226,12 +2130,6 @@ static int init_active_labels(struct nd_region *nd_region)
 			if (!label_ent)
 				break;
 			label = nd_label_active(ndd, j);
-			if (test_bit(NDD_NOBLK, &nvdimm->flags)) {
-				u32 flags = nsl_get_flags(ndd, label);
-
-				flags &= ~NSLABEL_FLAG_LOCAL;
-				nsl_set_flags(ndd, label, flags);
-			}
 			label_ent->label = label;
 
 			mutex_lock(&nd_mapping->lock);
@@ -2275,7 +2173,6 @@ int nd_region_register_namespaces(struct nd_region *nd_region, int *err)
 		devs = create_namespace_io(nd_region);
 		break;
 	case ND_DEVICE_NAMESPACE_PMEM:
-	case ND_DEVICE_NAMESPACE_BLK:
 		devs = create_namespaces(nd_region);
 		break;
 	default:
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index e4af0719cf33..17febf9ac911 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -82,30 +82,12 @@ static inline void nvdimm_security_overwrite_query(struct work_struct *work)
 }
 #endif
 
-/**
- * struct blk_alloc_info - tracking info for BLK dpa scanning
- * @nd_mapping: blk region mapping boundaries
- * @available: decremented in alias_dpa_busy as aliased PMEM is scanned
- * @busy: decremented in blk_dpa_busy to account for ranges already
- * 	  handled by alias_dpa_busy
- * @res: alias_dpa_busy interprets this a free space range that needs to
- * 	 be truncated to the valid BLK allocation starting DPA, blk_dpa_busy
- * 	 treats it as a busy range that needs the aliased PMEM ranges
- * 	 truncated.
- */
-struct blk_alloc_info {
-	struct nd_mapping *nd_mapping;
-	resource_size_t available, busy;
-	struct resource *res;
-};
-
 bool is_nvdimm(struct device *dev);
 bool is_nd_pmem(struct device *dev);
 bool is_nd_volatile(struct device *dev);
-bool is_nd_blk(struct device *dev);
 static inline bool is_nd_region(struct device *dev)
 {
-	return is_nd_pmem(dev) || is_nd_blk(dev) || is_nd_volatile(dev);
+	return is_nd_pmem(dev) || is_nd_volatile(dev);
 }
 static inline bool is_memory(struct device *dev)
 {
@@ -142,14 +124,12 @@ resource_size_t nd_pmem_max_contiguous_dpa(struct nd_region *nd_region,
 					   struct nd_mapping *nd_mapping);
 resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region);
 resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region,
-		struct nd_mapping *nd_mapping, resource_size_t *overlap);
-resource_size_t nd_blk_available_dpa(struct nd_region *nd_region);
+				      struct nd_mapping *nd_mapping);
 resource_size_t nd_region_available_dpa(struct nd_region *nd_region);
 int nd_region_conflict(struct nd_region *nd_region, resource_size_t start,
 		resource_size_t size);
 resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd,
 		struct nd_label_id *label_id);
-int alias_dpa_busy(struct device *dev, void *data);
 int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd);
 void get_ndd(struct nvdimm_drvdata *ndd);
 resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 8391bf2729bc..ec5219680092 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -295,9 +295,6 @@ static inline const u8 *nsl_uuid_raw(struct nvdimm_drvdata *ndd,
 	return nd_label->efi.uuid;
 }
 
-bool nsl_validate_blk_isetcookie(struct nvdimm_drvdata *ndd,
-				 struct nd_namespace_label *nd_label,
-				 u64 isetcookie);
 bool nsl_validate_type_guid(struct nvdimm_drvdata *ndd,
 			    struct nd_namespace_label *nd_label, guid_t *guid);
 enum nvdimm_claim_class nsl_get_claim_class(struct nvdimm_drvdata *ndd,
@@ -437,14 +434,6 @@ static inline bool nsl_validate_nlabel(struct nd_region *nd_region,
 	return nsl_get_nlabel(ndd, nd_label) == nd_region->ndr_mappings;
 }
 
-struct nd_blk_region {
-	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
-			void *iobuf, u64 len, int rw);
-	void *blk_provider_data;
-	struct nd_region nd_region;
-};
-
 /*
  * Lookup next in the repeating sequence of 01, 10, and 11.
  */
@@ -672,7 +661,6 @@ static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
 	return -ENXIO;
 }
 #endif
-int nd_blk_region_init(struct nd_region *nd_region);
 int nd_region_activate(struct nd_region *nd_region);
 static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
 		unsigned int len)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index e0c34120df37..188560b1c110 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -15,6 +15,10 @@ static int nd_region_probe(struct device *dev)
 	static unsigned long once;
 	struct nd_region_data *ndrd;
 	struct nd_region *nd_region = to_nd_region(dev);
+	struct range range = {
+		.start = nd_region->ndr_start,
+		.end = nd_region->ndr_start + nd_region->ndr_size - 1,
+	};
 
 	if (nd_region->num_lanes > num_online_cpus()
 			&& nd_region->num_lanes < num_possible_cpus()
@@ -30,25 +34,13 @@ static int nd_region_probe(struct device *dev)
 	if (rc)
 		return rc;
 
-	rc = nd_blk_region_init(nd_region);
-	if (rc)
-		return rc;
-
-	if (is_memory(&nd_region->dev)) {
-		struct range range = {
-			.start = nd_region->ndr_start,
-			.end = nd_region->ndr_start + nd_region->ndr_size - 1,
-		};
-
-		if (devm_init_badblocks(dev, &nd_region->bb))
-			return -ENODEV;
-		nd_region->bb_state = sysfs_get_dirent(nd_region->dev.kobj.sd,
-						       "badblocks");
-		if (!nd_region->bb_state)
-			dev_warn(&nd_region->dev,
-					"'badblocks' notification disabled\n");
-		nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range);
-	}
+	if (devm_init_badblocks(dev, &nd_region->bb))
+		return -ENODEV;
+	nd_region->bb_state =
+		sysfs_get_dirent(nd_region->dev.kobj.sd, "badblocks");
+	if (!nd_region->bb_state)
+		dev_warn(dev, "'badblocks' notification disabled\n");
+	nvdimm_badblocks_populate(nd_region, &nd_region->bb, &range);
 
 	rc = nd_region_register_namespaces(nd_region, &err);
 	if (rc < 0)
@@ -158,4 +150,3 @@ void nd_region_exit(void)
 }
 
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM);
-MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 70ad891a76ba..0cb274c2b508 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -134,10 +134,7 @@ static void nd_region_release(struct device *dev)
 	}
 	free_percpu(nd_region->lane);
 	memregion_free(nd_region->id);
-	if (is_nd_blk(dev))
-		kfree(to_nd_blk_region(dev));
-	else
-		kfree(nd_region);
+	kfree(nd_region);
 }
 
 struct nd_region *to_nd_region(struct device *dev)
@@ -157,33 +154,12 @@ struct device *nd_region_dev(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nd_region_dev);
 
-struct nd_blk_region *to_nd_blk_region(struct device *dev)
-{
-	struct nd_region *nd_region = to_nd_region(dev);
-
-	WARN_ON(!is_nd_blk(dev));
-	return container_of(nd_region, struct nd_blk_region, nd_region);
-}
-EXPORT_SYMBOL_GPL(to_nd_blk_region);
-
 void *nd_region_provider_data(struct nd_region *nd_region)
 {
 	return nd_region->provider_data;
 }
 EXPORT_SYMBOL_GPL(nd_region_provider_data);
 
-void *nd_blk_region_provider_data(struct nd_blk_region *ndbr)
-{
-	return ndbr->blk_provider_data;
-}
-EXPORT_SYMBOL_GPL(nd_blk_region_provider_data);
-
-void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data)
-{
-	ndbr->blk_provider_data = data;
-}
-EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data);
-
 /**
  * nd_region_to_nstype() - region to an integer namespace type
  * @nd_region: region-device to interrogate
@@ -208,8 +184,6 @@ int nd_region_to_nstype(struct nd_region *nd_region)
 			return ND_DEVICE_NAMESPACE_PMEM;
 		else
 			return ND_DEVICE_NAMESPACE_IO;
-	} else if (is_nd_blk(&nd_region->dev)) {
-		return ND_DEVICE_NAMESPACE_BLK;
 	}
 
 	return 0;
@@ -332,14 +306,12 @@ static DEVICE_ATTR_RO(set_cookie);
 
 resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 {
-	resource_size_t blk_max_overlap = 0, available, overlap;
+	resource_size_t available;
 	int i;
 
 	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
 
- retry:
 	available = 0;
-	overlap = blk_max_overlap;
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 		struct nvdimm_drvdata *ndd = to_ndd(nd_mapping);
@@ -348,15 +320,7 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 		if (!ndd)
 			return 0;
 
-		if (is_memory(&nd_region->dev)) {
-			available += nd_pmem_available_dpa(nd_region,
-					nd_mapping, &overlap);
-			if (overlap > blk_max_overlap) {
-				blk_max_overlap = overlap;
-				goto retry;
-			}
-		} else if (is_nd_blk(&nd_region->dev))
-			available += nd_blk_available_dpa(nd_region);
+		available += nd_pmem_available_dpa(nd_region, nd_mapping);
 	}
 
 	return available;
@@ -364,26 +328,17 @@ resource_size_t nd_region_available_dpa(struct nd_region *nd_region)
 
 resource_size_t nd_region_allocatable_dpa(struct nd_region *nd_region)
 {
-	resource_size_t available = 0;
+	resource_size_t avail = 0;
 	int i;
 
-	if (is_memory(&nd_region->dev))
-		available = PHYS_ADDR_MAX;
-
 	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
 	for (i = 0; i < nd_region->ndr_mappings; i++) {
 		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 
-		if (is_memory(&nd_region->dev))
-			available = min(available,
-					nd_pmem_max_contiguous_dpa(nd_region,
-								   nd_mapping));
-		else if (is_nd_blk(&nd_region->dev))
-			available += nd_blk_available_dpa(nd_region);
+		avail = min_not_zero(avail, nd_pmem_max_contiguous_dpa(
+						    nd_region, nd_mapping));
 	}
-	if (is_memory(&nd_region->dev))
-		return available * nd_region->ndr_mappings;
-	return available;
+	return avail * nd_region->ndr_mappings;
 }
 
 static ssize_t available_size_show(struct device *dev,
@@ -693,9 +648,8 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 			&& a != &dev_attr_available_size.attr)
 		return a->mode;
 
-	if ((type == ND_DEVICE_NAMESPACE_PMEM
-				|| type == ND_DEVICE_NAMESPACE_BLK)
-			&& a == &dev_attr_available_size.attr)
+	if (type == ND_DEVICE_NAMESPACE_PMEM &&
+	    a == &dev_attr_available_size.attr)
 		return a->mode;
 	else if (is_memory(dev) && nd_set)
 		return a->mode;
@@ -828,12 +782,6 @@ static const struct attribute_group *nd_region_attribute_groups[] = {
 	NULL,
 };
 
-static const struct device_type nd_blk_device_type = {
-	.name = "nd_blk",
-	.release = nd_region_release,
-	.groups = nd_region_attribute_groups,
-};
-
 static const struct device_type nd_pmem_device_type = {
 	.name = "nd_pmem",
 	.release = nd_region_release,
@@ -851,11 +799,6 @@ bool is_nd_pmem(struct device *dev)
 	return dev ? dev->type == &nd_pmem_device_type : false;
 }
 
-bool is_nd_blk(struct device *dev)
-{
-	return dev ? dev->type == &nd_blk_device_type : false;
-}
-
 bool is_nd_volatile(struct device *dev)
 {
 	return dev ? dev->type == &nd_volatile_device_type : false;
@@ -929,22 +872,6 @@ void nd_region_advance_seeds(struct nd_region *nd_region, struct device *dev)
 	nvdimm_bus_unlock(dev);
 }
 
-int nd_blk_region_init(struct nd_region *nd_region)
-{
-	struct device *dev = &nd_region->dev;
-	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-
-	if (!is_nd_blk(dev))
-		return 0;
-
-	if (nd_region->ndr_mappings < 1) {
-		dev_dbg(dev, "invalid BLK region\n");
-		return -ENXIO;
-	}
-
-	return to_nd_blk_region(dev)->enable(nvdimm_bus, dev);
-}
-
 /**
  * nd_region_acquire_lane - allocate and lock a lane
  * @nd_region: region id and number of lanes possible
@@ -1007,24 +934,10 @@ EXPORT_SYMBOL(nd_region_release_lane);
 static unsigned long default_align(struct nd_region *nd_region)
 {
 	unsigned long align;
-	int i, mappings;
 	u32 remainder;
+	int mappings;
 
-	if (is_nd_blk(&nd_region->dev))
-		align = PAGE_SIZE;
-	else
-		align = MEMREMAP_COMPAT_ALIGN_MAX;
-
-	for (i = 0; i < nd_region->ndr_mappings; i++) {
-		struct nd_mapping *nd_mapping = &nd_region->mapping[i];
-		struct nvdimm *nvdimm = nd_mapping->nvdimm;
-
-		if (test_bit(NDD_ALIASING, &nvdimm->flags)) {
-			align = MEMREMAP_COMPAT_ALIGN_MAX;
-			break;
-		}
-	}
-
+	align = MEMREMAP_COMPAT_ALIGN_MAX;
 	if (nd_region->ndr_size < MEMREMAP_COMPAT_ALIGN_MAX)
 		align = PAGE_SIZE;
 
@@ -1042,7 +955,6 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 {
 	struct nd_region *nd_region;
 	struct device *dev;
-	void *region_buf;
 	unsigned int i;
 	int ro = 0;
 
@@ -1060,36 +972,13 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 		if (test_bit(NDD_UNARMED, &nvdimm->flags))
 			ro = 1;
 
-		if (test_bit(NDD_NOBLK, &nvdimm->flags)
-				&& dev_type == &nd_blk_device_type) {
-			dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not BLK capable\n",
-					caller, dev_name(&nvdimm->dev), i);
-			return NULL;
-		}
 	}
 
-	if (dev_type == &nd_blk_device_type) {
-		struct nd_blk_region_desc *ndbr_desc;
-		struct nd_blk_region *ndbr;
+	nd_region =
+		kzalloc(struct_size(nd_region, mapping, ndr_desc->num_mappings),
+			GFP_KERNEL);
 
-		ndbr_desc = to_blk_region_desc(ndr_desc);
-		ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping)
-				* ndr_desc->num_mappings,
-				GFP_KERNEL);
-		if (ndbr) {
-			nd_region = &ndbr->nd_region;
-			ndbr->enable = ndbr_desc->enable;
-			ndbr->do_io = ndbr_desc->do_io;
-		}
-		region_buf = ndbr;
-	} else {
-		nd_region = kzalloc(struct_size(nd_region, mapping,
-						ndr_desc->num_mappings),
-				    GFP_KERNEL);
-		region_buf = nd_region;
-	}
-
-	if (!region_buf)
+	if (!nd_region)
 		return NULL;
 	nd_region->id = memregion_alloc(GFP_KERNEL);
 	if (nd_region->id < 0)
@@ -1153,7 +1042,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
  err_percpu:
 	memregion_free(nd_region->id);
  err_id:
-	kfree(region_buf);
+	kfree(nd_region);
 	return NULL;
 }
 
@@ -1166,17 +1055,6 @@ struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create);
 
-struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
-		struct nd_region_desc *ndr_desc)
-{
-	if (ndr_desc->num_mappings > 1)
-		return NULL;
-	ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES);
-	return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type,
-			__func__);
-}
-EXPORT_SYMBOL_GPL(nvdimm_blk_region_create);
-
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc)
 {
@@ -1201,7 +1079,7 @@ int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
 }
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
- * @nd_region: blk or interleaved pmem region
+ * @nd_region: interleaved pmem region
  */
 int generic_nvdimm_flush(struct nd_region *nd_region)
 {
@@ -1234,7 +1112,7 @@ EXPORT_SYMBOL_GPL(nvdimm_flush);
 
 /**
  * nvdimm_has_flush - determine write flushing requirements
- * @nd_region: blk or interleaved pmem region
+ * @nd_region: interleaved pmem region
  *
  * Returns 1 if writes require flushing
  * Returns 0 if writes do not require flushing
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 7074aa9af525..0d61e07b6827 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -25,8 +25,6 @@ struct badrange {
 };
 
 enum {
-	/* when a dimm supports both PMEM and BLK access a label is required */
-	NDD_ALIASING = 0,
 	/* unarmed memory devices may not persist writes */
 	NDD_UNARMED = 1,
 	/* locked memory devices should not be accessed */
@@ -35,8 +33,6 @@ enum {
 	NDD_SECURITY_OVERWRITE = 3,
 	/*  tracking whether or not there is a pending device reference */
 	NDD_WORK_PENDING = 4,
-	/* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */
-	NDD_NOBLK = 5,
 	/* dimm supports namespace labels */
 	NDD_LABELING = 6,
 
@@ -140,21 +136,6 @@ static inline void __iomem *devm_nvdimm_ioremap(struct device *dev,
 }
 
 struct nvdimm_bus;
-struct module;
-struct nd_blk_region;
-struct nd_blk_region_desc {
-	int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev);
-	int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
-			void *iobuf, u64 len, int rw);
-	struct nd_region_desc ndr_desc;
-};
-
-static inline struct nd_blk_region_desc *to_blk_region_desc(
-		struct nd_region_desc *ndr_desc)
-{
-	return container_of(ndr_desc, struct nd_blk_region_desc, ndr_desc);
-
-}
 
 /*
  * Note that separate bits for locked + unlocked are defined so that
@@ -257,7 +238,6 @@ struct nvdimm_bus *nvdimm_to_bus(struct nvdimm *nvdimm);
 struct nvdimm *to_nvdimm(struct device *dev);
 struct nd_region *to_nd_region(struct device *dev);
 struct device *nd_region_dev(struct nd_region *nd_region);
-struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 struct device *to_nvdimm_bus_dev(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
@@ -295,10 +275,6 @@ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
 struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 		struct nd_region_desc *ndr_desc);
 void *nd_region_provider_data(struct nd_region *nd_region);
-void *nd_blk_region_provider_data(struct nd_blk_region *ndbr);
-void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data);
-struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr);
-unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 8cf1e4884fd5..17e02b64ea2e 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -189,7 +189,6 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 #define ND_DEVICE_REGION_BLK 3      /* nd_region: (parent of BLK namespaces) */
 #define ND_DEVICE_NAMESPACE_IO 4    /* legacy persistent memory */
 #define ND_DEVICE_NAMESPACE_PMEM 5  /* PMEM namespace (may alias with BLK) */
-#define ND_DEVICE_NAMESPACE_BLK 6   /* BLK namespace (may alias with PMEM) */
 #define ND_DEVICE_DAX_PMEM 7        /* Device DAX interface to pmem */
 
 enum nd_driver_flags {
@@ -198,7 +197,6 @@ enum nd_driver_flags {
 	ND_DRIVER_REGION_BLK      = 1 << ND_DEVICE_REGION_BLK,
 	ND_DRIVER_NAMESPACE_IO    = 1 << ND_DEVICE_NAMESPACE_IO,
 	ND_DRIVER_NAMESPACE_PMEM  = 1 << ND_DEVICE_NAMESPACE_PMEM,
-	ND_DRIVER_NAMESPACE_BLK   = 1 << ND_DEVICE_NAMESPACE_BLK,
 	ND_DRIVER_DAX_PMEM	  = 1 << ND_DEVICE_DAX_PMEM,
 };
 
diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
index 3ca7c32e9362..4d1a947367f9 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -338,62 +338,6 @@ static int ndtest_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	return 0;
 }
 
-static int ndtest_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-		void *iobuf, u64 len, int rw)
-{
-	struct ndtest_dimm *dimm = ndbr->blk_provider_data;
-	struct ndtest_blk_mmio *mmio = dimm->mmio;
-	struct nd_region *nd_region = &ndbr->nd_region;
-	unsigned int lane;
-
-	if (!mmio)
-		return -ENOMEM;
-
-	lane = nd_region_acquire_lane(nd_region);
-	if (rw)
-		memcpy(mmio->base + dpa, iobuf, len);
-	else {
-		memcpy(iobuf, mmio->base + dpa, len);
-		arch_invalidate_pmem(mmio->base + dpa, len);
-	}
-
-	nd_region_release_lane(nd_region, lane);
-
-	return 0;
-}
-
-static int ndtest_blk_region_enable(struct nvdimm_bus *nvdimm_bus,
-				    struct device *dev)
-{
-	struct nd_blk_region *ndbr = to_nd_blk_region(dev);
-	struct nvdimm *nvdimm;
-	struct ndtest_dimm *dimm;
-	struct ndtest_blk_mmio *mmio;
-
-	nvdimm = nd_blk_region_to_dimm(ndbr);
-	dimm = nvdimm_provider_data(nvdimm);
-
-	nd_blk_region_set_provider_data(ndbr, dimm);
-	dimm->blk_region = to_nd_region(dev);
-
-	mmio = devm_kzalloc(dev, sizeof(struct ndtest_blk_mmio), GFP_KERNEL);
-	if (!mmio)
-		return -ENOMEM;
-
-	mmio->base = (void __iomem *) devm_nvdimm_memremap(
-		dev, dimm->address, 12, nd_blk_memremap_flags(ndbr));
-	if (!mmio->base) {
-		dev_err(dev, "%s failed to map blk dimm\n", nvdimm_name(nvdimm));
-		return -ENOMEM;
-	}
-	mmio->size = dimm->size;
-	mmio->base_offset = 0;
-
-	dimm->mmio = mmio;
-
-	return 0;
-}
-
 static struct nfit_test_resource *ndtest_resource_lookup(resource_size_t addr)
 {
 	int i;
@@ -523,17 +467,16 @@ static int ndtest_create_region(struct ndtest_priv *p,
 				struct ndtest_region *region)
 {
 	struct nd_mapping_desc mappings[NDTEST_MAX_MAPPING];
-	struct nd_blk_region_desc ndbr_desc;
+	struct nd_region_desc *ndr_desc, _ndr_desc;
 	struct nd_interleave_set *nd_set;
-	struct nd_region_desc *ndr_desc;
 	struct resource res;
 	int i, ndimm = region->mapping[0].dimm;
 	u64 uuid[2];
 
 	memset(&res, 0, sizeof(res));
 	memset(&mappings, 0, sizeof(mappings));
-	memset(&ndbr_desc, 0, sizeof(ndbr_desc));
-	ndr_desc = &ndbr_desc.ndr_desc;
+	memset(&_ndr_desc, 0, sizeof(_ndr_desc));
+	ndr_desc = &_ndr_desc;
 
 	if (!ndtest_alloc_resource(p, region->size, &res.start))
 		return -ENOMEM;
@@ -857,10 +800,8 @@ static int ndtest_dimm_register(struct ndtest_priv *priv,
 	struct device *dev = &priv->pdev.dev;
 	unsigned long dimm_flags = dimm->flags;
 
-	if (dimm->num_formats > 1) {
-		set_bit(NDD_ALIASING, &dimm_flags);
+	if (dimm->num_formats > 1)
 		set_bit(NDD_LABELING, &dimm_flags);
-	}
 
 	if (dimm->flags & PAPR_PMEM_UNARMED_MASK)
 		set_bit(NDD_UNARMED, &dimm_flags);

From cb8fac6d2727f79f211e745b16c9abbf4d8be652 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov <khoroshilov@ispras.ru>
Date: Tue, 15 Feb 2022 13:17:04 +0300
Subject: [PATCH 118/229] NFS: remove unneeded check in
 decode_devicenotify_args()

[You don't often get email from khoroshilov@ispras.ru. Learn why this is important at http://aka.ms/LearnAboutSenderIdentification.]

Overflow check in not needed anymore after we switch to kmalloc_array().

Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Fixes: a4f743a6bb20 ("NFSv4.1: Convert open-coded array allocation calls to kmalloc_array()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/callback_xdr.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f90de8043b0f..8dcb08e1a885 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -271,10 +271,6 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
 	n = ntohl(*p++);
 	if (n == 0)
 		goto out;
-	if (n > ULONG_MAX / sizeof(*args->devs)) {
-		status = htonl(NFS4ERR_BADXDR);
-		goto out;
-	}
 
 	args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
 	if (!args->devs) {

From b4be2c598b767eb72507e4dc56d75c3fe2231cee Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 15 Feb 2022 13:26:41 -0500
Subject: [PATCH 119/229] NFSv4.1 restrict GETATTR fs_location query to the
 main transport

In the presence of trunking transports, it's helpful to make sure
that during the migration event, the GETATTR for fs_location attribute
happens on the main transport.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8b875355824b..fd8eece12e94 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8008,6 +8008,18 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
 		.rpc_resp	= &res,
 		.rpc_cred	= cred,
 	};
+	struct nfs4_call_sync_data data = {
+		.seq_server = server,
+		.seq_args = &args.seq_args,
+		.seq_res = &res.seq_res,
+	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clnt,
+		.rpc_message = &msg,
+		.callback_ops = server->nfs_client->cl_mvops->call_sync_ops,
+		.callback_data = &data,
+		.flags = RPC_TASK_NO_ROUND_ROBIN,
+	};
 	int status;
 
 	nfs_fattr_init(&locations->fattr);
@@ -8015,8 +8027,7 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
 	locations->nlocations = 0;
 
 	nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 1);
-	status = nfs4_call_sync_sequence(clnt, server, &msg,
-					&args.seq_args, &res.seq_res);
+	status = nfs4_call_sync_custom(&task_setup_data);
 	if (status == NFS4_OK &&
 	    res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
 		status = -NFS4ERR_LEASE_MOVED;

From 45f3a70ba68e1fc7fe0edde731b08d85435da30d Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Tue, 1 Mar 2022 14:37:24 -0500
Subject: [PATCH 120/229] NFS: Cleanup usage of nfs_inode in fscache interface

A number of places in the fscache interface used nfs_inode when inode could
be used, simplifying the code.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fscache.c | 10 ++++------
 fs/nfs/fscache.h | 18 +++++++++---------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index cfe901650ab0..81bd2770e640 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -173,7 +173,7 @@ void nfs_fscache_init_inode(struct inode *inode)
 	if (!(nfss->fscache && S_ISREG(inode->i_mode)))
 		return;
 
-	nfs_fscache_update_auxdata(&auxdata, nfsi);
+	nfs_fscache_update_auxdata(&auxdata, inode);
 
 	nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
 					       0,
@@ -181,7 +181,7 @@ void nfs_fscache_init_inode(struct inode *inode)
 					       nfsi->fh.size,
 					       &auxdata,      /* aux_data */
 					       sizeof(auxdata),
-					       i_size_read(&nfsi->vfs_inode));
+					       i_size_read(inode));
 }
 
 /*
@@ -220,7 +220,6 @@ void nfs_fscache_clear_inode(struct inode *inode)
 void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 	bool open_for_write = inode_is_open_for_write(inode);
 
@@ -230,7 +229,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 	fscache_use_cookie(cookie, open_for_write);
 	if (open_for_write) {
 		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
-		nfs_fscache_update_auxdata(&auxdata, nfsi);
+		nfs_fscache_update_auxdata(&auxdata, inode);
 		fscache_invalidate(cookie, &auxdata, i_size_read(inode),
 				   FSCACHE_INVAL_DIO_WRITE);
 	}
@@ -240,11 +239,10 @@ EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
 void nfs_fscache_release_file(struct inode *inode, struct file *filp)
 {
 	struct nfs_fscache_inode_auxdata auxdata;
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
 	if (fscache_cookie_valid(cookie)) {
-		nfs_fscache_update_auxdata(&auxdata, nfsi);
+		nfs_fscache_update_auxdata(&auxdata, inode);
 		fscache_unuse_cookie(cookie, &auxdata, NULL);
 	}
 }
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 25a5c0f82392..4c7afaabbf9f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -86,16 +86,16 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,
 }
 
 static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
-					      struct nfs_inode *nfsi)
+					      struct inode *inode)
 {
 	memset(auxdata, 0, sizeof(*auxdata));
-	auxdata->mtime_sec  = nfsi->vfs_inode.i_mtime.tv_sec;
-	auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
-	auxdata->ctime_sec  = nfsi->vfs_inode.i_ctime.tv_sec;
-	auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+	auxdata->mtime_sec  = inode->i_mtime.tv_sec;
+	auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
+	auxdata->ctime_sec  = inode->i_ctime.tv_sec;
+	auxdata->ctime_nsec = inode->i_ctime.tv_nsec;
 
-	if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
-		auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+	if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4)
+		auxdata->change_attr = inode_peek_iversion_raw(inode);
 }
 
 /*
@@ -107,9 +107,9 @@ static inline void nfs_fscache_invalidate(struct inode *inode, int flags)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	if (nfsi->fscache) {
-		nfs_fscache_update_auxdata(&auxdata, nfsi);
+		nfs_fscache_update_auxdata(&auxdata, inode);
 		fscache_invalidate(nfsi->fscache, &auxdata,
-				   i_size_read(&nfsi->vfs_inode), flags);
+				   i_size_read(inode), flags);
 	}
 }
 

From fc1c5abfca7e1059df46623e64aecf840cdbb9dc Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Tue, 1 Mar 2022 14:37:25 -0500
Subject: [PATCH 121/229] NFS: Rename fscache read and write pages functions

Rename NFS fscache functions in a more consistent fashion
to better reflect when we read from and write to fscache.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fscache.c |  6 +++---
 fs/nfs/fscache.h | 27 ++++++++++-----------------
 fs/nfs/read.c    |  4 ++--
 3 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 81bd2770e640..62fbce28fe85 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -317,7 +317,7 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page,
 /*
  * Retrieve a page from fscache
  */
-int __nfs_readpage_from_fscache(struct inode *inode, struct page *page)
+int __nfs_fscache_read_page(struct inode *inode, struct page *page)
 {
 	int ret;
 
@@ -351,7 +351,7 @@ int __nfs_readpage_from_fscache(struct inode *inode, struct page *page)
  * Store a newly fetched page in fscache.  We can be certain there's no page
  * stored in the cache as yet otherwise we would've read it from there.
  */
-void __nfs_readpage_to_fscache(struct inode *inode, struct page *page)
+void __nfs_fscache_write_page(struct inode *inode, struct page *page)
 {
 	int ret;
 
@@ -362,7 +362,7 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page)
 	ret = fscache_fallback_write_page(inode, page, true);
 
 	dfprintk(FSCACHE,
-		 "NFS:     readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
+		 "NFS:     nfs_fscache_write_page: p:%p(i:%lu f:%lx) ret %d\n",
 		 page, page->index, page->flags, ret);
 
 	if (ret != 0) {
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4c7afaabbf9f..4e980cc04779 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -45,10 +45,8 @@ extern void nfs_fscache_clear_inode(struct inode *);
 extern void nfs_fscache_open_file(struct inode *, struct file *);
 extern void nfs_fscache_release_file(struct inode *, struct file *);
 
-extern int __nfs_readpage_from_fscache(struct inode *, struct page *);
-extern void __nfs_read_completion_to_fscache(struct nfs_pgio_header *hdr,
-					     unsigned long bytes);
-extern void __nfs_readpage_to_fscache(struct inode *, struct page *);
+extern int __nfs_fscache_read_page(struct inode *, struct page *);
+extern void __nfs_fscache_write_page(struct inode *, struct page *);
 
 static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
@@ -66,11 +64,10 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 /*
  * Retrieve a page from an inode data storage object.
  */
-static inline int nfs_readpage_from_fscache(struct inode *inode,
-					    struct page *page)
+static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
 {
-	if (NFS_I(inode)->fscache)
-		return __nfs_readpage_from_fscache(inode, page);
+	if (nfs_i_fscache(inode))
+		return __nfs_fscache_read_page(inode, page);
 	return -ENOBUFS;
 }
 
@@ -78,11 +75,11 @@ static inline int nfs_readpage_from_fscache(struct inode *inode,
  * Store a page newly fetched from the server in an inode data storage object
  * in the cache.
  */
-static inline void nfs_readpage_to_fscache(struct inode *inode,
+static inline void nfs_fscache_write_page(struct inode *inode,
 					   struct page *page)
 {
-	if (NFS_I(inode)->fscache)
-		__nfs_readpage_to_fscache(inode, page);
+	if (nfs_i_fscache(inode))
+		__nfs_fscache_write_page(inode, page);
 }
 
 static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
@@ -136,15 +133,11 @@ static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
 {
 	return 1; /* True: may release page */
 }
-static inline int nfs_readpage_from_fscache(struct inode *inode,
-					    struct page *page)
+static inline int nfs_fscache_read_page(struct inode *inode, struct page *page)
 {
 	return -ENOBUFS;
 }
-static inline void nfs_readpage_to_fscache(struct inode *inode,
-					   struct page *page) {}
-
-
+static inline void nfs_fscache_write_page(struct inode *inode, struct page *page) {}
 static inline void nfs_fscache_invalidate(struct inode *inode, int flags) {}
 
 static inline const char *nfs_server_fscache_state(struct nfs_server *server)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2472f962a9a2..e4c1a49b0126 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -123,7 +123,7 @@ static void nfs_readpage_release(struct nfs_page *req, int error)
 		struct address_space *mapping = page_file_mapping(page);
 
 		if (PageUptodate(page))
-			nfs_readpage_to_fscache(inode, page);
+			nfs_fscache_write_page(inode, page);
 		else if (!PageError(page) && !PagePrivate(page))
 			generic_error_remove_page(mapping, page);
 		unlock_page(page);
@@ -305,7 +305,7 @@ readpage_async_filler(struct nfs_readdesc *desc, struct page *page)
 	aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE);
 
 	if (!IS_SYNC(page->mapping->host)) {
-		error = nfs_readpage_from_fscache(page->mapping->host, page);
+		error = nfs_fscache_read_page(page->mapping->host, page);
 		if (error == 0)
 			goto out_unlock;
 	}

From e3f0a7fe698ff0d3ef428f72ba253fd1f377c193 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Tue, 1 Mar 2022 14:37:26 -0500
Subject: [PATCH 122/229] NFS: Replace dfprintks with tracepoints in fscache
 read and write page functions

Most of fscache and other NFS IO paths are now using tracepoints.
Remove the dfprintks in the NFS fscache read/write page functions
and replace with tracepoints at the begin and end of the functions.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fscache.c  | 29 ++++++---------
 fs/nfs/nfstrace.h | 91 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 62fbce28fe85..841b69aef189 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -19,6 +19,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "fscache.h"
+#include "nfstrace.h"
 
 #define NFSDBG_FACILITY		NFSDBG_FSCACHE
 
@@ -321,30 +322,27 @@ int __nfs_fscache_read_page(struct inode *inode, struct page *page)
 {
 	int ret;
 
-	dfprintk(FSCACHE,
-		 "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
-		 nfs_i_fscache(inode), page, page->index, page->flags, inode);
-
+	trace_nfs_fscache_read_page(inode, page);
 	if (PageChecked(page)) {
-		dfprintk(FSCACHE, "NFS:    readpage_from_fscache: PageChecked\n");
 		ClearPageChecked(page);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	ret = fscache_fallback_read_page(inode, page);
 	if (ret < 0) {
 		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL);
-		dfprintk(FSCACHE,
-			 "NFS:    readpage_from_fscache failed %d\n", ret);
 		SetPageChecked(page);
-		return ret;
+		goto out;
 	}
 
 	/* Read completed synchronously */
-	dfprintk(FSCACHE, "NFS:    readpage_from_fscache: read successful\n");
 	nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK);
 	SetPageUptodate(page);
-	return 0;
+	ret = 0;
+out:
+	trace_nfs_fscache_read_page_exit(inode, page, ret);
+	return ret;
 }
 
 /*
@@ -355,20 +353,15 @@ void __nfs_fscache_write_page(struct inode *inode, struct page *page)
 {
 	int ret;
 
-	dfprintk(FSCACHE,
-		 "NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx))\n",
-		 nfs_i_fscache(inode), page, page->index, page->flags);
+	trace_nfs_fscache_write_page(inode, page);
 
 	ret = fscache_fallback_write_page(inode, page, true);
 
-	dfprintk(FSCACHE,
-		 "NFS:     nfs_fscache_write_page: p:%p(i:%lu f:%lx) ret %d\n",
-		 page, page->index, page->flags, ret);
-
 	if (ret != 0) {
 		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL);
 		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED);
 	} else {
 		nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_WRITTEN_OK);
 	}
+	trace_nfs_fscache_write_page_exit(inode, page, ret);
 }
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 59f4ca803fd0..012bd7339862 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -1215,6 +1215,97 @@ TRACE_EVENT(nfs_readpage_short,
 		)
 );
 
+DECLARE_EVENT_CLASS(nfs_fscache_page_event,
+		TP_PROTO(
+			const struct inode *inode,
+			struct page *page
+		),
+
+		TP_ARGS(inode, page),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+		),
+
+		TP_fast_assign(
+			const struct nfs_inode *nfsi = NFS_I(inode);
+			const struct nfs_fh *fh = &nfsi->fh;
+
+			__entry->offset = page_index(page) << PAGE_SHIFT;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(fh);
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			(long long)__entry->offset
+		)
+);
+DECLARE_EVENT_CLASS(nfs_fscache_page_event_done,
+		TP_PROTO(
+			const struct inode *inode,
+			struct page *page,
+			int error
+		),
+
+		TP_ARGS(inode, page, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(loff_t, offset)
+		),
+
+		TP_fast_assign(
+			const struct nfs_inode *nfsi = NFS_I(inode);
+			const struct nfs_fh *fh = &nfsi->fh;
+
+			__entry->offset = page_index(page) << PAGE_SHIFT;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = nfsi->fileid;
+			__entry->fhandle = nfs_fhandle_hash(fh);
+			__entry->error = error;
+		),
+
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"offset=%lld error=%d",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			(long long)__entry->offset, __entry->error
+		)
+);
+#define DEFINE_NFS_FSCACHE_PAGE_EVENT(name) \
+	DEFINE_EVENT(nfs_fscache_page_event, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				struct page *page \
+			), \
+			TP_ARGS(inode, page))
+#define DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(name) \
+	DEFINE_EVENT(nfs_fscache_page_event_done, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				struct page *page, \
+				int error \
+			), \
+			TP_ARGS(inode, page, error))
+DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_read_page);
+DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_read_page_exit);
+DEFINE_NFS_FSCACHE_PAGE_EVENT(nfs_fscache_write_page);
+DEFINE_NFS_FSCACHE_PAGE_EVENT_DONE(nfs_fscache_write_page_exit);
+
 TRACE_EVENT(nfs_pgio_error,
 	TP_PROTO(
 		const struct nfs_pgio_header *hdr,

From b5fdf66f6eb2560784c6f60131dc567de06267dc Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Tue, 1 Mar 2022 14:37:27 -0500
Subject: [PATCH 123/229] NFS: Remove remaining dfprintks related to fscache
 and remove NFSDBG_FSCACHE

The fscache cookie APIs including fscache_acquire_cookie() and
fscache_relinquish_cookie() now have very good tracing.  Thus,
there is no real need for dfprintks in the NFS fscache interface.

The NFS fscache interface has removed all dfprintks so remove the
NFSDBG_FSCACHE defines.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fscache.c            | 10 ----------
 include/uapi/linux/nfs_fs.h |  2 +-
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 841b69aef189..4dee53ceb941 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -21,8 +21,6 @@
 #include "fscache.h"
 #include "nfstrace.h"
 
-#define NFSDBG_FACILITY		NFSDBG_FSCACHE
-
 #define NFS_MAX_KEY_LEN 1000
 
 static bool nfs_append_int(char *key, int *_len, unsigned long long x)
@@ -129,8 +127,6 @@ int nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int u
 	vcookie = fscache_acquire_volume(key,
 					 NULL, /* preferred_cache */
 					 NULL, 0 /* coherency_data */);
-	dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
-		 nfss, vcookie);
 	if (IS_ERR(vcookie)) {
 		if (vcookie != ERR_PTR(-EBUSY)) {
 			kfree(key);
@@ -153,9 +149,6 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
 {
 	struct nfs_server *nfss = NFS_SB(sb);
 
-	dfprintk(FSCACHE, "NFS: releasing superblock cookie (0x%p/0x%p)\n",
-		 nfss, nfss->fscache);
-
 	fscache_relinquish_volume(nfss->fscache, NULL, false);
 	nfss->fscache = NULL;
 	kfree(nfss->fscache_uniq);
@@ -193,8 +186,6 @@ void nfs_fscache_clear_inode(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct fscache_cookie *cookie = nfs_i_fscache(inode);
 
-	dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
-
 	fscache_relinquish_cookie(cookie, false);
 	nfsi->fscache = NULL;
 }
@@ -229,7 +220,6 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
 
 	fscache_use_cookie(cookie, open_for_write);
 	if (open_for_write) {
-		dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
 		nfs_fscache_update_auxdata(&auxdata, inode);
 		fscache_invalidate(cookie, &auxdata, i_size_read(inode),
 				   FSCACHE_INVAL_DIO_WRITE);
diff --git a/include/uapi/linux/nfs_fs.h b/include/uapi/linux/nfs_fs.h
index 3afe3767c55d..ae0de165c014 100644
--- a/include/uapi/linux/nfs_fs.h
+++ b/include/uapi/linux/nfs_fs.h
@@ -52,7 +52,7 @@
 #define NFSDBG_CALLBACK		0x0100
 #define NFSDBG_CLIENT		0x0200
 #define NFSDBG_MOUNT		0x0400
-#define NFSDBG_FSCACHE		0x0800
+#define NFSDBG_FSCACHE		0x0800 /* unused */
 #define NFSDBG_PNFS		0x1000
 #define NFSDBG_PNFS_LD		0x2000
 #define NFSDBG_STATE		0x4000

From 944d95f766c6fe97fa358c661281a741758cee7e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 124/229] NFS: remove IS_SWAPFILE hack

This code is pointless as IS_SWAPFILE is always defined.
So remove it.

Suggested-by: Mark Hemment <markhemm@googlemail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/file.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 4d681683d13c..93c01aaa0a8d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -44,11 +44,6 @@
 
 static const struct vm_operations_struct nfs_file_vm_ops;
 
-/* Hack for future NFS swap support */
-#ifndef IS_SWAPFILE
-# define IS_SWAPFILE(inode)	(0)
-#endif
-
 int nfs_check_flags(int flags)
 {
 	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))

From c487216bec83b0c5a8803e5c61433d33ad7b104d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 125/229] SUNRPC/call_alloc: async tasks mustn't block waiting
 for memory

When memory is short, new worker threads cannot be created and we depend
on the minimum one rpciod thread to be able to handle everything.
So it must not block waiting for memory.

mempools are particularly a problem as memory can only be released back
to the mempool by an async rpc task running.  If all available
workqueue threads are waiting on the mempool, no thread is available to
return anything.

rpc_malloc() can block, and this might cause deadlocks.
So check RPC_IS_ASYNC(), rather than RPC_IS_SWAPPER() to determine if
blocking is acceptable.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/sched.c              | 4 +++-
 net/sunrpc/xprtrdma/transport.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 52769b883c0a..e5b07562ba45 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1023,8 +1023,10 @@ int rpc_malloc(struct rpc_task *task)
 	struct rpc_buffer *buf;
 	gfp_t gfp = GFP_KERNEL;
 
+	if (RPC_IS_ASYNC(task))
+		gfp = GFP_NOWAIT | __GFP_NOWARN;
 	if (RPC_IS_SWAPPER(task))
-		gfp = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+		gfp |= __GFP_MEMALLOC;
 
 	size += sizeof(struct rpc_buffer);
 	if (size <= RPC_BUFFER_MAXSIZE)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 42e375dbdadb..5714bf880e95 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -570,8 +570,10 @@ xprt_rdma_allocate(struct rpc_task *task)
 	gfp_t flags;
 
 	flags = RPCRDMA_DEF_GFP;
+	if (RPC_IS_ASYNC(task))
+		flags = GFP_NOWAIT | __GFP_NOWARN;
 	if (RPC_IS_SWAPPER(task))
-		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
+		flags |= __GFP_MEMALLOC;
 
 	if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
 				  flags))

From a41b05edfedb939440e83666f23de3ef9af33acf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 126/229] SUNRPC/auth: async tasks mustn't block waiting for
 memory

When memory is short, new worker threads cannot be created and we depend
on the minimum one rpciod thread to be able to handle everything.  So it
must not block waiting for memory.

mempools are particularly a problem as memory can only be released back
to the mempool by an async rpc task running.  If all available workqueue
threads are waiting on the mempool, no thread is available to return
anything.

lookup_cred() can block on a mempool or kmalloc - and this can cause
deadlocks.  So add a new RPCAUTH_LOOKUP flag for async lookups and don't
block on memory.  If the -ENOMEM gets back to call_refreshresult(), wait
a short while and try again.  HZ>>4 is chosen as it is used elsewhere
for -ENOMEM retries.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/auth.h    | 1 +
 net/sunrpc/auth.c              | 6 +++++-
 net/sunrpc/auth_gss/auth_gss.c | 6 +++++-
 net/sunrpc/auth_unix.c         | 8 +++++++-
 net/sunrpc/clnt.c              | 3 +++
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 98da816b5fc2..3e6ce288a7fc 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -99,6 +99,7 @@ struct rpc_auth_create_args {
 
 /* Flags for rpcauth_lookupcred() */
 #define RPCAUTH_LOOKUP_NEW		0x01	/* Accept an uninitialised cred */
+#define RPCAUTH_LOOKUP_ASYNC		0x02	/* Don't block waiting for memory */
 
 /*
  * Client authentication ops
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a9f0d17fdb0d..6bfa19f9fa6a 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -615,6 +615,8 @@ rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags)
 	};
 	struct rpc_cred *ret;
 
+	if (RPC_IS_ASYNC(task))
+		lookupflags |= RPCAUTH_LOOKUP_ASYNC;
 	ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 	put_cred(acred.cred);
 	return ret;
@@ -631,6 +633,8 @@ rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags)
 
 	if (!acred.principal)
 		return NULL;
+	if (RPC_IS_ASYNC(task))
+		lookupflags |= RPCAUTH_LOOKUP_ASYNC;
 	return auth->au_ops->lookup_cred(auth, &acred, lookupflags);
 }
 
@@ -654,7 +658,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags)
 	};
 
 	if (flags & RPC_TASK_ASYNC)
-		lookupflags |= RPCAUTH_LOOKUP_NEW;
+		lookupflags |= RPCAUTH_LOOKUP_NEW | RPCAUTH_LOOKUP_ASYNC;
 	if (task->tk_op_cred)
 		/* Task must use exactly this rpc_cred */
 		new = get_rpccred(task->tk_op_cred);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index affd64a54f02..ac0828108204 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1341,7 +1341,11 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
 static struct rpc_cred *
 gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	return rpcauth_lookup_credcache(auth, acred, flags, GFP_KERNEL);
+	gfp_t gfp = GFP_KERNEL;
+
+	if (flags & RPCAUTH_LOOKUP_ASYNC)
+		gfp = GFP_NOWAIT | __GFP_NOWARN;
+	return rpcauth_lookup_credcache(auth, acred, flags, gfp);
 }
 
 static struct rpc_cred *
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 3600d8641644..c629d366030e 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -43,8 +43,14 @@ unx_destroy(struct rpc_auth *auth)
 static struct rpc_cred *
 unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 {
-	struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_KERNEL);
+	gfp_t gfp = GFP_KERNEL;
+	struct rpc_cred *ret;
 
+	if (flags & RPCAUTH_LOOKUP_ASYNC)
+		gfp = GFP_NOWAIT | __GFP_NOWARN;
+	ret = mempool_alloc(unix_pool, gfp);
+	if (!ret)
+		return ERR_PTR(-ENOMEM);
 	rpcauth_init_cred(ret, acred, auth, &unix_credops);
 	ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
 	return ret;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 97165a545cb3..9556eb7b065b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1745,6 +1745,9 @@ call_refreshresult(struct rpc_task *task)
 		task->tk_cred_retry--;
 		trace_rpc_retry_refresh_status(task);
 		return;
+	case -ENOMEM:
+		rpc_delay(task, HZ >> 4);
+		return;
 	}
 	trace_rpc_refresh_status(task);
 	rpc_call_rpcerror(task, status);

From a721035477fb5fb8abc738fbe410b07c12af3dc5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 127/229] SUNRPC/xprt: async tasks mustn't block waiting for
 memory

When memory is short, new worker threads cannot be created and we depend
on the minimum one rpciod thread to be able to handle everything.  So it
must not block waiting for memory.

xprt_dynamic_alloc_slot can block indefinitely.  This can tie up all
workqueue threads and NFS can deadlock.  So when called from a
workqueue, set __GFP_NORETRY.

The rdma alloc_slot already does not block.  However it sets the error
to -EAGAIN suggesting this will trigger a sleep.  It does not.  As we
can see in call_reserveresult(), only -ENOMEM causes a sleep.  -EAGAIN
causes immediate retry.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprt.c               | 5 ++++-
 net/sunrpc/xprtrdma/transport.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9f0025e0742c..2d1f84aea516 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1687,12 +1687,15 @@ out:
 static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
 {
 	struct rpc_rqst *req = ERR_PTR(-EAGAIN);
+	gfp_t gfp_mask = GFP_KERNEL;
 
 	if (xprt->num_reqs >= xprt->max_reqs)
 		goto out;
 	++xprt->num_reqs;
 	spin_unlock(&xprt->reserve_lock);
-	req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
+	if (current->flags & PF_WQ_WORKER)
+		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
+	req = kzalloc(sizeof(*req), gfp_mask);
 	spin_lock(&xprt->reserve_lock);
 	if (req != NULL)
 		goto out;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 5714bf880e95..923e4b512ee9 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -517,7 +517,7 @@ xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 	return;
 
 out_sleep:
-	task->tk_status = -EAGAIN;
+	task->tk_status = -ENOMEM;
 	xprt_add_backlog(xprt, task);
 }
 

From a80a8461868905823609be97f91776a26befe839 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 128/229] SUNRPC: remove scheduling boost for "SWAPPER" tasks.

Currently, tasks marked as "swapper" tasks get put to the front of
non-priority rpc_queues, and are sorted earlier than non-swapper tasks on
the transport's ->xmit_queue.

This is pointless as currently *all* tasks for a mount that has swap
enabled on *any* file are marked as "swapper" tasks.  So the net result
is that the non-priority rpc_queues are reverse-ordered (LIFO).

This scheduling boost is not necessary to avoid deadlocks, and hurts
fairness, so remove it.  If there were a need to expedite some requests,
the tk_priority mechanism is a more appropriate tool.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/sched.c |  7 -------
 net/sunrpc/xprt.c  | 11 -----------
 2 files changed, 18 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index e5b07562ba45..690bd3401820 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -186,11 +186,6 @@ static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue,
 
 /*
  * Add new request to wait queue.
- *
- * Swapper tasks always get inserted at the head of the queue.
- * This should avoid many nasty memory deadlocks and hopefully
- * improve overall performance.
- * Everyone else gets appended to the queue to ensure proper FIFO behavior.
  */
 static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
 		struct rpc_task *task,
@@ -199,8 +194,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
 	INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
 	if (RPC_IS_PRIORITY(queue))
 		__rpc_add_wait_queue_priority(queue, task, queue_priority);
-	else if (RPC_IS_SWAPPER(task))
-		list_add(&task->u.tk_wait.list, &queue->tasks[0]);
 	else
 		list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]);
 	task->tk_waitqueue = queue;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2d1f84aea516..2f165634df54 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1354,17 +1354,6 @@ xprt_request_enqueue_transmit(struct rpc_task *task)
 				INIT_LIST_HEAD(&req->rq_xmit2);
 				goto out;
 			}
-		} else if (RPC_IS_SWAPPER(task)) {
-			list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
-				if (pos->rq_cong || pos->rq_bytes_sent)
-					continue;
-				if (RPC_IS_SWAPPER(pos->rq_task))
-					continue;
-				/* Note: req is added _before_ pos */
-				list_add_tail(&req->rq_xmit, &pos->rq_xmit);
-				INIT_LIST_HEAD(&req->rq_xmit2);
-				goto out;
-			}
 		} else if (!req->rq_seqno) {
 			list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) {
 				if (pos->rq_task->tk_owner != task->tk_owner)

From 89c2be8a951654758dffeaaa6272328d9c8f29be Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 129/229] NFS: discard NFS_RPC_SWAPFLAGS and RPC_TASK_ROOTCREDS

NFS_RPC_SWAPFLAGS is only used for READ requests.
It sets RPC_TASK_SWAPPER which gives some memory-allocation priority to
requests.  This is not needed for swap READ - though it is for writes
where it is set via a different mechanism.

RPC_TASK_ROOTCREDS causes the 'machine' credential to be used.
This is not needed as the root credential is saved when the swap file is
opened, and this is used for all IO.

So NFS_RPC_SWAPFLAGS isn't needed, and as it is the only user of
RPC_TASK_ROOTCREDS, that isn't needed either.

Remove both.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/read.c                 | 4 ----
 include/linux/nfs_fs.h        | 5 -----
 include/linux/sunrpc/sched.h  | 1 -
 include/trace/events/sunrpc.h | 1 -
 net/sunrpc/auth.c             | 2 +-
 5 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e4c1a49b0126..5e7657374bc3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -194,10 +194,6 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr,
 			      const struct nfs_rpc_ops *rpc_ops,
 			      struct rpc_task_setup *task_setup_data, int how)
 {
-	struct inode *inode = hdr->inode;
-	int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
-
-	task_setup_data->flags |= swap_flags;
 	rpc_ops->read_setup(hdr, msg);
 	trace_nfs_initiate_read(hdr);
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3893386ceaed..9074ed0b65aa 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -45,11 +45,6 @@
  */
 #define NFS_MAX_TRANSPORTS 16
 
-/*
- * These are the default flags for swap requests
- */
-#define NFS_RPC_SWAPFLAGS		(RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS)
-
 /*
  * Size of the NFS directory verifier
  */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index db964bb63912..56710f8056d3 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -124,7 +124,6 @@ struct rpc_task_setup {
 #define RPC_TASK_MOVEABLE	0x0004		/* nfs4.1+ rpc tasks */
 #define RPC_TASK_NULLCREDS	0x0010		/* Use AUTH_NULL credential */
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
-#define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
 #define	RPC_TASK_NO_ROUND_ROBIN	0x0100		/* send requests on "main" xprt */
 #define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 29982d60b68a..ac33892da411 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -311,7 +311,6 @@ TRACE_EVENT(rpc_request,
 		{ RPC_TASK_MOVEABLE, "MOVEABLE" },			\
 		{ RPC_TASK_NULLCREDS, "NULLCREDS" },			\
 		{ RPC_CALL_MAJORSEEN, "MAJORSEEN" },			\
-		{ RPC_TASK_ROOTCREDS, "ROOTCREDS" },			\
 		{ RPC_TASK_DYNAMIC, "DYNAMIC" },			\
 		{ RPC_TASK_NO_ROUND_ROBIN, "NO_ROUND_ROBIN" },		\
 		{ RPC_TASK_SOFT, "SOFT" },				\
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 6bfa19f9fa6a..682fcd24bf43 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -670,7 +670,7 @@ rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags)
 	/* If machine cred couldn't be bound, try a root cred */
 	if (new)
 		;
-	else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS))
+	else if (cred == &machine_cred)
 		new = rpcauth_bind_root_cred(task, lookupflags);
 	else if (flags & RPC_TASK_NULLCREDS)
 		new = authnull_ops.lookup_cred(NULL, NULL, 0);

From 8db55a032ac7ac1ed7b98d6b1dc980e6378c652f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 130/229] SUNRPC: improve 'swap' handling: scheduling and
 PF_MEMALLOC

rpc tasks can be marked as RPC_TASK_SWAPPER.  This causes GFP_MEMALLOC
to be used for some allocations.  This is needed in some cases, but not
in all where it is currently provided, and in some where it isn't
provided.

Currently *all* tasks associated with a rpc_client on which swap is
enabled get the flag and hence some GFP_MEMALLOC support.

GFP_MEMALLOC is provided for ->buf_alloc() but only swap-writes need it.
However xdr_alloc_bvec does not get GFP_MEMALLOC - though it often does
need it.

xdr_alloc_bvec is called while the XPRT_LOCK is held.  If this blocks,
then it blocks all other queued tasks.  So this allocation needs
GFP_MEMALLOC for *all* requests, not just writes, when the xprt is used
for any swap writes.

Similarly, if the transport is not connected, that will block all
requests including swap writes, so memory allocations should get
GFP_MEMALLOC if swap writes are possible.

So with this patch:
 1/ we ONLY set RPC_TASK_SWAPPER for swap writes.
 2/ __rpc_execute() sets PF_MEMALLOC while handling any task
    with RPC_TASK_SWAPPER set, or when handling any task that
    holds the XPRT_LOCKED lock on an xprt used for swap.
    This removes the need for the RPC_IS_SWAPPER() test
    in ->buf_alloc handlers.
 3/ xprt_prepare_transmit() sets PF_MEMALLOC after locking
    any task to a swapper xprt.  __rpc_execute() will clear it.
 3/ PF_MEMALLOC is set for all the connect workers.

Reviewed-by: Chuck Lever <chuck.lever@oracle.com> (for xprtrdma parts)
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/write.c                  |  2 ++
 net/sunrpc/clnt.c               |  2 --
 net/sunrpc/sched.c              | 20 +++++++++++++++++---
 net/sunrpc/xprt.c               |  3 +++
 net/sunrpc/xprtrdma/transport.c |  6 ++++--
 net/sunrpc/xprtsock.c           |  8 ++++++++
 6 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 74d258781205..599a82406d38 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1412,6 +1412,8 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
 {
 	int priority = flush_task_priority(how);
 
+	if (IS_SWAPFILE(hdr->inode))
+		task_setup_data->flags |= RPC_TASK_SWAPPER;
 	task_setup_data->priority = priority;
 	rpc_ops->write_setup(hdr, msg, &task_setup_data->rpc_client);
 	trace_nfs_initiate_write(hdr);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 9556eb7b065b..4117ea4caa2e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1085,8 +1085,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 		task->tk_flags |= RPC_TASK_TIMEOUT;
 	if (clnt->cl_noretranstimeo)
 		task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
-	if (atomic_read(&clnt->cl_swapper))
-		task->tk_flags |= RPC_TASK_SWAPPER;
 	/* Add to the client's list of all tasks */
 	spin_lock(&clnt->cl_lock);
 	list_add_tail(&task->tk_task, &clnt->cl_tasks);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 690bd3401820..7c8f87ebdbc0 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -869,6 +869,15 @@ void rpc_release_calldata(const struct rpc_call_ops *ops, void *calldata)
 		ops->rpc_release(calldata);
 }
 
+static bool xprt_needs_memalloc(struct rpc_xprt *xprt, struct rpc_task *tk)
+{
+	if (!xprt)
+		return false;
+	if (!atomic_read(&xprt->swapper))
+		return false;
+	return test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == tk;
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -877,6 +886,7 @@ static void __rpc_execute(struct rpc_task *task)
 	struct rpc_wait_queue *queue;
 	int task_is_async = RPC_IS_ASYNC(task);
 	int status = 0;
+	unsigned long pflags = current->flags;
 
 	WARN_ON_ONCE(RPC_IS_QUEUED(task));
 	if (RPC_IS_QUEUED(task))
@@ -899,6 +909,10 @@ static void __rpc_execute(struct rpc_task *task)
 		}
 		if (!do_action)
 			break;
+		if (RPC_IS_SWAPPER(task) ||
+		    xprt_needs_memalloc(task->tk_xprt, task))
+			current->flags |= PF_MEMALLOC;
+
 		trace_rpc_task_run_action(task, do_action);
 		do_action(task);
 
@@ -936,7 +950,7 @@ static void __rpc_execute(struct rpc_task *task)
 		rpc_clear_running(task);
 		spin_unlock(&queue->lock);
 		if (task_is_async)
-			return;
+			goto out;
 
 		/* sync task: sleep here */
 		trace_rpc_task_sync_sleep(task, task->tk_action);
@@ -960,6 +974,8 @@ static void __rpc_execute(struct rpc_task *task)
 
 	/* Release all resources associated with the task */
 	rpc_release_task(task);
+out:
+	current_restore_flags(pflags, PF_MEMALLOC);
 }
 
 /*
@@ -1018,8 +1034,6 @@ int rpc_malloc(struct rpc_task *task)
 
 	if (RPC_IS_ASYNC(task))
 		gfp = GFP_NOWAIT | __GFP_NOWARN;
-	if (RPC_IS_SWAPPER(task))
-		gfp |= __GFP_MEMALLOC;
 
 	size += sizeof(struct rpc_buffer);
 	if (size <= RPC_BUFFER_MAXSIZE)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2f165634df54..bbe913121f43 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1492,6 +1492,9 @@ bool xprt_prepare_transmit(struct rpc_task *task)
 		return false;
 
 	}
+	if (atomic_read(&xprt->swapper))
+		/* This will be clear in __rpc_execute */
+		current->flags |= PF_MEMALLOC;
 	return true;
 }
 
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 923e4b512ee9..6b7e10e5a141 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -235,8 +235,11 @@ xprt_rdma_connect_worker(struct work_struct *work)
 	struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
 						   rx_connect_worker.work);
 	struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+	unsigned int pflags = current->flags;
 	int rc;
 
+	if (atomic_read(&xprt->swapper))
+		current->flags |= PF_MEMALLOC;
 	rc = rpcrdma_xprt_connect(r_xprt);
 	xprt_clear_connecting(xprt);
 	if (!rc) {
@@ -250,6 +253,7 @@ xprt_rdma_connect_worker(struct work_struct *work)
 		rpcrdma_xprt_disconnect(r_xprt);
 	xprt_unlock_connect(xprt, r_xprt);
 	xprt_wake_pending_tasks(xprt, rc);
+	current_restore_flags(pflags, PF_MEMALLOC);
 }
 
 /**
@@ -572,8 +576,6 @@ xprt_rdma_allocate(struct rpc_task *task)
 	flags = RPCRDMA_DEF_GFP;
 	if (RPC_IS_ASYNC(task))
 		flags = GFP_NOWAIT | __GFP_NOWARN;
-	if (RPC_IS_SWAPPER(task))
-		flags |= __GFP_MEMALLOC;
 
 	if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
 				  flags))
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0f39e08ee580..61d3293f1d68 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2052,7 +2052,10 @@ static void xs_udp_setup_socket(struct work_struct *work)
 	struct rpc_xprt *xprt = &transport->xprt;
 	struct socket *sock;
 	int status = -EIO;
+	unsigned int pflags = current->flags;
 
+	if (atomic_read(&xprt->swapper))
+		current->flags |= PF_MEMALLOC;
 	sock = xs_create_sock(xprt, transport,
 			xs_addr(xprt)->sa_family, SOCK_DGRAM,
 			IPPROTO_UDP, false);
@@ -2072,6 +2075,7 @@ out:
 	xprt_clear_connecting(xprt);
 	xprt_unlock_connect(xprt, transport);
 	xprt_wake_pending_tasks(xprt, status);
+	current_restore_flags(pflags, PF_MEMALLOC);
 }
 
 /**
@@ -2231,7 +2235,10 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 	struct socket *sock = transport->sock;
 	struct rpc_xprt *xprt = &transport->xprt;
 	int status;
+	unsigned int pflags = current->flags;
 
+	if (atomic_read(&xprt->swapper))
+		current->flags |= PF_MEMALLOC;
 	if (!sock) {
 		sock = xs_create_sock(xprt, transport,
 				xs_addr(xprt)->sa_family, SOCK_STREAM,
@@ -2296,6 +2303,7 @@ out:
 	xprt_clear_connecting(xprt);
 out_unlock:
 	xprt_unlock_connect(xprt, transport);
+	current_restore_flags(pflags, PF_MEMALLOC);
 }
 
 /**

From 4dc73c679114a2f408567e2e44770ed934190db2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 131/229] NFSv4: keep state manager thread active if swap is
 enabled

If we are swapping over NFSv4, we may not be able to allocate memory to
start the state-manager thread at the time when we need it.
So keep it always running when swap is enabled, and just signal it to
start.

This requires updating and testing the cl_swapper count on the root
rpc_clnt after following all ->cl_parent links.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/file.c           | 15 ++++++++++++---
 fs/nfs/nfs4_fs.h        |  1 +
 fs/nfs/nfs4proc.c       | 20 ++++++++++++++++++++
 fs/nfs/nfs4state.c      | 40 +++++++++++++++++++++++++++++++++-------
 include/linux/nfs_xdr.h |  2 ++
 net/sunrpc/clnt.c       |  2 ++
 6 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 93c01aaa0a8d..d31bc430dce3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -483,8 +483,9 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 {
 	unsigned long blocks;
 	long long isize;
-	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = file_inode(file);
+	struct rpc_clnt *clnt = NFS_CLIENT(inode);
+	struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
 
 	spin_lock(&inode->i_lock);
 	blocks = inode->i_blocks;
@@ -497,14 +498,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 
 	*span = sis->pages;
 
+
+	if (cl->rpc_ops->enable_swap)
+		cl->rpc_ops->enable_swap(inode);
+
 	return rpc_clnt_swap_activate(clnt);
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
-	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
+	struct inode *inode = file_inode(file);
+	struct rpc_clnt *clnt = NFS_CLIENT(inode);
+	struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
 
 	rpc_clnt_swap_deactivate(clnt);
+	if (cl->rpc_ops->disable_swap)
+		cl->rpc_ops->disable_swap(file_inode(file));
 }
 
 const struct address_space_operations nfs_file_aops = {
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 84f39b6f1b1e..79df6e83881b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,6 +42,7 @@ enum nfs4_client_state {
 	NFS4CLNT_LEASE_MOVED,
 	NFS4CLNT_DELEGATION_EXPIRED,
 	NFS4CLNT_RUN_MANAGER,
+	NFS4CLNT_MANAGER_AVAILABLE,
 	NFS4CLNT_RECALL_RUNNING,
 	NFS4CLNT_RECALL_ANY_LAYOUT_READ,
 	NFS4CLNT_RECALL_ANY_LAYOUT_RW,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fd8eece12e94..dd7a4c2a3f05 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -10468,6 +10468,24 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
 	return error + error2 + error3;
 }
 
+static void nfs4_enable_swap(struct inode *inode)
+{
+	/* The state manager thread must always be running.
+	 * It will notice the client is a swapper, and stay put.
+	 */
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+
+	nfs4_schedule_state_manager(clp);
+}
+
+static void nfs4_disable_swap(struct inode *inode)
+{
+	/* The state manager thread will now exit once it is
+	 * woken.
+	 */
+	wake_up_var(&NFS_SERVER(inode)->nfs_client->cl_state);
+}
+
 static const struct inode_operations nfs4_dir_inode_operations = {
 	.create		= nfs_create,
 	.lookup		= nfs_lookup,
@@ -10545,6 +10563,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 	.create_server	= nfs4_create_server,
 	.clone_server	= nfs_clone_server,
 	.discover_trunking = nfs4_discover_trunking,
+	.enable_swap	= nfs4_enable_swap,
+	.disable_swap	= nfs4_disable_swap,
 };
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 58054dfdf2b0..4b2ea239a537 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1207,10 +1207,17 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 {
 	struct task_struct *task;
 	char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+	struct rpc_clnt *cl = clp->cl_rpcclient;
+
+	while (cl != cl->cl_parent)
+		cl = cl->cl_parent;
 
 	set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
-	if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+	if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
+		wake_up_var(&clp->cl_state);
 		return;
+	}
+	set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
 	__module_get(THIS_MODULE);
 	refcount_inc(&clp->cl_count);
 
@@ -1226,6 +1233,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 		printk(KERN_ERR "%s: kthread_run: %ld\n",
 			__func__, PTR_ERR(task));
 		nfs4_clear_state_manager_bit(clp);
+		clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
 		nfs_put_client(clp);
 		module_put(THIS_MODULE);
 	}
@@ -2680,12 +2688,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
 		}
 
-		/* Did we race with an attempt to give us more work? */
-		if (!test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
-			return;
-		if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
-			return;
-		memflags = memalloc_nofs_save();
+		return;
+
 	} while (refcount_read(&clp->cl_count) > 1 && !signalled());
 	goto out_drain;
 
@@ -2706,9 +2710,31 @@ out_drain:
 static int nfs4_run_state_manager(void *ptr)
 {
 	struct nfs_client *clp = ptr;
+	struct rpc_clnt *cl = clp->cl_rpcclient;
+
+	while (cl != cl->cl_parent)
+		cl = cl->cl_parent;
 
 	allow_signal(SIGKILL);
+again:
+	set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
 	nfs4_state_manager(clp);
+	if (atomic_read(&cl->cl_swapper)) {
+		wait_var_event_interruptible(&clp->cl_state,
+					     test_bit(NFS4CLNT_RUN_MANAGER,
+						      &clp->cl_state));
+		if (atomic_read(&cl->cl_swapper) &&
+		    test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+			goto again;
+		/* Either no longer a swapper, or were signalled */
+	}
+	clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+
+	if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+	    test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+	    !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+		goto again;
+
 	nfs_put_client(clp);
 	module_put_and_kthread_exit(0);
 	return 0;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 82f7c2730b9a..49ba486aea5f 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1797,6 +1797,8 @@ struct nfs_rpc_ops {
 	struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,
 					   struct nfs_fattr *, rpc_authflavor_t);
 	int	(*discover_trunking)(struct nfs_server *, struct nfs_fh *);
+	void	(*enable_swap)(struct inode *inode);
+	void	(*disable_swap)(struct inode *inode);
 };
 
 /*
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 4117ea4caa2e..0f54a56d19d2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -3069,6 +3069,8 @@ rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt,
 int
 rpc_clnt_swap_activate(struct rpc_clnt *clnt)
 {
+	while (clnt != clnt->cl_parent)
+		clnt = clnt->cl_parent;
 	if (atomic_inc_return(&clnt->cl_swapper) == 1)
 		return rpc_clnt_iterate_for_each_xprt(clnt,
 				rpc_clnt_swap_activate_callback, NULL);

From 64158668ac8b31626a8ce48db4cad08496eb8340 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 132/229] NFS: swap IO handling is slightly different for
 O_DIRECT IO

1/ Taking the i_rwsem for swap IO triggers lockdep warnings regarding
   possible deadlocks with "fs_reclaim".  These deadlocks could, I believe,
   eventuate if a buffered read on the swapfile was attempted.

   We don't need coherence with the page cache for a swap file, and
   buffered writes are forbidden anyway.  There is no other need for
   i_rwsem during direct IO.  So never take it for swap_rw()

2/ generic_write_checks() explicitly forbids writes to swap, and
   performs checks that are not needed for swap.  So bypass it
   for swap_rw().

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c        | 42 ++++++++++++++++++++++++++++--------------
 fs/nfs/file.c          |  4 ++--
 include/linux/nfs_fs.h |  8 ++++----
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index eabfdab543c8..04aaf39a05cb 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -173,8 +173,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
 	if (iov_iter_rw(iter) == READ)
-		return nfs_file_direct_read(iocb, iter);
-	return nfs_file_direct_write(iocb, iter);
+		return nfs_file_direct_read(iocb, iter, true);
+	return nfs_file_direct_write(iocb, iter, true);
 }
 
 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -425,6 +425,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
  *
  * We use this function for direct reads instead of calling
  * generic_file_aio_read() in order to avoid gfar's check to see if
@@ -440,7 +441,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+			     bool swap)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -482,12 +484,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 	if (iter_is_iovec(iter))
 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
 
-	nfs_start_io_direct(inode);
+	if (!swap)
+		nfs_start_io_direct(inode);
 
 	NFS_I(inode)->read_io += count;
 	requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 
-	nfs_end_io_direct(inode);
+	if (!swap)
+		nfs_end_io_direct(inode);
 
 	if (requested > 0) {
 		result = nfs_direct_wait(dreq);
@@ -876,6 +880,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
  *
  * We use this function for direct writes instead of calling
  * generic_file_aio_write() in order to avoid taking the inode
@@ -892,7 +897,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * Note that O_APPEND is not supported for NFS direct writes, as there
  * is no atomic O_APPEND write facility in the NFS protocol.
  */
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+			      bool swap)
 {
 	ssize_t result, requested;
 	size_t count;
@@ -906,7 +912,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
 
-	result = generic_write_checks(iocb, iter);
+	if (swap)
+		/* bypass generic checks */
+		result =  iov_iter_count(iter);
+	else
+		result = generic_write_checks(iocb, iter);
 	if (result <= 0)
 		return result;
 	count = result;
@@ -937,17 +947,21 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 		dreq->iocb = iocb;
 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
 
-	nfs_start_io_direct(inode);
+	if (swap) {
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+	} else {
+		nfs_start_io_direct(inode);
 
-	requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
 
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_SHIFT, end);
+		if (mapping->nrpages) {
+			invalidate_inode_pages2_range(mapping,
+						      pos >> PAGE_SHIFT, end);
+		}
+
+		nfs_end_io_direct(inode);
 	}
 
-	nfs_end_io_direct(inode);
-
 	if (requested > 0) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d31bc430dce3..81c80548a5c6 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -157,7 +157,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t result;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
-		return nfs_file_direct_read(iocb, to);
+		return nfs_file_direct_read(iocb, to, false);
 
 	dprintk("NFS: read(%pD2, %zu@%lu)\n",
 		iocb->ki_filp,
@@ -623,7 +623,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
 		return result;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
-		return nfs_file_direct_write(iocb, from);
+		return nfs_file_direct_write(iocb, from, false);
 
 	dprintk("NFS: write(%pD2, %zu@%Ld)\n",
 		file, iov_iter_count(from), (long long) iocb->ki_pos);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9074ed0b65aa..c47c448befc8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -508,10 +508,10 @@ static inline const struct cred *nfs_file_cred(struct file *file)
  * linux/fs/nfs/direct.c
  */
 extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *);
-extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-			struct iov_iter *iter);
-extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
-			struct iov_iter *iter);
+ssize_t nfs_file_direct_read(struct kiocb *iocb,
+			     struct iov_iter *iter, bool swap);
+ssize_t nfs_file_direct_write(struct kiocb *iocb,
+			      struct iov_iter *iter, bool swap);
 
 /*
  * linux/fs/nfs/dir.c

From c265de257f558a05c1859ee9e3fed04883b9ec0e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:44 +1100
Subject: [PATCH 133/229] NFS: swap-out must always use STABLE writes.

The commit handling code is not safe against memory-pressure deadlocks
when writing to swap.  In particular, nfs_commitdata_alloc() blocks
indefinitely waiting for memory, and this can consume all available
workqueue threads.

swap-out most likely uses STABLE writes anyway as COND_STABLE indicates
that a stable write should be used if the write fits in a single
request, and it normally does.  However if we ever swap with a small
wsize, or gather unusually large numbers of pages for a single write,
this might change.

For safety, make it explicit in the code that direct writes used for swap
must always use FLUSH_STABLE.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/direct.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 04aaf39a05cb..11c566d8769f 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -794,7 +794,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
  */
 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 					       struct iov_iter *iter,
-					       loff_t pos)
+					       loff_t pos, int ioflags)
 {
 	struct nfs_pageio_descriptor desc;
 	struct inode *inode = dreq->inode;
@@ -802,7 +802,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	size_t requested_bytes = 0;
 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
 
-	nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+	nfs_pageio_init_write(&desc, inode, ioflags, false,
 			      &nfs_direct_write_completion_ops);
 	desc.pg_dreq = dreq;
 	get_dreq(dreq);
@@ -948,11 +948,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
 
 	if (swap) {
-		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+							    FLUSH_STABLE);
 	} else {
 		nfs_start_io_direct(inode);
 
-		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+							    FLUSH_COND_STABLE);
 
 		if (mapping->nrpages) {
 			invalidate_inode_pages2_range(mapping,

From 693486d5f8951780a9bb31f7fe935171a80010e4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 7 Mar 2022 10:41:45 +1100
Subject: [PATCH 134/229] SUNRPC: change locking for xs_swap_enable/disable

It is not in general safe to wait for XPRT_LOCKED to clear.
A wakeup is only sent when
 - connection completes
 - sock close completes
so during normal operations, this can wait indefinitely.

The event we need to protect against is ->inet being set to NULL, and
that happens under the recv_mutex lock.

So drop the handlign of XPRT_LOCKED and use recv_mutex instead.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtsock.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 61d3293f1d68..7e39f87cde2d 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1936,9 +1936,9 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 
 #if IS_ENABLED(CONFIG_SUNRPC_SWAP)
 /*
- * Note that this should be called with XPRT_LOCKED held (or when we otherwise
- * know that we have exclusive access to the socket), to guard against
- * races with xs_reset_transport.
+ * Note that this should be called with XPRT_LOCKED held, or recv_mutex
+ * held, or when we otherwise know that we have exclusive access to the
+ * socket, to guard against races with xs_reset_transport.
  */
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
@@ -1967,13 +1967,11 @@ xs_enable_swap(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
-	if (atomic_inc_return(&xprt->swapper) != 1)
-		return 0;
-	if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
-		return -ERESTARTSYS;
-	if (xs->inet)
+	mutex_lock(&xs->recv_mutex);
+	if (atomic_inc_return(&xprt->swapper) == 1 &&
+	    xs->inet)
 		sk_set_memalloc(xs->inet);
-	xprt_release_xprt(xprt, NULL);
+	mutex_unlock(&xs->recv_mutex);
 	return 0;
 }
 
@@ -1989,13 +1987,11 @@ xs_disable_swap(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
-	if (!atomic_dec_and_test(&xprt->swapper))
-		return;
-	if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
-		return;
-	if (xs->inet)
+	mutex_lock(&xs->recv_mutex);
+	if (atomic_dec_and_test(&xprt->swapper) &&
+	    xs->inet)
 		sk_clear_memalloc(xs->inet);
-	xprt_release_xprt(xprt, NULL);
+	mutex_unlock(&xs->recv_mutex);
 }
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)

From b0f212633b31ddca99c76aa38f812fe492e8410a Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 11 Mar 2022 11:34:28 +0100
Subject: [PATCH 135/229] xen/grant-table: remove gnttab_*transfer*() functions

All grant table operations related to the "transfer" functionality
are unused currently. There have been users in the old days of the
"Xen-o-Linux" kernel, but those didn't make it upstream.

So remove the "transfer" related functions.

Signed-off-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20220311103429.12845-2-jgross@suse.com
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/grant-table.c | 113 +-------------------------------------
 include/xen/grant_table.h |   8 ---
 2 files changed, 2 insertions(+), 119 deletions(-)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 5c83d41766c8..8963af8ec764 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -109,7 +109,7 @@ struct gnttab_ops {
 	void (*unmap_frames)(void);
 	/*
 	 * Introducing a valid entry into the grant table, granting the frame of
-	 * this grant entry to domain for accessing or transfering. Ref
+	 * this grant entry to domain for accessing. Ref
 	 * parameter is reference of this introduced grant entry, domid is id of
 	 * granted domain, frame is the page frame to be granted, and flags is
 	 * status of the grant entry to be updated.
@@ -125,14 +125,6 @@ struct gnttab_ops {
 	 * access for this entry and return success(==1).
 	 */
 	int (*end_foreign_access_ref)(grant_ref_t ref, int readonly);
-	/*
-	 * Stop granting a grant entry to domain for transfer. Ref parameter is
-	 * reference of a grant entry whose grant transfer will be stopped. If
-	 * tranfer has not started, just reclaim the grant entry and return
-	 * failure(==0). Otherwise, wait for the transfer to complete and then
-	 * return the frame.
-	 */
-	unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref);
 	/*
 	 * Read the frame number related to a given grant reference.
 	 */
@@ -230,10 +222,7 @@ static void put_free_entry(grant_ref_t ref)
  * Following applies to gnttab_update_entry_v1 and gnttab_update_entry_v2.
  * Introducing a valid entry into the grant table:
  *  1. Write ent->domid.
- *  2. Write ent->frame:
- *      GTF_permit_access:   Frame to which access is permitted.
- *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
- *                           frame, or zero if none.
+ *  2. Write ent->frame: Frame to which access is permitted.
  *  3. Write memory barrier (WMB).
  *  4. Write ent->flags, inc. valid type.
  */
@@ -455,102 +444,6 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
 }
 EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
 
-int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn)
-{
-	int ref;
-
-	ref = get_free_entries(1);
-	if (unlikely(ref < 0))
-		return -ENOSPC;
-	gnttab_grant_foreign_transfer_ref(ref, domid, pfn);
-
-	return ref;
-}
-EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer);
-
-void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid,
-				       unsigned long pfn)
-{
-	gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer);
-}
-EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref);
-
-static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref)
-{
-	unsigned long frame;
-	u16           flags;
-	u16          *pflags;
-
-	pflags = &gnttab_shared.v1[ref].flags;
-
-	/*
-	 * If a transfer is not even yet started, try to reclaim the grant
-	 * reference and return failure (== 0).
-	 */
-	while (!((flags = *pflags) & GTF_transfer_committed)) {
-		if (sync_cmpxchg(pflags, flags, 0) == flags)
-			return 0;
-		cpu_relax();
-	}
-
-	/* If a transfer is in progress then wait until it is completed. */
-	while (!(flags & GTF_transfer_completed)) {
-		flags = *pflags;
-		cpu_relax();
-	}
-
-	rmb();	/* Read the frame number /after/ reading completion status. */
-	frame = gnttab_shared.v1[ref].frame;
-	BUG_ON(frame == 0);
-
-	return frame;
-}
-
-static unsigned long gnttab_end_foreign_transfer_ref_v2(grant_ref_t ref)
-{
-	unsigned long frame;
-	u16           flags;
-	u16          *pflags;
-
-	pflags = &gnttab_shared.v2[ref].hdr.flags;
-
-	/*
-	 * If a transfer is not even yet started, try to reclaim the grant
-	 * reference and return failure (== 0).
-	 */
-	while (!((flags = *pflags) & GTF_transfer_committed)) {
-		if (sync_cmpxchg(pflags, flags, 0) == flags)
-			return 0;
-		cpu_relax();
-	}
-
-	/* If a transfer is in progress then wait until it is completed. */
-	while (!(flags & GTF_transfer_completed)) {
-		flags = *pflags;
-		cpu_relax();
-	}
-
-	rmb();  /* Read the frame number /after/ reading completion status. */
-	frame = gnttab_shared.v2[ref].full_page.frame;
-	BUG_ON(frame == 0);
-
-	return frame;
-}
-
-unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref)
-{
-	return gnttab_interface->end_foreign_transfer_ref(ref);
-}
-EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref);
-
-unsigned long gnttab_end_foreign_transfer(grant_ref_t ref)
-{
-	unsigned long frame = gnttab_end_foreign_transfer_ref(ref);
-	put_free_entry(ref);
-	return frame;
-}
-EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer);
-
 void gnttab_free_grant_reference(grant_ref_t ref)
 {
 	put_free_entry(ref);
@@ -1423,7 +1316,6 @@ static const struct gnttab_ops gnttab_v1_ops = {
 	.unmap_frames			= gnttab_unmap_frames_v1,
 	.update_entry			= gnttab_update_entry_v1,
 	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v1,
-	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v1,
 	.read_frame			= gnttab_read_frame_v1,
 };
 
@@ -1435,7 +1327,6 @@ static const struct gnttab_ops gnttab_v2_ops = {
 	.unmap_frames			= gnttab_unmap_frames_v2,
 	.update_entry			= gnttab_update_entry_v2,
 	.end_foreign_access_ref		= gnttab_end_foreign_access_ref_v2,
-	.end_foreign_transfer_ref	= gnttab_end_foreign_transfer_ref_v2,
 	.read_frame			= gnttab_read_frame_v2,
 };
 
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index c9fea9389ebe..9f9b1a297f0d 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -125,11 +125,6 @@ void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
  */
 int gnttab_try_end_foreign_access(grant_ref_t ref);
 
-int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn);
-
-unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref);
-unsigned long gnttab_end_foreign_transfer(grant_ref_t ref);
-
 /*
  * operations on reserved batches of grant references
  */
@@ -162,9 +157,6 @@ static inline void gnttab_page_grant_foreign_access_ref_one(
 					readonly);
 }
 
-void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
-				       unsigned long pfn);
-
 static inline void
 gnttab_set_map_op(struct gnttab_map_grant_ref *map, phys_addr_t addr,
 		  uint32_t flags, grant_ref_t ref, domid_t domid)

From c94b731da21f10086a9e52d63c21c730e3f6c939 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 11 Mar 2022 11:34:29 +0100
Subject: [PATCH 136/229] xen/grant-table: remove readonly parameter from
 functions

The gnttab_end_foreign_access() family of functions is taking a
"readonly" parameter, which isn't used. Remove it from the function
parameters.

Signed-off-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20220311103429.12845-3-jgross@suse.com
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Christian Schoenebeck <qemu_oss@crudebyte.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/block/xen-blkfront.c                |  8 ++---
 drivers/char/tpm/xen-tpmfront.c             |  2 +-
 drivers/gpu/drm/xen/xen_drm_front_evtchnl.c |  2 +-
 drivers/input/misc/xen-kbdfront.c           |  4 +--
 drivers/net/xen-netfront.c                  | 13 ++++---
 drivers/pci/xen-pcifront.c                  |  2 +-
 drivers/scsi/xen-scsifront.c                |  4 +--
 drivers/usb/host/xen-hcd.c                  |  4 +--
 drivers/xen/gntalloc.c                      |  2 +-
 drivers/xen/gntdev-dmabuf.c                 |  2 +-
 drivers/xen/grant-table.c                   | 38 +++++++++------------
 drivers/xen/pvcalls-front.c                 |  6 ++--
 drivers/xen/xen-front-pgdir-shbuf.c         |  3 +-
 include/xen/grant_table.h                   |  5 ++-
 net/9p/trans_xen.c                          |  8 ++---
 sound/xen/xen_snd_front_evtchnl.c           |  2 +-
 16 files changed, 48 insertions(+), 57 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 03b5fb341e58..aa996b637d0b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1223,7 +1223,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 			list_del(&persistent_gnt->node);
 			if (persistent_gnt->gref != GRANT_INVALID_REF) {
 				gnttab_end_foreign_access(persistent_gnt->gref,
-							  0, 0UL);
+							  0UL);
 				rinfo->persistent_gnts_c--;
 			}
 			if (info->feature_persistent)
@@ -1246,7 +1246,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 		       rinfo->shadow[i].req.u.rw.nr_segments;
 		for (j = 0; j < segs; j++) {
 			persistent_gnt = rinfo->shadow[i].grants_used[j];
-			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			gnttab_end_foreign_access(persistent_gnt->gref, 0UL);
 			if (info->feature_persistent)
 				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
@@ -1261,7 +1261,7 @@ static void blkif_free_ring(struct blkfront_ring_info *rinfo)
 
 		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
 			persistent_gnt = rinfo->shadow[i].indirect_grants[j];
-			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			gnttab_end_foreign_access(persistent_gnt->gref, 0UL);
 			__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
@@ -1284,7 +1284,7 @@ free_shadow:
 	/* Free resources associated with old device channel. */
 	for (i = 0; i < info->nr_ring_pages; i++) {
 		if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
-			gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
+			gnttab_end_foreign_access(rinfo->ring_ref[i], 0);
 			rinfo->ring_ref[i] = GRANT_INVALID_REF;
 		}
 	}
diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c
index da5b30771418..ad0675f23e6e 100644
--- a/drivers/char/tpm/xen-tpmfront.c
+++ b/drivers/char/tpm/xen-tpmfront.c
@@ -332,7 +332,7 @@ static void ring_free(struct tpm_private *priv)
 		return;
 
 	if (priv->ring_ref)
-		gnttab_end_foreign_access(priv->ring_ref, 0,
+		gnttab_end_foreign_access(priv->ring_ref,
 				(unsigned long)priv->shr);
 	else
 		free_page((unsigned long)priv->shr);
diff --git a/drivers/gpu/drm/xen/xen_drm_front_evtchnl.c b/drivers/gpu/drm/xen/xen_drm_front_evtchnl.c
index e10d95dddb99..08b526eeec16 100644
--- a/drivers/gpu/drm/xen/xen_drm_front_evtchnl.c
+++ b/drivers/gpu/drm/xen/xen_drm_front_evtchnl.c
@@ -148,7 +148,7 @@ static void evtchnl_free(struct xen_drm_front_info *front_info,
 
 	/* end access and free the page */
 	if (evtchnl->gref != GRANT_INVALID_REF)
-		gnttab_end_foreign_access(evtchnl->gref, 0, page);
+		gnttab_end_foreign_access(evtchnl->gref, page);
 
 	memset(evtchnl, 0, sizeof(*evtchnl));
 }
diff --git a/drivers/input/misc/xen-kbdfront.c b/drivers/input/misc/xen-kbdfront.c
index 3d17a0b3fe51..1fc9b3e7007f 100644
--- a/drivers/input/misc/xen-kbdfront.c
+++ b/drivers/input/misc/xen-kbdfront.c
@@ -481,7 +481,7 @@ static int xenkbd_connect_backend(struct xenbus_device *dev,
  error_evtchan:
 	xenbus_free_evtchn(dev, evtchn);
  error_grant:
-	gnttab_end_foreign_access(info->gref, 0, 0UL);
+	gnttab_end_foreign_access(info->gref, 0UL);
 	info->gref = -1;
 	return ret;
 }
@@ -492,7 +492,7 @@ static void xenkbd_disconnect_backend(struct xenkbd_info *info)
 		unbind_from_irqhandler(info->irq, info);
 	info->irq = -1;
 	if (info->gref >= 0)
-		gnttab_end_foreign_access(info->gref, 0, 0UL);
+		gnttab_end_foreign_access(info->gref, 0UL);
 	info->gref = -1;
 }
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index daa4e6106aac..e2b4a1893a13 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -425,7 +425,7 @@ static bool xennet_tx_buf_gc(struct netfront_queue *queue)
 			skb = queue->tx_skbs[id];
 			queue->tx_skbs[id] = NULL;
 			if (unlikely(!gnttab_end_foreign_access_ref(
-				queue->grant_tx_ref[id], GNTMAP_readonly))) {
+				queue->grant_tx_ref[id]))) {
 				dev_alert(dev,
 					  "Grant still in use by backend domain\n");
 				goto err;
@@ -1029,7 +1029,7 @@ static int xennet_get_responses(struct netfront_queue *queue,
 			goto next;
 		}
 
-		if (!gnttab_end_foreign_access_ref(ref, 0)) {
+		if (!gnttab_end_foreign_access_ref(ref)) {
 			dev_alert(dev,
 				  "Grant still in use by backend domain\n");
 			queue->info->broken = true;
@@ -1388,7 +1388,6 @@ static void xennet_release_tx_bufs(struct netfront_queue *queue)
 		queue->tx_skbs[i] = NULL;
 		get_page(queue->grant_tx_page[i]);
 		gnttab_end_foreign_access(queue->grant_tx_ref[i],
-					  GNTMAP_readonly,
 					  (unsigned long)page_address(queue->grant_tx_page[i]));
 		queue->grant_tx_page[i] = NULL;
 		queue->grant_tx_ref[i] = GRANT_INVALID_REF;
@@ -1421,7 +1420,7 @@ static void xennet_release_rx_bufs(struct netfront_queue *queue)
 		 * foreign access is ended (which may be deferred).
 		 */
 		get_page(page);
-		gnttab_end_foreign_access(ref, 0,
+		gnttab_end_foreign_access(ref,
 					  (unsigned long)page_address(page));
 		queue->grant_rx_ref[id] = GRANT_INVALID_REF;
 
@@ -1763,7 +1762,7 @@ static void xennet_end_access(int ref, void *page)
 {
 	/* This frees the page as a side-effect */
 	if (ref != GRANT_INVALID_REF)
-		gnttab_end_foreign_access(ref, 0, (unsigned long)page);
+		gnttab_end_foreign_access(ref, (unsigned long)page);
 }
 
 static void xennet_disconnect_backend(struct netfront_info *info)
@@ -1980,14 +1979,14 @@ static int setup_netfront(struct xenbus_device *dev,
 	 */
  fail:
 	if (queue->rx_ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(queue->rx_ring_ref, 0,
+		gnttab_end_foreign_access(queue->rx_ring_ref,
 					  (unsigned long)rxs);
 		queue->rx_ring_ref = GRANT_INVALID_REF;
 	} else {
 		free_page((unsigned long)rxs);
 	}
 	if (queue->tx_ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(queue->tx_ring_ref, 0,
+		gnttab_end_foreign_access(queue->tx_ring_ref,
 					  (unsigned long)txs);
 		queue->tx_ring_ref = GRANT_INVALID_REF;
 	} else {
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index d2a7b9fd678b..3edc1565a27c 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -755,7 +755,7 @@ static void free_pdev(struct pcifront_device *pdev)
 		xenbus_free_evtchn(pdev->xdev, pdev->evtchn);
 
 	if (pdev->gnt_ref != INVALID_GRANT_REF)
-		gnttab_end_foreign_access(pdev->gnt_ref, 0 /* r/w page */,
+		gnttab_end_foreign_access(pdev->gnt_ref,
 					  (unsigned long)pdev->sh_info);
 	else
 		free_page((unsigned long)pdev->sh_info);
diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c
index 7f421600cb66..12109e4c73d4 100644
--- a/drivers/scsi/xen-scsifront.c
+++ b/drivers/scsi/xen-scsifront.c
@@ -757,7 +757,7 @@ static int scsifront_alloc_ring(struct vscsifrnt_info *info)
 free_irq:
 	unbind_from_irqhandler(info->irq, info);
 free_gnttab:
-	gnttab_end_foreign_access(info->ring_ref, 0,
+	gnttab_end_foreign_access(info->ring_ref,
 				  (unsigned long)info->ring.sring);
 
 	return err;
@@ -766,7 +766,7 @@ free_gnttab:
 static void scsifront_free_ring(struct vscsifrnt_info *info)
 {
 	unbind_from_irqhandler(info->irq, info);
-	gnttab_end_foreign_access(info->ring_ref, 0,
+	gnttab_end_foreign_access(info->ring_ref,
 				  (unsigned long)info->ring.sring);
 }
 
diff --git a/drivers/usb/host/xen-hcd.c b/drivers/usb/host/xen-hcd.c
index 19b8c7ed74cb..5f4a00df4f1c 100644
--- a/drivers/usb/host/xen-hcd.c
+++ b/drivers/usb/host/xen-hcd.c
@@ -1075,14 +1075,14 @@ static void xenhcd_destroy_rings(struct xenhcd_info *info)
 	info->irq = 0;
 
 	if (info->urb_ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(info->urb_ring_ref, 0,
+		gnttab_end_foreign_access(info->urb_ring_ref,
 					  (unsigned long)info->urb_ring.sring);
 		info->urb_ring_ref = GRANT_INVALID_REF;
 	}
 	info->urb_ring.sring = NULL;
 
 	if (info->conn_ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(info->conn_ring_ref, 0,
+		gnttab_end_foreign_access(info->conn_ring_ref,
 					  (unsigned long)info->conn_ring.sring);
 		info->conn_ring_ref = GRANT_INVALID_REF;
 	}
diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index edb0acd0b832..4849f94372a4 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -192,7 +192,7 @@ static void __del_gref(struct gntalloc_gref *gref)
 	if (gref->gref_id) {
 		if (gref->page) {
 			addr = (unsigned long)page_to_virt(gref->page);
-			gnttab_end_foreign_access(gref->gref_id, 0, addr);
+			gnttab_end_foreign_access(gref->gref_id, addr);
 		} else
 			gnttab_free_grant_reference(gref->gref_id);
 	}
diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c
index 12e380db7f55..d5bfd7b867fc 100644
--- a/drivers/xen/gntdev-dmabuf.c
+++ b/drivers/xen/gntdev-dmabuf.c
@@ -533,7 +533,7 @@ static void dmabuf_imp_end_foreign_access(u32 *refs, int count)
 
 	for (i = 0; i < count; i++)
 		if (refs[i] != GRANT_INVALID_REF)
-			gnttab_end_foreign_access(refs[i], 0, 0UL);
+			gnttab_end_foreign_access(refs[i], 0UL);
 }
 
 static void dmabuf_imp_free_storage(struct gntdev_dmabuf *gntdev_dmabuf)
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 8963af8ec764..8ccccace2a4f 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -118,13 +118,12 @@ struct gnttab_ops {
 			     unsigned long frame, unsigned flags);
 	/*
 	 * Stop granting a grant entry to domain for accessing. Ref parameter is
-	 * reference of a grant entry whose grant access will be stopped,
-	 * readonly is not in use in this function. If the grant entry is
-	 * currently mapped for reading or writing, just return failure(==0)
-	 * directly and don't tear down the grant access. Otherwise, stop grant
-	 * access for this entry and return success(==1).
+	 * reference of a grant entry whose grant access will be stopped.
+	 * If the grant entry is currently mapped for reading or writing, just
+	 * return failure(==0) directly and don't tear down the grant access.
+	 * Otherwise, stop grant access for this entry and return success(==1).
 	 */
-	int (*end_foreign_access_ref)(grant_ref_t ref, int readonly);
+	int (*end_foreign_access_ref)(grant_ref_t ref);
 	/*
 	 * Read the frame number related to a given grant reference.
 	 */
@@ -270,7 +269,7 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 }
 EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access);
 
-static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
+static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref)
 {
 	u16 flags, nflags;
 	u16 *pflags;
@@ -286,7 +285,7 @@ static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly)
 	return 1;
 }
 
-static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
+static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref)
 {
 	gnttab_shared.v2[ref].hdr.flags = 0;
 	mb();	/* Concurrent access by hypervisor. */
@@ -309,14 +308,14 @@ static int gnttab_end_foreign_access_ref_v2(grant_ref_t ref, int readonly)
 	return 1;
 }
 
-static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref)
 {
-	return gnttab_interface->end_foreign_access_ref(ref, readonly);
+	return gnttab_interface->end_foreign_access_ref(ref);
 }
 
-int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly)
+int gnttab_end_foreign_access_ref(grant_ref_t ref)
 {
-	if (_gnttab_end_foreign_access_ref(ref, readonly))
+	if (_gnttab_end_foreign_access_ref(ref))
 		return 1;
 	pr_warn("WARNING: g.e. %#x still in use!\n", ref);
 	return 0;
@@ -336,7 +335,6 @@ static unsigned long gnttab_read_frame_v2(grant_ref_t ref)
 struct deferred_entry {
 	struct list_head list;
 	grant_ref_t ref;
-	bool ro;
 	uint16_t warn_delay;
 	struct page *page;
 };
@@ -360,7 +358,7 @@ static void gnttab_handle_deferred(struct timer_list *unused)
 			break;
 		list_del(&entry->list);
 		spin_unlock_irqrestore(&gnttab_list_lock, flags);
-		if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) {
+		if (_gnttab_end_foreign_access_ref(entry->ref)) {
 			put_free_entry(entry->ref);
 			pr_debug("freeing g.e. %#x (pfn %#lx)\n",
 				 entry->ref, page_to_pfn(entry->page));
@@ -386,8 +384,7 @@ static void gnttab_handle_deferred(struct timer_list *unused)
 	spin_unlock_irqrestore(&gnttab_list_lock, flags);
 }
 
-static void gnttab_add_deferred(grant_ref_t ref, bool readonly,
-				struct page *page)
+static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
 {
 	struct deferred_entry *entry;
 	gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
@@ -405,7 +402,6 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly,
 		unsigned long flags;
 
 		entry->ref = ref;
-		entry->ro = readonly;
 		entry->page = page;
 		entry->warn_delay = 60;
 		spin_lock_irqsave(&gnttab_list_lock, flags);
@@ -423,7 +419,7 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly,
 
 int gnttab_try_end_foreign_access(grant_ref_t ref)
 {
-	int ret = _gnttab_end_foreign_access_ref(ref, 0);
+	int ret = _gnttab_end_foreign_access_ref(ref);
 
 	if (ret)
 		put_free_entry(ref);
@@ -432,15 +428,13 @@ int gnttab_try_end_foreign_access(grant_ref_t ref)
 }
 EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access);
 
-void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
-			       unsigned long page)
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page)
 {
 	if (gnttab_try_end_foreign_access(ref)) {
 		if (page != 0)
 			put_page(virt_to_page(page));
 	} else
-		gnttab_add_deferred(ref, readonly,
-				    page ? virt_to_page(page) : NULL);
+		gnttab_add_deferred(ref, page ? virt_to_page(page) : NULL);
 }
 EXPORT_SYMBOL_GPL(gnttab_end_foreign_access);
 
diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c
index 0ca351f30a6d..e254ed19488f 100644
--- a/drivers/xen/pvcalls-front.c
+++ b/drivers/xen/pvcalls-front.c
@@ -238,8 +238,8 @@ static void pvcalls_front_free_map(struct pvcalls_bedata *bedata,
 	spin_unlock(&bedata->socket_lock);
 
 	for (i = 0; i < (1 << PVCALLS_RING_ORDER); i++)
-		gnttab_end_foreign_access(map->active.ring->ref[i], 0, 0);
-	gnttab_end_foreign_access(map->active.ref, 0, 0);
+		gnttab_end_foreign_access(map->active.ring->ref[i], 0);
+	gnttab_end_foreign_access(map->active.ref, 0);
 	free_page((unsigned long)map->active.ring);
 
 	kfree(map);
@@ -1117,7 +1117,7 @@ static int pvcalls_front_remove(struct xenbus_device *dev)
 		}
 	}
 	if (bedata->ref != -1)
-		gnttab_end_foreign_access(bedata->ref, 0, 0);
+		gnttab_end_foreign_access(bedata->ref, 0);
 	kfree(bedata->ring.sring);
 	kfree(bedata);
 	xenbus_switch_state(dev, XenbusStateClosed);
diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c
index 81b6e13fa5ec..a959dee21134 100644
--- a/drivers/xen/xen-front-pgdir-shbuf.c
+++ b/drivers/xen/xen-front-pgdir-shbuf.c
@@ -143,8 +143,7 @@ void xen_front_pgdir_shbuf_free(struct xen_front_pgdir_shbuf *buf)
 
 		for (i = 0; i < buf->num_grefs; i++)
 			if (buf->grefs[i] != GRANT_INVALID_REF)
-				gnttab_end_foreign_access(buf->grefs[i],
-							  0, 0UL);
+				gnttab_end_foreign_access(buf->grefs[i], 0UL);
 	}
 	kfree(buf->grefs);
 	kfree(buf->directory);
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 9f9b1a297f0d..dfd5bf31cfb9 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -97,7 +97,7 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
  * longer in use.  Return 1 if the grant entry was freed, 0 if it is still in
  * use.
  */
-int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
+int gnttab_end_foreign_access_ref(grant_ref_t ref);
 
 /*
  * Eventually end access through the given grant reference, and once that
@@ -114,8 +114,7 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly);
  * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing
  * via free_pages_exact()) in order to avoid high order pages.
  */
-void gnttab_end_foreign_access(grant_ref_t ref, int readonly,
-			       unsigned long page);
+void gnttab_end_foreign_access(grant_ref_t ref, unsigned long page);
 
 /*
  * End access through the given grant reference, iff the grant entry is
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index 01f8067994d6..77883b6788cd 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -279,13 +279,13 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv)
 				grant_ref_t ref;
 
 				ref = priv->rings[i].intf->ref[j];
-				gnttab_end_foreign_access(ref, 0, 0);
+				gnttab_end_foreign_access(ref, 0);
 			}
 			free_pages_exact(priv->rings[i].data.in,
 				   1UL << (priv->rings[i].intf->ring_order +
 					   XEN_PAGE_SHIFT));
 		}
-		gnttab_end_foreign_access(priv->rings[i].ref, 0, 0);
+		gnttab_end_foreign_access(priv->rings[i].ref, 0);
 		free_page((unsigned long)priv->rings[i].intf);
 	}
 	kfree(priv->rings);
@@ -353,10 +353,10 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev,
 out:
 	if (bytes) {
 		for (i--; i >= 0; i--)
-			gnttab_end_foreign_access(ring->intf->ref[i], 0, 0);
+			gnttab_end_foreign_access(ring->intf->ref[i], 0);
 		free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT));
 	}
-	gnttab_end_foreign_access(ring->ref, 0, 0);
+	gnttab_end_foreign_access(ring->ref, 0);
 	free_page((unsigned long)ring->intf);
 	return ret;
 }
diff --git a/sound/xen/xen_snd_front_evtchnl.c b/sound/xen/xen_snd_front_evtchnl.c
index 29e0f0ea67eb..ecbc294fc59a 100644
--- a/sound/xen/xen_snd_front_evtchnl.c
+++ b/sound/xen/xen_snd_front_evtchnl.c
@@ -168,7 +168,7 @@ static void evtchnl_free(struct xen_snd_front_info *front_info,
 
 	/* End access and free the page. */
 	if (channel->gref != GRANT_INVALID_REF)
-		gnttab_end_foreign_access(channel->gref, 0, page);
+		gnttab_end_foreign_access(channel->gref, page);
 	else
 		free_page(page);
 

From 309b517276f21dc7e6315c6637792f8bbfdf7ec4 Mon Sep 17 00:00:00 2001
From: jianchunfu <jianchunfu@cmss.chinamobile.com>
Date: Mon, 14 Mar 2022 15:05:14 +0800
Subject: [PATCH 137/229] arch:x86:xen: Remove unnecessary assignment in
 xen_apic_read()

In the function xen_apic_read(), the initialized value of 'ret' is unused
because it will be assigned by the function HYPERVISOR_platform_op(),
thus remove it.

Signed-off-by: jianchunfu <jianchunfu@cmss.chinamobile.com>
Link: https://lore.kernel.org/r/20220314070514.2602-1-jianchunfu@cmss.chinamobile.com
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 0d46cc283cf5..62d34b6611c5 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -51,7 +51,7 @@ static u32 xen_apic_read(u32 reg)
 		.interface_version = XENPF_INTERFACE_VERSION,
 		.u.pcpu_info.xen_cpuid = 0,
 	};
-	int ret = 0;
+	int ret;
 
 	/* Shouldn't need this as APIC is turned off for PV, and we only
 	 * get called on the bootup processor. But just in case. */

From 2957308343fa7c621df9f342fab88cb970b8d5f3 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <zhouchengming@bytedance.com>
Date: Sat, 12 Mar 2022 23:22:20 +0800
Subject: [PATCH 138/229] livepatch: Don't block removal of patches that are
 safe to unload

module_put() is not called for a patch with "forced" flag. It should
block the removal of the livepatch module when the code might still
be in use after forced transition.

klp_force_transition() currently sets "forced" flag for all patches on
the list.

In fact, any patch can be safely unloaded when it passed through
the consistency model in KLP_UNPATCHED transition.

In other words, the "forced" flag must be set only for livepatches
that are being removed. In particular, set the "forced" flag:

  + only for klp_transition_patch when the transition to KLP_UNPATCHED
    state was forced.

  + all replaced patches when the transition to KLP_PATCHED state was
    forced and the patch was replacing the existing patches.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
[mbenes@suse.cz: wording improvements]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220312152220.88127-1-zhouchengming@bytedance.com
---
 kernel/livepatch/transition.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..77ef45a1e0a3 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -641,6 +641,13 @@ void klp_force_transition(void)
 	for_each_possible_cpu(cpu)
 		klp_update_patch_state(idle_task(cpu));
 
-	klp_for_each_patch(patch)
-		patch->forced = true;
+	/* Set forced flag for patches being removed. */
+	if (klp_target_state == KLP_UNPATCHED)
+		klp_transition_patch->forced = true;
+	else if (klp_transition_patch->replace) {
+		klp_for_each_patch(patch) {
+			if (patch != klp_transition_patch)
+				patch->forced = true;
+		}
+	}
 }

From 336d4b814bf078fa698488632c19beca47308896 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:15:32 -0600
Subject: [PATCH 139/229] ptrace: Move setting/clearing ptrace_message into
 ptrace_stop

Today ptrace_message is easy to overlook as it not a core part of
ptrace_stop.  It has been overlooked so much that there are places
that set ptrace_message and don't clear it, and places that never set
it.  So if you get an unlucky sequence of events the ptracer may be
able to read a ptrace_message that does not apply to the current
ptrace stop.

Move setting of ptrace_message into ptrace_stop so that it always gets
set before the stop, and always gets cleared after the stop.  This
prevents non-sense from being reported to userspace and makes
ptrace_message more visible in the ptrace helper functions so that
kernel developers can see it.

Link: https://lkml.kernel.org/r/87bky67qfv.fsf_-_@email.froward.int.ebiederm.org
Acked-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h      |  9 +++------
 include/uapi/linux/ptrace.h |  2 +-
 kernel/signal.c             | 21 ++++++++++++---------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 5310f43e4762..3e6b46e2b7be 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -60,7 +60,7 @@ extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_request(struct task_struct *child, long request,
 			  unsigned long addr, unsigned long data);
-extern void ptrace_notify(int exit_code);
+extern void ptrace_notify(int exit_code, unsigned long message);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent,
 			  const struct cred *ptracer_cred);
@@ -155,8 +155,7 @@ static inline bool ptrace_event_enabled(struct task_struct *task, int event)
 static inline void ptrace_event(int event, unsigned long message)
 {
 	if (unlikely(ptrace_event_enabled(current, event))) {
-		current->ptrace_message = message;
-		ptrace_notify((event << 8) | SIGTRAP);
+		ptrace_notify((event << 8) | SIGTRAP, message);
 	} else if (event == PTRACE_EVENT_EXEC) {
 		/* legacy EXEC report via SIGTRAP */
 		if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
@@ -424,8 +423,7 @@ static inline int ptrace_report_syscall(unsigned long message)
 	if (!(ptrace & PT_PTRACED))
 		return 0;
 
-	current->ptrace_message = message;
-	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
+	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0), message);
 
 	/*
 	 * this isn't the same as continuing with a signal, but it will do
@@ -437,7 +435,6 @@ static inline int ptrace_report_syscall(unsigned long message)
 		current->exit_code = 0;
 	}
 
-	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
 
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index b7af92e07d1f..195ae64a8c87 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -114,7 +114,7 @@ struct ptrace_rseq_configuration {
 
 /*
  * These values are stored in task->ptrace_message
- * by ptrace_report_syscall_* to describe the current syscall-stop.
+ * by ptrace_stop to describe the current syscall-stop.
  */
 #define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
 #define PTRACE_EVENTMSG_SYSCALL_EXIT	2
diff --git a/kernel/signal.c b/kernel/signal.c
index c2dee5420567..a49ac7149256 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2191,7 +2191,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
  * If we actually decide not to stop at all because the tracer
  * is gone, we keep current->exit_code unless clear_code.
  */
-static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code,
+			unsigned long message, kernel_siginfo_t *info)
 	__releases(&current->sighand->siglock)
 	__acquires(&current->sighand->siglock)
 {
@@ -2237,6 +2238,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	 */
 	smp_wmb();
 
+	current->ptrace_message = message;
 	current->last_siginfo = info;
 	current->exit_code = exit_code;
 
@@ -2315,6 +2317,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	 */
 	spin_lock_irq(&current->sighand->siglock);
 	current->last_siginfo = NULL;
+	current->ptrace_message = 0;
 
 	/* LISTENING can be set only during STOP traps, clear it */
 	current->jobctl &= ~JOBCTL_LISTENING;
@@ -2327,7 +2330,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
 	recalc_sigpending_tsk(current);
 }
 
-static void ptrace_do_notify(int signr, int exit_code, int why)
+static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
 {
 	kernel_siginfo_t info;
 
@@ -2338,17 +2341,17 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
 	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
-	ptrace_stop(exit_code, why, 1, &info);
+	ptrace_stop(exit_code, why, 1, message, &info);
 }
 
-void ptrace_notify(int exit_code)
+void ptrace_notify(int exit_code, unsigned long message)
 {
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
 	spin_unlock_irq(&current->sighand->siglock);
 }
 
@@ -2504,10 +2507,10 @@ static void do_jobctl_trap(void)
 			signr = SIGTRAP;
 		WARN_ON_ONCE(!signr);
 		ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
-				 CLD_STOPPED);
+				 CLD_STOPPED, 0);
 	} else {
 		WARN_ON_ONCE(!signr);
-		ptrace_stop(signr, CLD_STOPPED, 0, NULL);
+		ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
 		current->exit_code = 0;
 	}
 }
@@ -2561,7 +2564,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	 * comment in dequeue_signal().
 	 */
 	current->jobctl |= JOBCTL_STOP_DEQUEUED;
-	ptrace_stop(signr, CLD_TRAPPED, 0, info);
+	ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
 	signr = current->exit_code;
@@ -2891,7 +2894,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
 	if (current->sas_ss_flags & SS_AUTODISARM)
 		sas_ss_reset(current);
 	if (stepping)
-		ptrace_notify(SIGTRAP);
+		ptrace_notify(SIGTRAP, 0);
 }
 
 void signal_setup_done(int failed, struct ksignal *ksig, int stepping)

From 6487d1dab837214ec2fd3f0ddd5f787e63be7c20 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Jan 2022 12:19:13 -0600
Subject: [PATCH 140/229] ptrace: Return the signal to continue with from
 ptrace_stop

The signal a task should continue with after a ptrace stop is
inconsistently read, cleared, and sent.  Solve this by reading and
clearing the signal to be sent in ptrace_stop.

In an ideal world everything except ptrace_signal would share a common
implementation of continuing with the signal, so ptracers could count
on the signal they ask to continue with actually being delivered.  For
now retain bug compatibility and just return with the signal number
the ptracer requested the code continue with.

Link: https://lkml.kernel.org/r/875yoe7qdp.fsf_-_@email.froward.int.ebiederm.org
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h | 12 ++++++------
 kernel/signal.c        | 32 +++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 3e6b46e2b7be..15b3d176b6b4 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -60,7 +60,7 @@ extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_request(struct task_struct *child, long request,
 			  unsigned long addr, unsigned long data);
-extern void ptrace_notify(int exit_code, unsigned long message);
+extern int ptrace_notify(int exit_code, unsigned long message);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent,
 			  const struct cred *ptracer_cred);
@@ -419,21 +419,21 @@ extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oa
 static inline int ptrace_report_syscall(unsigned long message)
 {
 	int ptrace = current->ptrace;
+	int signr;
 
 	if (!(ptrace & PT_PTRACED))
 		return 0;
 
-	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0), message);
+	signr = ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0),
+			      message);
 
 	/*
 	 * this isn't the same as continuing with a signal, but it will do
 	 * for normal use.  strace only continues with a signal if the
 	 * stopping signal is not SIGTRAP.  -brl
 	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+	if (signr)
+		send_sig(signr, current, 1);
 
 	return fatal_signal_pending(current);
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index a49ac7149256..f1c4ab85833c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2188,15 +2188,17 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
  * That makes it a way to test a stopped process for
  * being ptrace-stopped vs being job-control-stopped.
  *
- * If we actually decide not to stop at all because the tracer
- * is gone, we keep current->exit_code unless clear_code.
+ * Returns the signal the ptracer requested the code resume
+ * with.  If the code did not stop because the tracer is gone,
+ * the stop signal remains unchanged unless clear_code.
  */
-static void ptrace_stop(int exit_code, int why, int clear_code,
+static int ptrace_stop(int exit_code, int why, int clear_code,
 			unsigned long message, kernel_siginfo_t *info)
 	__releases(&current->sighand->siglock)
 	__acquires(&current->sighand->siglock)
 {
 	bool gstop_done = false;
+	bool read_code = true;
 
 	if (arch_ptrace_stop_needed()) {
 		/*
@@ -2305,8 +2307,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code,
 
 		/* tasklist protects us from ptrace_freeze_traced() */
 		__set_current_state(TASK_RUNNING);
+		read_code = false;
 		if (clear_code)
-			current->exit_code = 0;
+			exit_code = 0;
 		read_unlock(&tasklist_lock);
 	}
 
@@ -2316,8 +2319,11 @@ static void ptrace_stop(int exit_code, int why, int clear_code,
 	 * any signal-sending on another CPU that wants to examine it.
 	 */
 	spin_lock_irq(&current->sighand->siglock);
+	if (read_code)
+		exit_code = current->exit_code;
 	current->last_siginfo = NULL;
 	current->ptrace_message = 0;
+	current->exit_code = 0;
 
 	/* LISTENING can be set only during STOP traps, clear it */
 	current->jobctl &= ~JOBCTL_LISTENING;
@@ -2328,9 +2334,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code,
 	 * This sets TIF_SIGPENDING, but never clears it.
 	 */
 	recalc_sigpending_tsk(current);
+	return exit_code;
 }
 
-static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
+static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
 {
 	kernel_siginfo_t info;
 
@@ -2341,18 +2348,21 @@ static void ptrace_do_notify(int signr, int exit_code, int why, unsigned long me
 	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
 
 	/* Let the debugger run.  */
-	ptrace_stop(exit_code, why, 1, message, &info);
+	return ptrace_stop(exit_code, why, 1, message, &info);
 }
 
-void ptrace_notify(int exit_code, unsigned long message)
+int ptrace_notify(int exit_code, unsigned long message)
 {
+	int signr;
+
 	BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 
 	spin_lock_irq(&current->sighand->siglock);
-	ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
+	signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
 	spin_unlock_irq(&current->sighand->siglock);
+	return signr;
 }
 
 /**
@@ -2511,7 +2521,6 @@ static void do_jobctl_trap(void)
 	} else {
 		WARN_ON_ONCE(!signr);
 		ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
-		current->exit_code = 0;
 	}
 }
 
@@ -2564,15 +2573,12 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
 	 * comment in dequeue_signal().
 	 */
 	current->jobctl |= JOBCTL_STOP_DEQUEUED;
-	ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
+	signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
 
 	/* We're back.  Did the debugger cancel the sig?  */
-	signr = current->exit_code;
 	if (signr == 0)
 		return signr;
 
-	current->exit_code = 0;
-
 	/*
 	 * Update the siginfo structure if the signal has
 	 * changed.  If the debugger wanted something

From a43bf604446414103b7535f38e739b65601c4fb2 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Wed, 16 Mar 2022 18:24:26 -0400
Subject: [PATCH 141/229] NFSv4.1 provide mount option to toggle trunking
 discovery

Introduce a new mount option -- trunkdiscovery,notrunkdiscovery -- to
toggle whether or not the client will engage in actively discovery
of trunking locations.

v2 make notrunkdiscovery default

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Fixes: 1976b2b31462 ("NFSv4.1 query for fs_location attr on a new file system")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c           | 3 ++-
 fs/nfs/fs_context.c       | 8 ++++++++
 include/linux/nfs_fs_sb.h | 1 +
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d1f34229e11a..e828504cc396 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -857,7 +857,8 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
 	}
 
 	if (clp->rpc_ops->discover_trunking != NULL &&
-			(server->caps & NFS_CAP_FS_LOCATIONS)) {
+			(server->caps & NFS_CAP_FS_LOCATIONS &&
+			 (server->flags & NFS_MOUNT_TRUNK_DISCOVERY))) {
 		error = clp->rpc_ops->discover_trunking(server, mntfh);
 		if (error < 0)
 			return error;
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index ea17fa1f31ec..e2d59bb5e6bb 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -80,6 +80,7 @@ enum nfs_param {
 	Opt_source,
 	Opt_tcp,
 	Opt_timeo,
+	Opt_trunkdiscovery,
 	Opt_udp,
 	Opt_v,
 	Opt_vers,
@@ -180,6 +181,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
 	fsparam_string("source",	Opt_source),
 	fsparam_flag  ("tcp",		Opt_tcp),
 	fsparam_u32   ("timeo",		Opt_timeo),
+	fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery),
 	fsparam_flag  ("udp",		Opt_udp),
 	fsparam_flag  ("v2",		Opt_v),
 	fsparam_flag  ("v3",		Opt_v),
@@ -529,6 +531,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		else
 			ctx->flags &= ~NFS_MOUNT_NOCTO;
 		break;
+	case Opt_trunkdiscovery:
+		if (result.negated)
+			ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY;
+		else
+			ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY;
+		break;
 	case Opt_ac:
 		if (result.negated)
 			ctx->flags |= NFS_MOUNT_NOAC;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index ca0959e51e81..b0e3fd550122 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -151,6 +151,7 @@ struct nfs_server {
 #define NFS_MOUNT_SOFTREVAL		0x800000
 #define NFS_MOUNT_WRITE_EAGER		0x01000000
 #define NFS_MOUNT_WRITE_WAIT		0x02000000
+#define NFS_MOUNT_TRUNK_DISCOVERY	0x04000000
 
 	unsigned int		fattr_valid;	/* Valid attributes */
 	unsigned int		caps;		/* server capabilities */

From 648a4548d622c4ae965058db1a6b5b95c062789a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 22:27:13 -0400
Subject: [PATCH 142/229] NFS: Don't deadlock when cookie hashes collide

In the very rare case where the readdir reply contains multiple cookies
that map to the same hash value, we can end up deadlocking waiting for a
page lock that we already hold. In this case we should fail the page
lock by using grab_cache_page_nowait().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7e12102b29e7..17986c0019d4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -381,23 +381,28 @@ static void nfs_readdir_page_unlock_and_put(struct page *page)
 	put_page(page);
 }
 
-static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
-						u64 last_cookie,
-						u64 change_attr)
+static void nfs_readdir_page_init_and_validate(struct page *page, u64 cookie,
+					       u64 change_attr)
 {
-	pgoff_t index = nfs_readdir_page_cookie_hash(last_cookie);
+	if (PageUptodate(page)) {
+		if (nfs_readdir_page_validate(page, cookie, change_attr))
+			return;
+		nfs_readdir_clear_array(page);
+	}
+	nfs_readdir_page_init_array(page, cookie, change_attr);
+	SetPageUptodate(page);
+}
+
+static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
+						u64 cookie, u64 change_attr)
+{
+	pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
 	struct page *page;
 
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return NULL;
-	if (PageUptodate(page)) {
-		if (nfs_readdir_page_validate(page, last_cookie, change_attr))
-			return page;
-		nfs_readdir_clear_array(page);
-	}
-	nfs_readdir_page_init_array(page, last_cookie, change_attr);
-	SetPageUptodate(page);
+	nfs_readdir_page_init_and_validate(page, cookie, change_attr);
 	return page;
 }
 
@@ -435,11 +440,13 @@ static void nfs_readdir_page_set_eof(struct page *page)
 static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
 					      u64 cookie, u64 change_attr)
 {
+	pgoff_t index = nfs_readdir_page_cookie_hash(cookie);
 	struct page *page;
 
-	page = nfs_readdir_page_get_locked(mapping, cookie, change_attr);
+	page = grab_cache_page_nowait(mapping, index);
 	if (!page)
 		return NULL;
+	nfs_readdir_page_init_and_validate(page, cookie, change_attr);
 	if (nfs_readdir_page_last_cookie(page) != cookie)
 		nfs_readdir_page_reinit_array(page, cookie, change_attr);
 	return page;

From 2cc7cc01c15f57d056318c33705647f87dcd4aab Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Sat, 19 Mar 2022 22:30:00 +0300
Subject: [PATCH 143/229] jfs: fix divide error in dbNextAG

Syzbot reported divide error in dbNextAG(). The problem was in missing
validation check for malicious image.

Syzbot crafted an image with bmp->db_numag equal to 0. There wasn't any
validation checks, but dbNextAG() blindly use bmp->db_numag in divide
expression

Fix it by validating bmp->db_numag in dbMount() and return an error if
image is malicious

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-and-tested-by: syzbot+46f5c25af73eb8330eb6@syzkaller.appspotmail.com
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/jfs_dmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 91f4ec93dab1..d8502f4989d9 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -148,6 +148,7 @@ static const s8 budtab[256] = {
  *	0	- success
  *	-ENOMEM	- insufficient memory
  *	-EIO	- i/o error
+ *	-EINVAL - wrong bmap data
  */
 int dbMount(struct inode *ipbmap)
 {
@@ -179,6 +180,12 @@ int dbMount(struct inode *ipbmap)
 	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
 	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
 	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+	if (!bmp->db_numag) {
+		release_metapage(mp);
+		kfree(bmp);
+		return -EINVAL;
+	}
+
 	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);

From a53046291020ec41e09181396c1e829287b48d47 Mon Sep 17 00:00:00 2001
From: Haimin Zhang <tcs_kernel@tencent.com>
Date: Tue, 22 Mar 2022 21:59:17 +0800
Subject: [PATCH 144/229] jfs: prevent NULL deref in diFree

Add validation check for JFS_IP(ipimap)->i_imap to prevent a NULL deref
in diFree since diFree uses it without do any validations.
When function jfs_mount calls diMount to initialize fileset inode
allocation map, it can fail and JFS_IP(ipimap)->i_imap won't be
initialized. Then it calls diFreeSpecial to close fileset inode allocation
map inode and it will flow into jfs_evict_inode. Function jfs_evict_inode
just validates JFS_SBI(inode->i_sb)->ipimap, then calls diFree. diFree use
JFS_IP(ipimap)->i_imap directly, then it will cause a NULL deref.

Reported-by: TCS Robot <tcs_robot@tencent.com>
Signed-off-by: Haimin Zhang <tcs_kernel@tencent.com>
Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
---
 fs/jfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 57ab424c05ff..072821b50ab9 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -146,12 +146,13 @@ void jfs_evict_inode(struct inode *inode)
 		dquot_initialize(inode);
 
 		if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+			struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
 			truncate_inode_pages_final(&inode->i_data);
 
 			if (test_cflag(COMMIT_Freewmap, inode))
 				jfs_free_zero_link(inode);
 
-			if (JFS_SBI(inode->i_sb)->ipimap)
+			if (ipimap && JFS_IP(ipimap)->i_imap)
 				diFree(inode);
 
 			/*

From ee1fee900537b5d9560e9f937402de5ddc8412f3 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Sat, 19 Mar 2022 02:08:37 +0100
Subject: [PATCH 145/229] ptrace: Check PTRACE_O_SUSPEND_SECCOMP permission on
 PTRACE_SEIZE

Setting PTRACE_O_SUSPEND_SECCOMP is supposed to be a highly privileged
operation because it allows the tracee to completely bypass all seccomp
filters on kernels with CONFIG_CHECKPOINT_RESTORE=y. It is only supposed to
be settable by a process with global CAP_SYS_ADMIN, and only if that
process is not subject to any seccomp filters at all.

However, while these permission checks were done on the PTRACE_SETOPTIONS
path, they were missing on the PTRACE_SEIZE path, which also sets
user-specified ptrace flags.

Move the permissions checks out into a helper function and let both
ptrace_attach() and ptrace_setoptions() call it.

Cc: stable@kernel.org
Fixes: 13c4a90119d2 ("seccomp: add ptrace options for suspend/resume")
Signed-off-by: Jann Horn <jannh@google.com>
Link: https://lkml.kernel.org/r/20220319010838.1386861-1-jannh@google.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..ccc4b465775b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -371,6 +371,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return !err;
 }
 
+static int check_ptrace_options(unsigned long data)
+{
+	if (data & ~(unsigned long)PTRACE_O_MASK)
+		return -EINVAL;
+
+	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+		if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+		    !IS_ENABLED(CONFIG_SECCOMP))
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+		    current->ptrace & PT_SUSPEND_SECCOMP)
+			return -EPERM;
+	}
+	return 0;
+}
+
 static int ptrace_attach(struct task_struct *task, long request,
 			 unsigned long addr,
 			 unsigned long flags)
@@ -382,8 +402,16 @@ static int ptrace_attach(struct task_struct *task, long request,
 	if (seize) {
 		if (addr != 0)
 			goto out;
+		/*
+		 * This duplicates the check in check_ptrace_options() because
+		 * ptrace_attach() and ptrace_setoptions() have historically
+		 * used different error codes for unknown ptrace options.
+		 */
 		if (flags & ~(unsigned long)PTRACE_O_MASK)
 			goto out;
+		retval = check_ptrace_options(flags);
+		if (retval)
+			return retval;
 		flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
 	} else {
 		flags = PT_PTRACED;
@@ -654,22 +682,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
 static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 {
 	unsigned flags;
+	int ret;
 
-	if (data & ~(unsigned long)PTRACE_O_MASK)
-		return -EINVAL;
-
-	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
-		if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
-		    !IS_ENABLED(CONFIG_SECCOMP))
-			return -EINVAL;
-
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
-		    current->ptrace & PT_SUSPEND_SECCOMP)
-			return -EPERM;
-	}
+	ret = check_ptrace_options(data);
+	if (ret)
+		return ret;
 
 	/* Avoid intermediate state when all opts are cleared */
 	flags = child->ptrace;

From e47a62df29a0714cc2d5129516a3618337c84554 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 22 Mar 2022 09:11:44 -0400
Subject: [PATCH 146/229] NFS: Fix revalidation of empty readdir pages

If the page is empty, we need to check the array->last_cookie instead of
the first entry. Add a helper for the cases where we care.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 17986c0019d4..bac4cf1a308e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -252,6 +252,11 @@ static void nfs_readdir_page_array_free(struct page *page)
 	}
 }
 
+static u64 nfs_readdir_array_index_cookie(struct nfs_cache_array *array)
+{
+	return array->size == 0 ? array->last_cookie : array->array[0].cookie;
+}
+
 static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
 {
 	array->page_is_eof = 1;
@@ -369,7 +374,7 @@ static bool nfs_readdir_page_validate(struct page *page, u64 last_cookie,
 
 	if (array->change_attr != change_attr)
 		ret = false;
-	if (array->size > 0 && array->array[0].cookie != last_cookie)
+	if (nfs_readdir_array_index_cookie(array) != last_cookie)
 		ret = false;
 	kunmap_atomic(array);
 	return ret;
@@ -480,7 +485,7 @@ static void nfs_readdir_seek_next_array(struct nfs_cache_array *array,
 		desc->cache_entry_index = 0;
 		desc->page_index++;
 	} else
-		desc->last_cookie = array->array[0].cookie;
+		desc->last_cookie = nfs_readdir_array_index_cookie(array);
 }
 
 static void nfs_readdir_rewind_search(struct nfs_readdir_descriptor *desc)

From 89f42494f92f448747bd8a7ab1ae8b5d5520577d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 16 Mar 2022 19:10:43 -0400
Subject: [PATCH 147/229] SUNRPC: Don't call connect() more than once on a TCP
 socket

Avoid socket state races due to repeated calls to ->connect() using the
same socket. If connect() returns 0 due to the connection having
completed, but we are in fact in a closing state, then we may leave the
XPRT_CONNECTING flag set on the transport.

Reported-by: Enrico Scholz <enrico.scholz@sigma-chemnitz.de>
Fixes: 3be232f11a3c ("SUNRPC: Prevent immediate close+reconnect")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h |  1 +
 net/sunrpc/xprtsock.c           | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 8c2a712cb242..689062afdd61 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -89,5 +89,6 @@ struct sock_xprt {
 #define XPRT_SOCK_WAKE_WRITE	(5)
 #define XPRT_SOCK_WAKE_PENDING	(6)
 #define XPRT_SOCK_WAKE_DISCONNECT	(7)
+#define XPRT_SOCK_CONNECT_SENT	(8)
 
 #endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7e39f87cde2d..8f8a03c3315a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2235,10 +2235,15 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 
 	if (atomic_read(&xprt->swapper))
 		current->flags |= PF_MEMALLOC;
-	if (!sock) {
-		sock = xs_create_sock(xprt, transport,
-				xs_addr(xprt)->sa_family, SOCK_STREAM,
-				IPPROTO_TCP, true);
+
+	if (xprt_connected(xprt))
+		goto out;
+	if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT,
+			       &transport->sock_state) ||
+	    !sock) {
+		xs_reset_transport(transport);
+		sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family,
+				      SOCK_STREAM, IPPROTO_TCP, true);
 		if (IS_ERR(sock)) {
 			xprt_wake_pending_tasks(xprt, PTR_ERR(sock));
 			goto out;
@@ -2262,6 +2267,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 		fallthrough;
 	case -EINPROGRESS:
 		/* SYN_SENT! */
+		set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state);
 		if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
 			xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
 		fallthrough;
@@ -2323,13 +2329,9 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 
 	WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
 
-	if (transport->sock != NULL && !xprt_connecting(xprt)) {
+	if (transport->sock != NULL) {
 		dprintk("RPC:       xs_connect delayed xprt %p for %lu "
-				"seconds\n",
-				xprt, xprt->reestablish_timeout / HZ);
-
-		/* Start by resetting any existing state */
-		xs_reset_transport(transport);
+			"seconds\n", xprt, xprt->reestablish_timeout / HZ);
 
 		delay = xprt_reconnect_delay(xprt);
 		xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);

From 3b21f757c309c84a23a26d8cab20b743e0719705 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 16 Mar 2022 19:18:25 -0400
Subject: [PATCH 148/229] SUNRPC: Only save the TCP source port after the
 connection is complete

Since the RPC client uses a non-blocking connect(), we do not expect to
see it return '0' under normal circumstances.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtsock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8f8a03c3315a..d2bf3b49dbf4 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -58,6 +58,7 @@
 #include "sunrpc.h"
 
 static void xs_close(struct rpc_xprt *xprt);
+static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock);
 static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
 		struct socket *sock);
 
@@ -1025,6 +1026,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 	if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
 		xs_tcp_set_socket_timeouts(xprt, transport->sock);
 
+	xs_set_srcport(transport, transport->sock);
+
 	/* Continue transmitting the packet/record. We must be careful
 	 * to cope with writespace callbacks arriving _after_ we have
 	 * called sendmsg(). */
@@ -2263,8 +2266,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)
 			sock->sk->sk_state);
 	switch (status) {
 	case 0:
-		xs_set_srcport(transport, sock);
-		fallthrough;
 	case -EINPROGRESS:
 		/* SYN_SENT! */
 		set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state);

From 7496b59f588dd52886fdbac7633608097543a0a5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 14 Mar 2022 21:02:10 -0400
Subject: [PATCH 149/229] SUNRPC: Fix socket waits for write buffer space

The socket layer requires that we use the socket lock to protect changes
to the sock->sk_write_pending field and others.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtsock.c | 60 ++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d2bf3b49dbf4..68eee352d69a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -764,12 +764,12 @@ xs_stream_start_connect(struct sock_xprt *transport)
 /**
  * xs_nospace - handle transmit was incomplete
  * @req: pointer to RPC request
+ * @transport: pointer to struct sock_xprt
  *
  */
-static int xs_nospace(struct rpc_rqst *req)
+static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport)
 {
-	struct rpc_xprt *xprt = req->rq_xprt;
-	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+	struct rpc_xprt *xprt = &transport->xprt;
 	struct sock *sk = transport->inet;
 	int ret = -EAGAIN;
 
@@ -780,16 +780,6 @@ static int xs_nospace(struct rpc_rqst *req)
 
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
-		/* wait for more buffer space */
-		sk->sk_write_pending++;
-		xprt_wait_for_buffer_space(xprt);
-	} else
-		ret = -ENOTCONN;
-
-	spin_unlock(&xprt->transport_lock);
-
-	/* Race breaker in case memory is freed before above code is called */
-	if (ret == -EAGAIN) {
 		struct socket_wq *wq;
 
 		rcu_read_lock();
@@ -797,8 +787,42 @@ static int xs_nospace(struct rpc_rqst *req)
 		set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
 		rcu_read_unlock();
 
-		sk->sk_write_space(sk);
-	}
+		/* wait for more buffer space */
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_pending++;
+		xprt_wait_for_buffer_space(xprt);
+	} else
+		ret = -ENOTCONN;
+
+	spin_unlock(&xprt->transport_lock);
+	return ret;
+}
+
+static int xs_sock_nospace(struct rpc_rqst *req)
+{
+	struct sock_xprt *transport =
+		container_of(req->rq_xprt, struct sock_xprt, xprt);
+	struct sock *sk = transport->inet;
+	int ret = -EAGAIN;
+
+	lock_sock(sk);
+	if (!sock_writeable(sk))
+		ret = xs_nospace(req, transport);
+	release_sock(sk);
+	return ret;
+}
+
+static int xs_stream_nospace(struct rpc_rqst *req)
+{
+	struct sock_xprt *transport =
+		container_of(req->rq_xprt, struct sock_xprt, xprt);
+	struct sock *sk = transport->inet;
+	int ret = -EAGAIN;
+
+	lock_sock(sk);
+	if (!sk_stream_memory_free(sk))
+		ret = xs_nospace(req, transport);
+	release_sock(sk);
 	return ret;
 }
 
@@ -888,7 +912,7 @@ static int xs_local_send_request(struct rpc_rqst *req)
 	case -ENOBUFS:
 		break;
 	case -EAGAIN:
-		status = xs_nospace(req);
+		status = xs_stream_nospace(req);
 		break;
 	default:
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
@@ -964,7 +988,7 @@ process_status:
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_nospace(req);
+		status = xs_sock_nospace(req);
 		break;
 	case -ENETUNREACH:
 	case -ENOBUFS:
@@ -1086,7 +1110,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_nospace(req);
+		status = xs_stream_nospace(req);
 		break;
 	case -ECONNRESET:
 	case -ECONNREFUSED:

From 2790a624d43084de590884934969e19c7a82316a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 15 Mar 2022 08:12:40 -0400
Subject: [PATCH 150/229] SUNRPC: Replace internal use of SOCKWQ_ASYNC_NOSPACE

The socket's SOCKWQ_ASYNC_NOSPACE can be cleared by various actors in
the socket layer, so replace it with our own flag in the transport
sock_state field.

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h |  1 +
 net/sunrpc/xprtsock.c           | 22 ++++------------------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 689062afdd61..3eb0079669c5 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -90,5 +90,6 @@ struct sock_xprt {
 #define XPRT_SOCK_WAKE_PENDING	(6)
 #define XPRT_SOCK_WAKE_DISCONNECT	(7)
 #define XPRT_SOCK_CONNECT_SENT	(8)
+#define XPRT_SOCK_NOSPACE	(9)
 
 #endif /* _LINUX_SUNRPC_XPRTSOCK_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 68eee352d69a..2450b31b807a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -780,14 +780,8 @@ static int xs_nospace(struct rpc_rqst *req, struct sock_xprt *transport)
 
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
-		struct socket_wq *wq;
-
-		rcu_read_lock();
-		wq = rcu_dereference(sk->sk_wq);
-		set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
-		rcu_read_unlock();
-
 		/* wait for more buffer space */
+		set_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		sk->sk_write_pending++;
 		xprt_wait_for_buffer_space(xprt);
@@ -1151,6 +1145,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
 	clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
 	clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
 	clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+	clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state);
 }
 
 static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
@@ -1497,7 +1492,6 @@ static void xs_tcp_state_change(struct sock *sk)
 
 static void xs_write_space(struct sock *sk)
 {
-	struct socket_wq *wq;
 	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 
@@ -1508,15 +1502,10 @@ static void xs_write_space(struct sock *sk)
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
 	transport = container_of(xprt, struct sock_xprt, xprt);
-	rcu_read_lock();
-	wq = rcu_dereference(sk->sk_wq);
-	if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
-		goto out;
-
+	if (!test_and_clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state))
+		return;
 	xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
 	sk->sk_write_pending--;
-out:
-	rcu_read_unlock();
 }
 
 /**
@@ -1857,7 +1846,6 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
 		sk->sk_user_data = xprt;
 		sk->sk_data_ready = xs_data_ready;
 		sk->sk_write_space = xs_udp_write_space;
-		sock_set_flag(sk, SOCK_FASYNC);
 		sk->sk_error_report = xs_error_report;
 
 		xprt_clear_connected(xprt);
@@ -2051,7 +2039,6 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_user_data = xprt;
 		sk->sk_data_ready = xs_data_ready;
 		sk->sk_write_space = xs_udp_write_space;
-		sock_set_flag(sk, SOCK_FASYNC);
 
 		xprt_set_connected(xprt);
 
@@ -2218,7 +2205,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
 		sk->sk_data_ready = xs_data_ready;
 		sk->sk_state_change = xs_tcp_state_change;
 		sk->sk_write_space = xs_tcp_write_space;
-		sock_set_flag(sk, SOCK_FASYNC);
 		sk->sk_error_report = xs_error_report;
 
 		/* socket options */

From d0afde5fc6fb13531e2434fc4b6a65f131671f68 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 14 Mar 2022 23:05:07 -0400
Subject: [PATCH 151/229] SUNRPC: Improve accuracy of socket ENOBUFS
 determination

The current code checks for whether or not the socket is in a writeable
state after we get an EAGAIN. That is racy, since we've dropped the
socket lock, so the amount of free buffer may have changed.

Instead, let's check whether the socket is writeable before we try to
write to it. If that was the case, we do expect the message to be at
least partially sent unless we're in a low memory situation.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtsock.c | 53 +++++++++++++++----------------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2450b31b807a..8909c768fe71 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -806,13 +806,15 @@ static int xs_sock_nospace(struct rpc_rqst *req)
 	return ret;
 }
 
-static int xs_stream_nospace(struct rpc_rqst *req)
+static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait)
 {
 	struct sock_xprt *transport =
 		container_of(req->rq_xprt, struct sock_xprt, xprt);
 	struct sock *sk = transport->inet;
 	int ret = -EAGAIN;
 
+	if (vm_wait)
+		return -ENOBUFS;
 	lock_sock(sk);
 	if (!sk_stream_memory_free(sk))
 		ret = xs_nospace(req, transport);
@@ -870,6 +872,7 @@ static int xs_local_send_request(struct rpc_rqst *req)
 	struct msghdr msg = {
 		.msg_flags	= XS_SENDMSG_FLAGS,
 	};
+	bool vm_wait;
 	unsigned int sent;
 	int status;
 
@@ -882,15 +885,14 @@ static int xs_local_send_request(struct rpc_rqst *req)
 	xs_pktdump("packet data:",
 			req->rq_svec->iov_base, req->rq_svec->iov_len);
 
+	vm_wait = sk_stream_is_writeable(transport->inet) ? true : false;
+
 	req->rq_xtime = ktime_get();
 	status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
 				   transport->xmit.offset, rm, &sent);
 	dprintk("RPC:       %s(%u) = %d\n",
 			__func__, xdr->len - transport->xmit.offset, status);
 
-	if (status == -EAGAIN && sock_writeable(transport->inet))
-		status = -ENOBUFS;
-
 	if (likely(sent > 0) || status == 0) {
 		transport->xmit.offset += sent;
 		req->rq_bytes_sent = transport->xmit.offset;
@@ -900,13 +902,12 @@ static int xs_local_send_request(struct rpc_rqst *req)
 			return 0;
 		}
 		status = -EAGAIN;
+		vm_wait = false;
 	}
 
 	switch (status) {
-	case -ENOBUFS:
-		break;
 	case -EAGAIN:
-		status = xs_stream_nospace(req);
+		status = xs_stream_nospace(req, vm_wait);
 		break;
 	default:
 		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
@@ -1024,7 +1025,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 	struct msghdr msg = {
 		.msg_flags	= XS_SENDMSG_FLAGS,
 	};
-	bool vm_wait = false;
+	bool vm_wait;
 	unsigned int sent;
 	int status;
 
@@ -1051,7 +1052,10 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 	 * called sendmsg(). */
 	req->rq_xtime = ktime_get();
 	tcp_sock_set_cork(transport->inet, true);
-	while (1) {
+
+	vm_wait = sk_stream_is_writeable(transport->inet) ? true : false;
+
+	do {
 		status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
 					   transport->xmit.offset, rm, &sent);
 
@@ -1072,31 +1076,10 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 
 		WARN_ON_ONCE(sent == 0 && status == 0);
 
-		if (status == -EAGAIN ) {
-			/*
-			 * Return EAGAIN if we're sure we're hitting the
-			 * socket send buffer limits.
-			 */
-			if (test_bit(SOCK_NOSPACE, &transport->sock->flags))
-				break;
-			/*
-			 * Did we hit a memory allocation failure?
-			 */
-			if (sent == 0) {
-				status = -ENOBUFS;
-				if (vm_wait)
-					break;
-				/* Retry, knowing now that we're below the
-				 * socket send buffer limit
-				 */
-				vm_wait = true;
-			}
-			continue;
-		}
-		if (status < 0)
-			break;
-		vm_wait = false;
-	}
+		if (sent > 0)
+			vm_wait = false;
+
+	} while (status == 0);
 
 	switch (status) {
 	case -ENOTSOCK:
@@ -1104,7 +1087,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
 		/* Should we call xs_close() here? */
 		break;
 	case -EAGAIN:
-		status = xs_stream_nospace(req);
+		status = xs_stream_nospace(req, vm_wait);
 		break;
 	case -ECONNRESET:
 	case -ECONNREFUSED:

From 33e5c765bc1ea5e06ea7603637f14d727e6fcdf3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 14 Mar 2022 22:02:22 -0400
Subject: [PATCH 152/229] NFS: Fix memory allocation in rpc_malloc()

When in a low memory situation, we do want rpciod to kick off direct
reclaim in the case where that helps, however we don't want it looping
forever in mempool_alloc().
So first try allocating from the slab using GFP_KERNEL | __GFP_NORETRY,
and then fall back to a GFP_NOWAIT allocation from the mempool.

Ditto for rpc_alloc_task()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/sched.c           | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 56710f8056d3..1d7a3e51b795 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -262,6 +262,7 @@ void		rpc_destroy_mempool(void);
 extern struct workqueue_struct *rpciod_workqueue;
 extern struct workqueue_struct *xprtiod_workqueue;
 void		rpc_prepare_task(struct rpc_task *task);
+gfp_t		rpc_task_gfp_mask(void);
 
 static inline int rpc_wait_for_completion_task(struct rpc_task *task)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 7c8f87ebdbc0..d59a033820be 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -57,6 +57,13 @@ struct workqueue_struct *rpciod_workqueue __read_mostly;
 struct workqueue_struct *xprtiod_workqueue __read_mostly;
 EXPORT_SYMBOL_GPL(xprtiod_workqueue);
 
+gfp_t rpc_task_gfp_mask(void)
+{
+	if (current->flags & PF_WQ_WORKER)
+		return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+	return GFP_KERNEL;
+}
+
 unsigned long
 rpc_task_timeout(const struct rpc_task *task)
 {
@@ -1030,15 +1037,15 @@ int rpc_malloc(struct rpc_task *task)
 	struct rpc_rqst *rqst = task->tk_rqstp;
 	size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
 	struct rpc_buffer *buf;
-	gfp_t gfp = GFP_KERNEL;
-
-	if (RPC_IS_ASYNC(task))
-		gfp = GFP_NOWAIT | __GFP_NOWARN;
+	gfp_t gfp = rpc_task_gfp_mask();
 
 	size += sizeof(struct rpc_buffer);
-	if (size <= RPC_BUFFER_MAXSIZE)
-		buf = mempool_alloc(rpc_buffer_mempool, gfp);
-	else
+	if (size <= RPC_BUFFER_MAXSIZE) {
+		buf = kmem_cache_alloc(rpc_buffer_slabp, gfp);
+		/* Reach for the mempool if dynamic allocation fails */
+		if (!buf && RPC_IS_ASYNC(task))
+			buf = mempool_alloc(rpc_buffer_mempool, GFP_NOWAIT);
+	} else
 		buf = kmalloc(size, gfp);
 
 	if (!buf)

From 910ad38697d95bd32f45ba70fd6952f6c2956f28 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 17:37:01 -0400
Subject: [PATCH 153/229] NFS: Fix memory allocation in rpc_alloc_task()

As for rpc_malloc(), we first try allocating from the slab, then fall
back to a non-waiting allocation from the mempool.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/sched.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index d59a033820be..b258b87a3ec2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1108,10 +1108,14 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	rpc_init_task_statistics(task);
 }
 
-static struct rpc_task *
-rpc_alloc_task(void)
+static struct rpc_task *rpc_alloc_task(void)
 {
-	return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_KERNEL);
+	struct rpc_task *task;
+
+	task = kmem_cache_alloc(rpc_task_slabp, rpc_task_gfp_mask());
+	if (task)
+		return task;
+	return mempool_alloc(rpc_task_mempool, GFP_NOWAIT);
 }
 
 /*

From 059ee82b6462028ebace435bc94f5b082be0632a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 17:46:30 -0400
Subject: [PATCH 154/229] SUNRPC: Fix unx_lookup_cred() allocation

Default to the same mempool allocation strategy as for rpc_malloc().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/auth_unix.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index c629d366030e..1e091d3fa607 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -40,17 +40,19 @@ unx_destroy(struct rpc_auth *auth)
 /*
  * Lookup AUTH_UNIX creds for current process
  */
-static struct rpc_cred *
-unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
+static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth,
+					struct auth_cred *acred, int flags)
 {
-	gfp_t gfp = GFP_KERNEL;
 	struct rpc_cred *ret;
 
-	if (flags & RPCAUTH_LOOKUP_ASYNC)
-		gfp = GFP_NOWAIT | __GFP_NOWARN;
-	ret = mempool_alloc(unix_pool, gfp);
-	if (!ret)
-		return ERR_PTR(-ENOMEM);
+	ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask());
+	if (!ret) {
+		if (!(flags & RPCAUTH_LOOKUP_ASYNC))
+			return ERR_PTR(-ENOMEM);
+		ret = mempool_alloc(unix_pool, GFP_NOWAIT);
+		if (!ret)
+			return ERR_PTR(-ENOMEM);
+	}
 	rpcauth_init_cred(ret, acred, auth, &unix_credops);
 	ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE;
 	return ret;

From b2648015d4521de21ed3c9f48f718e023860b8c1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 13:20:16 -0400
Subject: [PATCH 155/229] SUNRPC: Make the rpciod and xprtiod slab allocation
 modes consistent

Make sure that rpciod and xprtiod are always using the same slab
allocation modes.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/backchannel_rqst.c |  8 ++++----
 net/sunrpc/rpcb_clnt.c        |  4 ++--
 net/sunrpc/socklib.c          |  3 ++-
 net/sunrpc/xprt.c             |  5 +----
 net/sunrpc/xprtsock.c         | 11 ++++++-----
 5 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 22a2c235abf1..5a6b61dcdf2d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -75,9 +75,9 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
 	return 0;
 }
 
-static
-struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
+static struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt)
 {
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 	struct rpc_rqst *req;
 
 	/* Pre-allocate one backchannel rpc_rqst */
@@ -154,7 +154,7 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 	INIT_LIST_HEAD(&tmp_list);
 	for (i = 0; i < min_reqs; i++) {
 		/* Pre-allocate one backchannel rpc_rqst */
-		req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+		req = xprt_alloc_bc_req(xprt);
 		if (req == NULL) {
 			printk(KERN_ERR "Failed to create bc rpc_rqst\n");
 			goto out_free;
@@ -343,7 +343,7 @@ found:
 			break;
 		} else if (req)
 			break;
-		new = xprt_alloc_bc_req(xprt, GFP_KERNEL);
+		new = xprt_alloc_bc_req(xprt);
 	} while (new);
 	return req;
 }
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 0fdeb8666bfd..5a8e6d46809a 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -714,7 +714,7 @@ void rpcb_getport_async(struct rpc_task *task)
 		goto bailout_nofree;
 	}
 
-	map = kzalloc(sizeof(struct rpcbind_args), GFP_KERNEL);
+	map = kzalloc(sizeof(struct rpcbind_args), rpc_task_gfp_mask());
 	if (!map) {
 		status = -ENOMEM;
 		goto bailout_release_client;
@@ -730,7 +730,7 @@ void rpcb_getport_async(struct rpc_task *task)
 	case RPCBVERS_4:
 	case RPCBVERS_3:
 		map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID];
-		map->r_addr = rpc_sockaddr2uaddr(sap, GFP_KERNEL);
+		map->r_addr = rpc_sockaddr2uaddr(sap, rpc_task_gfp_mask());
 		if (!map->r_addr) {
 			status = -ENOMEM;
 			goto bailout_free_args;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index d52313af82bc..05b38bf68316 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -15,6 +15,7 @@
 #include <linux/pagemap.h>
 #include <linux/udp.h>
 #include <linux/sunrpc/msg_prot.h>
+#include <linux/sunrpc/sched.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/export.h>
 
@@ -222,7 +223,7 @@ static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg,
 {
 	int err;
 
-	err = xdr_alloc_bvec(xdr, GFP_KERNEL);
+	err = xdr_alloc_bvec(xdr, rpc_task_gfp_mask());
 	if (err < 0)
 		return err;
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index bbe913121f43..744c6c1d536f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1679,15 +1679,12 @@ out:
 static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt)
 {
 	struct rpc_rqst *req = ERR_PTR(-EAGAIN);
-	gfp_t gfp_mask = GFP_KERNEL;
 
 	if (xprt->num_reqs >= xprt->max_reqs)
 		goto out;
 	++xprt->num_reqs;
 	spin_unlock(&xprt->reserve_lock);
-	if (current->flags & PF_WQ_WORKER)
-		gfp_mask |= __GFP_NORETRY | __GFP_NOWARN;
-	req = kzalloc(sizeof(*req), gfp_mask);
+	req = kzalloc(sizeof(*req), rpc_task_gfp_mask());
 	spin_lock(&xprt->reserve_lock);
 	if (req != NULL)
 		goto out;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 8909c768fe71..b52eaa8a0cda 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -428,9 +428,9 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
 		offset += want;
 	}
 
-	want = xs_alloc_sparse_pages(buf,
-			min_t(size_t, count - offset, buf->page_len),
-			GFP_KERNEL);
+	want = xs_alloc_sparse_pages(
+		buf, min_t(size_t, count - offset, buf->page_len),
+		GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
 	if (seek < want) {
 		ret = xs_read_bvec(sock, msg, flags, buf->bvec,
 				xdr_buf_pagecount(buf),
@@ -826,7 +826,8 @@ static void
 xs_stream_prepare_request(struct rpc_rqst *req)
 {
 	xdr_free_bvec(&req->rq_rcv_buf);
-	req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_KERNEL);
+	req->rq_task->tk_status = xdr_alloc_bvec(
+		&req->rq_rcv_buf, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
 }
 
 /*
@@ -2487,7 +2488,7 @@ static int bc_malloc(struct rpc_task *task)
 		return -EINVAL;
 	}
 
-	page = alloc_page(GFP_KERNEL);
+	page = alloc_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
 	if (!page)
 		return -ENOMEM;
 

From 515dcdcd48736576c6f5c197814da6f81c60a21e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 12:34:19 -0400
Subject: [PATCH 156/229] NFS: nfsiod should not block forever in
 mempool_alloc()

The concern is that since nfsiod is sometimes required to kick off a
commit, it can get locked up waiting forever in mempool_alloc() instead
of failing gracefully and leaving the commit until later.

Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY,
then fall back to a non-blocking attempt to allocate from the memory
pool.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/internal.h      |  7 +++++++
 fs/nfs/pnfs_nfs.c      |  8 ++++++--
 fs/nfs/write.c         | 24 +++++++++---------------
 include/linux/nfs_fs.h |  2 +-
 4 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 194840a97e3a..57b0497105c8 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -587,6 +587,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
 		!nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
 }
 
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+	if (current->flags & PF_WQ_WORKER)
+		return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+	return GFP_KERNEL;
+}
+
 /* unlink.c */
 extern struct rpc_task *
 nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 316f68f96e57..657c242a18ff 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -419,7 +419,7 @@ static struct nfs_commit_data *
 pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
 			     struct nfs_commit_info *cinfo)
 {
-	struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+	struct nfs_commit_data *data = nfs_commitdata_alloc();
 
 	if (!data)
 		return NULL;
@@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	unsigned int nreq = 0;
 
 	if (!list_empty(mds_pages)) {
-		data = nfs_commitdata_alloc(true);
+		data = nfs_commitdata_alloc();
+		if (!data) {
+			nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+			return -ENOMEM;
+		}
 		data->ds_commit_index = -1;
 		list_splice_init(mds_pages, &data->pages);
 		list_add_tail(&data->list, &list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 599a82406d38..ef47e3700e4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool;
 static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
 {
 	struct nfs_commit_data *p;
 
-	if (never_fail)
-		p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
-	else {
-		/* It is OK to do some reclaim, not no safe to wait
-		 * for anything to be returned to the pool.
-		 * mempool_alloc() cannot handle that particular combination,
-		 * so we need two separate attempts.
-		 */
+	p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+	if (!p) {
 		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
-		if (!p)
-			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
-					     __GFP_NOWARN | __GFP_NORETRY);
 		if (!p)
 			return NULL;
+		memset(p, 0, sizeof(*p));
 	}
-
-	memset(p, 0, sizeof(*p));
 	INIT_LIST_HEAD(&p->pages);
 	return p;
 }
@@ -1826,7 +1816,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	if (list_empty(head))
 		return 0;
 
-	data = nfs_commitdata_alloc(true);
+	data = nfs_commitdata_alloc();
+	if (!data) {
+		nfs_retry_commit(head, NULL, cinfo, -1);
+		return -ENOMEM;
+	}
 
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c47c448befc8..db305abafc9e 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -580,7 +580,7 @@ extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page *page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
+extern struct nfs_commit_data *nfs_commitdata_alloc(void);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 bool nfs_commit_end(struct nfs_mds_commit_info *cinfo);
 

From 0bae835b63c53f86cdc524f5962e39409585b22c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 13:48:36 -0400
Subject: [PATCH 157/229] NFS: Avoid writeback threads getting stuck in
 mempool_alloc()

In a low memory situation, allow the NFS writeback code to fail without
getting stuck in infinite loops in mempool_alloc().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pagelist.c | 10 +++++-----
 fs/nfs/write.c    | 10 ++++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ad7f83dc9a2d..3156db526cc4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -90,10 +90,10 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
 	}
 }
 
-static inline struct nfs_page *
-nfs_page_alloc(void)
+static inline struct nfs_page *nfs_page_alloc(void)
 {
-	struct nfs_page	*p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+	struct nfs_page *p =
+		kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
 	if (p)
 		INIT_LIST_HEAD(&p->wb_list);
 	return p;
@@ -892,7 +892,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	struct nfs_commit_info cinfo;
 	struct nfs_page_array *pg_array = &hdr->page_array;
 	unsigned int pagecount, pageused;
-	gfp_t gfp_flags = GFP_KERNEL;
+	gfp_t gfp_flags = nfs_io_gfp_mask();
 
 	pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
 	pg_array->npages = pagecount;
@@ -979,7 +979,7 @@ nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
 	desc->pg_mirrors_dynamic = NULL;
 	if (mirror_count == 1)
 		return desc->pg_mirrors_static;
-	ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL);
+	ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
 	if (ret != NULL) {
 		for (i = 0; i < mirror_count; i++)
 			nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ef47e3700e4b..e864ac836237 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -94,9 +94,15 @@ EXPORT_SYMBOL_GPL(nfs_commit_free);
 
 static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
-	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL);
+	struct nfs_pgio_header *p;
 
-	memset(p, 0, sizeof(*p));
+	p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+	if (!p) {
+		p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+		if (!p)
+			return NULL;
+		memset(p, 0, sizeof(*p));
+	}
 	p->rw_mode = FMODE_WRITE;
 	return p;
 }

From 63d8a41b1dbf24dfc2480cb28236e2f6844d89b9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 15:32:09 -0400
Subject: [PATCH 158/229] NFSv4/pnfs: Ensure pNFS allocation modes are
 consistent with nfsiod

Ensure that pNFS allocations that can be called from rpciod/nfsiod
callback can fail in low memory mode, so that the threads don't block
and loop forever.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42proc.c |  2 +-
 fs/nfs/pnfs.c      | 39 +++++++++++++++++----------------------
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 882bf84484ac..b841e267b764 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1017,7 +1017,7 @@ int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg,
 		return -EOPNOTSUPP;
 	if (n > NFS42_LAYOUTERROR_MAX)
 		return -EINVAL;
-	data = nfs42_alloc_layouterror_data(lseg, GFP_KERNEL);
+	data = nfs42_alloc_layouterror_data(lseg, nfs_io_gfp_mask());
 	if (!data)
 		return -ENOMEM;
 	for (i = 0; i < n; i++) {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f089e11fd001..de318bb5d349 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1233,7 +1233,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
 	int status = 0;
 
 	*pcred = NULL;
-	lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+	lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask());
 	if (unlikely(lrp == NULL)) {
 		status = -ENOMEM;
 		spin_lock(&ino->i_lock);
@@ -2206,7 +2206,7 @@ _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
 	struct pnfs_layout_hdr *lo;
 
 	spin_lock(&ino->i_lock);
-	lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
+	lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask());
 	if (!lo)
 		goto out_unlock;
 	if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
@@ -2249,8 +2249,8 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data,
 	lo = _pnfs_grab_empty_layout(ino, ctx);
 	if (!lo)
 		return;
-	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
-					     &rng, GFP_KERNEL);
+	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+					     nfs_io_gfp_mask());
 	if (!lgp) {
 		pnfs_clear_first_layoutget(lo);
 		nfs_layoutget_end(lo);
@@ -2275,8 +2275,8 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data,
 	};
 	struct nfs4_layoutget *lgp;
 
-	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
-					     &rng, GFP_KERNEL);
+	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
+					     nfs_io_gfp_mask());
 	if (!lgp)
 		return;
 	data->lgp = lgp;
@@ -2691,13 +2691,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 		else
 			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
 
-		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   nfs_req_openctx(req),
-						   req_offset(req),
-						   rd_size,
-						   IOMODE_READ,
-						   false,
-						   GFP_KERNEL);
+		pgio->pg_lseg =
+			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+					   req_offset(req), rd_size,
+					   IOMODE_READ, false,
+					   nfs_io_gfp_mask());
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -2718,13 +2716,10 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	pnfs_generic_pg_check_layout(pgio);
 	pnfs_generic_pg_check_range(pgio, req);
 	if (pgio->pg_lseg == NULL) {
-		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   nfs_req_openctx(req),
-						   req_offset(req),
-						   wb_size,
-						   IOMODE_RW,
-						   false,
-						   GFP_KERNEL);
+		pgio->pg_lseg =
+			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+					   req_offset(req), wb_size, IOMODE_RW,
+					   false, nfs_io_gfp_mask());
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -3183,7 +3178,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 
 	status = -ENOMEM;
 	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
-	data = kzalloc(sizeof(*data), GFP_NOFS);
+	data = kzalloc(sizeof(*data), nfs_io_gfp_mask());
 	if (!data)
 		goto clear_layoutcommitting;
 
@@ -3250,7 +3245,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
 	struct nfs4_threshold *thp;
 
-	thp = kzalloc(sizeof(*thp), GFP_KERNEL);
+	thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask());
 	if (!thp) {
 		dprintk("%s mdsthreshold allocation failed\n", __func__);
 		return NULL;

From 3e5f151e94c190c31a240d9458677caab4f6c44e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 15:34:22 -0400
Subject: [PATCH 159/229] pNFS/flexfiles: Ensure pNFS allocation modes are
 consistent with nfsiod

Ensure that pNFS flexfile allocations in rpciod/nfsiod callbacks can
fail in low memory mode, so that the threads don't block and loop
forever.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 50 +++++++++++---------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e28f2177afb7..604be402ae13 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -663,7 +663,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(inode, GFP_KERNEL);
+		pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
 }
 
 static void
@@ -694,7 +694,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
 	spin_unlock(&mirror->lock);
 
 	if (report)
-		pnfs_report_layoutstat(inode, GFP_KERNEL);
+		pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
 }
 
 static void
@@ -806,13 +806,10 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
 		      bool strict_iomode)
 {
 	pnfs_put_lseg(pgio->pg_lseg);
-	pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-					   nfs_req_openctx(req),
-					   req_offset(req),
-					   req->wb_bytes,
-					   IOMODE_READ,
-					   strict_iomode,
-					   GFP_KERNEL);
+	pgio->pg_lseg =
+		pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+				   req_offset(req), req->wb_bytes, IOMODE_READ,
+				   strict_iomode, nfs_io_gfp_mask());
 	if (IS_ERR(pgio->pg_lseg)) {
 		pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 		pgio->pg_lseg = NULL;
@@ -894,13 +891,10 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 retry:
 	ff_layout_pg_check_layout(pgio, req);
 	if (!pgio->pg_lseg) {
-		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   nfs_req_openctx(req),
-						   req_offset(req),
-						   req->wb_bytes,
-						   IOMODE_RW,
-						   false,
-						   GFP_KERNEL);
+		pgio->pg_lseg =
+			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+					   req_offset(req), req->wb_bytes,
+					   IOMODE_RW, false, nfs_io_gfp_mask());
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -953,13 +947,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 				    struct nfs_page *req)
 {
 	if (!pgio->pg_lseg) {
-		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-						   nfs_req_openctx(req),
-						   req_offset(req),
-						   req->wb_bytes,
-						   IOMODE_RW,
-						   false,
-						   GFP_KERNEL);
+		pgio->pg_lseg =
+			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
+					   req_offset(req), req->wb_bytes,
+					   IOMODE_RW, false, nfs_io_gfp_mask());
 		if (IS_ERR(pgio->pg_lseg)) {
 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
 			pgio->pg_lseg = NULL;
@@ -1258,7 +1249,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
-				       GFP_KERNEL);
+				       nfs_io_gfp_mask());
 
 	switch (status) {
 	case NFS4ERR_DELAY:
@@ -1973,7 +1964,8 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
 	struct inode *inode = lseg->pls_layout->plh_inode;
 	struct pnfs_commit_array *array, *new;
 
-	new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_KERNEL);
+	new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
+				      nfs_io_gfp_mask());
 	if (new) {
 		spin_lock(&inode->i_lock);
 		array = pnfs_add_commit_array(fl_cinfo, new, lseg);
@@ -2152,10 +2144,10 @@ ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
 	struct nfs4_flexfile_layoutreturn_args *ff_args;
 	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
 
-	ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+	ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask());
 	if (!ff_args)
 		goto out_nomem;
-	ff_args->pages[0] = alloc_page(GFP_KERNEL);
+	ff_args->pages[0] = alloc_page(nfs_io_gfp_mask());
 	if (!ff_args->pages[0])
 		goto out_nomem_free;
 
@@ -2193,7 +2185,7 @@ ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
 		return;
 
 	errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors),
-			       GFP_KERNEL);
+			       nfs_io_gfp_mask());
 	if (errors != NULL) {
 		const struct nfs4_ff_layout_ds_err *pos;
 		size_t n = 0;
@@ -2445,7 +2437,7 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 
 	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
 	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo),
-				      GFP_KERNEL);
+				      nfs_io_gfp_mask());
 	if (!args->devinfo)
 		return -ENOMEM;
 

From a245832aaa9930f0ea91527cbd70521722b89313 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 21 Mar 2022 17:27:14 -0400
Subject: [PATCH 160/229] pNFS/files: Ensure pNFS allocation modes are
 consistent with nfsiod

Ensure that pNFS file commit allocations in rpciod/nfsiod callbacks can
fail in low memory mode, so that the threads don't block and loop
forever.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/filelayout/filelayout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 9c96e3e5ed35..76deddab0a8f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1075,7 +1075,7 @@ filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
 	unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
 		fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
 
-	new = pnfs_alloc_commit_array(size, GFP_NOIO);
+	new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask());
 	if (new) {
 		spin_lock(&inode->i_lock);
 		array = pnfs_add_commit_array(fl_cinfo, new, lseg);

From 5e6ded2e7a5d9c71186acc8f51989ef6e6addda4 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 19 Mar 2022 18:51:43 -0700
Subject: [PATCH 161/229] livepatch: Reorder to use before freeing a pointer

Clang static analysis reports this issue
livepatch-shadow-fix1.c:113:2: warning: Use of
  memory after it is freed
  pr_info("%s: dummy @ %p, prevented leak @ %p\n",
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

The pointer is freed in the previous statement.
Reorder the pr_info to report before the free.

Similar issue in livepatch-shadow-fix2.c

Note that it is a false positive. pr_info() just prints the address.
The freed memory is not accessed. Well, the static analyzer could not
know this easily.

Signed-off-by: Tom Rix <trix@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: David Vernet <void@manifault.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
[pmladek@suse.com: Note about that it was false positive.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20220320015143.2208591-1-trix@redhat.com
---
 samples/livepatch/livepatch-shadow-fix1.c | 2 +-
 samples/livepatch/livepatch-shadow-fix2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
index 918ce17b43fd..6701641bf12d 100644
--- a/samples/livepatch/livepatch-shadow-fix1.c
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -109,9 +109,9 @@ static void livepatch_fix1_dummy_leak_dtor(void *obj, void *shadow_data)
 	void *d = obj;
 	int **shadow_leak = shadow_data;
 
-	kfree(*shadow_leak);
 	pr_info("%s: dummy @ %p, prevented leak @ %p\n",
 			 __func__, d, *shadow_leak);
+	kfree(*shadow_leak);
 }
 
 static void livepatch_fix1_dummy_free(struct dummy *d)
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
index 29fe5cd42047..361046a4f10c 100644
--- a/samples/livepatch/livepatch-shadow-fix2.c
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -61,9 +61,9 @@ static void livepatch_fix2_dummy_leak_dtor(void *obj, void *shadow_data)
 	void *d = obj;
 	int **shadow_leak = shadow_data;
 
-	kfree(*shadow_leak);
 	pr_info("%s: dummy @ %p, prevented leak @ %p\n",
 			 __func__, d, *shadow_leak);
+	kfree(*shadow_leak);
 }
 
 static void livepatch_fix2_dummy_free(struct dummy *d)

From dcbc65aac28360df5f5a3b613043ccc0e81da3cf Mon Sep 17 00:00:00 2001
From: Yang Li <yang.lee@linux.alibaba.com>
Date: Wed, 16 Mar 2022 07:51:48 +0800
Subject: [PATCH 162/229] ptrace: Remove duplicated include in ptrace.c

Fix following includecheck warning:
./arch/m68k/kernel/ptrace.c: linux/ptrace.h is included more than once.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
Fixes: 153474ba1a4a ("ptrace: Create ptrace_report_syscall_{entry,exit} in ptrace.h")
Link: https://lkml.kernel.org/r/20220315235148.54253-1-yang.lee@linux.alibaba.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/m68k/kernel/ptrace.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index a0c99fe3118e..6342ff4d2073 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -19,7 +19,6 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/signal.h>
-#include <linux/ptrace.h>
 
 #include <linux/uaccess.h>
 #include <asm/page.h>

From e97824ff663ce3509fe040431c713182c2f058b1 Mon Sep 17 00:00:00 2001
From: Miaohe Lin <linmiaohe@huawei.com>
Date: Tue, 22 Mar 2022 16:09:18 +0800
Subject: [PATCH 163/229] mm/mlock: fix two bugs in user_shm_lock()

user_shm_lock forgets to set allowed to 0 when get_ucounts fails. So the
later user_shm_unlock might do the extra dec_rlimit_ucounts. Also in the
RLIM_INFINITY case, user_shm_lock will success regardless of the value of
memlock where memblock == LONG_MAX && !capable(CAP_IPC_LOCK) should fail.
Fix all of these by changing the code to leave lock_limit at ULONG_MAX aka
RLIM_INFINITY, leave "allowed" initialized to 0 and remove the special case
of RLIM_INFINITY as nothing can be greater than ULONG_MAX.

Credit goes to Eric W. Biederman for proposing simplifying the code and
thus catching the later bug.

Fixes: d7c9e99aee48 ("Reimplement RLIMIT_MEMLOCK on top of ucounts")
Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: stable@vger.kernel.org
v1: https://lkml.kernel.org/r/20220310132417.41189-1-linmiaohe@huawei.com
v2: https://lkml.kernel.org/r/20220314064039.62972-1-linmiaohe@huawei.com
Link: https://lkml.kernel.org/r/20220322080918.59861-1-linmiaohe@huawei.com
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 mm/mlock.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index 25934e7db3e1..37f969ec68fa 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -827,13 +827,12 @@ int user_shm_lock(size_t size, struct ucounts *ucounts)
 
 	locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	lock_limit = rlimit(RLIMIT_MEMLOCK);
-	if (lock_limit == RLIM_INFINITY)
-		allowed = 1;
-	lock_limit >>= PAGE_SHIFT;
+	if (lock_limit != RLIM_INFINITY)
+		lock_limit >>= PAGE_SHIFT;
 	spin_lock(&shmlock_user_lock);
 	memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 
-	if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+	if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
 		dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
 		goto out;
 	}

From 3848e96edf4788f772d83990022fa7023a233d83 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 8 Mar 2022 13:42:17 +1100
Subject: [PATCH 164/229] SUNRPC: avoid race between mod_timer() and
 del_timer_sync()

xprt_destory() claims XPRT_LOCKED and then calls del_timer_sync().
Both xprt_unlock_connect() and xprt_release() call
 ->release_xprt()
which drops XPRT_LOCKED and *then* xprt_schedule_autodisconnect()
which calls mod_timer().

This may result in mod_timer() being called *after* del_timer_sync().
When this happens, the timer may fire long after the xprt has been freed,
and run_timer_softirq() will probably crash.

The pairing of ->release_xprt() and xprt_schedule_autodisconnect() is
always called under ->transport_lock.  So if we take ->transport_lock to
call del_timer_sync(), we can be sure that mod_timer() will run first
(if it runs at all).

Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprt.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 744c6c1d536f..880bfe8dc7f6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -2104,7 +2104,14 @@ static void xprt_destroy(struct rpc_xprt *xprt)
 	 */
 	wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE);
 
+	/*
+	 * xprt_schedule_autodisconnect() can run after XPRT_LOCKED
+	 * is cleared.  We use ->transport_lock to ensure the mod_timer()
+	 * can only run *before* del_time_sync(), never after.
+	 */
+	spin_lock(&xprt->transport_lock);
 	del_timer_sync(&xprt->timer);
+	spin_unlock(&xprt->transport_lock);
 
 	/*
 	 * Destroy sockets etc from the system workqueue so they can

From de7a9e949f4f094741f708cd05572f932d009d02 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Wed, 23 Mar 2022 22:15:49 +0530
Subject: [PATCH 165/229] drivers/nvdimm: Fix build failure when
 CONFIG_PERF_EVENTS is not set

The following build failure occurs when CONFIG_PERF_EVENTS is not set
as generic pmu functions are not visible in that scenario.

|-- s390-randconfig-r044-20220313
|   |-- nd_perf.c:(.text):undefined-reference-to-perf_pmu_migrate_context
|   |-- nd_perf.c:(.text):undefined-reference-to-perf_pmu_register
|   `-- nd_perf.c:(.text):undefined-reference-to-perf_pmu_unregister

Similar build failure in nds32 architecture:
nd_perf.c:(.text+0x21e): undefined reference to `perf_pmu_migrate_context'
nd_perf.c:(.text+0x434): undefined reference to `perf_pmu_register'
nd_perf.c:(.text+0x57c): undefined reference to `perf_pmu_unregister'

Fix this issue by adding check for CONFIG_PERF_EVENTS config option
and disabling the nvdimm perf interface incase this config is not set.

Also remove function declaration of perf_pmu_migrate_context,
perf_pmu_register, perf_pmu_unregister functions from nd.h as these are
common pmu functions which are part of perf_event.h and since we
are disabling nvdimm perf interface incase CONFIG_PERF_EVENTS option
is not set, we not need to declare them in nd.h

Also move the platform_device header file addition part from nd.h to
nd_perf.c and add stub functions for register_nvdimm_pmu and
unregister_nvdimm_pmu functions to handle CONFIG_PERF_EVENTS=n
case.

Fixes: 0fab1ba6ad6b ("drivers/nvdimm: Add perf interface to expose nvdimm performance stats") (Commit id based on libnvdimm-for-next tree)
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Link: https://lore.kernel.org/all/62317124.YBQFU33+s%2FwdvWGj%25lkp@intel.com/
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/r/20220323164550.109768-1-kjain@linux.ibm.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/Makefile  |  2 +-
 drivers/nvdimm/nd_perf.c |  1 +
 include/linux/nd.h       | 16 ++++++++++++----
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 3fb806748716..ba0296dca9db 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -15,7 +15,7 @@ nd_e820-y := e820.o
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
-libnvdimm-y += nd_perf.o
+libnvdimm-$(CONFIG_PERF_EVENTS) += nd_perf.o
 libnvdimm-y += dimm.o
 libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c
index 314415894acf..433bbb68ae64 100644
--- a/drivers/nvdimm/nd_perf.c
+++ b/drivers/nvdimm/nd_perf.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt) "nvdimm_pmu: " fmt
 
 #include <linux/nd.h>
+#include <linux/platform_device.h>
 
 #define EVENT(_name, _code)     enum{_name = _code}
 
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 7b2ccbdc1cbc..b9771ba1ef87 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -9,7 +9,6 @@
 #include <linux/device.h>
 #include <linux/badblocks.h>
 #include <linux/perf_event.h>
-#include <linux/platform_device.h>
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
@@ -57,15 +56,24 @@ struct nvdimm_pmu {
 	struct cpumask arch_cpumask;
 };
 
+struct platform_device;
+
+#ifdef CONFIG_PERF_EVENTS
 extern ssize_t nvdimm_events_sysfs_show(struct device *dev,
 					struct device_attribute *attr,
 					char *page);
 
 int register_nvdimm_pmu(struct nvdimm_pmu *nvdimm, struct platform_device *pdev);
 void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu);
-int perf_pmu_register(struct pmu *pmu, const char *name, int type);
-void perf_pmu_unregister(struct pmu *pmu);
+
+#else
+static inline int register_nvdimm_pmu(struct nvdimm_pmu *nvdimm, struct platform_device *pdev)
+{
+	return -ENXIO;
+}
+
+static inline void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu) { }
+#endif
 
 struct nd_device_driver {
 	struct device_driver drv;

From d0007eb15c2a8113e847143c783ea83d93963741 Mon Sep 17 00:00:00 2001
From: Kajol Jain <kjain@linux.ibm.com>
Date: Wed, 23 Mar 2022 22:15:50 +0530
Subject: [PATCH 166/229] powerpc/papr_scm: Fix build failure when
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following build failure occurs when CONFIG_PERF_EVENTS is not set
as generic pmu functions are not visible in that scenario.

arch/powerpc/platforms/pseries/papr_scm.c:372:35: error: ‘struct perf_event’ has no member named ‘attr’
         p->nvdimm_events_map[event->attr.config],
                                   ^~
In file included from ./include/linux/list.h:5,
                 from ./include/linux/kobject.h:19,
                 from ./include/linux/of.h:17,
                 from arch/powerpc/platforms/pseries/papr_scm.c:5:
arch/powerpc/platforms/pseries/papr_scm.c: In function ‘papr_scm_pmu_event_init’:
arch/powerpc/platforms/pseries/papr_scm.c:389:49: error: ‘struct perf_event’ has no member named ‘pmu’
  struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
                                                 ^~
./include/linux/container_of.h:18:26: note: in definition of macro ‘container_of’
  void *__mptr = (void *)(ptr);     \
                          ^~~
arch/powerpc/platforms/pseries/papr_scm.c:389:30: note: in expansion of macro ‘to_nvdimm_pmu’
  struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
                              ^~~~~~~~~~~~~
In file included from ./include/linux/bits.h:22,
                 from ./include/linux/bitops.h:6,
                 from ./include/linux/of.h:15,
                 from arch/powerpc/platforms/pseries/papr_scm.c:5:

Fix the build issue by adding check for CONFIG_PERF_EVENTS config option
and also add stub function for papr_scm_pmu_register to handle
the CONFIG_PERF_EVENTS=n case. Also move the position of macro
"to_nvdimm_pmu" inorder to merge it in CONFIG_PERF_EVENTS=y block.

based on libnvdimm-for-next tree)

Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") (Commit id
Signed-off-by: Kajol Jain <kjain@linux.ibm.com>
Link: https://lore.kernel.org/r/20220323164550.109768-2-kjain@linux.ibm.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/platforms/pseries/papr_scm.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 4dd513d7c029..9dba9e71fde9 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -69,8 +69,6 @@
 #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
 #define PAPR_SCM_PERF_STATS_VERSION 0x1
 
-#define to_nvdimm_pmu(_pmu)	container_of(_pmu, struct nvdimm_pmu, pmu)
-
 /* Struct holding a single performance metric */
 struct papr_scm_perf_stat {
 	u8 stat_id[8];
@@ -346,6 +344,9 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p,
 	return 0;
 }
 
+#ifdef CONFIG_PERF_EVENTS
+#define to_nvdimm_pmu(_pmu)	container_of(_pmu, struct nvdimm_pmu, pmu)
+
 static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, u64 *count)
 {
 	struct papr_scm_perf_stat *stat;
@@ -558,6 +559,10 @@ pmu_err_print:
 	dev_info(&p->pdev->dev, "nvdimm pmu didn't register rc=%d\n", rc);
 }
 
+#else
+static void papr_scm_pmu_register(struct papr_scm_priv *p) { }
+#endif
+
 /*
  * Issue hcall to retrieve dimm health info and populate papr_scm_priv with the
  * health information.

From 11cddee9c19f57bba335cff9787f23fb2d6d6f26 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 16 Mar 2022 06:21:33 +0100
Subject: [PATCH 167/229] MAINTAINERS: remove section LIBNVDIMM BLK:
 MMIO-APERTURE DRIVER

Commit f8669f1d6a86 ("nvdimm/blk: Delete the block-aperture window driver")
removes the file drivers/nvdimm/blk.c, but misses to adjust MAINTAINERS.

Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a
broken reference.

The section LIBNVDIMM BLK: MMIO-APERTURE DRIVER refers to the driver in
blk.c, and some more generic nvdimm code in region_devs.c.

As the driver is deleted, delete the section LIBNVDIMM BLK: MMIO-APERTURE
DRIVER in MAINTAINERS as well.

The remaining file region_devs.c is still covered by the section LIBNVDIMM:
NON-VOLATILE MEMORY DEVICE SUBSYSTEM, and all patches to region_devs.c will
still reach the same developers as before.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20220316052133.26212-1-lukas.bulwahn@gmail.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 MAINTAINERS | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index fca970a46e77..2a802dc83bc3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10933,17 +10933,6 @@ F:	drivers/ata/
 F:	include/linux/ata.h
 F:	include/linux/libata.h
 
-LIBNVDIMM BLK: MMIO-APERTURE DRIVER
-M:	Dan Williams <dan.j.williams@intel.com>
-M:	Vishal Verma <vishal.l.verma@intel.com>
-M:	Dave Jiang <dave.jiang@intel.com>
-L:	nvdimm@lists.linux.dev
-S:	Supported
-Q:	https://patchwork.kernel.org/project/linux-nvdimm/list/
-P:	Documentation/nvdimm/maintainer-entry-profile.rst
-F:	drivers/nvdimm/blk.c
-F:	drivers/nvdimm/region_devs.c
-
 LIBNVDIMM BTT: BLOCK TRANSLATION TABLE
 M:	Vishal Verma <vishal.l.verma@intel.com>
 M:	Dan Williams <dan.j.williams@intel.com>

From ada8d8d337ee970860c9844126e634df8076aa11 Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Mon, 21 Mar 2022 15:04:08 -0700
Subject: [PATCH 168/229] nvdimm/blk: Fix title level

make htmldocs fails with
Sphinx parallel build error:
docutils.utils.SystemMessage: ...nvdimm.rst:146:
  (SEVERE/4) Title level inconsistent:

PMEM-REGIONs, Atomic Sectors, and DAX
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The PMEM vs BLK section was removed without changing
the PMEM-REGIONS... title line.  Replace '^' with '_'.

Fixes: f8669f1d6a86 ("nvdimm/blk: Delete the block-aperture window driver")
Signed-off-by: Tom Rix <trix@redhat.com>
Link: https://lore.kernel.org/r/20220321220408.2381974-1-trix@redhat.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/driver-api/nvdimm/nvdimm.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst
index 7917f6471092..be8587a558e1 100644
--- a/Documentation/driver-api/nvdimm/nvdimm.rst
+++ b/Documentation/driver-api/nvdimm/nvdimm.rst
@@ -15,7 +15,7 @@ Version 13
 	    Supporting Documents
 	    Git Trees
 	LIBNVDIMM PMEM
-	        PMEM-REGIONs, Atomic Sectors, and DAX
+	    PMEM-REGIONs, Atomic Sectors, and DAX
 	Example NVDIMM Platform
 	LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
 	    LIBNDCTL: Context
@@ -144,7 +144,7 @@ can be immediately attached to nd_pmem. This latter mode is called
 label-less or "legacy".
 
 PMEM-REGIONs, Atomic Sectors, and DAX
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-------------------------------------
 
 For the cases where an application or filesystem still needs atomic sector
 update guarantees it can register a BTT on a PMEM device or partition.  See

From d645552e9bd96671079b27015294ec7f9748fa2b Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 24 Mar 2022 15:02:40 +0100
Subject: [PATCH 169/229] netfilter: egress: Report interface as outgoing

Otherwise packets in egress chains seem like they are being received by
the interface, not sent out via it.

Fixes: 42df6e1d221dd ("netfilter: Introduce egress hook")
Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter_netdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/netfilter_netdev.h b/include/linux/netfilter_netdev.h
index e6487a691136..8676316547cc 100644
--- a/include/linux/netfilter_netdev.h
+++ b/include/linux/netfilter_netdev.h
@@ -99,7 +99,7 @@ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
 		return skb;
 
 	nf_hook_state_init(&state, NF_NETDEV_EGRESS,
-			   NFPROTO_NETDEV, dev, NULL, NULL,
+			   NFPROTO_NETDEV, NULL, dev, NULL,
 			   dev_net(dev), NULL);
 
 	/* nf assumes rcu_read_lock, not just read_lock_bh */

From f2dd495a8d589371289981d5ed33e6873df94ecc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 21 Mar 2022 11:38:32 +0100
Subject: [PATCH 170/229] netfilter: nf_conntrack_tcp: preserve liberal flag in
 tcp options

Do not reset IP_CT_TCP_FLAG_BE_LIBERAL flag in out-of-sync scenarios
coming before the TCP window tracking, otherwise such connections will
fail in the window check.

Update tcp_options() to leave this flag in place and add a new helper
function to reset the tcp window state.

Based on patch from Sven Auhagen.

Fixes: c4832c7bbc3f ("netfilter: nf_ct_tcp: improve out-of-sync situation in TCP tracking")
Tested-by: Sven Auhagen <sven.auhagen@voleatech.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index d1582b888c0d..8ec55cd72572 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -341,8 +341,8 @@ static void tcp_options(const struct sk_buff *skb,
 	if (!ptr)
 		return;
 
-	state->td_scale =
-	state->flags = 0;
+	state->td_scale = 0;
+	state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL;
 
 	while (length > 0) {
 		int opcode=*ptr++;
@@ -862,6 +862,16 @@ static bool tcp_can_early_drop(const struct nf_conn *ct)
 	return false;
 }
 
+static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
+{
+	state->td_end		= 0;
+	state->td_maxend	= 0;
+	state->td_maxwin	= 0;
+	state->td_maxack	= 0;
+	state->td_scale		= 0;
+	state->flags		&= IP_CT_TCP_FLAG_BE_LIBERAL;
+}
+
 /* Returns verdict for packet, or -1 for invalid. */
 int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			    struct sk_buff *skb,
@@ -968,8 +978,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
 				ct->proto.tcp.last_flags;
-			memset(&ct->proto.tcp.seen[dir], 0,
-			       sizeof(struct ip_ct_tcp_state));
+			nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]);
 			break;
 		}
 		ct->proto.tcp.last_index = index;

From 3de24f3d7078f90c8488fc67446671503f44d63e Mon Sep 17 00:00:00 2001
From: Jakob Koschel <jakobkoschel@gmail.com>
Date: Thu, 24 Mar 2022 08:15:23 +0100
Subject: [PATCH 171/229] NFS: replace usage of found with dedicated list
 iterator variable

To move the list iterator variable into the list_for_each_entry_*()
macro in the future it should be avoided to use the list iterator
variable after the loop body.

To *never* use the list iterator variable after the loop it was
concluded to use a separate iterator variable instead of a
found boolean [1].

This removes the need to use a found variable and simply checking if
the variable was set, can determine if the break/goto was hit.

Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWXJ6w@mail.gmail.com/
Signed-off-by: Jakob Koschel <jakobkoschel@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42proc.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index b841e267b764..068c45b3bc1a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -175,9 +175,8 @@ static int handle_async_copy(struct nfs42_copy_res *res,
 			     nfs4_stateid *src_stateid,
 			     bool *restart)
 {
-	struct nfs4_copy_state *copy, *tmp_copy;
+	struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
 	int status = NFS4_OK;
-	bool found_pending = false;
 	struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
 	struct nfs_open_context *src_ctx = nfs_file_open_context(src);
 
@@ -186,17 +185,17 @@ static int handle_async_copy(struct nfs42_copy_res *res,
 		return -ENOMEM;
 
 	spin_lock(&dst_server->nfs_client->cl_lock);
-	list_for_each_entry(tmp_copy,
+	list_for_each_entry(iter,
 				&dst_server->nfs_client->pending_cb_stateids,
 				copies) {
-		if (memcmp(&res->write_res.stateid, &tmp_copy->stateid,
+		if (memcmp(&res->write_res.stateid, &iter->stateid,
 				NFS4_STATEID_SIZE))
 			continue;
-		found_pending = true;
-		list_del(&tmp_copy->copies);
+		tmp_copy = iter;
+		list_del(&iter->copies);
 		break;
 	}
-	if (found_pending) {
+	if (tmp_copy) {
 		spin_unlock(&dst_server->nfs_client->cl_lock);
 		kfree(copy);
 		copy = tmp_copy;

From 82ee41b85cef16b4be1f4732650012d9baaedddd Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Thu, 24 Mar 2022 10:22:58 -0400
Subject: [PATCH 172/229] SUNRPC don't resend a task on an offlined transport

When a task is being retried, due to an NFS error, if the assigned
transport has been put offline and the task is relocatable pick a new
transport.

Fixes: 6f081693e7b2b ("sunrpc: remove an offlined xprt using sysfs")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/clnt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0f54a56d19d2..8bf2af8546d2 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1065,7 +1065,9 @@ rpc_task_get_next_xprt(struct rpc_clnt *clnt)
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
-	if (task->tk_xprt)
+	if (task->tk_xprt &&
+			!(test_bit(XPRT_OFFLINE, &task->tk_xprt->state) &&
+                        (task->tk_flags & RPC_TASK_MOVEABLE)))
 		return;
 	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
 		task->tk_xprt = rpc_task_get_first_xprt(clnt);

From 1d15d121cc2ad4d016a7dc1493132a9696f91fc5 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Thu, 24 Mar 2022 10:38:42 -0400
Subject: [PATCH 173/229] NFSv4.1: don't retry BIND_CONN_TO_SESSION on session
 error

There is no reason to retry the operation if a session error had
occurred in such case result structure isn't filled out.

Fixes: dff58530c4ca ("NFSv4.1: fix handling of backchannel binding in BIND_CONN_TO_SESSION")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dd7a4c2a3f05..e3f5b380cefe 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8340,6 +8340,7 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
 	case -NFS4ERR_DEADSESSION:
 		nfs4_schedule_session_recovery(clp->cl_session,
 				task->tk_status);
+		return;
 	}
 	if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
 			res->dir != NFS4_CDFS4_BOTH) {

From c1cb81429df462eca1b6ba615cddd21dd3103c46 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 28 Jan 2022 14:40:55 +0000
Subject: [PATCH 174/229] kdb: Fix the putarea helper function

Currently kdb_putarea_size() uses copy_from_kernel_nofault() to write *to*
arbitrary kernel memory. This is obviously wrong and means the memory
modify ('mm') command is a serious risk to debugger stability: if we poke
to a bad address we'll double-fault and lose our debug session.

Fix this the (very) obvious way.

Note that there are two Fixes: tags because the API was renamed and this
patch will only trivially backport as far as the rename (and this is
probably enough). Nevertheless Christoph's rename did not introduce this
problem so I wanted to record that!

Fixes: fe557319aa06 ("maccess: rename probe_kernel_{read,write} to copy_{from,to}_kernel_nofault")
Fixes: 5d5314d6795f ("kdb: core for kgdb back end (1 of 2)")
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20220128144055.207267-1-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_support.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index df2bface866e..85cb51c4a17e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -291,7 +291,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
  */
 int kdb_putarea_size(unsigned long addr, void *res, size_t size)
 {
-	int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+	int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
 	if (ret) {
 		if (!KDB_STATE(SUPPRESS)) {
 			kdb_func_printf("Bad address 0x%lx\n", addr);

From ffba2123e1714a27e9362fda57c42155dda37efc Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Tue, 22 Mar 2022 20:32:55 -0700
Subject: [PATCH 175/229] net: stmmac: dwmac-qcom-ethqos: Enable RGMII
 functional clock on resume

When the Qualcomm ethqos driver is properly described in its associated
GDSC power-domain, the hardware will be powered down and loose its state
between qcom_ethqos_probe() and stmmac_init_dma_engine().

The result of this is that the functional clock from the RGMII IO macro
is no longer provides and the DMA software reset in dwmac4_dma_reset()
will time out, due to lacking clock signal.

Re-enable the functional clock, as part of the Qualcomm specific clock
enablement sequence to avoid this problem.

The final clock configuration will be adjusted by ethqos_fix_mac_speed()
once the link is being brought up.

Fixes: a7c30e62d4b8 ("net: stmmac: Add driver for Qualcomm ethqos")
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Tested-and-reviewed-by: Bhupesh Sharma <bhupesh.sharma@linaro.org>
Link: https://lore.kernel.org/r/20220323033255.2282930-1-bjorn.andersson@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
index 0cc28c79cc61..835caa15d55f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c
@@ -487,6 +487,13 @@ static int ethqos_clks_config(void *priv, bool enabled)
 			dev_err(&ethqos->pdev->dev, "rgmii_clk enable failed\n");
 			return ret;
 		}
+
+		/* Enable functional clock to prevent DMA reset to timeout due
+		 * to lacking PHY clock after the hardware block has been power
+		 * cycled. The actual configuration will be adjusted once
+		 * ethqos_fix_mac_speed() is invoked.
+		 */
+		ethqos_set_func_clk_en(ethqos);
 	} else {
 		clk_disable_unprepare(ethqos->rgmii_clk);
 	}

From 109d899452ba17996eccec7ae8249fb1f8900a16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= <alsi@bang-olufsen.dk>
Date: Wed, 23 Mar 2022 13:42:25 +0100
Subject: [PATCH 176/229] net: dsa: realtek: make interface drivers depend on
 OF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kernel test robot reported build warnings with a randconfig that
built realtek-{smi,mdio} without CONFIG_OF set. Since both interface
drivers are using OF and will not probe without, add the corresponding
dependency to Kconfig.

Link: https://lore.kernel.org/all/202203231233.Xx73Y40o-lkp@intel.com/
Link: https://lore.kernel.org/all/202203231439.ycl0jg50-lkp@intel.com/
Fixes: aac94001067d ("net: dsa: realtek: add new mdio interface for drivers")
Fixes: 765c39a4fafe ("net: dsa: realtek: convert subdrivers into modules")
Signed-off-by: Alvin Šipraga <alsi@bang-olufsen.dk>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Acked-by: Luiz Angelo Daros de Luca <luizluca@gmail.com>
Link: https://lore.kernel.org/r/20220323124225.91763-1-alvin@pqrs.dk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/realtek/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/realtek/Kconfig b/drivers/net/dsa/realtek/Kconfig
index b7427a8292b2..1aa79735355f 100644
--- a/drivers/net/dsa/realtek/Kconfig
+++ b/drivers/net/dsa/realtek/Kconfig
@@ -12,6 +12,7 @@ menuconfig NET_DSA_REALTEK
 config NET_DSA_REALTEK_MDIO
 	tristate "Realtek MDIO connected switch driver"
 	depends on NET_DSA_REALTEK
+	depends on OF
 	help
 	  Select to enable support for registering switches configured
 	  through MDIO.
@@ -19,6 +20,7 @@ config NET_DSA_REALTEK_MDIO
 config NET_DSA_REALTEK_SMI
 	tristate "Realtek SMI connected switch driver"
 	depends on NET_DSA_REALTEK
+	depends on OF
 	help
 	  Select to enable support for registering switches connected
 	  through SMI.

From 9fe087dda5bf097007b263664051bc0e84f6580d Mon Sep 17 00:00:00 2001
From: Greg Jesionowski <jesionowskigreg@gmail.com>
Date: Wed, 23 Mar 2022 15:00:24 -0700
Subject: [PATCH 177/229] net: usb: ax88179_178a: add Allied Telesis AT-UMCs

Adds the driver_info and IDs for the AX88179 based Allied Telesis AT-UMC
family of devices.

Signed-off-by: Greg Jesionowski <jesionowskigreg@gmail.com>
Link: https://lore.kernel.org/r/20220323220024.GA36800@test-HP-EliteDesk-800-G1-SFF
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/ax88179_178a.c | 51 ++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index a31098981a65..e2fa56b92685 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1872,6 +1872,45 @@ static const struct driver_info mct_info = {
 	.tx_fixup = ax88179_tx_fixup,
 };
 
+static const struct driver_info at_umc2000_info = {
+	.description = "AT-UMC2000 USB 3.0/USB 3.1 Gen 1 to Gigabit Ethernet Adapter",
+	.bind   = ax88179_bind,
+	.unbind = ax88179_unbind,
+	.status = ax88179_status,
+	.link_reset = ax88179_link_reset,
+	.reset  = ax88179_reset,
+	.stop   = ax88179_stop,
+	.flags  = FLAG_ETHER | FLAG_FRAMING_AX,
+	.rx_fixup = ax88179_rx_fixup,
+	.tx_fixup = ax88179_tx_fixup,
+};
+
+static const struct driver_info at_umc200_info = {
+	.description = "AT-UMC200 USB 3.0/USB 3.1 Gen 1 to Fast Ethernet Adapter",
+	.bind   = ax88179_bind,
+	.unbind = ax88179_unbind,
+	.status = ax88179_status,
+	.link_reset = ax88179_link_reset,
+	.reset  = ax88179_reset,
+	.stop   = ax88179_stop,
+	.flags  = FLAG_ETHER | FLAG_FRAMING_AX,
+	.rx_fixup = ax88179_rx_fixup,
+	.tx_fixup = ax88179_tx_fixup,
+};
+
+static const struct driver_info at_umc2000sp_info = {
+	.description = "AT-UMC2000/SP USB 3.0/USB 3.1 Gen 1 to Gigabit Ethernet Adapter",
+	.bind   = ax88179_bind,
+	.unbind = ax88179_unbind,
+	.status = ax88179_status,
+	.link_reset = ax88179_link_reset,
+	.reset  = ax88179_reset,
+	.stop   = ax88179_stop,
+	.flags  = FLAG_ETHER | FLAG_FRAMING_AX,
+	.rx_fixup = ax88179_rx_fixup,
+	.tx_fixup = ax88179_tx_fixup,
+};
+
 static const struct usb_device_id products[] = {
 {
 	/* ASIX AX88179 10/100/1000 */
@@ -1913,6 +1952,18 @@ static const struct usb_device_id products[] = {
 	/* Magic Control Technology U3-A9003 USB 3.0 Gigabit Ethernet Adapter */
 	USB_DEVICE(0x0711, 0x0179),
 	.driver_info = (unsigned long)&mct_info,
+}, {
+	/* Allied Telesis AT-UMC2000 USB 3.0/USB 3.1 Gen 1 to Gigabit Ethernet Adapter */
+	USB_DEVICE(0x07c9, 0x000e),
+	.driver_info = (unsigned long)&at_umc2000_info,
+}, {
+	/* Allied Telesis AT-UMC200 USB 3.0/USB 3.1 Gen 1 to Fast Ethernet Adapter */
+	USB_DEVICE(0x07c9, 0x000f),
+	.driver_info = (unsigned long)&at_umc200_info,
+}, {
+	/* Allied Telesis AT-UMC2000/SP USB 3.0/USB 3.1 Gen 1 to Gigabit Ethernet Adapter */
+	USB_DEVICE(0x07c9, 0x0010),
+	.driver_info = (unsigned long)&at_umc2000sp_info,
 },
 	{ },
 };

From 4b5f1ad5566ada230aaa2ce861b28d1895f1ea68 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 23 Mar 2022 18:36:23 +0100
Subject: [PATCH 178/229] vsock/virtio: initialize vdev->priv before using VQs

When we fill VQs with empty buffers and kick the host, it may send
an interrupt. `vdev->priv` must be initialized before this since it
is used in the virtqueue callbacks.

Fixes: 0deab087b16a ("vsock/virtio: use RCU to avoid use-after-free on the_virtio_vsock")
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/virtio_transport.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 5afc194a58bb..3e5513934c9f 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -622,6 +622,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 	INIT_WORK(&vsock->event_work, virtio_transport_event_work);
 	INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
 
+	vdev->priv = vsock;
+
 	mutex_lock(&vsock->tx_lock);
 	vsock->tx_run = true;
 	mutex_unlock(&vsock->tx_lock);
@@ -639,7 +641,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
 		vsock->seqpacket_allow = true;
 
-	vdev->priv = vsock;
 	rcu_assign_pointer(the_virtio_vsock, vsock);
 
 	mutex_unlock(&the_virtio_vsock_mutex);

From c1011c0b3a9c8d2065f425407475cbcc812540b7 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 23 Mar 2022 18:36:24 +0100
Subject: [PATCH 179/229] vsock/virtio: read the negotiated features before
 using VQs

Complete the driver configuration, reading the negotiated features,
before using the VQs in the virtio_vsock_probe().

Fixes: 53efbba12cc7 ("virtio/vsock: enable SEQPACKET for transport")
Suggested-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/virtio_transport.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 3e5513934c9f..3954d3be9083 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -622,6 +622,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 	INIT_WORK(&vsock->event_work, virtio_transport_event_work);
 	INIT_WORK(&vsock->send_pkt_work, virtio_transport_send_pkt_work);
 
+	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
+		vsock->seqpacket_allow = true;
+
 	vdev->priv = vsock;
 
 	mutex_lock(&vsock->tx_lock);
@@ -638,9 +641,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 	vsock->event_run = true;
 	mutex_unlock(&vsock->event_lock);
 
-	if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
-		vsock->seqpacket_allow = true;
-
 	rcu_assign_pointer(the_virtio_vsock, vsock);
 
 	mutex_unlock(&the_virtio_vsock_mutex);

From 88704454ef8b00ea91537ae0d47d9348077e0e72 Mon Sep 17 00:00:00 2001
From: Stefano Garzarella <sgarzare@redhat.com>
Date: Wed, 23 Mar 2022 18:36:25 +0100
Subject: [PATCH 180/229] vsock/virtio: enable VQs early on probe

virtio spec requires drivers to set DRIVER_OK before using VQs.
This is set automatically after probe returns, but virtio-vsock
driver uses VQs in the probe function to fill rx and event VQs
with new buffers.

Let's fix this, calling virtio_device_ready() before using VQs
in the probe function.

Fixes: 0ea9e1d3a9e3 ("VSOCK: Introduce virtio_transport.ko")
Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/vmw_vsock/virtio_transport.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 3954d3be9083..ba1c8cc0c467 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -627,6 +627,8 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 
 	vdev->priv = vsock;
 
+	virtio_device_ready(vdev);
+
 	mutex_lock(&vsock->tx_lock);
 	vsock->tx_run = true;
 	mutex_unlock(&vsock->tx_lock);

From 421ab1be43bd015ffe744f4ea25df4f19d1ce6fe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 25 Mar 2022 10:37:31 -0400
Subject: [PATCH 181/229] SUNRPC: Do not dereference non-socket transports in
 sysfs

Do not cast the struct xprt to a sock_xprt unless we know it is a UDP or
TCP transport. Otherwise the call to lock the mutex will scribble over
whatever structure is actually there. This has been seen to cause hard
system lockups when the underlying transport was RDMA.

Fixes: b49ea673e119 ("SUNRPC: lock against ->sock changing during sysfs read")
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h     |  3 ++
 include/linux/sunrpc/xprtsock.h |  1 -
 net/sunrpc/sysfs.c              | 55 ++++++++++++++++-----------------
 net/sunrpc/xprtsock.c           | 26 ++++++++++++++--
 4 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 955ea4d7af0b..eef5e87c03b4 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -139,6 +139,9 @@ struct rpc_xprt_ops {
 	void		(*rpcbind)(struct rpc_task *task);
 	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
 	void		(*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
+	int		(*get_srcaddr)(struct rpc_xprt *xprt, char *buf,
+				       size_t buflen);
+	unsigned short	(*get_srcport)(struct rpc_xprt *xprt);
 	int		(*buf_alloc)(struct rpc_task *task);
 	void		(*buf_free)(struct rpc_task *task);
 	void		(*prepare_request)(struct rpc_rqst *req);
diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index 3eb0079669c5..38284f25eddf 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -10,7 +10,6 @@
 
 int		init_socket_xprt(void);
 void		cleanup_socket_xprt(void);
-unsigned short	get_srcport(struct rpc_xprt *);
 
 #define RPC_MIN_RESVPORT	(1U)
 #define RPC_MAX_RESVPORT	(65535U)
diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index 05c758da6a92..9d8a7d9f3e41 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -97,7 +97,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj,
 		return 0;
 	ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
 	xprt_put(xprt);
-	return ret + 1;
+	return ret;
 }
 
 static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj,
@@ -105,33 +105,31 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj,
 					   char *buf)
 {
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
-	struct sockaddr_storage saddr;
-	struct sock_xprt *sock;
-	ssize_t ret = -1;
+	size_t buflen = PAGE_SIZE;
+	ssize_t ret = -ENOTSOCK;
 
 	if (!xprt || !xprt_connected(xprt)) {
-		xprt_put(xprt);
-		return -ENOTCONN;
+		ret = -ENOTCONN;
+	} else if (xprt->ops->get_srcaddr) {
+		ret = xprt->ops->get_srcaddr(xprt, buf, buflen);
+		if (ret > 0) {
+			if (ret < buflen - 1) {
+				buf[ret] = '\n';
+				ret++;
+				buf[ret] = '\0';
+			}
+		}
 	}
-
-	sock = container_of(xprt, struct sock_xprt, xprt);
-	mutex_lock(&sock->recv_mutex);
-	if (sock->sock == NULL ||
-	    kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0)
-		goto out;
-
-	ret = sprintf(buf, "%pISc\n", &saddr);
-out:
-	mutex_unlock(&sock->recv_mutex);
 	xprt_put(xprt);
-	return ret + 1;
+	return ret;
 }
 
 static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					char *buf)
+					struct kobj_attribute *attr, char *buf)
 {
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
+	unsigned short srcport = 0;
+	size_t buflen = PAGE_SIZE;
 	ssize_t ret;
 
 	if (!xprt || !xprt_connected(xprt)) {
@@ -139,7 +137,11 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
 		return -ENOTCONN;
 	}
 
-	ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n"
+	if (xprt->ops->get_srcport)
+		srcport = xprt->ops->get_srcport(xprt);
+
+	ret = snprintf(buf, buflen,
+		       "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n"
 		       "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n"
 		       "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n"
 		       "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n"
@@ -147,14 +149,11 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
 		       xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs,
 		       xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen,
 		       xprt->sending.qlen, xprt->pending.qlen,
-		       xprt->backlog.qlen, xprt->main,
-		       (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ?
-		       get_srcport(xprt) : 0,
+		       xprt->backlog.qlen, xprt->main, srcport,
 		       atomic_long_read(&xprt->queuelen),
-		       (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ?
-				xprt->address_strings[RPC_DISPLAY_PORT] : "0");
+		       xprt->address_strings[RPC_DISPLAY_PORT]);
 	xprt_put(xprt);
-	return ret + 1;
+	return ret;
 }
 
 static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
@@ -201,7 +200,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 	}
 
 	xprt_put(xprt);
-	return ret + 1;
+	return ret;
 }
 
 static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj,
@@ -220,7 +219,7 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj,
 		      xprt_switch->xps_nunique_destaddr_xprts,
 		      atomic_long_read(&xprt_switch->xps_queuelen));
 	xprt_switch_put(xprt_switch);
-	return ret + 1;
+	return ret;
 }
 
 static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b52eaa8a0cda..78af7518f263 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1638,7 +1638,7 @@ static int xs_get_srcport(struct sock_xprt *transport)
 	return port;
 }
 
-unsigned short get_srcport(struct rpc_xprt *xprt)
+static unsigned short xs_sock_srcport(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt);
 	unsigned short ret = 0;
@@ -1648,7 +1648,25 @@ unsigned short get_srcport(struct rpc_xprt *xprt)
 	mutex_unlock(&sock->recv_mutex);
 	return ret;
 }
-EXPORT_SYMBOL(get_srcport);
+
+static int xs_sock_srcaddr(struct rpc_xprt *xprt, char *buf, size_t buflen)
+{
+	struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt);
+	union {
+		struct sockaddr sa;
+		struct sockaddr_storage st;
+	} saddr;
+	int ret = -ENOTCONN;
+
+	mutex_lock(&sock->recv_mutex);
+	if (sock->sock) {
+		ret = kernel_getsockname(sock->sock, &saddr.sa);
+		if (ret >= 0)
+			ret = snprintf(buf, buflen, "%pISc", &saddr.sa);
+	}
+	mutex_unlock(&sock->recv_mutex);
+	return ret;
+}
 
 static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port)
 {
@@ -2622,6 +2640,8 @@ static const struct rpc_xprt_ops xs_udp_ops = {
 	.rpcbind		= rpcb_getport_async,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
+	.get_srcaddr		= xs_sock_srcaddr,
+	.get_srcport		= xs_sock_srcport,
 	.buf_alloc		= rpc_malloc,
 	.buf_free		= rpc_free,
 	.send_request		= xs_udp_send_request,
@@ -2644,6 +2664,8 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
 	.rpcbind		= rpcb_getport_async,
 	.set_port		= xs_set_port,
 	.connect		= xs_connect,
+	.get_srcaddr		= xs_sock_srcaddr,
+	.get_srcport		= xs_sock_srcport,
 	.buf_alloc		= rpc_malloc,
 	.buf_free		= rpc_free,
 	.prepare_request	= xs_stream_prepare_request,

From ebbe788731cb52a0ef69cf962813b302ef5b399e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 24 Mar 2022 17:19:59 -0400
Subject: [PATCH 182/229] SUNRPC: Don't return error values in sysfs read of
 closed files

Instead of returning an error value, which ends up being the return
value for the read() system call, it is more elegant to simply return
the error as a string value.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/sysfs.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c
index 9d8a7d9f3e41..a3a2f8aeb80e 100644
--- a/net/sunrpc/sysfs.c
+++ b/net/sunrpc/sysfs.c
@@ -93,10 +93,13 @@ static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj,
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
 	ssize_t ret;
 
-	if (!xprt)
-		return 0;
+	if (!xprt) {
+		ret = sprintf(buf, "<closed>\n");
+		goto out;
+	}
 	ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
 	xprt_put(xprt);
+out:
 	return ret;
 }
 
@@ -106,10 +109,10 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj,
 {
 	struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj);
 	size_t buflen = PAGE_SIZE;
-	ssize_t ret = -ENOTSOCK;
+	ssize_t ret;
 
 	if (!xprt || !xprt_connected(xprt)) {
-		ret = -ENOTCONN;
+		ret = sprintf(buf, "<closed>\n");
 	} else if (xprt->ops->get_srcaddr) {
 		ret = xprt->ops->get_srcaddr(xprt, buf, buflen);
 		if (ret > 0) {
@@ -118,8 +121,10 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj,
 				ret++;
 				buf[ret] = '\0';
 			}
-		}
-	}
+		} else
+			ret = sprintf(buf, "<closed>\n");
+	} else
+		ret = sprintf(buf, "<not a socket>\n");
 	xprt_put(xprt);
 	return ret;
 }
@@ -133,8 +138,8 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
 	ssize_t ret;
 
 	if (!xprt || !xprt_connected(xprt)) {
-		xprt_put(xprt);
-		return -ENOTCONN;
+		ret = sprintf(buf, "<closed>\n");
+		goto out;
 	}
 
 	if (xprt->ops->get_srcport)
@@ -152,6 +157,7 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj,
 		       xprt->backlog.qlen, xprt->main, srcport,
 		       atomic_long_read(&xprt->queuelen),
 		       xprt->address_strings[RPC_DISPLAY_PORT]);
+out:
 	xprt_put(xprt);
 	return ret;
 }
@@ -165,10 +171,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj,
 	int locked, connected, connecting, close_wait, bound, binding,
 	    closing, congested, cwnd_wait, write_space, offline, remove;
 
-	if (!xprt)
-		return 0;
-
-	if (!xprt->state) {
+	if (!(xprt && xprt->state)) {
 		ret = sprintf(buf, "state=CLOSED\n");
 	} else {
 		locked = test_bit(XPRT_LOCKED, &xprt->state);

From 7000ef38052b219583f17a10ef2e7ca99b5e585e Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Wed, 23 Mar 2022 11:26:02 +0100
Subject: [PATCH 183/229] Documentation: amd-pstate: grammar and sentence
 structure updates

Signed-off-by: Jan Engelhardt <jengelh@inai.de>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/admin-guide/pm/amd-pstate.rst | 135 ++++++++++----------
 1 file changed, 67 insertions(+), 68 deletions(-)

diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index 2f066df4ee9c..c83db37695da 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -19,7 +19,7 @@ Linux kernel. The new mechanism is based on Collaborative Processor
 Performance Control (CPPC) which provides finer grain frequency management
 than legacy ACPI hardware P-States. Current AMD CPU/APU platforms are using
 the ACPI P-states driver to manage CPU frequency and clocks with switching
-only in 3 P-states. CPPC replaces the ACPI P-states controls, allows a
+only in 3 P-states. CPPC replaces the ACPI P-states controls and allows a
 flexible, low-latency interface for the Linux kernel to directly
 communicate the performance hints to hardware.
 
@@ -27,7 +27,7 @@ communicate the performance hints to hardware.
 ``ondemand``, etc. to manage the performance hints which are provided by
 CPPC hardware functionality that internally follows the hardware
 specification (for details refer to AMD64 Architecture Programmer's Manual
-Volume 2: System Programming [1]_). Currently ``amd-pstate`` supports basic
+Volume 2: System Programming [1]_). Currently, ``amd-pstate`` supports basic
 frequency control function according to kernel governors on some of the
 Zen2 and Zen3 processors, and we will implement more AMD specific functions
 in future after we verify them on the hardware and SBIOS.
@@ -41,9 +41,9 @@ continuous, abstract, and unit-less performance value in a scale that is
 not tied to a specific performance state / frequency. This is an ACPI
 standard [2]_ which software can specify application performance goals and
 hints as a relative target to the infrastructure limits. AMD processors
-provides the low latency register model (MSR) instead of AML code
+provide the low latency register model (MSR) instead of an AML code
 interpreter for performance adjustments. ``amd-pstate`` will initialize a
-``struct cpufreq_driver`` instance ``amd_pstate_driver`` with the callbacks
+``struct cpufreq_driver`` instance, ``amd_pstate_driver``, with the callbacks
 to manage each performance update behavior. ::
 
  Highest Perf ------>+-----------------------+                         +-----------------------+
@@ -91,26 +91,26 @@ AMD CPPC Performance Capability
 Highest Performance (RO)
 .........................
 
-It is the absolute maximum performance an individual processor may reach,
+This is the absolute maximum performance an individual processor may reach,
 assuming ideal conditions. This performance level may not be sustainable
 for long durations and may only be achievable if other platform components
-are in a specific state; for example, it may require other processors be in
+are in a specific state; for example, it may require other processors to be in
 an idle state. This would be equivalent to the highest frequencies
 supported by the processor.
 
 Nominal (Guaranteed) Performance (RO)
 ......................................
 
-It is the maximum sustained performance level of the processor, assuming
-ideal operating conditions. In absence of an external constraint (power,
-thermal, etc.) this is the performance level the processor is expected to
+This is the maximum sustained performance level of the processor, assuming
+ideal operating conditions. In the absence of an external constraint (power,
+thermal, etc.), this is the performance level the processor is expected to
 be able to maintain continuously. All cores/processors are expected to be
 able to sustain their nominal performance state simultaneously.
 
 Lowest non-linear Performance (RO)
 ...................................
 
-It is the lowest performance level at which nonlinear power savings are
+This is the lowest performance level at which nonlinear power savings are
 achieved, for example, due to the combined effects of voltage and frequency
 scaling. Above this threshold, lower performance levels should be generally
 more energy efficient than higher performance levels. This register
@@ -119,7 +119,7 @@ effectively conveys the most efficient performance level to ``amd-pstate``.
 Lowest Performance (RO)
 ........................
 
-It is the absolute lowest performance level of the processor. Selecting a
+This is the absolute lowest performance level of the processor. Selecting a
 performance level lower than the lowest nonlinear performance level may
 cause an efficiency penalty but should reduce the instantaneous power
 consumption of the processor.
@@ -149,14 +149,14 @@ a relative number. This can be expressed as percentage of nominal
 performance (infrastructure max). Below the nominal sustained performance
 level, desired performance expresses the average performance level of the
 processor subject to hardware. Above the nominal performance level,
-processor must provide at least nominal performance requested and go higher
+the processor must provide at least nominal performance requested and go higher
 if current operating conditions allow.
 
 Energy Performance Preference (EPP) (RW)
 .........................................
 
-Provides a hint to the hardware if software wants to bias toward performance
-(0x0) or energy efficiency (0xff).
+This attribute provides a hint to the hardware if software wants to bias
+toward performance (0x0) or energy efficiency (0xff).
 
 
 Key Governors Support
@@ -173,35 +173,34 @@ operating frequencies supported by the hardware. Users can check the
 ``amd-pstate`` mainly supports ``schedutil`` and ``ondemand`` for dynamic
 frequency control. It is to fine tune the processor configuration on
 ``amd-pstate`` to the ``schedutil`` with CPU CFS scheduler. ``amd-pstate``
-registers adjust_perf callback to implement the CPPC similar performance
-update behavior. It is initialized by ``sugov_start`` and then populate the
-CPU's update_util_data pointer to assign ``sugov_update_single_perf`` as
-the utilization update callback function in CPU scheduler. CPU scheduler
-will call ``cpufreq_update_util`` and assign the target performance
-according to the ``struct sugov_cpu`` that utilization update belongs to.
-Then ``amd-pstate`` updates the desired performance according to the CPU
+registers the adjust_perf callback to implement performance update behavior
+similar to CPPC. It is initialized by ``sugov_start`` and then populates the
+CPU's update_util_data pointer to assign ``sugov_update_single_perf`` as the
+utilization update callback function in the CPU scheduler. The CPU scheduler
+will call ``cpufreq_update_util`` and assigns the target performance according
+to the ``struct sugov_cpu`` that the utilization update belongs to.
+Then, ``amd-pstate`` updates the desired performance according to the CPU
 scheduler assigned.
 
 
 Processor Support
 =======================
 
-The ``amd-pstate`` initialization will fail if the _CPC in ACPI SBIOS is
-not existed at the detected processor, and it uses ``acpi_cpc_valid`` to
-check the _CPC existence. All Zen based processors support legacy ACPI
-hardware P-States function, so while the ``amd-pstate`` fails to be
-initialized, the kernel will fall back to initialize ``acpi-cpufreq``
-driver.
+The ``amd-pstate`` initialization will fail if the ``_CPC`` entry in the ACPI
+SBIOS does not exist in the detected processor. It uses ``acpi_cpc_valid``
+to check the existence of ``_CPC``. All Zen based processors support the legacy
+ACPI hardware P-States function, so when ``amd-pstate`` fails initialization,
+the kernel will fall back to initialize the ``acpi-cpufreq`` driver.
 
 There are two types of hardware implementations for ``amd-pstate``: one is
 `Full MSR Support <perf_cap_>`_ and another is `Shared Memory Support
-<perf_cap_>`_. It can use :c:macro:`X86_FEATURE_CPPC` feature flag (for
-details refer to Processor Programming Reference (PPR) for AMD Family
-19h Model 51h, Revision A1 Processors [3]_) to indicate the different
-types. ``amd-pstate`` is to register different ``static_call`` instances
-for different hardware implementations.
+<perf_cap_>`_. It can use the :c:macro:`X86_FEATURE_CPPC` feature flag to
+indicate the different types. (For details, refer to the Processor Programming
+Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors [3]_.)
+``amd-pstate`` is to register different ``static_call`` instances for different
+hardware implementations.
 
-Currently, some of Zen2 and Zen3 processors support ``amd-pstate``. In the
+Currently, some of the Zen2 and Zen3 processors support ``amd-pstate``. In the
 future, it will be supported on more and more AMD processors.
 
 Full MSR Support
@@ -210,18 +209,18 @@ Full MSR Support
 Some new Zen3 processors such as Cezanne provide the MSR registers directly
 while the :c:macro:`X86_FEATURE_CPPC` CPU feature flag is set.
 ``amd-pstate`` can handle the MSR register to implement the fast switch
-function in ``CPUFreq`` that can shrink latency of frequency control on the
-interrupt context. The functions with ``pstate_xxx`` prefix represent the
-operations of MSR registers.
+function in ``CPUFreq`` that can reduce the latency of frequency control in
+interrupt context. The functions with a ``pstate_xxx`` prefix represent the
+operations on MSR registers.
 
 Shared Memory Support
 ----------------------
 
-If :c:macro:`X86_FEATURE_CPPC` CPU feature flag is not set, that means the
-processor supports shared memory solution. In this case, ``amd-pstate``
+If the :c:macro:`X86_FEATURE_CPPC` CPU feature flag is not set, the
+processor supports the shared memory solution. In this case, ``amd-pstate``
 uses the ``cppc_acpi`` helper methods to implement the callback functions
-that defined on ``static_call``. The functions with ``cppc_xxx`` prefix
-represent the operations of acpi cppc helpers for shared memory solution.
+that are defined on ``static_call``. The functions with the ``cppc_xxx`` prefix
+represent the operations of ACPI CPPC helpers for the shared memory solution.
 
 
 AMD P-States and ACPI hardware P-States always can be supported in one
@@ -234,7 +233,7 @@ User Space Interface in ``sysfs``
 ==================================
 
 ``amd-pstate`` exposes several global attributes (files) in ``sysfs`` to
-control its functionality at the system level. They located in the
+control its functionality at the system level. They are located in the
 ``/sys/devices/system/cpu/cpufreq/policyX/`` directory and affect all CPUs. ::
 
  root@hr-test1:/home/ray# ls /sys/devices/system/cpu/cpufreq/policy0/*amd*
@@ -246,38 +245,38 @@ control its functionality at the system level. They located in the
 ``amd_pstate_highest_perf / amd_pstate_max_freq``
 
 Maximum CPPC performance and CPU frequency that the driver is allowed to
-set in percent of the maximum supported CPPC performance level (the highest
+set, in percent of the maximum supported CPPC performance level (the highest
 performance supported in `AMD CPPC Performance Capability <perf_cap_>`_).
-In some of ASICs, the highest CPPC performance is not the one in the _CPC
-table, so we need to expose it to sysfs. If boost is not active but
-supported, this maximum frequency will be larger than the one in
+In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
+table, so we need to expose it to sysfs. If boost is not active, but
+still supported, this maximum frequency will be larger than the one in
 ``cpuinfo``.
 This attribute is read-only.
 
 ``amd_pstate_lowest_nonlinear_freq``
 
-The lowest non-linear CPPC CPU frequency that the driver is allowed to set
-in percent of the maximum supported CPPC performance level (Please see the
+The lowest non-linear CPPC CPU frequency that the driver is allowed to set,
+in percent of the maximum supported CPPC performance level. (Please see the
 lowest non-linear performance in `AMD CPPC Performance Capability
-<perf_cap_>`_).
+<perf_cap_>`_.)
 This attribute is read-only.
 
-For other performance and frequency values, we can read them back from
+Other performance and frequency values can be read back from
 ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`.
 
 
 ``amd-pstate`` vs ``acpi-cpufreq``
 ======================================
 
-On majority of AMD platforms supported by ``acpi-cpufreq``, the ACPI tables
-provided by the platform firmware used for CPU performance scaling, but
-only provides 3 P-states on AMD processors.
-However, on modern AMD APU and CPU series, it provides the collaborative
-processor performance control according to ACPI protocol and customize this
-for AMD platforms. That is fine-grain and continuous frequency range
+On the majority of AMD platforms supported by ``acpi-cpufreq``, the ACPI tables
+provided by the platform firmware are used for CPU performance scaling, but
+only provide 3 P-states on AMD processors.
+However, on modern AMD APU and CPU series, hardware provides the Collaborative
+Processor Performance Control according to the ACPI protocol and customizes this
+for AMD platforms. That is, fine-grained and continuous frequency ranges
 instead of the legacy hardware P-states. ``amd-pstate`` is the kernel
-module which supports the new AMD P-States mechanism on most of future AMD
-platforms. The AMD P-States mechanism will be the more performance and energy
+module which supports the new AMD P-States mechanism on most of the future AMD
+platforms. The AMD P-States mechanism is the more performance and energy
 efficiency frequency management method on AMD processors.
 
 Kernel Module Options for ``amd-pstate``
@@ -287,25 +286,25 @@ Kernel Module Options for ``amd-pstate``
 Use a module param (shared_mem) to enable related processors manually with
 **amd_pstate.shared_mem=1**.
 Due to the performance issue on the processors with `Shared Memory Support
-<perf_cap_>`_, so we disable it for the moment and will enable this by default
-once we address performance issue on this solution.
+<perf_cap_>`_, we disable it presently and will re-enable this by default
+once we address performance issue with this solution.
 
-The way to check whether current processor is `Full MSR Support <perf_cap_>`_
+To check whether the current processor is using `Full MSR Support <perf_cap_>`_
 or `Shared Memory Support <perf_cap_>`_ : ::
 
   ray@hr-test1:~$ lscpu | grep cppc
   Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm
 
-If CPU Flags have cppc, then this processor supports `Full MSR Support
-<perf_cap_>`_. Otherwise it supports `Shared Memory Support <perf_cap_>`_.
+If the CPU flags have ``cppc``, then this processor supports `Full MSR Support
+<perf_cap_>`_. Otherwise, it supports `Shared Memory Support <perf_cap_>`_.
 
 
 ``cpupower`` tool support for ``amd-pstate``
 ===============================================
 
-``amd-pstate`` is supported on ``cpupower`` tool that can be used to dump the frequency
-information. And it is in progress to support more and more operations for new
-``amd-pstate`` module with this tool. ::
+``amd-pstate`` is supported by the ``cpupower`` tool, which can be used to dump
+frequency information. Development is in progress to support more and more
+operations for the new ``amd-pstate`` module with this tool. ::
 
  root@hr-test1:/home/ray# cpupower frequency-info
  analyzing CPU 0:
@@ -336,10 +335,10 @@ Trace Events
 --------------
 
 There are two static trace events that can be used for ``amd-pstate``
-diagnostics.  One of them is the cpu_frequency trace event generally used
+diagnostics. One of them is the ``cpu_frequency`` trace event generally used
 by ``CPUFreq``, and the other one is the ``amd_pstate_perf`` trace event
 specific to ``amd-pstate``.  The following sequence of shell commands can
-be used to enable them and see their output (if the kernel is generally
+be used to enable them and see their output (if the kernel is
 configured to support event tracing). ::
 
  root@hr-test1:/home/ray# cd /sys/kernel/tracing/
@@ -364,7 +363,7 @@ configured to support event tracing). ::
           <idle>-0       [003] d.s..  4995.980971: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=3 changed=false fast_switch=true
           <idle>-0       [011] d.s..  4995.980996: amd_pstate_perf: amd_min_perf=85 amd_des_perf=85 amd_max_perf=166 cpu_id=11 changed=false fast_switch=true
 
-The cpu_frequency trace event will be triggered either by the ``schedutil`` scaling
+The ``cpu_frequency`` trace event will be triggered either by the ``schedutil`` scaling
 governor (for the policies it is attached to), or by the ``CPUFreq`` core (for the
 policies with other scaling governors).
 

From ff32baa1f39b1adb519479a51e7acbcbfdd2206c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20K=C4=85dzio=C5=82ka?=
 <niedzejkob@invisiblethingslab.com>
Date: Wed, 23 Mar 2022 02:21:03 +0100
Subject: [PATCH 184/229] xen: don't hang when resuming PCI device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a xen domain with at least two VCPUs has a PCI device attached which
enters the D3hot state during suspend, the kernel may hang while
resuming, depending on the core on which an async resume task gets
scheduled.

The bug occurs because xen's do_suspend calls dpm_resume_start while
only the timer of the boot CPU has been resumed (when xen_suspend called
syscore_resume), before calling xen_arch_suspend to resume the timers of
the other CPUs. This breaks pci_dev_d3_sleep.

Thus this patch moves the call to xen_arch_resume before the call to
dpm_resume_start, eliminating the hangs and restoring the stack-like
structure of the suspend/restore procedure.

Signed-off-by: Jakub Kądziołka <niedzejkob@invisiblethingslab.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20220323012103.2537-1-niedzejkob@invisiblethingslab.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 drivers/xen/manage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 374d36de7f5a..3d5a384d65f7 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -141,6 +141,8 @@ static void do_suspend(void)
 
 	raw_notifier_call_chain(&xen_resume_notifier, 0, NULL);
 
+	xen_arch_resume();
+
 	dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE);
 
 	if (err) {
@@ -148,8 +150,6 @@ static void do_suspend(void)
 		si.cancelled = 1;
 	}
 
-	xen_arch_resume();
-
 out_resume:
 	if (!si.cancelled)
 		xs_resume();

From de2ae403b4c0e79a3410e63bc448542fbb9f9bfc Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Fri, 25 Mar 2022 15:20:02 +0100
Subject: [PATCH 185/229] xen: fix is_xen_pmu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

is_xen_pmu() is taking the cpu number as parameter, but it is not using
it. Instead it just tests whether the Xen PMU initialization on the
current cpu did succeed. As this test is done by checking a percpu
pointer, preemption needs to be disabled in order to avoid switching
the cpu while doing the test. While resuming from suspend() this seems
not to be the case:

[   88.082751] ACPI: PM: Low-level resume complete
[   88.087933] ACPI: EC: EC started
[   88.091464] ACPI: PM: Restoring platform NVS memory
[   88.097166] xen_acpi_processor: Uploading Xen processor PM info
[   88.103850] Enabling non-boot CPUs ...
[   88.108128] installing Xen timer for CPU 1
[   88.112763] BUG: using smp_processor_id() in preemptible [00000000] code: systemd-sleep/7138
[   88.122256] caller is is_xen_pmu+0x12/0x30
[   88.126937] CPU: 0 PID: 7138 Comm: systemd-sleep Tainted: G        W         5.16.13-2.fc32.qubes.x86_64 #1
[   88.137939] Hardware name: Star Labs StarBook/StarBook, BIOS 7.97 03/21/2022
[   88.145930] Call Trace:
[   88.148757]  <TASK>
[   88.151193]  dump_stack_lvl+0x48/0x5e
[   88.155381]  check_preemption_disabled+0xde/0xe0
[   88.160641]  is_xen_pmu+0x12/0x30
[   88.164441]  xen_smp_intr_init_pv+0x75/0x100

Fix that by replacing is_xen_pmu() by a simple boolean variable which
reflects the Xen PMU initialization state on cpu 0.

Modify xen_pmu_init() to return early in case it is being called for a
cpu other than cpu 0 and the boolean variable not being set.

Fixes: bf6dfb154d93 ("xen/PMU: PMU emulation code")
Reported-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/r/20220325142002.31789-1-jgross@suse.com
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/xen/pmu.c    | 10 ++++------
 arch/x86/xen/pmu.h    |  3 ++-
 arch/x86/xen/smp_pv.c |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 89dd6b1708b0..21ecbe754cb2 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -506,10 +506,7 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
 	return ret;
 }
 
-bool is_xen_pmu(int cpu)
-{
-	return (get_xenpmu_data() != NULL);
-}
+bool is_xen_pmu;
 
 void xen_pmu_init(int cpu)
 {
@@ -520,7 +517,7 @@ void xen_pmu_init(int cpu)
 
 	BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
 
-	if (xen_hvm_domain())
+	if (xen_hvm_domain() || (cpu != 0 && !is_xen_pmu))
 		return;
 
 	xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
@@ -541,7 +538,8 @@ void xen_pmu_init(int cpu)
 	per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
 	per_cpu(xenpmu_shared, cpu).flags = 0;
 
-	if (cpu == 0) {
+	if (!is_xen_pmu) {
+		is_xen_pmu = true;
 		perf_register_guest_info_callbacks(&xen_guest_cbs);
 		xen_pmu_arch_init();
 	}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
index 0e83a160589b..65c58894fc79 100644
--- a/arch/x86/xen/pmu.h
+++ b/arch/x86/xen/pmu.h
@@ -4,6 +4,8 @@
 
 #include <xen/interface/xenpmu.h>
 
+extern bool is_xen_pmu;
+
 irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
 #ifdef CONFIG_XEN_HAVE_VPMU
 void xen_pmu_init(int cpu);
@@ -12,7 +14,6 @@ void xen_pmu_finish(int cpu);
 static inline void xen_pmu_init(int cpu) {}
 static inline void xen_pmu_finish(int cpu) {}
 #endif
-bool is_xen_pmu(int cpu);
 bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
 bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
 int pmu_apic_update(uint32_t reg);
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 4a6019238ee7..688aa8b6ae29 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -129,7 +129,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
 	per_cpu(xen_irq_work, cpu).irq = rc;
 	per_cpu(xen_irq_work, cpu).name = callfunc_name;
 
-	if (is_xen_pmu(cpu)) {
+	if (is_xen_pmu) {
 		pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
 		rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
 					     xen_pmu_irq_handler,

From feb13dcb1818b775fbd9191f797be67cd605f03e Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 24 Mar 2022 18:12:10 +0200
Subject: [PATCH 186/229] net: enetc: report software timestamping via
 SO_TIMESTAMPING

Let user space properly determine that the enetc driver provides
software timestamps.

Fixes: 4caefbce06d1 ("enetc: add software timestamping")
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Claudiu Manoil <claudiu.manoil@nxp.com>
Link: https://lore.kernel.org/r/20220324161210.4122281-1-vladimir.oltean@nxp.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/freescale/enetc/enetc_ethtool.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
index fa5b4f885b17..60ec64bfb3f0 100644
--- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
+++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c
@@ -674,7 +674,10 @@ static int enetc_get_ts_info(struct net_device *ndev,
 #ifdef CONFIG_FSL_ENETC_PTP_CLOCK
 	info->so_timestamping = SOF_TIMESTAMPING_TX_HARDWARE |
 				SOF_TIMESTAMPING_RX_HARDWARE |
-				SOF_TIMESTAMPING_RAW_HARDWARE;
+				SOF_TIMESTAMPING_RAW_HARDWARE |
+				SOF_TIMESTAMPING_TX_SOFTWARE |
+				SOF_TIMESTAMPING_RX_SOFTWARE |
+				SOF_TIMESTAMPING_SOFTWARE;
 
 	info->tx_types = (1 << HWTSTAMP_TX_OFF) |
 			 (1 << HWTSTAMP_TX_ON) |

From 264a9c5c9dff40f92a2a9ad1757d59a4438114fd Mon Sep 17 00:00:00 2001
From: Casper Andersson <casper.casan@gmail.com>
Date: Thu, 24 Mar 2022 12:38:52 +0100
Subject: [PATCH 187/229] net: sparx5: Remove unused GLAG handling in PGID

Removes PGID handling for GLAG since it is not used
yet. According to feedback on previous patch.
https://lore.kernel.org/netdev/20220322081823.wqbx7vud4q7qtjuq@wse-c0155/T/#t

Signed-off-by: Casper Andersson <casper.casan@gmail.com>
Reviewed-by: Steen Hegelund <Steen.Hegelund@microchip.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/microchip/sparx5/sparx5_main.h |  4 ----
 .../net/ethernet/microchip/sparx5/sparx5_pgid.c | 17 -----------------
 2 files changed, 21 deletions(-)

diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
index 7a04b8f2a546..8e77d7ee8e68 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
@@ -69,9 +69,6 @@ enum sparx5_vlan_port_type {
 #define PGID_TABLE_SIZE	       3290
 
 #define PGID_MCAST_START 65
-#define PGID_GLAG_START 833
-#define PGID_GLAG_END 1088
-
 #define IFH_LEN                9 /* 36 bytes */
 #define NULL_VID               0
 #define SPX5_MACT_PULL_DELAY   (2 * HZ)
@@ -374,7 +371,6 @@ enum sparx5_pgid_type {
 	SPX5_PGID_FREE,
 	SPX5_PGID_RESERVED,
 	SPX5_PGID_MULTICAST,
-	SPX5_PGID_GLAG
 };
 
 void sparx5_pgid_init(struct sparx5 *spx5);
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c b/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c
index 90366fcb9958..851a559269e1 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c
@@ -15,28 +15,11 @@ void sparx5_pgid_init(struct sparx5 *spx5)
 		spx5->pgid_map[i] = SPX5_PGID_RESERVED;
 }
 
-int sparx5_pgid_alloc_glag(struct sparx5 *spx5, u16 *idx)
-{
-	int i;
-
-	for (i = PGID_GLAG_START; i <= PGID_GLAG_END; i++)
-		if (spx5->pgid_map[i] == SPX5_PGID_FREE) {
-			spx5->pgid_map[i] = SPX5_PGID_GLAG;
-			*idx = i;
-			return 0;
-		}
-
-	return -EBUSY;
-}
-
 int sparx5_pgid_alloc_mcast(struct sparx5 *spx5, u16 *idx)
 {
 	int i;
 
 	for (i = PGID_MCAST_START; i < PGID_TABLE_SIZE; i++) {
-		if (i == PGID_GLAG_START)
-			i = PGID_GLAG_END + 1;
-
 		if (spx5->pgid_map[i] == SPX5_PGID_FREE) {
 			spx5->pgid_map[i] = SPX5_PGID_MULTICAST;
 			*idx = i;

From ad238fc6de7d1adece5167df09464912c098b021 Mon Sep 17 00:00:00 2001
From: Casper Andersson <casper.casan@gmail.com>
Date: Thu, 24 Mar 2022 12:38:53 +0100
Subject: [PATCH 188/229] net: sparx5: Refactor mdb handling according to
 feedback

- Remove mact_lookup and use new mact_find instead.
- Make pgid_read_mask function.
- Set PGID arbiter to start searching at PGID_BASE + 8.

This is according to feedback on previous patch.
https://lore.kernel.org/netdev/20220322081823.wqbx7vud4q7qtjuq@wse-c0155/T/#t

Signed-off-by: Casper Andersson <casper.casan@gmail.com>
Reviewed-by: Steen Hegelund <Steen.Hegelund@microchip.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../microchip/sparx5/sparx5_mactable.c        | 19 ++++---------------
 .../ethernet/microchip/sparx5/sparx5_main.h   |  3 ++-
 .../ethernet/microchip/sparx5/sparx5_pgid.c   |  3 +++
 .../microchip/sparx5/sparx5_switchdev.c       | 18 ++++++++----------
 .../ethernet/microchip/sparx5/sparx5_vlan.c   |  7 +++++++
 5 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c b/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c
index 35abb3d0ce19..a5837dbe0c7e 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c
@@ -212,19 +212,7 @@ bool sparx5_mact_find(struct sparx5 *sparx5,
 
 	mutex_unlock(&sparx5->lock);
 
-	return ret == 0;
-}
-
-static int sparx5_mact_lookup(struct sparx5 *sparx5,
-			      const unsigned char mac[ETH_ALEN],
-			      u16 vid)
-{
-	u32 pcfg2;
-
-	if (sparx5_mact_find(sparx5, mac, vid, &pcfg2))
-		return 1;
-
-	return 0;
+	return ret;
 }
 
 int sparx5_mact_forget(struct sparx5 *sparx5,
@@ -305,9 +293,10 @@ int sparx5_add_mact_entry(struct sparx5 *sparx5,
 {
 	struct sparx5_mact_entry *mact_entry;
 	int ret;
+	u32 cfg2;
 
-	ret = sparx5_mact_lookup(sparx5, addr, vid);
-	if (ret)
+	ret = sparx5_mact_find(sparx5, addr, vid, &cfg2);
+	if (!ret)
 		return 0;
 
 	/* In case the entry already exists, don't add it again to SW,
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
index 8e77d7ee8e68..b197129044b5 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
@@ -65,10 +65,10 @@ enum sparx5_vlan_port_type {
 #define PGID_IPV6_MC_CTRL      (PGID_BASE + 5)
 #define PGID_BCAST	       (PGID_BASE + 6)
 #define PGID_CPU	       (PGID_BASE + 7)
+#define PGID_MCAST_START       (PGID_BASE + 8)
 
 #define PGID_TABLE_SIZE	       3290
 
-#define PGID_MCAST_START 65
 #define IFH_LEN                9 /* 36 bytes */
 #define NULL_VID               0
 #define SPX5_MACT_PULL_DELAY   (2 * HZ)
@@ -325,6 +325,7 @@ void sparx5_mact_init(struct sparx5 *sparx5);
 
 /* sparx5_vlan.c */
 void sparx5_pgid_update_mask(struct sparx5_port *port, int pgid, bool enable);
+void sparx5_pgid_read_mask(struct sparx5 *sparx5, int pgid, u32 portmask[3]);
 void sparx5_update_fwd(struct sparx5 *sparx5);
 void sparx5_vlan_init(struct sparx5 *sparx5);
 void sparx5_vlan_port_setup(struct sparx5 *sparx5, int portno);
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c b/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c
index 851a559269e1..af8b435009f4 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_pgid.c
@@ -19,6 +19,9 @@ int sparx5_pgid_alloc_mcast(struct sparx5 *spx5, u16 *idx)
 {
 	int i;
 
+	/* The multicast area starts at index 65, but the first 7
+	 * are reserved for flood masks and CPU. Start alloc after that.
+	 */
 	for (i = PGID_MCAST_START; i < PGID_TABLE_SIZE; i++) {
 		if (spx5->pgid_map[i] == SPX5_PGID_FREE) {
 			spx5->pgid_map[i] = SPX5_PGID_MULTICAST;
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
index 2d8e0b81c839..5389fffc694a 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c
@@ -406,11 +406,11 @@ static int sparx5_handle_port_mdb_add(struct net_device *dev,
 
 	res = sparx5_mact_find(spx5, v->addr, vid, &mact_entry);
 
-	if (res) {
+	if (res == 0) {
 		pgid_idx = LRN_MAC_ACCESS_CFG_2_MAC_ENTRY_ADDR_GET(mact_entry);
 
-		/* MC_IDX has an offset of 65 in the PGID table. */
-		pgid_idx += PGID_MCAST_START;
+		/* MC_IDX starts after the port masks in the PGID table */
+		pgid_idx += SPX5_PORTS;
 		sparx5_pgid_update_mask(port, pgid_idx, true);
 	} else {
 		err = sparx5_pgid_alloc_mcast(spx5, &pgid_idx);
@@ -468,17 +468,15 @@ static int sparx5_handle_port_mdb_del(struct net_device *dev,
 
 	res = sparx5_mact_find(spx5, v->addr, vid, &mact_entry);
 
-	if (res) {
+	if (res == 0) {
 		pgid_idx = LRN_MAC_ACCESS_CFG_2_MAC_ENTRY_ADDR_GET(mact_entry);
 
-		/* MC_IDX has an offset of 65 in the PGID table. */
-		pgid_idx += PGID_MCAST_START;
+		/* MC_IDX starts after the port masks in the PGID table */
+		pgid_idx += SPX5_PORTS;
 		sparx5_pgid_update_mask(port, pgid_idx, false);
 
-		pgid_entry[0] = spx5_rd(spx5, ANA_AC_PGID_CFG(pgid_idx));
-		pgid_entry[1] = spx5_rd(spx5, ANA_AC_PGID_CFG1(pgid_idx));
-		pgid_entry[2] = spx5_rd(spx5, ANA_AC_PGID_CFG2(pgid_idx));
-		if (pgid_entry[0] == 0 && pgid_entry[1] == 0 && pgid_entry[2] == 0) {
+		sparx5_pgid_read_mask(spx5, pgid_idx, pgid_entry);
+		if (bitmap_empty((unsigned long *)pgid_entry, SPX5_PORTS)) {
 			/* No ports are in MC group. Remove entry */
 			err = sparx5_mdb_del_entry(dev, spx5, v->addr, vid, pgid_idx);
 			if (err)
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c b/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c
index 8e56ffa1c4f7..37e4ac965849 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_vlan.c
@@ -138,6 +138,13 @@ void sparx5_pgid_update_mask(struct sparx5_port *port, int pgid, bool enable)
 	}
 }
 
+void sparx5_pgid_read_mask(struct sparx5 *spx5, int pgid, u32 portmask[3])
+{
+	portmask[0] = spx5_rd(spx5, ANA_AC_PGID_CFG(pgid));
+	portmask[1] = spx5_rd(spx5, ANA_AC_PGID_CFG1(pgid));
+	portmask[2] = spx5_rd(spx5, ANA_AC_PGID_CFG2(pgid));
+}
+
 void sparx5_update_fwd(struct sparx5 *sparx5)
 {
 	DECLARE_BITMAP(workmask, SPX5_PORTS);

From ccb18f05535c96d26e2d559d402acb87700fc5a7 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Thu, 24 Mar 2022 20:54:47 +0800
Subject: [PATCH 189/229] net: hns3: fix bug when PF set the duplicate MAC
 address for VFs

If the MAC address A is configured to vport A and then vport B. The MAC
address of vport A in the hardware becomes invalid. If the address of
vport A is changed to MAC address B, the driver needs to delete the MAC
address A of vport A. Due to the MAC address A of vport A has become
invalid in the hardware entry, so "-ENOENT" is returned. In this case, the
"used_umv_size" value recorded in driver is not updated. As a result, the
MAC entry status of the software is inconsistent with that of the hardware.

Therefore, the driver updates the umv size even if the MAC entry cannot be
found. Ensure that the software and hardware status is consistent.

Fixes: ee4bcd3b7ae4 ("net: hns3: refactor the MAC address configure")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 78d0498bdabc..0e1a110e7847 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -8438,12 +8438,11 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport,
 	hnae3_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT0_EN_B, 0);
 	hclge_prepare_mac_addr(&req, addr, false);
 	ret = hclge_remove_mac_vlan_tbl(vport, &req);
-	if (!ret) {
+	if (!ret || ret == -ENOENT) {
 		mutex_lock(&hdev->vport_lock);
 		hclge_update_umv_space(vport, true);
 		mutex_unlock(&hdev->vport_lock);
-	} else if (ret == -ENOENT) {
-		ret = 0;
+		return 0;
 	}
 
 	return ret;

From c0f46de30c965d4ba208b5bf1a6d3437a7556ee2 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Thu, 24 Mar 2022 20:54:48 +0800
Subject: [PATCH 190/229] net: hns3: fix port base vlan add fail when
 concurrent with reset

Currently, Port base vlan is initiated by PF and configured to its VFs,
by using command "ip link set <pf name> vf <vf id> vlan <vlan id>".
When a global reset was triggered, the hardware vlan table and the soft
recorded vlan information will be cleared by PF, and restored them until
VFs were ready. There is a short time window between the table had been
cleared and before table restored. If configured a new port base vlan tag
at this moment, driver will check the soft recorded vlan information,
and find there hasn't the old tag in it, which causing a warning print.

Due to the port base vlan is managed by PF, so the VFs's port base vlan
restoring should be handled by PF when PF was ready.

This patch fixes it.

Fixes: 039ba863e8d7 ("net: hns3: optimize the filter table entries handling when resetting")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 68 +++++++++++++------
 .../hisilicon/hns3/hns3pf/hclge_main.h        |  3 +
 2 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 0e1a110e7847..34c6280c9af2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1872,6 +1872,7 @@ static int hclge_alloc_vport(struct hclge_dev *hdev)
 		vport->vf_info.link_state = IFLA_VF_LINK_STATE_AUTO;
 		vport->mps = HCLGE_MAC_DEFAULT_FRAME;
 		vport->port_base_vlan_cfg.state = HNAE3_PORT_BASE_VLAN_DISABLE;
+		vport->port_base_vlan_cfg.tbl_sta = true;
 		vport->rxvlan_cfg.rx_vlan_offload_en = true;
 		vport->req_vlan_fltr_en = true;
 		INIT_LIST_HEAD(&vport->vlan_list);
@@ -9915,34 +9916,52 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev)
 	}
 }
 
+void hclge_restore_vport_port_base_vlan_config(struct hclge_dev *hdev)
+{
+	struct hclge_vlan_info *vlan_info;
+	struct hclge_vport *vport;
+	u16 vlan_proto;
+	u16 vlan_id;
+	u16 state;
+	int vf_id;
+	int ret;
+
+	/* PF should restore all vfs port base vlan */
+	for (vf_id = 0; vf_id < hdev->num_alloc_vfs; vf_id++) {
+		vport = &hdev->vport[vf_id + HCLGE_VF_VPORT_START_NUM];
+		vlan_info = vport->port_base_vlan_cfg.tbl_sta ?
+			    &vport->port_base_vlan_cfg.vlan_info :
+			    &vport->port_base_vlan_cfg.old_vlan_info;
+
+		vlan_id = vlan_info->vlan_tag;
+		vlan_proto = vlan_info->vlan_proto;
+		state = vport->port_base_vlan_cfg.state;
+
+		if (state != HNAE3_PORT_BASE_VLAN_DISABLE) {
+			clear_bit(vport->vport_id, hdev->vlan_table[vlan_id]);
+			ret = hclge_set_vlan_filter_hw(hdev, htons(vlan_proto),
+						       vport->vport_id,
+						       vlan_id, false);
+			vport->port_base_vlan_cfg.tbl_sta = ret == 0;
+		}
+	}
+}
+
 void hclge_restore_vport_vlan_table(struct hclge_vport *vport)
 {
 	struct hclge_vport_vlan_cfg *vlan, *tmp;
 	struct hclge_dev *hdev = vport->back;
-	u16 vlan_proto;
-	u16 vlan_id;
-	u16 state;
 	int ret;
 
-	vlan_proto = vport->port_base_vlan_cfg.vlan_info.vlan_proto;
-	vlan_id = vport->port_base_vlan_cfg.vlan_info.vlan_tag;
-	state = vport->port_base_vlan_cfg.state;
-
-	if (state != HNAE3_PORT_BASE_VLAN_DISABLE) {
-		clear_bit(vport->vport_id, hdev->vlan_table[vlan_id]);
-		hclge_set_vlan_filter_hw(hdev, htons(vlan_proto),
-					 vport->vport_id, vlan_id,
-					 false);
-		return;
-	}
-
-	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
-		ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q),
-					       vport->vport_id,
-					       vlan->vlan_id, false);
-		if (ret)
-			break;
-		vlan->hd_tbl_status = true;
+	if (vport->port_base_vlan_cfg.state == HNAE3_PORT_BASE_VLAN_DISABLE) {
+		list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
+			ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q),
+						       vport->vport_id,
+						       vlan->vlan_id, false);
+			if (ret)
+				break;
+			vlan->hd_tbl_status = true;
+		}
 	}
 }
 
@@ -9983,6 +10002,7 @@ static void hclge_restore_hw_table(struct hclge_dev *hdev)
 	struct hnae3_handle *handle = &vport->nic;
 
 	hclge_restore_mac_table_common(vport);
+	hclge_restore_vport_port_base_vlan_config(hdev);
 	hclge_restore_vport_vlan_table(vport);
 	set_bit(HCLGE_STATE_FD_USER_DEF_CHANGED, &hdev->state);
 	hclge_restore_fd_entries(handle);
@@ -10039,6 +10059,8 @@ static int hclge_update_vlan_filter_entries(struct hclge_vport *vport,
 						 false);
 	}
 
+	vport->port_base_vlan_cfg.tbl_sta = false;
+
 	/* force add VLAN 0 */
 	ret = hclge_set_vf_vlan_common(hdev, vport->vport_id, false, 0);
 	if (ret)
@@ -10128,7 +10150,9 @@ out:
 	else
 		nic->port_base_vlan_state = HNAE3_PORT_BASE_VLAN_ENABLE;
 
+	vport->port_base_vlan_cfg.old_vlan_info = *old_vlan_info;
 	vport->port_base_vlan_cfg.vlan_info = *vlan_info;
+	vport->port_base_vlan_cfg.tbl_sta = true;
 	hclge_set_vport_vlan_fltr_change(vport);
 
 	return 0;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index fc92ae385e30..a0f619c77eae 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -985,7 +985,9 @@ struct hclge_vlan_info {
 
 struct hclge_port_base_vlan_config {
 	u16 state;
+	bool tbl_sta;
 	struct hclge_vlan_info vlan_info;
+	struct hclge_vlan_info old_vlan_info;
 };
 
 struct hclge_vf_info {
@@ -1100,6 +1102,7 @@ void hclge_rm_vport_all_mac_table(struct hclge_vport *vport, bool is_del_list,
 void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list);
 void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev);
 void hclge_restore_mac_table_common(struct hclge_vport *vport);
+void hclge_restore_vport_port_base_vlan_config(struct hclge_dev *hdev);
 void hclge_restore_vport_vlan_table(struct hclge_vport *vport);
 int hclge_update_port_base_vlan_cfg(struct hclge_vport *vport, u16 state,
 				    struct hclge_vlan_info *vlan_info);

From 1932a624ab88ff407d1a1d567fe581faa15dc725 Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Thu, 24 Mar 2022 20:54:49 +0800
Subject: [PATCH 191/229] net: hns3: add vlan list lock to protect vlan list

When adding port base VLAN, vf VLAN need to remove from HW and modify
the vlan state in vf VLAN list as false. If the periodicity task is
freeing the same node, it may cause "use after free" error.
This patch adds a vlan list lock to protect the vlan list.

Fixes: c6075b193462 ("net: hns3: Record VF vlan tables")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 38 +++++++++++++++++--
 .../hisilicon/hns3/hns3pf/hclge_main.h        |  1 +
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 34c6280c9af2..67d60e00170f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -9818,19 +9818,28 @@ static void hclge_add_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
 				       bool writen_to_tbl)
 {
 	struct hclge_vport_vlan_cfg *vlan, *tmp;
+	struct hclge_dev *hdev = vport->back;
 
-	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node)
-		if (vlan->vlan_id == vlan_id)
+	mutex_lock(&hdev->vport_lock);
+
+	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
+		if (vlan->vlan_id == vlan_id) {
+			mutex_unlock(&hdev->vport_lock);
 			return;
+		}
+	}
 
 	vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
-	if (!vlan)
+	if (!vlan) {
+		mutex_unlock(&hdev->vport_lock);
 		return;
+	}
 
 	vlan->hd_tbl_status = writen_to_tbl;
 	vlan->vlan_id = vlan_id;
 
 	list_add_tail(&vlan->node, &vport->vlan_list);
+	mutex_unlock(&hdev->vport_lock);
 }
 
 static int hclge_add_vport_all_vlan_table(struct hclge_vport *vport)
@@ -9839,6 +9848,8 @@ static int hclge_add_vport_all_vlan_table(struct hclge_vport *vport)
 	struct hclge_dev *hdev = vport->back;
 	int ret;
 
+	mutex_lock(&hdev->vport_lock);
+
 	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
 		if (!vlan->hd_tbl_status) {
 			ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q),
@@ -9848,12 +9859,16 @@ static int hclge_add_vport_all_vlan_table(struct hclge_vport *vport)
 				dev_err(&hdev->pdev->dev,
 					"restore vport vlan list failed, ret=%d\n",
 					ret);
+
+				mutex_unlock(&hdev->vport_lock);
 				return ret;
 			}
 		}
 		vlan->hd_tbl_status = true;
 	}
 
+	mutex_unlock(&hdev->vport_lock);
+
 	return 0;
 }
 
@@ -9863,6 +9878,8 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
 	struct hclge_vport_vlan_cfg *vlan, *tmp;
 	struct hclge_dev *hdev = vport->back;
 
+	mutex_lock(&hdev->vport_lock);
+
 	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
 		if (vlan->vlan_id == vlan_id) {
 			if (is_write_tbl && vlan->hd_tbl_status)
@@ -9877,6 +9894,8 @@ static void hclge_rm_vport_vlan_table(struct hclge_vport *vport, u16 vlan_id,
 			break;
 		}
 	}
+
+	mutex_unlock(&hdev->vport_lock);
 }
 
 void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list)
@@ -9884,6 +9903,8 @@ void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list)
 	struct hclge_vport_vlan_cfg *vlan, *tmp;
 	struct hclge_dev *hdev = vport->back;
 
+	mutex_lock(&hdev->vport_lock);
+
 	list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
 		if (vlan->hd_tbl_status)
 			hclge_set_vlan_filter_hw(hdev,
@@ -9899,6 +9920,7 @@ void hclge_rm_vport_all_vlan_table(struct hclge_vport *vport, bool is_del_list)
 		}
 	}
 	clear_bit(vport->vport_id, hdev->vf_vlan_full);
+	mutex_unlock(&hdev->vport_lock);
 }
 
 void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev)
@@ -9907,6 +9929,8 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev)
 	struct hclge_vport *vport;
 	int i;
 
+	mutex_lock(&hdev->vport_lock);
+
 	for (i = 0; i < hdev->num_alloc_vport; i++) {
 		vport = &hdev->vport[i];
 		list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
@@ -9914,6 +9938,8 @@ void hclge_uninit_vport_vlan_table(struct hclge_dev *hdev)
 			kfree(vlan);
 		}
 	}
+
+	mutex_unlock(&hdev->vport_lock);
 }
 
 void hclge_restore_vport_port_base_vlan_config(struct hclge_dev *hdev)
@@ -9953,6 +9979,8 @@ void hclge_restore_vport_vlan_table(struct hclge_vport *vport)
 	struct hclge_dev *hdev = vport->back;
 	int ret;
 
+	mutex_lock(&hdev->vport_lock);
+
 	if (vport->port_base_vlan_cfg.state == HNAE3_PORT_BASE_VLAN_DISABLE) {
 		list_for_each_entry_safe(vlan, tmp, &vport->vlan_list, node) {
 			ret = hclge_set_vlan_filter_hw(hdev, htons(ETH_P_8021Q),
@@ -9963,6 +9991,8 @@ void hclge_restore_vport_vlan_table(struct hclge_vport *vport)
 			vlan->hd_tbl_status = true;
 		}
 	}
+
+	mutex_unlock(&hdev->vport_lock);
 }
 
 /* For global reset and imp reset, hardware will clear the mac table,
@@ -11861,8 +11891,8 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
 	hclge_misc_irq_uninit(hdev);
 	hclge_devlink_uninit(hdev);
 	hclge_pci_uninit(hdev);
-	mutex_destroy(&hdev->vport_lock);
 	hclge_uninit_vport_vlan_table(hdev);
+	mutex_destroy(&hdev->vport_lock);
 	ae_dev->priv = NULL;
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index a0f619c77eae..c70239758bb2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -1033,6 +1033,7 @@ struct hclge_vport {
 	spinlock_t mac_list_lock; /* protect mac address need to add/detele */
 	struct list_head uc_mac_list;   /* Store VF unicast table */
 	struct list_head mc_mac_list;   /* Store VF multicast table */
+
 	struct list_head vlan_list;     /* Store VF vlan table */
 };
 

From 190cd8a72b0181c543ecada6243be3a50636941b Mon Sep 17 00:00:00 2001
From: Jian Shen <shenjian15@huawei.com>
Date: Thu, 24 Mar 2022 20:54:50 +0800
Subject: [PATCH 192/229] net: hns3: refine the process when PF set VF VLAN

Currently, when PF set VF VLAN, it sends notify mailbox to VF
if VF alive. VF stop its traffic, and send request mailbox
to PF, then PF updates VF VLAN. It's a bit complex. If VF is
killed before sending request, PF will not set VF VLAN without
any log.

This patch refines the process, PF can set VF VLAN direclty,
and then notify the VF. If VF is resetting at that time, the
notify may be dropped, so VF should query it after reset finished.

Fixes: 92f11ea177cd ("net: hns3: fix set port based VLAN issue for VF")
Signed-off-by: Jian Shen <shenjian15@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../hisilicon/hns3/hns3pf/hclge_main.c         | 18 +++++++++++++-----
 .../hisilicon/hns3/hns3vf/hclgevf_main.c       |  5 +++++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 67d60e00170f..279a1771fe7c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -8993,11 +8993,16 @@ static int hclge_set_vf_mac(struct hnae3_handle *handle, int vf,
 
 	ether_addr_copy(vport->vf_info.mac, mac_addr);
 
+	/* there is a timewindow for PF to know VF unalive, it may
+	 * cause send mailbox fail, but it doesn't matter, VF will
+	 * query it when reinit.
+	 */
 	if (test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state)) {
 		dev_info(&hdev->pdev->dev,
 			 "MAC of VF %d has been set to %s, and it will be reinitialized!\n",
 			 vf, format_mac_addr);
-		return hclge_inform_reset_assert_to_vf(vport);
+		(void)hclge_inform_reset_assert_to_vf(vport);
+		return 0;
 	}
 
 	dev_info(&hdev->pdev->dev, "MAC of VF %d has been set to %s\n",
@@ -10250,14 +10255,17 @@ static int hclge_set_vf_vlan_filter(struct hnae3_handle *handle, int vfid,
 		return ret;
 	}
 
-	/* for DEVICE_VERSION_V3, vf doesn't need to know about the port based
+	/* there is a timewindow for PF to know VF unalive, it may
+	 * cause send mailbox fail, but it doesn't matter, VF will
+	 * query it when reinit.
+	 * for DEVICE_VERSION_V3, vf doesn't need to know about the port based
 	 * VLAN state.
 	 */
 	if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3 &&
 	    test_bit(HCLGE_VPORT_STATE_ALIVE, &vport->state))
-		hclge_push_vf_port_base_vlan_info(&hdev->vport[0],
-						  vport->vport_id, state,
-						  &vlan_info);
+		(void)hclge_push_vf_port_base_vlan_info(&hdev->vport[0],
+							vport->vport_id,
+							state, &vlan_info);
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index 93389bec8d89..342d7cdf6285 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -2862,6 +2862,11 @@ static int hclgevf_reset_hdev(struct hclgevf_dev *hdev)
 		return ret;
 	}
 
+	/* get current port based vlan state from PF */
+	ret = hclgevf_get_port_base_vlan_filter_state(hdev);
+	if (ret)
+		return ret;
+
 	set_bit(HCLGEVF_STATE_PROMISC_CHANGED, &hdev->state);
 
 	hclgevf_init_rxd_adv_layout(hdev);

From 2d327a79ee176930dc72c131a970c891d367c1dc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 24 Mar 2022 20:58:27 -0700
Subject: [PATCH 193/229] llc: only change llc->dev when bind() succeeds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

My latest patch, attempting to fix the refcount leak in a minimal
way turned out to add a new bug.

Whenever the bind operation fails before we attempt to grab
a reference count on a device, we might release the device refcount
of a prior successful bind() operation.

syzbot was not happy about this [1].

Note to stable teams:

Make sure commit b37a46683739 ("netdevice: add the case if dev is NULL")
is already present in your trees.

[1]
general protection fault, probably for non-canonical address 0xdffffc0000000070: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000380-0x0000000000000387]
CPU: 1 PID: 3590 Comm: syz-executor361 Tainted: G        W         5.17.0-syzkaller-04796-g169e77764adc #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:llc_ui_connect+0x400/0xcb0 net/llc/af_llc.c:500
Code: 80 3c 02 00 0f 85 fc 07 00 00 4c 8b a5 38 05 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d bc 24 80 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 a9 07 00 00 49 8b b4 24 80 03 00 00 4c 89 f2 48
RSP: 0018:ffffc900038cfcc0 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff8880756eb600 RCX: 0000000000000000
RDX: 0000000000000070 RSI: ffffc900038cfe3e RDI: 0000000000000380
RBP: ffff888015ee5000 R08: 0000000000000001 R09: ffff888015ee5535
R10: ffffed1002bdcaa6 R11: 0000000000000000 R12: 0000000000000000
R13: ffffc900038cfe37 R14: ffffc900038cfe38 R15: ffff888015ee5012
FS:  0000555555acd300(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000280 CR3: 0000000077db6000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 __sys_connect_file+0x155/0x1a0 net/socket.c:1900
 __sys_connect+0x161/0x190 net/socket.c:1917
 __do_sys_connect net/socket.c:1927 [inline]
 __se_sys_connect net/socket.c:1924 [inline]
 __x64_sys_connect+0x6f/0xb0 net/socket.c:1924
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7f016acb90b9
Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ffd417947f8 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f016acb90b9
RDX: 0000000000000010 RSI: 0000000020000140 RDI: 0000000000000003
RBP: 00007f016ac7d0a0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f016ac7d130
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:llc_ui_connect+0x400/0xcb0 net/llc/af_llc.c:500

Fixes: 764f4eb6846f ("llc: fix netdevice reference leaks in llc_ui_bind()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: 赵子轩 <beraphin@gmail.com>
Cc: Stoyan Manolov <smanolov@suse.de>
Link: https://lore.kernel.org/r/20220325035827.360418-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/llc/af_llc.c | 59 ++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c86256064743..7f555d2e5357 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -275,6 +275,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
 {
 	struct sock *sk = sock->sk;
 	struct llc_sock *llc = llc_sk(sk);
+	struct net_device *dev = NULL;
 	struct llc_sap *sap;
 	int rc = -EINVAL;
 
@@ -286,16 +287,15 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
 		goto out;
 	rc = -ENODEV;
 	if (sk->sk_bound_dev_if) {
-		llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
-		if (llc->dev && addr->sllc_arphrd != llc->dev->type) {
-			dev_put(llc->dev);
-			llc->dev = NULL;
+		dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
+		if (dev && addr->sllc_arphrd != dev->type) {
+			dev_put(dev);
+			dev = NULL;
 		}
 	} else
-		llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd);
-	if (!llc->dev)
+		dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd);
+	if (!dev)
 		goto out;
-	netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL);
 	rc = -EUSERS;
 	llc->laddr.lsap = llc_ui_autoport();
 	if (!llc->laddr.lsap)
@@ -304,6 +304,12 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
 	sap = llc_sap_open(llc->laddr.lsap, NULL);
 	if (!sap)
 		goto out;
+
+	/* Note: We do not expect errors from this point. */
+	llc->dev = dev;
+	netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL);
+	dev = NULL;
+
 	memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN);
 	memcpy(&llc->addr, addr, sizeof(llc->addr));
 	/* assign new connection to its SAP */
@@ -311,10 +317,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
 	sock_reset_flag(sk, SOCK_ZAPPED);
 	rc = 0;
 out:
-	if (rc) {
-		dev_put_track(llc->dev, &llc->dev_tracker);
-		llc->dev = NULL;
-	}
+	dev_put(dev);
 	return rc;
 }
 
@@ -337,6 +340,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 	struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
 	struct sock *sk = sock->sk;
 	struct llc_sock *llc = llc_sk(sk);
+	struct net_device *dev = NULL;
 	struct llc_sap *sap;
 	int rc = -EINVAL;
 
@@ -352,25 +356,27 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 	rc = -ENODEV;
 	rcu_read_lock();
 	if (sk->sk_bound_dev_if) {
-		llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
-		if (llc->dev) {
+		dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
+		if (dev) {
 			if (is_zero_ether_addr(addr->sllc_mac))
-				memcpy(addr->sllc_mac, llc->dev->dev_addr,
+				memcpy(addr->sllc_mac, dev->dev_addr,
 				       IFHWADDRLEN);
-			if (addr->sllc_arphrd != llc->dev->type ||
+			if (addr->sllc_arphrd != dev->type ||
 			    !ether_addr_equal(addr->sllc_mac,
-					      llc->dev->dev_addr)) {
+					      dev->dev_addr)) {
 				rc = -EINVAL;
-				llc->dev = NULL;
+				dev = NULL;
 			}
 		}
-	} else
-		llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd,
+	} else {
+		dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd,
 					   addr->sllc_mac);
-	dev_hold_track(llc->dev, &llc->dev_tracker, GFP_ATOMIC);
+	}
+	dev_hold(dev);
 	rcu_read_unlock();
-	if (!llc->dev)
+	if (!dev)
 		goto out;
+
 	if (!addr->sllc_sap) {
 		rc = -EUSERS;
 		addr->sllc_sap = llc_ui_autoport();
@@ -402,6 +408,12 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 			goto out_put;
 		}
 	}
+
+	/* Note: We do not expect errors from this point. */
+	llc->dev = dev;
+	netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL);
+	dev = NULL;
+
 	llc->laddr.lsap = addr->sllc_sap;
 	memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN);
 	memcpy(&llc->addr, addr, sizeof(llc->addr));
@@ -412,10 +424,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
 out_put:
 	llc_sap_put(sap);
 out:
-	if (rc) {
-		dev_put_track(llc->dev, &llc->dev_tracker);
-		llc->dev = NULL;
-	}
+	dev_put(dev);
 	release_sock(sk);
 	return rc;
 }

From bf8bfc4336f7a34e48b3bbd19b1542bf085bdc3d Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 24 Mar 2022 16:24:38 -0700
Subject: [PATCH 194/229] net: phy: broadcom: Fix brcm_fet_config_init()

A Broadcom AC201 PHY (same entry as 5241) would be flagged by the
Broadcom UniMAC MDIO controller as not completing the turn around
properly since the PHY expects 65 MDC clock cycles to complete a write
cycle, and the MDIO controller was only sending 64 MDC clock cycles as
determined by looking at a scope shot.

This would make the subsequent read fail with the UniMAC MDIO controller
command field having MDIO_READ_FAIL set and we would abort the
brcm_fet_config_init() function and thus not probe the PHY at all.

After issuing a software reset, wait for at least 1ms which is well
above the 1us reset delay advertised by the datasheet and issue a dummy
read to let the PHY turn around the line properly. This read
specifically ignores -EIO which would be returned by MDIO controllers
checking for the line being turned around.

If we have a genuine reaad failure, the next read of the interrupt
status register would pick it up anyway.

Fixes: d7a2ed9248a3 ("broadcom: Add AC131 phy support")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20220324232438.1156812-1-f.fainelli@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/broadcom.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
index 3c683e0e40e9..e36809aa6d30 100644
--- a/drivers/net/phy/broadcom.c
+++ b/drivers/net/phy/broadcom.c
@@ -11,6 +11,7 @@
  */
 
 #include "bcm-phy-lib.h"
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/phy.h>
 #include <linux/brcmphy.h>
@@ -602,6 +603,26 @@ static int brcm_fet_config_init(struct phy_device *phydev)
 	if (err < 0)
 		return err;
 
+	/* The datasheet indicates the PHY needs up to 1us to complete a reset,
+	 * build some slack here.
+	 */
+	usleep_range(1000, 2000);
+
+	/* The PHY requires 65 MDC clock cycles to complete a write operation
+	 * and turnaround the line properly.
+	 *
+	 * We ignore -EIO here as the MDIO controller (e.g.: mdio-bcm-unimac)
+	 * may flag the lack of turn-around as a read failure. This is
+	 * particularly true with this combination since the MDIO controller
+	 * only used 64 MDC cycles. This is not a critical failure in this
+	 * specific case and it has no functional impact otherwise, so we let
+	 * that one go through. If there is a genuine bus error, the next read
+	 * of MII_BRCM_FET_INTREG will error out.
+	 */
+	err = phy_read(phydev, MII_BMCR);
+	if (err < 0 && err != -EIO)
+		return err;
+
 	reg = phy_read(phydev, MII_BRCM_FET_INTREG);
 	if (reg < 0)
 		return reg;

From b50d3b46f84282d795ae3076111acb75ae1031f3 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 24 Mar 2022 22:05:14 +0200
Subject: [PATCH 195/229] selftests: test_vxlan_under_vrf: Fix broken test case

The purpose of the last test case is to test VXLAN encapsulation and
decapsulation when the underlay lookup takes place in a non-default VRF.
This is achieved by enslaving the physical device of the tunnel to a
VRF.

The binding of the VXLAN UDP socket to the VRF happens when the VXLAN
device itself is opened, not when its physical device is opened. This
was also mentioned in the cited commit ("tests that moving the underlay
from a VRF to another works when down/up the VXLAN interface"), but the
test did something else.

Fix it by reopening the VXLAN device instead of its physical device.

Before:

 # ./test_vxlan_under_vrf.sh
 Checking HV connectivity                                           [ OK ]
 Check VM connectivity through VXLAN (underlay in the default VRF)  [ OK ]
 Check VM connectivity through VXLAN (underlay in a VRF)            [FAIL]

After:

 # ./test_vxlan_under_vrf.sh
 Checking HV connectivity                                           [ OK ]
 Check VM connectivity through VXLAN (underlay in the default VRF)  [ OK ]
 Check VM connectivity through VXLAN (underlay in a VRF)            [ OK ]

Fixes: 03f1c26b1c56 ("test/net: Add script for VXLAN underlay in a VRF")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20220324200514.1638326-1-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/test_vxlan_under_vrf.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/net/test_vxlan_under_vrf.sh b/tools/testing/selftests/net/test_vxlan_under_vrf.sh
index ea5a7a808f12..1fd1250ebc66 100755
--- a/tools/testing/selftests/net/test_vxlan_under_vrf.sh
+++ b/tools/testing/selftests/net/test_vxlan_under_vrf.sh
@@ -120,11 +120,11 @@ echo "[ OK ]"
 
 # Move the underlay to a non-default VRF
 ip -netns hv-1 link set veth0 vrf vrf-underlay
-ip -netns hv-1 link set veth0 down
-ip -netns hv-1 link set veth0 up
+ip -netns hv-1 link set vxlan0 down
+ip -netns hv-1 link set vxlan0 up
 ip -netns hv-2 link set veth0 vrf vrf-underlay
-ip -netns hv-2 link set veth0 down
-ip -netns hv-2 link set veth0 up
+ip -netns hv-2 link set vxlan0 down
+ip -netns hv-2 link set vxlan0 up
 
 echo -n "Check VM connectivity through VXLAN (underlay in a VRF)            "
 ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)

From d02d81efc7564b4d5446a02e0214a164cf00b1f3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 25 Mar 2022 21:51:03 -0400
Subject: [PATCH 196/229] NFS: Don't loop forever in nfs_do_recoalesce()

If __nfs_pageio_add_request() fails to add the request, it will return
with either desc->pg_error < 0, or mirror->pg_recoalesce will be set, so
we are guaranteed either to exit the function altogether, or to loop.

However if there is nothing left in mirror->pg_list to coalesce, we must
exit, so make sure that we clear mirror->pg_recoalesce every time we
loop.

Reported-by: Olga Kornievskaia <aglo@umich.edu>
Fixes: 70536bf4eb07 ("NFS: Clean up reset of the mirror accounting variables")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pagelist.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 3156db526cc4..9157dd19b8b4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1218,6 +1218,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 
 	do {
 		list_splice_init(&mirror->pg_list, &head);
+		mirror->pg_recoalesce = 0;
 
 		while (!list_empty(&head)) {
 			struct nfs_page *req;

From 8778372118023e2258612c03573c47efef41d755 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao288@hisilicon.com>
Date: Sat, 26 Mar 2022 17:51:00 +0800
Subject: [PATCH 197/229] net: hns3: fix ethtool tx copybreak buf size
 indicating not aligned issue

When use ethtoool set tx copybreak buf size to a large value
which causes order exceeding 10 or memory is not enough,
it causes allocating tx copybreak buffer failed and print
"the active tx spare buf is 0, not enabled tx spare buffer",
however, use --get-tunable parameter query tx copybreak buf
size and it indicates setting value not 0.

So, it's necessary to change the print value from setting
value to 0.

Set kinfo.tx_spare_buf_size to 0 when set tx copybreak buf size failed.

Fixes: e445f08af2b1 ("net: hns3: add support to set/get tx copybreak buf size via ethtool for hns3 driver")
Signed-off-by: Hao Chen <chenhao288@hisilicon.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/hisilicon/hns3/hns3_enet.c   | 20 +++++++++++--------
 .../ethernet/hisilicon/hns3/hns3_ethtool.c    |  3 ++-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 0b8a73c40b12..16137238ddbf 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1028,13 +1028,12 @@ static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring,
 
 static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 {
+	u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size;
 	struct hns3_tx_spare *tx_spare;
 	struct page *page;
-	u32 alloc_size;
 	dma_addr_t dma;
 	int order;
 
-	alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size;
 	if (!alloc_size)
 		return;
 
@@ -1044,30 +1043,35 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 	if (!tx_spare) {
 		/* The driver still work without the tx spare buffer */
 		dev_warn(ring_to_dev(ring), "failed to allocate hns3_tx_spare\n");
-		return;
+		goto devm_kzalloc_error;
 	}
 
 	page = alloc_pages_node(dev_to_node(ring_to_dev(ring)),
 				GFP_KERNEL, order);
 	if (!page) {
 		dev_warn(ring_to_dev(ring), "failed to allocate tx spare pages\n");
-		devm_kfree(ring_to_dev(ring), tx_spare);
-		return;
+		goto alloc_pages_error;
 	}
 
 	dma = dma_map_page(ring_to_dev(ring), page, 0,
 			   PAGE_SIZE << order, DMA_TO_DEVICE);
 	if (dma_mapping_error(ring_to_dev(ring), dma)) {
 		dev_warn(ring_to_dev(ring), "failed to map pages for tx spare\n");
-		put_page(page);
-		devm_kfree(ring_to_dev(ring), tx_spare);
-		return;
+		goto dma_mapping_error;
 	}
 
 	tx_spare->dma = dma;
 	tx_spare->buf = page_address(page);
 	tx_spare->len = PAGE_SIZE << order;
 	ring->tx_spare = tx_spare;
+	return;
+
+dma_mapping_error:
+	put_page(page);
+alloc_pages_error:
+	devm_kfree(ring_to_dev(ring), tx_spare);
+devm_kzalloc_error:
+	ring->tqp->handle->kinfo.tx_spare_buf_size = 0;
 }
 
 /* Use hns3_tx_spare_space() to make sure there is enough buffer
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 6469238ae090..ae30dbe7ef52 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1818,7 +1818,8 @@ static int hns3_set_tunable(struct net_device *netdev,
 		old_tx_spare_buf_size = h->kinfo.tx_spare_buf_size;
 		new_tx_spare_buf_size = *(u32 *)data;
 		ret = hns3_set_tx_spare_buf_size(netdev, new_tx_spare_buf_size);
-		if (ret) {
+		if (ret ||
+		    (!priv->ring->tx_spare && new_tx_spare_buf_size != 0)) {
 			int ret1;
 
 			netdev_warn(netdev,

From a89cbb16995bf15582e0d1bdb922ad1a54a2fa8c Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao288@hisilicon.com>
Date: Sat, 26 Mar 2022 17:51:01 +0800
Subject: [PATCH 198/229] net: hns3: add max order judgement for tx spare
 buffer

Add max order judgement for tx spare buffer to avoid triggering
call trace, print related fail information instead, when user
set tx spare buf size to a large value which causes order
exceeding 10.

Fixes: e445f08af2b1 ("net: hns3: add support to set/get tx copybreak buf size via ethtool for hns3 driver")
Signed-off-by: Hao Chen <chenhao288@hisilicon.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 16137238ddbf..530ba8bef503 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1038,6 +1038,12 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring)
 		return;
 
 	order = get_order(alloc_size);
+	if (order >= MAX_ORDER) {
+		if (net_ratelimit())
+			dev_warn(ring_to_dev(ring), "failed to allocate tx spare buffer, exceed to max order\n");
+		return;
+	}
+
 	tx_spare = devm_kzalloc(ring_to_dev(ring), sizeof(*tx_spare),
 				GFP_KERNEL);
 	if (!tx_spare) {

From 671cb8cbb9c9e24b681d21b1bfae991e2386ac73 Mon Sep 17 00:00:00 2001
From: Peng Li <lipeng321@huawei.com>
Date: Sat, 26 Mar 2022 17:51:02 +0800
Subject: [PATCH 199/229] net: hns3: clean residual vf config after disable
 sriov

After disable sriov, VF still has some config and info need to be
cleaned, which configured by PF. This patch clean the HW config
and SW struct vport->vf_info.

Fixes: fa8d82e853e8 ("net: hns3: Add support of .sriov_configure in HNS3 driver")
Signed-off-by: Peng Li<lipeng321@huawei.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h   |  3 ++
 .../net/ethernet/hisilicon/hns3/hns3_enet.c   | 18 +++++++
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 50 +++++++++++++++++++
 3 files changed, 71 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 6f18c9a03231..d44dd7091fa1 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -537,6 +537,8 @@ struct hnae3_ae_dev {
  *   Get 1588 rx hwstamp
  * get_ts_info
  *   Get phc info
+ * clean_vf_config
+ *   Clean residual vf info after disable sriov
  */
 struct hnae3_ae_ops {
 	int (*init_ae_dev)(struct hnae3_ae_dev *ae_dev);
@@ -730,6 +732,7 @@ struct hnae3_ae_ops {
 			   struct ethtool_ts_info *info);
 	int (*get_link_diagnosis_info)(struct hnae3_handle *handle,
 				       u32 *status_code);
+	void (*clean_vf_config)(struct hnae3_ae_dev *ae_dev, int num_vfs);
 };
 
 struct hnae3_dcb_ops {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 530ba8bef503..14dc12c2155d 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -3060,6 +3060,21 @@ static int hns3_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return ret;
 }
 
+/**
+ * hns3_clean_vf_config
+ * @pdev: pointer to a pci_dev structure
+ * @num_vfs: number of VFs allocated
+ *
+ * Clean residual vf config after disable sriov
+ **/
+static void hns3_clean_vf_config(struct pci_dev *pdev, int num_vfs)
+{
+	struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev);
+
+	if (ae_dev->ops->clean_vf_config)
+		ae_dev->ops->clean_vf_config(ae_dev, num_vfs);
+}
+
 /* hns3_remove - Device removal routine
  * @pdev: PCI device information struct
  */
@@ -3098,7 +3113,10 @@ static int hns3_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
 		else
 			return num_vfs;
 	} else if (!pci_vfs_assigned(pdev)) {
+		int num_vfs_pre = pci_num_vf(pdev);
+
 		pci_disable_sriov(pdev);
+		hns3_clean_vf_config(pdev, num_vfs_pre);
 	} else {
 		dev_warn(&pdev->dev,
 			 "Unable to free VFs because some are assigned to VMs.\n");
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 279a1771fe7c..2a5e6a241d52 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -12724,6 +12724,55 @@ static int hclge_get_link_diagnosis_info(struct hnae3_handle *handle,
 	return 0;
 }
 
+/* After disable sriov, VF still has some config and info need clean,
+ * which configed by PF.
+ */
+static void hclge_clear_vport_vf_info(struct hclge_vport *vport, int vfid)
+{
+	struct hclge_dev *hdev = vport->back;
+	struct hclge_vlan_info vlan_info;
+	int ret;
+
+	/* after disable sriov, clean VF rate configured by PF */
+	ret = hclge_tm_qs_shaper_cfg(vport, 0);
+	if (ret)
+		dev_err(&hdev->pdev->dev,
+			"failed to clean vf%d rate config, ret = %d\n",
+			vfid, ret);
+
+	vlan_info.vlan_tag = 0;
+	vlan_info.qos = 0;
+	vlan_info.vlan_proto = ETH_P_8021Q;
+	ret = hclge_update_port_base_vlan_cfg(vport,
+					      HNAE3_PORT_BASE_VLAN_DISABLE,
+					      &vlan_info);
+	if (ret)
+		dev_err(&hdev->pdev->dev,
+			"failed to clean vf%d port base vlan, ret = %d\n",
+			vfid, ret);
+
+	ret = hclge_set_vf_spoofchk_hw(hdev, vport->vport_id, false);
+	if (ret)
+		dev_err(&hdev->pdev->dev,
+			"failed to clean vf%d spoof config, ret = %d\n",
+			vfid, ret);
+
+	memset(&vport->vf_info, 0, sizeof(vport->vf_info));
+}
+
+static void hclge_clean_vport_config(struct hnae3_ae_dev *ae_dev, int num_vfs)
+{
+	struct hclge_dev *hdev = ae_dev->priv;
+	struct hclge_vport *vport;
+	int i;
+
+	for (i = 0; i < num_vfs; i++) {
+		vport = &hdev->vport[i + HCLGE_VF_VPORT_START_NUM];
+
+		hclge_clear_vport_vf_info(vport, i);
+	}
+}
+
 static const struct hnae3_ae_ops hclge_ops = {
 	.init_ae_dev = hclge_init_ae_dev,
 	.uninit_ae_dev = hclge_uninit_ae_dev,
@@ -12825,6 +12874,7 @@ static const struct hnae3_ae_ops hclge_ops = {
 	.get_rx_hwts = hclge_ptp_get_rx_hwts,
 	.get_ts_info = hclge_ptp_get_ts_info,
 	.get_link_diagnosis_info = hclge_get_link_diagnosis_info,
+	.clean_vf_config = hclge_clean_vport_config,
 };
 
 static struct hnae3_ae_algo ae_algo = {

From f5cd60169f981ca737c9e49c446506dfafc90a35 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao288@hisilicon.com>
Date: Sat, 26 Mar 2022 17:51:03 +0800
Subject: [PATCH 200/229] net: hns3: add netdev reset check for
 hns3_set_tunable()

When pci device reset failed, it does uninit operation and priv->ring
is NULL, it causes accessing NULL pointer error.

Add netdev reset check for hns3_set_tunable() to fix it.

Fixes: 99f6b5fb5f63 ("net: hns3: use bounce buffer when rx page can not be reused")
Signed-off-by: Hao Chen <chenhao288@hisilicon.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index ae30dbe7ef52..49e7b022caaa 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -1766,9 +1766,6 @@ static int hns3_set_tx_spare_buf_size(struct net_device *netdev,
 	struct hnae3_handle *h = priv->ae_handle;
 	int ret;
 
-	if (hns3_nic_resetting(netdev))
-		return -EBUSY;
-
 	h->kinfo.tx_spare_buf_size = data;
 
 	ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT);
@@ -1799,6 +1796,11 @@ static int hns3_set_tunable(struct net_device *netdev,
 	struct hnae3_handle *h = priv->ae_handle;
 	int i, ret = 0;
 
+	if (hns3_nic_resetting(netdev) || !priv->ring) {
+		netdev_err(netdev, "failed to set tunable value, dev resetting!");
+		return -EBUSY;
+	}
+
 	switch (tuna->id) {
 	case ETHTOOL_TX_COPYBREAK:
 		priv->tx_copybreak = *(u32 *)data;

From 4d07c5936c2508ddd1cfd49b0a91d94cb4d1f0e8 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao288@hisilicon.com>
Date: Sat, 26 Mar 2022 17:51:04 +0800
Subject: [PATCH 201/229] net: hns3: add NULL pointer check for
 hns3_set/get_ringparam()

When pci devices init failed and haven't reinit, priv->ring is
NULL and hns3_set/get_ringparam() will access priv->ring. it
causes call trace.

So, add NULL pointer check for hns3_set/get_ringparam() to
avoid this situation.

Fixes: 5668abda0931 ("net: hns3: add support for set_ringparam")
Signed-off-by: Hao Chen <chenhao288@hisilicon.com>
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 49e7b022caaa..f4da77452126 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -653,8 +653,8 @@ static void hns3_get_ringparam(struct net_device *netdev,
 	struct hnae3_handle *h = priv->ae_handle;
 	int rx_queue_index = h->kinfo.num_tqps;
 
-	if (hns3_nic_resetting(netdev)) {
-		netdev_err(netdev, "dev resetting!");
+	if (hns3_nic_resetting(netdev) || !priv->ring) {
+		netdev_err(netdev, "failed to get ringparam value, due to dev resetting or uninited\n");
 		return;
 	}
 
@@ -1074,8 +1074,14 @@ static int hns3_check_ringparam(struct net_device *ndev,
 {
 #define RX_BUF_LEN_2K 2048
 #define RX_BUF_LEN_4K 4096
-	if (hns3_nic_resetting(ndev))
+
+	struct hns3_nic_priv *priv = netdev_priv(ndev);
+
+	if (hns3_nic_resetting(ndev) || !priv->ring) {
+		netdev_err(ndev, "failed to set ringparam value, due to dev resetting or uninited\n");
 		return -EBUSY;
+	}
+
 
 	if (param->rx_mini_pending || param->rx_jumbo_pending)
 		return -EINVAL;

From ad0ecaef6a2c07e67ef9fe163c007f7b3dad8643 Mon Sep 17 00:00:00 2001
From: Guangbin Huang <huangguangbin2@huawei.com>
Date: Sat, 26 Mar 2022 17:51:05 +0800
Subject: [PATCH 202/229] net: hns3: fix phy can not link up when autoneg off
 and reset

Currently, function hclge_mdio_read() will return 0 if during reset(the
cmd state will be set to disable).

If use general phy driver, the phy_state_machine() will update phy speed
every second in function genphy_read_status_fixed() when PHY is set to
autoneg off, no matter of link down or link up.

If phy driver happens to read BMCR register during reset, phy speed will
be updated to 10Mpbs as BMCR register value is 0. So it may call phy can
not link up if previous speed is not 10Mpbs.

To fix this problem, function hclge_mdio_read() should return -EBUSY if
the cmd state is disable. So does function hclge_mdio_write().

Fixes: 1c1249380992 ("net: hns3: bugfix for hclge_mdio_write and hclge_mdio_read")
Signed-off-by: Guangbin Huang <huangguangbin2@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
index 63d2be4349e3..03d63b6a9b2b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
@@ -48,7 +48,7 @@ static int hclge_mdio_write(struct mii_bus *bus, int phyid, int regnum,
 	int ret;
 
 	if (test_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state))
-		return 0;
+		return -EBUSY;
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MDIO_CONFIG, false);
 
@@ -86,7 +86,7 @@ static int hclge_mdio_read(struct mii_bus *bus, int phyid, int regnum)
 	int ret;
 
 	if (test_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state))
-		return 0;
+		return -EBUSY;
 
 	hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MDIO_CONFIG, true);
 

From 08be6b13db23f68146c600dd5adfd92e99d9ec6e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 26 Mar 2022 11:02:34 -0700
Subject: [PATCH 203/229] net: sparx5: depends on PTP_1588_CLOCK_OPTIONAL

Fix build errors when PTP_1588_CLOCK=m and SPARX5_SWTICH=y.

arc-linux-ld: drivers/net/ethernet/microchip/sparx5/sparx5_ethtool.o: in function `sparx5_get_ts_info':
sparx5_ethtool.c:(.text+0x146): undefined reference to `ptp_clock_index'
arc-linux-ld: sparx5_ethtool.c:(.text+0x146): undefined reference to `ptp_clock_index'
arc-linux-ld: drivers/net/ethernet/microchip/sparx5/sparx5_ptp.o: in function `sparx5_ptp_init':
sparx5_ptp.c:(.text+0xd56): undefined reference to `ptp_clock_register'
arc-linux-ld: sparx5_ptp.c:(.text+0xd56): undefined reference to `ptp_clock_register'
arc-linux-ld: drivers/net/ethernet/microchip/sparx5/sparx5_ptp.o: in function `sparx5_ptp_deinit':
sparx5_ptp.c:(.text+0xf30): undefined reference to `ptp_clock_unregister'
arc-linux-ld: sparx5_ptp.c:(.text+0xf30): undefined reference to `ptp_clock_unregister'
arc-linux-ld: sparx5_ptp.c:(.text+0xf38): undefined reference to `ptp_clock_unregister'
arc-linux-ld: sparx5_ptp.c:(.text+0xf46): undefined reference to `ptp_clock_unregister'
arc-linux-ld: drivers/net/ethernet/microchip/sparx5/sparx5_ptp.o:sparx5_ptp.c:(.text+0xf46): more undefined references to `ptp_clock_unregister' follow

Fixes: 3cfa11bac9bb ("net: sparx5: add the basic sparx5 driver")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Horatiu Vultur <horatiu.vultur@microchip.com>
Cc: UNGLinuxDriver@microchip.com
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Steen Hegelund <steen.hegelund@microchip.com>
Cc: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Cc: Lars Povlsen <lars.povlsen@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/sparx5/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/microchip/sparx5/Kconfig b/drivers/net/ethernet/microchip/sparx5/Kconfig
index 7bdbb2d09a14..85b24edb65d5 100644
--- a/drivers/net/ethernet/microchip/sparx5/Kconfig
+++ b/drivers/net/ethernet/microchip/sparx5/Kconfig
@@ -4,6 +4,7 @@ config SPARX5_SWITCH
 	depends on HAS_IOMEM
 	depends on OF
 	depends on ARCH_SPARX5 || COMPILE_TEST
+	depends on PTP_1588_CLOCK_OPTIONAL
 	select PHYLINK
 	select PHY_SPARX5_SERDES
 	select RESET_CONTROLLER

From 1521db37f0d42334a88e8ff28198a27d1ed5cd7b Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 26 Mar 2022 10:20:03 -0700
Subject: [PATCH 204/229] qlcnic: dcb: default to returning -EOPNOTSUPP

Clang static analysis reports this issue
qlcnic_dcb.c:382:10: warning: Assigned value is
  garbage or undefined
  mbx_out = *val;
          ^ ~~~~

val is set in the qlcnic_dcb_query_hw_capability() wrapper.
If there is no query_hw_capability op in dcp, success is
returned without setting the val.

For this and similar wrappers, return -EOPNOTSUPP.

Fixes: 14d385b99059 ("qlcnic: dcb: Query adapter DCB capabilities.")
Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
index 5d79ee4370bc..7519773eaca6 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_dcb.h
@@ -51,7 +51,7 @@ static inline int qlcnic_dcb_get_hw_capability(struct qlcnic_dcb *dcb)
 	if (dcb && dcb->ops->get_hw_capability)
 		return dcb->ops->get_hw_capability(dcb);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static inline void qlcnic_dcb_free(struct qlcnic_dcb *dcb)
@@ -65,7 +65,7 @@ static inline int qlcnic_dcb_attach(struct qlcnic_dcb *dcb)
 	if (dcb && dcb->ops->attach)
 		return dcb->ops->attach(dcb);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static inline int
@@ -74,7 +74,7 @@ qlcnic_dcb_query_hw_capability(struct qlcnic_dcb *dcb, char *buf)
 	if (dcb && dcb->ops->query_hw_capability)
 		return dcb->ops->query_hw_capability(dcb, buf);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static inline void qlcnic_dcb_get_info(struct qlcnic_dcb *dcb)
@@ -89,7 +89,7 @@ qlcnic_dcb_query_cee_param(struct qlcnic_dcb *dcb, char *buf, u8 type)
 	if (dcb && dcb->ops->query_cee_param)
 		return dcb->ops->query_cee_param(dcb, buf, type);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static inline int qlcnic_dcb_get_cee_cfg(struct qlcnic_dcb *dcb)
@@ -97,7 +97,7 @@ static inline int qlcnic_dcb_get_cee_cfg(struct qlcnic_dcb *dcb)
 	if (dcb && dcb->ops->get_cee_cfg)
 		return dcb->ops->get_cee_cfg(dcb);
 
-	return 0;
+	return -EOPNOTSUPP;
 }
 
 static inline void qlcnic_dcb_aen_handler(struct qlcnic_dcb *dcb, void *msg)

From 7781607938c8371d4c2b243527430241c62e39c2 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Sat, 26 Mar 2022 18:43:46 +0800
Subject: [PATCH 205/229] net/x25: Fix null-ptr-deref caused by x25_disconnect

When the link layer is terminating, x25->neighbour will be set to NULL
in x25_disconnect(). As a result, it could cause null-ptr-deref bugs in
x25_sendmsg(),x25_recvmsg() and x25_connect(). One of the bugs is
shown below.

    (Thread 1)                 |  (Thread 2)
x25_link_terminated()          | x25_recvmsg()
 x25_kill_by_neigh()           |  ...
  x25_disconnect()             |  lock_sock(sk)
   ...                         |  ...
   x25->neighbour = NULL //(1) |
   ...                         |  x25->neighbour->extended //(2)

The code sets NULL to x25->neighbour in position (1) and dereferences
x25->neighbour in position (2), which could cause null-ptr-deref bug.

This patch adds lock_sock() in x25_kill_by_neigh() in order to synchronize
with x25_sendmsg(), x25_recvmsg() and x25_connect(). What`s more, the
sock held by lock_sock() is not NULL, because it is extracted from x25_list
and uses x25_list_lock to synchronize.

Fixes: 4becb7ee5b3d ("net/x25: Fix x25_neigh refcnt leak when x25 disconnect")
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Reviewed-by: Lin Ma <linma@zju.edu.cn>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/af_x25.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 3583354a7d7f..3a171828638b 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1765,10 +1765,15 @@ void x25_kill_by_neigh(struct x25_neigh *nb)
 
 	write_lock_bh(&x25_list_lock);
 
-	sk_for_each(s, &x25_list)
-		if (x25_sk(s)->neighbour == nb)
+	sk_for_each(s, &x25_list) {
+		if (x25_sk(s)->neighbour == nb) {
+			write_unlock_bh(&x25_list_lock);
+			lock_sock(s);
 			x25_disconnect(s, ENETUNREACH, 0, 0);
-
+			release_sock(s);
+			write_lock_bh(&x25_list_lock);
+		}
+	}
 	write_unlock_bh(&x25_list_lock);
 
 	/* Remove any related forwards */

From 0906f3a3df07835e37077d8971aac65347f2ed57 Mon Sep 17 00:00:00 2001
From: Zheng Yongjun <zhengyongjun3@huawei.com>
Date: Sat, 26 Mar 2022 08:12:39 +0000
Subject: [PATCH 206/229] net: sparx5: switchdev: fix possible NULL pointer
 dereference

As the possible failure of the allocation, devm_kzalloc() may return NULL
pointer.
Therefore, it should be better to check the 'db' in order to prevent
the dereference of NULL pointer.

Fixes: 10615907e9b51 ("net: sparx5: switchdev: adding frame DMA functionality")
Signed-off-by: Zheng Yongjun <zhengyongjun3@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/sparx5/sparx5_fdma.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_fdma.c b/drivers/net/ethernet/microchip/sparx5/sparx5_fdma.c
index 2dc87584023a..1e9ff365459e 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_fdma.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_fdma.c
@@ -422,6 +422,8 @@ static int sparx5_fdma_tx_alloc(struct sparx5 *sparx5)
 			db_hw->dataptr = phys;
 			db_hw->status = 0;
 			db = devm_kzalloc(sparx5->dev, sizeof(*db), GFP_KERNEL);
+			if (!db)
+				return -ENOMEM;
 			db->cpu_addr = cpu_addr;
 			list_add_tail(&db->list, &tx->db_list);
 		}

From 33b5bc9e703383e396f275d51fc4bafa48dbae5a Mon Sep 17 00:00:00 2001
From: Tom Rix <trix@redhat.com>
Date: Sat, 26 Mar 2022 09:03:06 -0700
Subject: [PATCH 207/229] octeontx2-af: initialize action variable

Clang static analysis reports this representative issue
rvu_npc.c:898:15: warning: Assigned value is garbage
  or undefined
  req.match_id = action.match_id;
               ^ ~~~~~~~~~~~~~~~

The initial setting of action is conditional on
 if (is_mcam_entry_enabled(...))
The later check of action.op will sometimes be garbage.
So initialize action.

Reduce setting of
  *(u64 *)&action = 0x00;
to
  *(u64 *)&action = 0;

Fixes: 967db3529eca ("octeontx2-af: add support for multicast/promisc packet replication feature")
Signed-off-by: Tom Rix <trix@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/af/rvu_npc.c   | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
index 91f86d77cd41..3a31fb8cc155 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@ -605,7 +605,7 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
 	struct npc_install_flow_req req = { 0 };
 	struct npc_install_flow_rsp rsp = { 0 };
 	struct npc_mcam *mcam = &rvu->hw->mcam;
-	struct nix_rx_action action;
+	struct nix_rx_action action = { 0 };
 	int blkaddr, index;
 
 	/* AF's and SDP VFs work in promiscuous mode */
@@ -626,7 +626,6 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
 		*(u64 *)&action = npc_get_mcam_action(rvu, mcam,
 						      blkaddr, index);
 	} else {
-		*(u64 *)&action = 0x00;
 		action.op = NIX_RX_ACTIONOP_UCAST;
 		action.pf_func = pcifunc;
 	}
@@ -657,7 +656,7 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
 	struct npc_mcam *mcam = &rvu->hw->mcam;
 	struct rvu_hwinfo *hw = rvu->hw;
 	int blkaddr, ucast_idx, index;
-	struct nix_rx_action action;
+	struct nix_rx_action action = { 0 };
 	u64 relaxed_mask;
 
 	if (!hw->cap.nix_rx_multicast && is_cgx_vf(rvu, pcifunc))
@@ -685,14 +684,14 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
 						      blkaddr, ucast_idx);
 
 	if (action.op != NIX_RX_ACTIONOP_RSS) {
-		*(u64 *)&action = 0x00;
+		*(u64 *)&action = 0;
 		action.op = NIX_RX_ACTIONOP_UCAST;
 	}
 
 	/* RX_ACTION set to MCAST for CGX PF's */
 	if (hw->cap.nix_rx_multicast && pfvf->use_mce_list &&
 	    is_pf_cgxmapped(rvu, rvu_get_pf(pcifunc))) {
-		*(u64 *)&action = 0x00;
+		*(u64 *)&action = 0;
 		action.op = NIX_RX_ACTIONOP_MCAST;
 		pfvf = rvu_get_pfvf(rvu, pcifunc & ~RVU_PFVF_FUNC_MASK);
 		action.index = pfvf->promisc_mce_idx;
@@ -832,7 +831,7 @@ void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf,
 	struct rvu_hwinfo *hw = rvu->hw;
 	int blkaddr, ucast_idx, index;
 	u8 mac_addr[ETH_ALEN] = { 0 };
-	struct nix_rx_action action;
+	struct nix_rx_action action = { 0 };
 	struct rvu_pfvf *pfvf;
 	u16 vf_func;
 
@@ -861,14 +860,14 @@ void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf,
 							blkaddr, ucast_idx);
 
 	if (action.op != NIX_RX_ACTIONOP_RSS) {
-		*(u64 *)&action = 0x00;
+		*(u64 *)&action = 0;
 		action.op = NIX_RX_ACTIONOP_UCAST;
 		action.pf_func = pcifunc;
 	}
 
 	/* RX_ACTION set to MCAST for CGX PF's */
 	if (hw->cap.nix_rx_multicast && pfvf->use_mce_list) {
-		*(u64 *)&action = 0x00;
+		*(u64 *)&action = 0;
 		action.op = NIX_RX_ACTIONOP_MCAST;
 		action.index = pfvf->mcast_mce_idx;
 	}

From 5c7e49be96ea24776a5b5a07c732c477294add00 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 25 Mar 2022 16:27:09 -0700
Subject: [PATCH 208/229] selftests: tls: skip cmsg_to_pipe tests with TLS=n

These are negative tests, testing TLS code rejects certain
operations. They won't pass without TLS enabled, pure TCP
accepts those operations.

Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Fixes: d87d67fd61ef ("selftests: tls: test splicing cmsgs")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/tls.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 6e468e0f42f7..5d70b04c482c 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -683,6 +683,9 @@ TEST_F(tls, splice_cmsg_to_pipe)
 	char buf[10];
 	int p[2];
 
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
 	ASSERT_GE(pipe(p), 0);
 	EXPECT_EQ(tls_send_cmsg(self->fd, 100, test_str, send_len, 0), 10);
 	EXPECT_EQ(splice(self->cfd, NULL, p[1], NULL, send_len, 0), -1);
@@ -703,6 +706,9 @@ TEST_F(tls, splice_dec_cmsg_to_pipe)
 	char buf[10];
 	int p[2];
 
+	if (self->notls)
+		SKIP(return, "no TLS support");
+
 	ASSERT_GE(pipe(p), 0);
 	EXPECT_EQ(tls_send_cmsg(self->fd, 100, test_str, send_len, 0), 10);
 	EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);

From 5ae6acf1d00be462d7b08b4a8748798ef595ae5a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 25 Mar 2022 09:50:21 -0700
Subject: [PATCH 209/229] net/smc: fix a memory leak in smc_sysctl_net_exit()

Recently added smc_sysctl_net_exit() forgot to free
the memory allocated from smc_sysctl_net_init()
for non initial network namespace.

Fixes: 462791bbfa35 ("net/smc: add sysctl interface for SMC")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Tony Lu <tonylu@linux.alibaba.com>
Cc: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_sysctl.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index bae19419e755..cf3ab1334c00 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -61,5 +61,10 @@ err_alloc:
 
 void __net_exit smc_sysctl_net_exit(struct net *net)
 {
+	struct ctl_table *table;
+
+	table = net->smc.smc_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->smc.smc_hdr);
+	if (!net_eq(net, &init_net))
+		kfree(table);
 }

From bcb74e132a76ce0502bb33d5b65533a4ed72d159 Mon Sep 17 00:00:00 2001
From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Date: Thu, 24 Mar 2022 16:22:10 -0300
Subject: [PATCH 210/229] net/sched: act_ct: fix ref leak when switching zones

When switching zones or network namespaces without doing a ct clear in
between, it is now leaking a reference to the old ct entry. That's
because tcf_ct_skb_nfct_cached() returns false and
tcf_ct_flow_table_lookup() may simply overwrite it.

The fix is to, as the ct entry is not reusable, free it already at
tcf_ct_skb_nfct_cached().

Reported-by: Florian Westphal <fw@strlen.de>
Fixes: 2f131de361f6 ("net/sched: act_ct: Fix flow table lookup after ct clear or switching zones")
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_ct.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 6a34f7b80a6d..b1f502fce595 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -666,22 +666,25 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
 	if (!ct)
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
-		return false;
+		goto drop_ct;
 	if (nf_ct_zone(ct)->id != zone_id)
-		return false;
+		goto drop_ct;
 
 	/* Force conntrack entry direction. */
 	if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
 		if (nf_ct_is_confirmed(ct))
 			nf_ct_kill(ct);
 
-		nf_ct_put(ct);
-		nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
-
-		return false;
+		goto drop_ct;
 	}
 
 	return true;
+
+drop_ct:
+	nf_ct_put(ct);
+	nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+
+	return false;
 }
 
 /* Trim the skb to the length specified by the IP/IPv6 header,

From 33758c891479ea1c736abfee64b5225925875557 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vasily.averin@linux.dev>
Date: Thu, 24 Mar 2022 21:05:50 +0300
Subject: [PATCH 211/229] memcg: enable accounting for nft objects

nftables replaces iptables, but it lacks memcg accounting.

This patch account most of the memory allocation associated with nft
and should protect the host from misusing nft inside a memcg restricted
container.

Signed-off-by: Vasily Averin <vvs@openvz.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/core.c          |  2 +-
 net/netfilter/nf_tables_api.c | 44 +++++++++++++++++------------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 8a77a3fd69bc..77ae3e8d344c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -58,7 +58,7 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
 	if (num == 0)
 		return NULL;
 
-	e = kvzalloc(alloc, GFP_KERNEL);
+	e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
 	if (e)
 		e->num_hook_entries = num;
 	return e;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1f5a0eece0d1..a312dc961cab 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1113,16 +1113,16 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
 	}
 
 	err = -ENOMEM;
-	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
 	if (table == NULL)
 		goto err_kzalloc;
 
-	table->name = nla_strdup(attr, GFP_KERNEL);
+	table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
 	if (table->name == NULL)
 		goto err_strdup;
 
 	if (nla[NFTA_TABLE_USERDATA]) {
-		table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL);
+		table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
 		if (table->udata == NULL)
 			goto err_table_udata;
 
@@ -1803,7 +1803,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
 	struct nft_hook *hook;
 	int err;
 
-	hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL);
+	hook = kmalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
 	if (!hook) {
 		err = -ENOMEM;
 		goto err_hook_alloc;
@@ -2026,7 +2026,7 @@ static struct nft_rule_blob *nf_tables_chain_alloc_rules(unsigned int size)
 	if (size > INT_MAX)
 		return NULL;
 
-	blob = kvmalloc(size, GFP_KERNEL);
+	blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
 	if (!blob)
 		return NULL;
 
@@ -2126,7 +2126,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		if (err < 0)
 			return err;
 
-		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL);
+		basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
 		if (basechain == NULL) {
 			nft_chain_release_hook(&hook);
 			return -ENOMEM;
@@ -2156,7 +2156,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		if (flags & NFT_CHAIN_HW_OFFLOAD)
 			return -EOPNOTSUPP;
 
-		chain = kzalloc(sizeof(*chain), GFP_KERNEL);
+		chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
 		if (chain == NULL)
 			return -ENOMEM;
 
@@ -2169,7 +2169,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	chain->table = table;
 
 	if (nla[NFTA_CHAIN_NAME]) {
-		chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+		chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
 	} else {
 		if (!(flags & NFT_CHAIN_BINDING)) {
 			err = -EINVAL;
@@ -2177,7 +2177,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		}
 
 		snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
-		chain->name = kstrdup(name, GFP_KERNEL);
+		chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
 	}
 
 	if (!chain->name) {
@@ -2186,7 +2186,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	}
 
 	if (nla[NFTA_CHAIN_USERDATA]) {
-		chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL);
+		chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
 		if (chain->udata == NULL) {
 			err = -ENOMEM;
 			goto err_destroy_chain;
@@ -2349,7 +2349,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 		char *name;
 
 		err = -ENOMEM;
-		name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+		name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
 		if (!name)
 			goto err;
 
@@ -2797,7 +2797,7 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
 		goto err1;
 
 	err = -ENOMEM;
-	expr = kzalloc(expr_info.ops->size, GFP_KERNEL);
+	expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
 	if (expr == NULL)
 		goto err2;
 
@@ -3405,7 +3405,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
 	}
 
 	err = -ENOMEM;
-	rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL);
+	rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
 	if (rule == NULL)
 		goto err_release_expr;
 
@@ -3818,7 +3818,7 @@ cont:
 		free_page((unsigned long)inuse);
 	}
 
-	set->name = kasprintf(GFP_KERNEL, name, min + n);
+	set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
 	if (!set->name)
 		return -ENOMEM;
 
@@ -4382,11 +4382,11 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
 	alloc_size = sizeof(*set) + size + udlen;
 	if (alloc_size < size || alloc_size > INT_MAX)
 		return -ENOMEM;
-	set = kvzalloc(alloc_size, GFP_KERNEL);
+	set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
 	if (!set)
 		return -ENOMEM;
 
-	name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
+	name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
 	if (!name) {
 		err = -ENOMEM;
 		goto err_set_name;
@@ -5921,7 +5921,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	err = -ENOMEM;
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
 				      elem.key_end.val.data, elem.data.val.data,
-				      timeout, expiration, GFP_KERNEL);
+				      timeout, expiration, GFP_KERNEL_ACCOUNT);
 	if (elem.priv == NULL)
 		goto err_parse_data;
 
@@ -6165,7 +6165,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
 	err = -ENOMEM;
 	elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
 				      elem.key_end.val.data, NULL, 0, 0,
-				      GFP_KERNEL);
+				      GFP_KERNEL_ACCOUNT);
 	if (elem.priv == NULL)
 		goto fail_elem;
 
@@ -6477,7 +6477,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 	}
 
 	err = -ENOMEM;
-	obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
+	obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
 	if (!obj)
 		goto err2;
 
@@ -6643,7 +6643,7 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
 	obj->key.table = table;
 	obj->handle = nf_tables_alloc_handle(table);
 
-	obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
+	obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
 	if (!obj->key.name) {
 		err = -ENOMEM;
 		goto err_strdup;
@@ -7404,7 +7404,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
 
 	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
-	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL);
+	flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
 	if (!flowtable)
 		return -ENOMEM;
 
@@ -7412,7 +7412,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb,
 	flowtable->handle = nf_tables_alloc_handle(table);
 	INIT_LIST_HEAD(&flowtable->hook_list);
 
-	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
+	flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
 	if (!flowtable->name) {
 		err = -ENOMEM;
 		goto err1;

From 7c9d845f0612e5bcd23456a2ec43be8ac43458f1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 28 Mar 2022 08:36:34 -0400
Subject: [PATCH 212/229] NFSv4/pNFS: Fix another issue with a list iterator
 pointing to the head

In nfs4_callback_devicenotify(), if we don't find a matching entry for
the deviceid, we're left with a pointer to 'struct nfs_server' that
actually points to the list of super blocks associated with our struct
nfs_client.
Furthermore, even if we have a valid pointer, nothing pins the super
block, and so the struct nfs_server could end up getting freed while
we're using it.

Since all we want is a pointer to the struct pnfs_layoutdriver_type,
let's skip all the iteration over super blocks, and just use APIs to
find the layout driver directly.

Reported-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Fixes: 1be5683b03a7 ("pnfs: CB_NOTIFY_DEVICEID")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/callback_proc.c | 27 +++++++++------------------
 fs/nfs/pnfs.c          | 11 +++++++++++
 fs/nfs/pnfs.h          |  2 ++
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 39d1ec870d90..c8520284dda7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -358,12 +358,11 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
 				  struct cb_process_state *cps)
 {
 	struct cb_devicenotifyargs *args = argp;
+	const struct pnfs_layoutdriver_type *ld = NULL;
 	uint32_t i;
 	__be32 res = 0;
-	struct nfs_client *clp = cps->clp;
-	struct nfs_server *server = NULL;
 
-	if (!clp) {
+	if (!cps->clp) {
 		res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
 		goto out;
 	}
@@ -371,23 +370,15 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp,
 	for (i = 0; i < args->ndevs; i++) {
 		struct cb_devicenotifyitem *dev = &args->devs[i];
 
-		if (!server ||
-		    server->pnfs_curr_ld->id != dev->cbd_layout_type) {
-			rcu_read_lock();
-			list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
-				if (server->pnfs_curr_ld &&
-				    server->pnfs_curr_ld->id == dev->cbd_layout_type) {
-					rcu_read_unlock();
-					goto found;
-				}
-			rcu_read_unlock();
-			continue;
+		if (!ld || ld->id != dev->cbd_layout_type) {
+			pnfs_put_layoutdriver(ld);
+			ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+			if (!ld)
+				continue;
 		}
-
-	found:
-		nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+		nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
 	}
-
+	pnfs_put_layoutdriver(ld);
 out:
 	kfree(args->devs);
 	return res;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index de318bb5d349..856c962273c7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -92,6 +92,17 @@ find_pnfs_driver(u32 id)
 	return local;
 }
 
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+	return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+	if (ld)
+		module_put(ld->owner);
+}
+
 void
 unset_pnfs_layoutdriver(struct nfs_server *nfss)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f4d7548d67b2..07f11489e4e9 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -234,6 +234,8 @@ struct pnfs_devicelist {
 
 extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
 extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
 
 /* nfs4proc.c */
 extern size_t max_response_pages(struct nfs_server *server);

From d9142e1cf3bbdaf21337767114ecab26fe702d47 Mon Sep 17 00:00:00 2001
From: Naresh Kamboju <naresh.kamboju@linaro.org>
Date: Mon, 28 Mar 2022 19:16:50 +0530
Subject: [PATCH 213/229] selftests: net: Add tls config dependency for tls
 selftests

selftest net tls test cases need TLS=m without this the test hangs.
Enabling config TLS solves this problem and runs to complete.
  - CONFIG_TLS=m

Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Signed-off-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index ead7963b9bf0..cecb921a0dbf 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -43,5 +43,6 @@ CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_BAREUDP=m
 CONFIG_IPV6_IOAM6_LWTUNNEL=y
+CONFIG_TLS=m
 CONFIG_CRYPTO_SM4=y
 CONFIG_AMT=m

From dcf500065fabe27676dfe7b4ba521a4f1e0fc8ac Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Date: Mon, 28 Mar 2022 15:27:08 +0900
Subject: [PATCH 214/229] net: bnxt_ptp: fix compilation error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Broadcom bnxt_ptp driver does not compile with GCC 11.2.2 when
CONFIG_WERROR is enabled. The following error is generated:

drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c: In function ‘bnxt_ptp_enable’:
drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c:400:43: error: array
subscript 255 is above array bounds of ‘struct pps_pin[4]’
[-Werror=array-bounds]
  400 |  ptp->pps_info.pins[pin_id].event = BNXT_PPS_EVENT_EXTERNAL;
      |  ~~~~~~~~~~~~~~~~~~^~~~~~~~
In file included from drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c:20:
drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h:75:24: note: while
referencing ‘pins’
   75 |         struct pps_pin pins[BNXT_MAX_TSIO_PINS];
      |                        ^~~~
cc1: all warnings being treated as errors

This is due to the function ptp_find_pin() returning a pin ID of -1 when
a valid pin is not found and this error never being checked.
Change the TSIO_PIN_VALID() function to also check that a pin ID is not
negative and use this macro in bnxt_ptp_enable() to check the result of
the calls to ptp_find_pin() to return an error early for invalid pins.
This fixes the compilation error.

Cc: <stable@vger.kernel.org>
Fixes: 9e518f25802c ("bnxt_en: 1PPS functions to configure TSIO pins")
Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Reviewed-by: Michael Chan <michael.chan@broadcom.com>
Link: https://lore.kernel.org/r/20220328062708.207079-1-damien.lemoal@opensource.wdc.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 6 +++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
index a0b321a19361..9c2ad5e67a5d 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c
@@ -382,7 +382,7 @@ static int bnxt_ptp_enable(struct ptp_clock_info *ptp_info,
 	struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg,
 						ptp_info);
 	struct bnxt *bp = ptp->bp;
-	u8 pin_id;
+	int pin_id;
 	int rc;
 
 	switch (rq->type) {
@@ -390,6 +390,8 @@ static int bnxt_ptp_enable(struct ptp_clock_info *ptp_info,
 		/* Configure an External PPS IN */
 		pin_id = ptp_find_pin(ptp->ptp_clock, PTP_PF_EXTTS,
 				      rq->extts.index);
+		if (!TSIO_PIN_VALID(pin_id))
+			return -EOPNOTSUPP;
 		if (!on)
 			break;
 		rc = bnxt_ptp_cfg_pin(bp, pin_id, BNXT_PPS_PIN_PPS_IN);
@@ -403,6 +405,8 @@ static int bnxt_ptp_enable(struct ptp_clock_info *ptp_info,
 		/* Configure a Periodic PPS OUT */
 		pin_id = ptp_find_pin(ptp->ptp_clock, PTP_PF_PEROUT,
 				      rq->perout.index);
+		if (!TSIO_PIN_VALID(pin_id))
+			return -EOPNOTSUPP;
 		if (!on)
 			break;
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h
index 373baf45884b..530b9922608c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h
@@ -31,7 +31,7 @@ struct pps_pin {
 	u8 state;
 };
 
-#define TSIO_PIN_VALID(pin) ((pin) < (BNXT_MAX_TSIO_PINS))
+#define TSIO_PIN_VALID(pin) ((pin) >= 0 && (pin) < (BNXT_MAX_TSIO_PINS))
 
 #define EVENT_DATA2_PPS_EVENT_TYPE(data2)				\
 	((data2) & ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE)

From b0cf9b4b26e697886bf0986579953c16cf92d3d2 Mon Sep 17 00:00:00 2001
From: Brian Cain <bcain@quicinc.com>
Date: Fri, 28 Jan 2022 10:46:30 -0600
Subject: [PATCH 215/229] MAINTAINERS: update hexagon maintainer email, tree

Some email infrastructure changes required this switch.

Signed-off-by: Brian Cain <bcain@quicinc.com>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 741825cff43c..d91f6c6e3d3b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16290,8 +16290,9 @@ F:	drivers/misc/fastrpc.c
 F:	include/uapi/misc/fastrpc.h
 
 QUALCOMM HEXAGON ARCHITECTURE
-M:	Brian Cain <bcain@codeaurora.org>
+M:	Brian Cain <bcain@quicinc.com>
 L:	linux-hexagon@vger.kernel.org
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/bcain/linux.git
 S:	Supported
 F:	arch/hexagon/
 

From 6da69b1da130e7d96766042750cd9f902e890eba Mon Sep 17 00:00:00 2001
From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Date: Mon, 28 Mar 2022 11:24:31 +0800
Subject: [PATCH 216/229] net: dsa: bcm_sf2_cfp: fix an incorrect NULL check on
 list iterator

The bug is here:
	return rule;

The list iterator value 'rule' will *always* be set and non-NULL
by list_for_each_entry(), so it is incorrect to assume that the
iterator value will be NULL if the list is empty or no element
is found.

To fix the bug, return 'rule' when found, otherwise return NULL.

Fixes: ae7a5aff783c7 ("net: dsa: bcm_sf2: Keep copy of inserted rules")
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
Link: https://lore.kernel.org/r/20220328032431.22538-1-xiam0nd.tong@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/dsa/bcm_sf2_cfp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index a7e2fcf2df2c..edbe5e7f1cb6 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -567,14 +567,14 @@ static void bcm_sf2_cfp_slice_ipv6(struct bcm_sf2_priv *priv,
 static struct cfp_rule *bcm_sf2_cfp_rule_find(struct bcm_sf2_priv *priv,
 					      int port, u32 location)
 {
-	struct cfp_rule *rule = NULL;
+	struct cfp_rule *rule;
 
 	list_for_each_entry(rule, &priv->cfp.rules_list, next) {
 		if (rule->port == port && rule->fs.location == location)
-			break;
+			return rule;
 	}
 
-	return rule;
+	return NULL;
 }
 
 static int bcm_sf2_cfp_rule_cmp(struct bcm_sf2_priv *priv, int port,

From f32404ae1bb9a7428a3c77419672a28895d185bf Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 25 Mar 2022 22:50:23 +0100
Subject: [PATCH 217/229] net: move net_unlink_todo() out of the header

There's no reason for this to be in netdevice.h, it's all
just used in dev.c. Also make it no longer inline and let
the compiler decide to do that by itself.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20220325225023.f49b9056fe1c.I6b901a2df00000837a9bd251a8dd259bd23f5ded@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 10 ----------
 net/core/dev.c            | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cd7a597c55b1..59e27a2b7bf0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4601,16 +4601,6 @@ bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
 						     struct list_head **iter);
 
-#ifdef CONFIG_LOCKDEP
-static LIST_HEAD(net_unlink_list);
-
-static inline void net_unlink_todo(struct net_device *dev)
-{
-	if (list_empty(&dev->unlink_list))
-		list_add_tail(&dev->unlink_list, &net_unlink_list);
-}
-#endif
-
 /* iterate through upper list, must be called under RCU read lock */
 #define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
 	for (iter = &(dev)->adj_list.upper, \
diff --git a/net/core/dev.c b/net/core/dev.c
index 8a5109479dbe..8c6c08446556 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7193,6 +7193,16 @@ static int __netdev_update_upper_level(struct net_device *dev,
 	return 0;
 }
 
+#ifdef CONFIG_LOCKDEP
+static LIST_HEAD(net_unlink_list);
+
+static void net_unlink_todo(struct net_device *dev)
+{
+	if (list_empty(&dev->unlink_list))
+		list_add_tail(&dev->unlink_list, &net_unlink_list);
+}
+#endif
+
 static int __netdev_update_lower_level(struct net_device *dev,
 				       struct netdev_nested_priv *priv)
 {

From 906b3d64913c19a50d5c553f21b54d2f4ce3ded7 Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Mon, 28 Mar 2022 14:10:36 +0800
Subject: [PATCH 218/229] net/smc: Send out the remaining data in sndbuf before
 close

The current autocork algorithms will delay the data transmission
in BH context to smc_release_cb() when sock_lock is hold by user.

So there is a possibility that when connection is being actively
closed (sock_lock is hold by user now), some corked data still
remains in sndbuf, waiting to be sent by smc_release_cb(). This
will cause:

- smc_close_stream_wait(), which is called under the sock_lock,
  has a high probability of timeout because data transmission is
  delayed until sock_lock is released.

- Unexpected data sends may happen after connction closed and use
  the rtoken which has been deleted by remote peer through
  LLC_DELETE_RKEY messages.

So this patch will try to send out the remaining corked data in
sndbuf before active close process, to ensure data integrity and
avoid unexpected data transmission after close.

Reported-by: Guangguan Wang <guangguan.wang@linux.alibaba.com>
Fixes: 6b88af839d20 ("net/smc: don't send in the BH context if sock_owned_by_user")
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Acked-by: Karsten Graul <kgraul@linux.ibm.com>
Link: https://lore.kernel.org/r/1648447836-111521-1-git-send-email-guwen@linux.alibaba.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/smc/smc_close.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index 292e4d904ab6..676cb2333d3c 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -57,6 +57,9 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
 	if (!smc_tx_prepared_sends(&smc->conn))
 		return;
 
+	/* Send out corked data remaining in sndbuf */
+	smc_tx_pending(&smc->conn);
+
 	smc->wait_close_tx_prepared = 1;
 	add_wait_queue(sk_sleep(sk), &wait);
 	while (!signal_pending(current) && timeout) {

From 20695e9a9fd39103d1b0669470ae74030b7aa196 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 28 Mar 2022 14:29:04 -0700
Subject: [PATCH 219/229] Revert "selftests: net: Add tls config dependency for
 tls selftests"

This reverts commit d9142e1cf3bbdaf21337767114ecab26fe702d47.

The test is supposed to run cleanly with TLS is disabled,
to test compatibility with TCP behavior. I can't repro
the failure [1], the problem should be debugged rather
than papered over.

Link: https://lore.kernel.org/all/20220325161203.7000698c@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com/ [1]
Fixes: d9142e1cf3bb ("selftests: net: Add tls config dependency for tls selftests")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20220328212904.2685395-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/net/config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index cecb921a0dbf..ead7963b9bf0 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -43,6 +43,5 @@ CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_ACT_MIRRED=m
 CONFIG_BAREUDP=m
 CONFIG_IPV6_IOAM6_LWTUNNEL=y
-CONFIG_TLS=m
 CONFIG_CRYPTO_SM4=y
 CONFIG_AMT=m

From dcb09a08d8d4052f8e8567ae2baddfbaf1d9c11f Mon Sep 17 00:00:00 2001
From: "jason-jh.lin" <jason-jh.lin@mediatek.com>
Date: Wed, 9 Mar 2022 21:46:59 +0800
Subject: [PATCH 220/229] Revert "dt-bindings: display: mediatek: add ethdr
 definition for mt8195"

This reverts commit e7dcfe64204a5cd9a74a9ca7d9c7a22434dc7fe5.

Because examples property of mediatek,ethdr.yaml should base on [1][2].
Reverting it until [1][2] are applied.

[1] dt-bindings: mediatek: mt8195: Add binding for MM IOMMU
https://patchwork.kernel.org/project/linux-mediatek/patch/20220217113453.13658-2-yong.wu@mediatek.com/
[2] dt-bindings: reset: mt8195: add vdosys1 reset control bit
https://patchwork.kernel.org/project/linux-mediatek/patch/20220222100741.30138-5-nancy.lin@mediatek.com/

Signed-off-by: jason-jh.lin <jason-jh.lin@mediatek.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220309134702.9942-2-jason-jh.lin@mediatek.com
---
 .../display/mediatek/mediatek,ethdr.yaml      | 147 ------------------
 1 file changed, 147 deletions(-)
 delete mode 100644 Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml

diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml
deleted file mode 100644
index 131eed5eeeb7..000000000000
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,ethdr.yaml
+++ /dev/null
@@ -1,147 +0,0 @@
-# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-%YAML 1.2
----
-$id: http://devicetree.org/schemas/display/mediatek/mediatek,ethdr.yaml#
-$schema: http://devicetree.org/meta-schemas/core.yaml#
-
-title: Mediatek Ethdr Device Tree Bindings
-
-maintainers:
-  - Chun-Kuang Hu <chunkuang.hu@kernel.org>
-  - Philipp Zabel <p.zabel@pengutronix.de>
-
-description: |
-  ETHDR is designed for HDR video and graphics conversion in the external display path.
-  It handles multiple HDR input types and performs tone mapping, color space/color
-  format conversion, and then combine different layers, output the required HDR or
-  SDR signal to the subsequent display path. This engine is composed of two video
-  frontends, two graphic frontends, one video backend and a mixer. ETHDR has two
-  DMA function blocks, DS and ADL. These two function blocks read the pre-programmed
-  registers from DRAM and set them to HW in the v-blanking period.
-
-properties:
-  compatible:
-    items:
-      - const: mediatek,mt8195-disp-ethdr
-  reg:
-    maxItems: 7
-  reg-names:
-    items:
-      - const: mixer
-      - const: vdo_fe0
-      - const: vdo_fe1
-      - const: gfx_fe0
-      - const: gfx_fe1
-      - const: vdo_be
-      - const: adl_ds
-  interrupts:
-    minItems: 1
-  iommus:
-    description: The compatible property is DMA function blocks.
-      Should point to the respective IOMMU block with master port as argument,
-      see Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml for
-      details.
-    minItems: 1
-    maxItems: 2
-  clocks:
-    items:
-      - description: mixer clock
-      - description: video frontend 0 clock
-      - description: video frontend 1 clock
-      - description: graphic frontend 0 clock
-      - description: graphic frontend 1 clock
-      - description: video backend clock
-      - description: autodownload and menuload clock
-      - description: video frontend 0 async clock
-      - description: video frontend 1 async clock
-      - description: graphic frontend 0 async clock
-      - description: graphic frontend 1 async clock
-      - description: video backend async clock
-      - description: ethdr top clock
-  clock-names:
-    items:
-      - const: mixer
-      - const: vdo_fe0
-      - const: vdo_fe1
-      - const: gfx_fe0
-      - const: gfx_fe1
-      - const: vdo_be
-      - const: adl_ds
-      - const: vdo_fe0_async
-      - const: vdo_fe1_async
-      - const: gfx_fe0_async
-      - const: gfx_fe1_async
-      - const: vdo_be_async
-      - const: ethdr_top
-  power-domains:
-    maxItems: 1
-  resets:
-    maxItems: 5
-  mediatek,gce-client-reg:
-    $ref: /schemas/types.yaml#/definitions/phandle-array
-    description: The register of display function block to be set by gce.
-      There are 4 arguments in this property, gce node, subsys id, offset and
-      register size. The subsys id is defined in the gce header of each chips
-      include/include/dt-bindings/gce/<chip>-gce.h, mapping to the register of
-      display function block.
-
-required:
-  - compatible
-  - reg
-  - clocks
-  - clock-names
-  - interrupts
-  - power-domains
-
-additionalProperties: false
-
-examples:
-  - |
-
-    disp_ethdr@1c114000 {
-            compatible = "mediatek,mt8195-disp-ethdr";
-            reg = <0 0x1c114000 0 0x1000>,
-                  <0 0x1c115000 0 0x1000>,
-                  <0 0x1c117000 0 0x1000>,
-                  <0 0x1c119000 0 0x1000>,
-                  <0 0x1c11A000 0 0x1000>,
-                  <0 0x1c11B000 0 0x1000>,
-                  <0 0x1c11C000 0 0x1000>;
-            reg-names = "mixer", "vdo_fe0", "vdo_fe1", "gfx_fe0", "gfx_fe1",
-                        "vdo_be", "adl_ds";
-            mediatek,gce-client-reg = <&gce0 SUBSYS_1c11XXXX 0x4000 0x1000>,
-                                      <&gce0 SUBSYS_1c11XXXX 0x5000 0x1000>,
-                                      <&gce0 SUBSYS_1c11XXXX 0x7000 0x1000>,
-                                      <&gce0 SUBSYS_1c11XXXX 0x9000 0x1000>,
-                                      <&gce0 SUBSYS_1c11XXXX 0xA000 0x1000>,
-                                      <&gce0 SUBSYS_1c11XXXX 0xB000 0x1000>,
-                                      <&gce0 SUBSYS_1c11XXXX 0xC000 0x1000>;
-            clocks = <&vdosys1 CLK_VDO1_DISP_MIXER>,
-                     <&vdosys1 CLK_VDO1_HDR_VDO_FE0>,
-                     <&vdosys1 CLK_VDO1_HDR_VDO_FE1>,
-                     <&vdosys1 CLK_VDO1_HDR_GFX_FE0>,
-                     <&vdosys1 CLK_VDO1_HDR_GFX_FE1>,
-                     <&vdosys1 CLK_VDO1_HDR_VDO_BE>,
-                     <&vdosys1 CLK_VDO1_26M_SLOW>,
-                     <&vdosys1 CLK_VDO1_HDR_VDO_FE0_DL_ASYNC>,
-                     <&vdosys1 CLK_VDO1_HDR_VDO_FE1_DL_ASYNC>,
-                     <&vdosys1 CLK_VDO1_HDR_GFX_FE0_DL_ASYNC>,
-                     <&vdosys1 CLK_VDO1_HDR_GFX_FE1_DL_ASYNC>,
-                     <&vdosys1 CLK_VDO1_HDR_VDO_BE_DL_ASYNC>,
-                     <&topckgen CLK_TOP_ETHDR_SEL>;
-            clock-names = "mixer", "vdo_fe0", "vdo_fe1", "gfx_fe0", "gfx_fe1",
-                          "vdo_be", "adl_ds", "vdo_fe0_async", "vdo_fe1_async",
-                          "gfx_fe0_async", "gfx_fe1_async","vdo_be_async",
-                          "ethdr_top";
-            power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>;
-            iommus = <&iommu_vpp M4U_PORT_L3_HDR_DS>,
-                     <&iommu_vpp M4U_PORT_L3_HDR_ADL>;
-            interrupts = <GIC_SPI 517 IRQ_TYPE_LEVEL_HIGH 0>; /* disp mixer */
-            resets = <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_VDO_FE0_DL_ASYNC>,
-                     <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_VDO_FE1_DL_ASYNC>,
-                     <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_GFX_FE0_DL_ASYNC>,
-                     <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_GFX_FE1_DL_ASYNC>,
-                     <&vdosys1 MT8195_VDOSYS1_SW1_RST_B_HDR_VDO_BE_DL_ASYNC>;
-    };
-
-...

From ab487888d5dfb37b71a256114e8fa6fafdbb3163 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Wed, 9 Mar 2022 21:47:00 +0800
Subject: [PATCH 221/229] dt-bindings: display: mediatek, mutex: Fix mediatek,
 gce-events type

The mediatek,gce-events property needs as value an array of uint32
corresponding to the CMDQ events to listen to, and not any phandle.

Fixes: 4ed545e7d100 ("dt-bindings: display: mediatek: disp: split each block to individual yaml")
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: jason-jh.lin <jason-jh.lin@mediatek.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220309134702.9942-3-jason-jh.lin@mediatek.com
---
 .../devicetree/bindings/display/mediatek/mediatek,mutex.yaml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml
index 6eca525eced0..842ba7b07a34 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml
@@ -58,7 +58,7 @@ properties:
       The event id which is mapping to the specific hardware event signal
       to gce. The event id is defined in the gce header
       include/dt-bindings/gce/<chip>-gce.h of each chips.
-    $ref: /schemas/types.yaml#/definitions/phandle-array
+    $ref: /schemas/types.yaml#/definitions/uint32-array
 
 required:
   - compatible

From 10f17b2054a7ab3b86a8d333d99146fff3e9c728 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Wed, 9 Mar 2022 21:47:01 +0800
Subject: [PATCH 222/229] dt-bindings: display: mediatek, ovl: Fix 'iommu'
 required property typo

The property is called 'iommus' and not 'iommu'. Fix this typo.

Fixes: 4ed545e7d100 ("dt-bindings: display: mediatek: disp: split each block to individual yaml")
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: jason-jh.lin <jason-jh.lin@mediatek.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220309134702.9942-4-jason-jh.lin@mediatek.com
---
 .../devicetree/bindings/display/mediatek/mediatek,ovl.yaml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
index 93d5c68a2dbd..fc691d00c60e 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
@@ -75,7 +75,7 @@ required:
   - interrupts
   - power-domains
   - clocks
-  - iommu
+  - iommus
 
 additionalProperties: false
 

From bff4e302a6679b0df8ac950979faabbc8f3ae961 Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Wed, 9 Mar 2022 21:47:02 +0800
Subject: [PATCH 223/229] dt-bindings: display: mediatek: Fix examples on new
 bindings

To avoid failure of dt_binding_check perform a slight refactoring
of the examples: the main block is kept, but that required fixing
the address and size cells, plus the inclusion of missing dt-bindings
headers, required to parse some of the values assigned to various
properties.

Fixes: 4ed545e7d100 ("dt-bindings: display: mediatek: disp: split each block to individual yaml")
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: jason-jh.lin <jason-jh.lin@mediatek.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Tested-by: jason-jh.lin <jason-jh.lin@medaitek.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220309134702.9942-5-jason-jh.lin@mediatek.com
---
 .../display/mediatek/mediatek,aal.yaml        | 23 ++++++---
 .../display/mediatek/mediatek,ccorr.yaml      | 23 ++++++---
 .../display/mediatek/mediatek,color.yaml      | 23 ++++++---
 .../display/mediatek/mediatek,dither.yaml     | 23 ++++++---
 .../display/mediatek/mediatek,dpi.yaml        |  3 +-
 .../display/mediatek/mediatek,dsc.yaml        | 23 ++++++---
 .../display/mediatek/mediatek,gamma.yaml      | 23 ++++++---
 .../display/mediatek/mediatek,merge.yaml      | 47 +++++++++----------
 .../display/mediatek/mediatek,mutex.yaml      | 25 ++++++----
 .../display/mediatek/mediatek,od.yaml         | 14 ++++--
 .../display/mediatek/mediatek,ovl-2l.yaml     | 26 ++++++----
 .../display/mediatek/mediatek,ovl.yaml        | 26 ++++++----
 .../display/mediatek/mediatek,postmask.yaml   | 23 ++++++---
 .../display/mediatek/mediatek,rdma.yaml       | 28 +++++++----
 .../display/mediatek/mediatek,split.yaml      | 17 +++++--
 .../display/mediatek/mediatek,ufoe.yaml       | 19 +++++---
 .../display/mediatek/mediatek,wdma.yaml       | 26 ++++++----
 17 files changed, 259 insertions(+), 133 deletions(-)

diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml
index 225f9dd726d2..61f0ed1e388f 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,aal.yaml
@@ -66,12 +66,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
 
-    aal@14015000 {
-        compatible = "mediatek,mt8173-disp-aal";
-        reg = <0 0x14015000 0 0x1000>;
-        interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_AAL>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x5000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        aal@14015000 {
+           compatible = "mediatek,mt8173-disp-aal";
+           reg = <0 0x14015000 0 0x1000>;
+           interrupts = <GIC_SPI 189 IRQ_TYPE_LEVEL_LOW>;
+           power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+           clocks = <&mmsys CLK_MM_DISP_AAL>;
+           mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x5000 0x1000>;
+       };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml
index 6894b6999412..0ed53b6238f0 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ccorr.yaml
@@ -65,12 +65,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8183-clk.h>
+    #include <dt-bindings/power/mt8183-power.h>
+    #include <dt-bindings/gce/mt8183-gce.h>
 
-    ccorr0: ccorr@1400f000 {
-        compatible = "mediatek,mt8183-disp-ccorr";
-        reg = <0 0x1400f000 0 0x1000>;
-        interrupts = <GIC_SPI 232 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
-        clocks = <&mmsys CLK_MM_DISP_CCORR0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xf000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        ccorr0: ccorr@1400f000 {
+            compatible = "mediatek,mt8183-disp-ccorr";
+            reg = <0 0x1400f000 0 0x1000>;
+            interrupts = <GIC_SPI 232 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+            clocks = <&mmsys CLK_MM_DISP_CCORR0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xf000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml
index bc83155b3b4c..3ad842eb5668 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,color.yaml
@@ -75,12 +75,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
 
-    color0: color@14013000 {
-        compatible = "mediatek,mt8173-disp-color";
-        reg = <0 0x14013000 0 0x1000>;
-        interrupts = <GIC_SPI 187 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_COLOR0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x3000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        color0: color@14013000 {
+            compatible = "mediatek,mt8173-disp-color";
+            reg = <0 0x14013000 0 0x1000>;
+            interrupts = <GIC_SPI 187 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_COLOR0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x3000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml
index 9d89297f5f1d..6657549af165 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dither.yaml
@@ -65,12 +65,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8183-clk.h>
+    #include <dt-bindings/power/mt8183-power.h>
+    #include <dt-bindings/gce/mt8183-gce.h>
 
-    dither0: dither@14012000 {
-        compatible = "mediatek,mt8183-disp-dither";
-        reg = <0 0x14012000 0 0x1000>;
-        interrupts = <GIC_SPI 235 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
-        clocks = <&mmsys CLK_MM_DISP_DITHER0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x2000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        dither0: dither@14012000 {
+            compatible = "mediatek,mt8183-disp-dither";
+            reg = <0 0x14012000 0 0x1000>;
+            interrupts = <GIC_SPI 235 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+            clocks = <&mmsys CLK_MM_DISP_DITHER0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x2000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
index dd2896a40ff0..843f89d6053f 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
@@ -70,8 +70,7 @@ examples:
   - |
     #include <dt-bindings/interrupt-controller/arm-gic.h>
     #include <dt-bindings/clock/mt8173-clk.h>
-    #include <dt-bindings/interrupt-controller/arm-gic.h>
-    #include <dt-bindings/interrupt-controller/irq.h>
+
     dpi0: dpi@1401d000 {
         compatible = "mediatek,mt8173-dpi";
         reg = <0x1401d000 0x1000>;
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml
index 1ec083eff824..49248864514b 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dsc.yaml
@@ -60,12 +60,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8195-clk.h>
+    #include <dt-bindings/power/mt8195-power.h>
+    #include <dt-bindings/gce/mt8195-gce.h>
 
-    dsc0: disp_dsc_wrap@1c009000 {
-        compatible = "mediatek,mt8195-disp-dsc";
-        reg = <0 0x1c009000 0 0x1000>;
-        interrupts = <GIC_SPI 645 IRQ_TYPE_LEVEL_HIGH 0>;
-        power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS0>;
-        clocks = <&vdosys0 CLK_VDO0_DSC_WRAP0>;
-        mediatek,gce-client-reg = <&gce1 SUBSYS_1c00XXXX 0x9000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        dsc0: disp_dsc_wrap@1c009000 {
+            compatible = "mediatek,mt8195-disp-dsc";
+            reg = <0 0x1c009000 0 0x1000>;
+            interrupts = <GIC_SPI 645 IRQ_TYPE_LEVEL_HIGH 0>;
+            power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS0>;
+            clocks = <&vdosys0 CLK_VDO0_DSC_WRAP0>;
+            mediatek,gce-client-reg = <&gce1 SUBSYS_1c00XXXX 0x9000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml
index 247baad147b3..78442339314f 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,gamma.yaml
@@ -66,12 +66,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
 
-    gamma@14016000 {
-        compatible = "mediatek,mt8173-disp-gamma";
-        reg = <0 0x14016000 0 0x1000>;
-        interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_GAMMA>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x6000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        gamma@14016000 {
+            compatible = "mediatek,mt8173-disp-gamma";
+            reg = <0 0x14016000 0 0x1000>;
+            interrupts = <GIC_SPI 190 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_GAMMA>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x6000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml
index d5cd69b7f501..d635c5dcb68b 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,merge.yaml
@@ -38,18 +38,16 @@ properties:
       Documentation/devicetree/bindings/power/power-domain.yaml for details.
 
   clocks:
+    minItems: 1
     maxItems: 2
-    items:
-      - description: MERGE Clock
-      - description: MERGE Async Clock
-          Controlling the synchronous process between MERGE and other display
-          function blocks cross clock domain.
 
   clock-names:
-    maxItems: 2
-    items:
-      - const: merge
-      - const: merge_async
+    oneOf:
+      - items:
+          - const: merge
+      - items:
+          - const: merge
+          - const: merge_async
 
   mediatek,merge-fifo-en:
     description:
@@ -88,23 +86,20 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
 
-    merge@14017000 {
-        compatible = "mediatek,mt8173-disp-merge";
-        reg = <0 0x14017000 0 0x1000>;
-        power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_MERGE>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        merge@14017000 {
+            compatible = "mediatek,mt8173-disp-merge";
+            reg = <0 0x14017000 0 0x1000>;
+            power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_MERGE>;
+            clock-names = "merge";
+        };
     };
 
-    merge5: disp_vpp_merge5@1c110000 {
-        compatible = "mediatek,mt8195-disp-merge";
-        reg = <0 0x1c110000 0 0x1000>;
-        interrupts = <GIC_SPI 507 IRQ_TYPE_LEVEL_HIGH 0>;
-        clocks = <&vdosys1 CLK_VDO1_VPP_MERGE4>,
-                 <&vdosys1 CLK_VDO1_MERGE4_DL_ASYNC>;
-        clock-names = "merge","merge_async";
-        power-domains = <&spm MT8195_POWER_DOMAIN_VDOSYS1>;
-        mediatek,gce-client-reg = <&gce1 SUBSYS_1c11XXXX 0x0000 0x1000>;
-        mediatek,merge-fifo-en = <1>;
-        resets = <&vdosys1 MT8195_VDOSYS1_SW0_RST_B_MERGE4_DL_ASYNC>;
-    };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml
index 842ba7b07a34..00e6a1041a9b 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,mutex.yaml
@@ -71,13 +71,22 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
 
-    mutex: mutex@14020000 {
-        compatible = "mediatek,mt8173-disp-mutex";
-        reg = <0 0x14020000 0 0x1000>;
-        interrupts = <GIC_SPI 169 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_MUTEX_32K>;
-        mediatek,gce-events = <CMDQ_EVENT_MUTEX0_STREAM_EOF>,
-                              <CMDQ_EVENT_MUTEX1_STREAM_EOF>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        mutex: mutex@14020000 {
+            compatible = "mediatek,mt8173-disp-mutex";
+            reg = <0 0x14020000 0 0x1000>;
+            interrupts = <GIC_SPI 169 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_MUTEX_32K>;
+            mediatek,gce-events = <CMDQ_EVENT_MUTEX0_STREAM_EOF>,
+                                  <CMDQ_EVENT_MUTEX1_STREAM_EOF>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml
index 7519db315217..853fcb9db2be 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,od.yaml
@@ -45,9 +45,15 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/clock/mt8173-clk.h>
 
-    od@14023000 {
-        compatible = "mediatek,mt8173-disp-od";
-        reg = <0 0x14023000 0 0x1000>;
-        clocks = <&mmsys CLK_MM_DISP_OD>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        od@14023000 {
+            compatible = "mediatek,mt8173-disp-od";
+            reg = <0 0x14023000 0 0x1000>;
+            clocks = <&mmsys CLK_MM_DISP_OD>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml
index e3cef99d0f98..da999ba53b7c 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl-2l.yaml
@@ -66,13 +66,23 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8183-clk.h>
+    #include <dt-bindings/power/mt8183-power.h>
+    #include <dt-bindings/gce/mt8183-gce.h>
+    #include <dt-bindings/memory/mt8183-larb-port.h>
 
-    ovl_2l0: ovl@14009000 {
-        compatible = "mediatek,mt8183-disp-ovl-2l";
-        reg = <0 0x14009000 0 0x1000>;
-        interrupts = <GIC_SPI 226 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
-        clocks = <&mmsys CLK_MM_DISP_OVL0_2L>;
-        iommus = <&iommu M4U_PORT_DISP_2L_OVL0_LARB0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0x9000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        ovl_2l0: ovl@14009000 {
+            compatible = "mediatek,mt8183-disp-ovl-2l";
+            reg = <0 0x14009000 0 0x1000>;
+            interrupts = <GIC_SPI 226 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&spm MT8183_POWER_DOMAIN_DISP>;
+            clocks = <&mmsys CLK_MM_DISP_OVL0_2L>;
+            iommus = <&iommu M4U_PORT_DISP_2L_OVL0_LARB0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0x9000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
index fc691d00c60e..f77094e61443 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ovl.yaml
@@ -81,13 +81,23 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
+    #include <dt-bindings/memory/mt8173-larb-port.h>
 
-    ovl0: ovl@1400c000 {
-        compatible = "mediatek,mt8173-disp-ovl";
-        reg = <0 0x1400c000 0 0x1000>;
-        interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_OVL0>;
-        iommus = <&iommu M4U_PORT_DISP_OVL0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xc000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        ovl0: ovl@1400c000 {
+            compatible = "mediatek,mt8173-disp-ovl";
+            reg = <0 0x1400c000 0 0x1000>;
+            interrupts = <GIC_SPI 180 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_OVL0>;
+            iommus = <&iommu M4U_PORT_DISP_OVL0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xc000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml
index 6ac1da2e8871..2d769422e29f 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,postmask.yaml
@@ -58,12 +58,21 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8192-clk.h>
+    #include <dt-bindings/power/mt8192-power.h>
+    #include <dt-bindings/gce/mt8192-gce.h>
 
-    postmask0: postmask@1400d000 {
-        compatible = "mediatek,mt8192-disp-postmask";
-        reg = <0 0x1400d000 0 0x1000>;
-        interrupts = <GIC_SPI 262 IRQ_TYPE_LEVEL_HIGH 0>;
-        power-domains = <&scpsys MT8192_POWER_DOMAIN_DISP>;
-        clocks = <&mmsys CLK_MM_DISP_POSTMASK0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xd000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        postmask0: postmask@1400d000 {
+            compatible = "mediatek,mt8192-disp-postmask";
+            reg = <0 0x1400d000 0 0x1000>;
+            interrupts = <GIC_SPI 262 IRQ_TYPE_LEVEL_HIGH 0>;
+            power-domains = <&scpsys MT8192_POWER_DOMAIN_DISP>;
+            clocks = <&mmsys CLK_MM_DISP_POSTMASK0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xd000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml
index b56e22fbcd52..e8c72afa0630 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,rdma.yaml
@@ -94,14 +94,24 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
+    #include <dt-bindings/memory/mt8173-larb-port.h>
 
-    rdma0: rdma@1400e000 {
-        compatible = "mediatek,mt8173-disp-rdma";
-        reg = <0 0x1400e000 0 0x1000>;
-        interrupts = <GIC_SPI 182 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_RDMA0>;
-        iommus = <&iommu M4U_PORT_DISP_RDMA0>;
-        mediatek,rdma-fifosize = <8192>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xe000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        rdma0: rdma@1400e000 {
+            compatible = "mediatek,mt8173-disp-rdma";
+            reg = <0 0x1400e000 0 0x1000>;
+            interrupts = <GIC_SPI 182 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_RDMA0>;
+            iommus = <&iommu M4U_PORT_DISP_RDMA0>;
+            mediatek,rdma-fifo-size = <8192>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1400XXXX 0xe000 0x1000>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
index 4f08e89c1067..35ace1f322e8 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
@@ -49,10 +49,17 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
 
-    split0: split@14018000 {
-        compatible = "mediatek,mt8173-disp-split";
-        reg = <0 0x14018000 0 0x1000>;
-        power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_SPLIT0>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        split0: split@14018000 {
+            compatible = "mediatek,mt8173-disp-split";
+            reg = <0 0x14018000 0 0x1000>;
+            power-domains = <&spm MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_SPLIT0>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml
index 6e8748529e73..b8bb135fe96b 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,ufoe.yaml
@@ -51,11 +51,18 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
 
-    ufoe@1401a000 {
-        compatible = "mediatek,mt8173-disp-ufoe";
-        reg = <0 0x1401a000 0 0x1000>;
-        interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_UFOE>;
+        ufoe@1401a000 {
+            compatible = "mediatek,mt8173-disp-ufoe";
+            reg = <0 0x1401a000 0 0x1000>;
+            interrupts = <GIC_SPI 191 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_UFOE>;
+        };
     };
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml
index f9f00a518edf..7d7cc1ab526b 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,wdma.yaml
@@ -64,13 +64,23 @@ additionalProperties: false
 
 examples:
   - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/clock/mt8173-clk.h>
+    #include <dt-bindings/power/mt8173-power.h>
+    #include <dt-bindings/gce/mt8173-gce.h>
+    #include <dt-bindings/memory/mt8173-larb-port.h>
 
-    wdma0: wdma@14011000 {
-        compatible = "mediatek,mt8173-disp-wdma";
-        reg = <0 0x14011000 0 0x1000>;
-        interrupts = <GIC_SPI 185 IRQ_TYPE_LEVEL_LOW>;
-        power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-        clocks = <&mmsys CLK_MM_DISP_WDMA0>;
-        iommus = <&iommu M4U_PORT_DISP_WDMA0>;
-        mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x1000 0x1000>;
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        wdma0: wdma@14011000 {
+            compatible = "mediatek,mt8173-disp-wdma";
+            reg = <0 0x14011000 0 0x1000>;
+            interrupts = <GIC_SPI 185 IRQ_TYPE_LEVEL_LOW>;
+            power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
+            clocks = <&mmsys CLK_MM_DISP_WDMA0>;
+            iommus = <&iommu M4U_PORT_DISP_WDMA0>;
+            mediatek,gce-client-reg = <&gce SUBSYS_1401XXXX 0x1000 0x1000>;
+        };
     };

From 09a2fb41ba67dcb45f259efd1d2baafe4a6be1a7 Mon Sep 17 00:00:00 2001
From: Biao Huang <biao.huang@mediatek.com>
Date: Thu, 24 Mar 2022 09:21:12 +0800
Subject: [PATCH 224/229] dt-bindings: net: snps,dwmac: modify available values
 of PBL

PBL can be any of the following values: 1, 2, 4, 8, 16 or 32
according to the datasheet, so modify available values of PBL in
snps,dwmac.yaml.

Signed-off-by: Biao Huang <biao.huang@mediatek.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220324012112.7016-2-biao.huang@mediatek.com
---
 Documentation/devicetree/bindings/net/snps,dwmac.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
index 7eb43707e601..2d5248f5b919 100644
--- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -340,21 +340,21 @@ allOf:
           description:
             Programmable Burst Length (tx and rx)
           $ref: /schemas/types.yaml#/definitions/uint32
-          enum: [2, 4, 8]
+          enum: [1, 2, 4, 8, 16, 32]
 
         snps,txpbl:
           description:
             Tx Programmable Burst Length. If set, DMA tx will use this
             value rather than snps,pbl.
           $ref: /schemas/types.yaml#/definitions/uint32
-          enum: [2, 4, 8]
+          enum: [1, 2, 4, 8, 16, 32]
 
         snps,rxpbl:
           description:
             Rx Programmable Burst Length. If set, DMA rx will use this
             value rather than snps,pbl.
           $ref: /schemas/types.yaml#/definitions/uint32
-          enum: [2, 4, 8]
+          enum: [1, 2, 4, 8, 16, 32]
 
         snps,no-pbl-x8:
           $ref: /schemas/types.yaml#/definitions/flag

From a50e431bbc6fc5768ed26be5fab5b149b7b8b1fe Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 1 Mar 2022 17:35:00 -0600
Subject: [PATCH 225/229] dt-bindings: media: mediatek,vcodec: Fix addressing
 cell sizes

'dma-ranges' in the example is written for cell sizes of 2 cells, but
the schema and example specify sizes of 1 cell. As the h/w has a bus
address of >32-bits, cell sizes of 2 is correct. Update the schema's
'#address-cells' and '#size-cells' to be 2 and adjust the example
throughout.

There's no error currently because dtc only checks 'dma-ranges' is a
correct multiple number of cells (3) and the schema checking is based on
bracketing of entries.

Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20220301233501.2110047-1-robh@kernel.org
---
 .../media/mediatek,vcodec-subdev-decoder.yaml | 120 +++++++++---------
 1 file changed, 63 insertions(+), 57 deletions(-)

diff --git a/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml b/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml
index d587fc3e39fb..7687be0f50aa 100644
--- a/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml
+++ b/Documentation/devicetree/bindings/media/mediatek,vcodec-subdev-decoder.yaml
@@ -72,10 +72,10 @@ properties:
       Describes the physical address space of IOMMU maps to memory.
 
   "#address-cells":
-    const: 1
+    const: 2
 
   "#size-cells":
-    const: 1
+    const: 2
 
   ranges: true
 
@@ -205,61 +205,67 @@ examples:
     #include <dt-bindings/clock/mt8192-clk.h>
     #include <dt-bindings/power/mt8192-power.h>
 
-    video-codec@16000000 {
-        compatible = "mediatek,mt8192-vcodec-dec";
-        mediatek,scp = <&scp>;
-        iommus = <&iommu0 M4U_PORT_L4_VDEC_MC_EXT>;
-        dma-ranges = <0x1 0x0 0x0 0x40000000 0x0 0xfff00000>;
-        #address-cells = <1>;
-        #size-cells = <1>;
-        ranges = <0 0x16000000 0x40000>;
-        reg = <0x16000000 0x1000>;		/* VDEC_SYS */
-        vcodec-lat@10000 {
-            compatible = "mediatek,mtk-vcodec-lat";
-            reg = <0x10000 0x800>;
-            interrupts = <GIC_SPI 426 IRQ_TYPE_LEVEL_HIGH 0>;
-            iommus = <&iommu0 M4U_PORT_L5_VDEC_LAT0_VLD_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_LAT0_VLD2_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_LAT0_AVC_MV_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_LAT0_PRED_RD_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_LAT0_TILE_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_LAT0_WDMA_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_LAT0_RG_CTRL_DMA_EXT>,
-                <&iommu0 M4U_PORT_L5_VDEC_UFO_ENC_EXT>;
-            clocks = <&topckgen CLK_TOP_VDEC_SEL>,
-                <&vdecsys_soc CLK_VDEC_SOC_VDEC>,
-                <&vdecsys_soc CLK_VDEC_SOC_LAT>,
-                <&vdecsys_soc CLK_VDEC_SOC_LARB1>,
-                <&topckgen CLK_TOP_MAINPLL_D4>;
-            clock-names = "sel", "soc-vdec", "soc-lat", "vdec", "top";
-            assigned-clocks = <&topckgen CLK_TOP_VDEC_SEL>;
-            assigned-clock-parents = <&topckgen CLK_TOP_MAINPLL_D4>;
-            power-domains = <&spm MT8192_POWER_DOMAIN_VDEC>;
-        };
+    bus@16000000 {
+        #address-cells = <2>;
+        #size-cells = <2>;
+        ranges = <0 0x16000000 0x16000000 0 0x40000>;
 
-        vcodec-core@25000 {
-            compatible = "mediatek,mtk-vcodec-core";
-            reg = <0x25000 0x1000>;
-            interrupts = <GIC_SPI 425 IRQ_TYPE_LEVEL_HIGH 0>;
-            iommus = <&iommu0 M4U_PORT_L4_VDEC_MC_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_UFO_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_PP_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_PRED_RD_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_PRED_WR_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_PPWRAP_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_TILE_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_VLD_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_VLD2_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_AVC_MV_EXT>,
-                <&iommu0 M4U_PORT_L4_VDEC_RG_CTRL_DMA_EXT>;
-            clocks = <&topckgen CLK_TOP_VDEC_SEL>,
-                <&vdecsys CLK_VDEC_VDEC>,
-                <&vdecsys CLK_VDEC_LAT>,
-                <&vdecsys CLK_VDEC_LARB1>,
-                <&topckgen CLK_TOP_MAINPLL_D4>;
-            clock-names = "sel", "soc-vdec", "soc-lat", "vdec", "top";
-            assigned-clocks = <&topckgen CLK_TOP_VDEC_SEL>;
-            assigned-clock-parents = <&topckgen CLK_TOP_MAINPLL_D4>;
-            power-domains = <&spm MT8192_POWER_DOMAIN_VDEC2>;
+        video-codec@16000000 {
+            compatible = "mediatek,mt8192-vcodec-dec";
+            mediatek,scp = <&scp>;
+            iommus = <&iommu0 M4U_PORT_L4_VDEC_MC_EXT>;
+            dma-ranges = <0x1 0x0 0x0 0x40000000 0x0 0xfff00000>;
+            #address-cells = <2>;
+            #size-cells = <2>;
+            ranges = <0 0 0 0x16000000 0 0x40000>;
+            reg = <0 0x16000000 0 0x1000>;		/* VDEC_SYS */
+            vcodec-lat@10000 {
+                compatible = "mediatek,mtk-vcodec-lat";
+                reg = <0 0x10000 0 0x800>;
+                interrupts = <GIC_SPI 426 IRQ_TYPE_LEVEL_HIGH 0>;
+                iommus = <&iommu0 M4U_PORT_L5_VDEC_LAT0_VLD_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_LAT0_VLD2_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_LAT0_AVC_MV_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_LAT0_PRED_RD_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_LAT0_TILE_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_LAT0_WDMA_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_LAT0_RG_CTRL_DMA_EXT>,
+                    <&iommu0 M4U_PORT_L5_VDEC_UFO_ENC_EXT>;
+                clocks = <&topckgen CLK_TOP_VDEC_SEL>,
+                    <&vdecsys_soc CLK_VDEC_SOC_VDEC>,
+                    <&vdecsys_soc CLK_VDEC_SOC_LAT>,
+                    <&vdecsys_soc CLK_VDEC_SOC_LARB1>,
+                    <&topckgen CLK_TOP_MAINPLL_D4>;
+                clock-names = "sel", "soc-vdec", "soc-lat", "vdec", "top";
+                assigned-clocks = <&topckgen CLK_TOP_VDEC_SEL>;
+                assigned-clock-parents = <&topckgen CLK_TOP_MAINPLL_D4>;
+                power-domains = <&spm MT8192_POWER_DOMAIN_VDEC>;
+            };
+
+            vcodec-core@25000 {
+                compatible = "mediatek,mtk-vcodec-core";
+                reg = <0 0x25000 0 0x1000>;
+                interrupts = <GIC_SPI 425 IRQ_TYPE_LEVEL_HIGH 0>;
+                iommus = <&iommu0 M4U_PORT_L4_VDEC_MC_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_UFO_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_PP_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_PRED_RD_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_PRED_WR_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_PPWRAP_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_TILE_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_VLD_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_VLD2_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_AVC_MV_EXT>,
+                    <&iommu0 M4U_PORT_L4_VDEC_RG_CTRL_DMA_EXT>;
+                clocks = <&topckgen CLK_TOP_VDEC_SEL>,
+                    <&vdecsys CLK_VDEC_VDEC>,
+                    <&vdecsys CLK_VDEC_LAT>,
+                    <&vdecsys CLK_VDEC_LARB1>,
+                    <&topckgen CLK_TOP_MAINPLL_D4>;
+                clock-names = "sel", "soc-vdec", "soc-lat", "vdec", "top";
+                assigned-clocks = <&topckgen CLK_TOP_VDEC_SEL>;
+                assigned-clock-parents = <&topckgen CLK_TOP_MAINPLL_D4>;
+                power-domains = <&spm MT8192_POWER_DOMAIN_VDEC2>;
+            };
         };
     };

From 22a41e9a5044bf3519f05b4a00e99af34bfeb40c Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 25 Mar 2022 16:56:52 -0500
Subject: [PATCH 226/229] dt-bindings: Fix missing '/schemas' in $ref paths

Absolute paths in $ref should always begin with '/schemas'. The tools
mostly work with it omitted, but for correctness the path should be
everything except the hostname as that is taken from the schema's $id
value. This scheme is defined in the json-schema spec.

Cc: Hector Martin <marcan@marcan.st>
Cc: Sven Peter <sven@svenpeter.dev>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Vivien Didelot <vivien.didelot@gmail.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Vladimir Oltean <olteanv@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chunfeng Yun <chunfeng.yun@mediatek.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mukesh Savaliya <msavaliy@codeaurora.org>
Cc: Akash Asthana <akashast@codeaurora.org>
Cc: Bayi Cheng <bayi.cheng@mediatek.com>
Cc: Chuanhong Guo <gch981213@gmail.com>
Cc: Min Guo <min.guo@mediatek.com>
Cc: netdev@vger.kernel.org
Cc: linux-spi@vger.kernel.org
Cc: linux-usb@vger.kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Mark Brown <broonie@debian.org>
Link: https://lore.kernel.org/r/20220325215652.525383-1-robh@kernel.org
---
 Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml   | 2 +-
 Documentation/devicetree/bindings/net/dsa/dsa-port.yaml       | 2 +-
 Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml  | 2 +-
 .../devicetree/bindings/spi/mediatek,spi-mtk-nor.yaml         | 2 +-
 Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml | 2 +-
 Documentation/devicetree/bindings/spi/sprd,spi-adi.yaml       | 2 +-
 Documentation/devicetree/bindings/usb/mediatek,mtu3.yaml      | 4 ++--
 Documentation/devicetree/bindings/usb/mediatek,musb.yaml      | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml b/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml
index b6b5d3a912b3..0dc957a56d35 100644
--- a/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml
+++ b/Documentation/devicetree/bindings/arm/apple/apple,pmgr.yaml
@@ -42,7 +42,7 @@ patternProperties:
     description:
       The individual power management domains within this controller
     type: object
-    $ref: /power/apple,pmgr-pwrstate.yaml#
+    $ref: /schemas/power/apple,pmgr-pwrstate.yaml#
 
 required:
   - compatible
diff --git a/Documentation/devicetree/bindings/net/dsa/dsa-port.yaml b/Documentation/devicetree/bindings/net/dsa/dsa-port.yaml
index a6b6b36a3f81..09317e16cb5d 100644
--- a/Documentation/devicetree/bindings/net/dsa/dsa-port.yaml
+++ b/Documentation/devicetree/bindings/net/dsa/dsa-port.yaml
@@ -15,7 +15,7 @@ description:
   Ethernet switch port Description
 
 allOf:
-  - $ref: "http://devicetree.org/schemas/net/ethernet-controller.yaml#"
+  - $ref: /schemas/net/ethernet-controller.yaml#
 
 properties:
   reg:
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml
index a776cd37c297..95fcb43675d6 100644
--- a/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,geni-se.yaml
@@ -103,7 +103,7 @@ patternProperties:
                  supports up to 50MHz, up to four chip selects, programmable
                  data path from 4 bits to 32 bits and numerous protocol
                  variants.
-    $ref: /spi/spi-controller.yaml#
+    $ref: /schemas/spi/spi-controller.yaml#
 
     properties:
       compatible:
diff --git a/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-nor.yaml b/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-nor.yaml
index be3cc7faed53..41e60fe4b09f 100644
--- a/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-nor.yaml
+++ b/Documentation/devicetree/bindings/spi/mediatek,spi-mtk-nor.yaml
@@ -18,7 +18,7 @@ description: |
   capability of this controller.
 
 allOf:
-  - $ref: /spi/spi-controller.yaml#
+  - $ref: /schemas/spi/spi-controller.yaml#
 
 properties:
   compatible:
diff --git a/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml b/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml
index 055524fe8327..5a60fba14bba 100644
--- a/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml
+++ b/Documentation/devicetree/bindings/spi/qcom,spi-qcom-qspi.yaml
@@ -16,7 +16,7 @@ description: The QSPI controller allows SPI protocol communication in single,
   as NOR flash.
 
 allOf:
-  - $ref: /spi/spi-controller.yaml#
+  - $ref: /schemas/spi/spi-controller.yaml#
 
 properties:
   compatible:
diff --git a/Documentation/devicetree/bindings/spi/sprd,spi-adi.yaml b/Documentation/devicetree/bindings/spi/sprd,spi-adi.yaml
index fe014020da69..a3ab1a1f1eb4 100644
--- a/Documentation/devicetree/bindings/spi/sprd,spi-adi.yaml
+++ b/Documentation/devicetree/bindings/spi/sprd,spi-adi.yaml
@@ -44,7 +44,7 @@ description: |
   compatibility.
 
 allOf:
-  - $ref: /spi/spi-controller.yaml#
+  - $ref: /schemas/spi/spi-controller.yaml#
 
 properties:
   compatible:
diff --git a/Documentation/devicetree/bindings/usb/mediatek,mtu3.yaml b/Documentation/devicetree/bindings/usb/mediatek,mtu3.yaml
index 77db1233516e..df766f8de872 100644
--- a/Documentation/devicetree/bindings/usb/mediatek,mtu3.yaml
+++ b/Documentation/devicetree/bindings/usb/mediatek,mtu3.yaml
@@ -132,7 +132,7 @@ properties:
     default: host
 
   connector:
-    $ref: /connector/usb-connector.yaml#
+    $ref: /schemas/connector/usb-connector.yaml#
     description:
       Connector for dual role switch, especially for "gpio-usb-b-connector"
     type: object
@@ -191,7 +191,7 @@ properties:
 patternProperties:
   "^usb@[0-9a-f]+$":
     type: object
-    $ref: /usb/mediatek,mtk-xhci.yaml#
+    $ref: /schemas/usb/mediatek,mtk-xhci.yaml#
     description:
       The xhci should be added as subnode to mtu3 as shown in the following
       example if the host mode is enabled.
diff --git a/Documentation/devicetree/bindings/usb/mediatek,musb.yaml b/Documentation/devicetree/bindings/usb/mediatek,musb.yaml
index 03d62d60ce5f..11a33f9b1f17 100644
--- a/Documentation/devicetree/bindings/usb/mediatek,musb.yaml
+++ b/Documentation/devicetree/bindings/usb/mediatek,musb.yaml
@@ -63,7 +63,7 @@ properties:
     maxItems: 1
 
   connector:
-    $ref: /connector/usb-connector.yaml#
+    $ref: /schemas/connector/usb-connector.yaml#
     description: Connector for dual role switch
     type: object
 

From 37fcacb50be7071d146144a6c5c5bf0194b9a1cf Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 29 Mar 2022 14:56:31 +0200
Subject: [PATCH 227/229] phy: PHY_FSL_LYNX_28G should depend on
 ARCH_LAYERSCAPE

Freescale Layerscape Lynx 28G SerDes PHYs are only present on
Freescale/NXP Layerscape SoCs.

Move PHY_FSL_LYNX_28G outside the block for ARCH_MXC, as the latter
is meant for i.MX8 SoCs, which is a different family than Layerscape.
Add a dependency on ARCH_LAYERSCAPE, to prevent asking the user about
this driver when configuring a kernel without Layerscape SoC support.

Fixes: 02e2af20f4f9f2aa ("Merge tag 'char-misc-5.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc")
Fixes: 8f73b37cf3fbda67 ("phy: add support for the Layerscape SerDes 28G")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/phy/freescale/Kconfig | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/phy/freescale/Kconfig b/drivers/phy/freescale/Kconfig
index 8d945211c7b4..f9c54cd02036 100644
--- a/drivers/phy/freescale/Kconfig
+++ b/drivers/phy/freescale/Kconfig
@@ -26,14 +26,15 @@ config PHY_FSL_IMX8M_PCIE
 	  Enable this to add support for the PCIE PHY as found on
 	  i.MX8M family of SOCs.
 
+endif
+
 config PHY_FSL_LYNX_28G
 	tristate "Freescale Layerscape Lynx 28G SerDes PHY support"
 	depends on OF
+	depends on ARCH_LAYERSCAPE || COMPILE_TEST
 	select GENERIC_PHY
 	help
 	  Enable this to add support for the Lynx SerDes 28G PHY as
 	  found on NXP's Layerscape platforms such as LX2160A.
 	  Used to change the protocol running on SerDes lanes at runtime.
 	  Only useful for a restricted set of Ethernet protocols.
-
-endif

From 1c24a186398f59c80adb9a967486b65c1423a59d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Mar 2022 15:06:39 -0700
Subject: [PATCH 228/229] fs: fd tables have to be multiples of BITS_PER_LONG

This has always been the rule: fdtables have several bitmaps in them,
and as a result they have to be sized properly for bitmaps.  We walk
those bitmaps in chunks of 'unsigned long' in serveral cases, but even
when we don't, we use the regular kernel bitops that are defined to work
on arrays of 'unsigned long', not on some byte array.

Now, the distinction between arrays of bytes and 'unsigned long'
normally only really ends up being noticeable on big-endian systems, but
Fedor Pchelkin and Alexey Khoroshilov reported that copy_fd_bitmaps()
could be called with an argument that wasn't even a multiple of
BITS_PER_BYTE.  And then it fails to do the proper copy even on
little-endian machines.

The bug wasn't in copy_fd_bitmap(), but in sane_fdtable_size(), which
didn't actually sanitize the fdtable size sufficiently, and never made
sure it had the proper BITS_PER_LONG alignment.

That's partly because the alignment historically came not from having to
explicitly align things, but simply from previous fdtable sizes, and
from count_open_files(), which counts the file descriptors by walking
them one 'unsigned long' word at a time and thus naturally ends up doing
sizing in the proper 'chunks of unsigned long'.

But with the introduction of close_range(), we now have an external
source of "this is how many files we want to have", and so
sane_fdtable_size() needs to do a better job.

This also adds that explicit alignment to alloc_fdtable(), although
there it is mainly just for documentation at a source code level.  The
arithmetic we do there to pick a reasonable fdtable size already aligns
the result sufficiently.

In fact,clang notices that the added ALIGN() in that function doesn't
actually do anything, and does not generate any extra code for it.

It turns out that gcc ends up confusing itself by combining a previous
constant-sized shift operation with the variable-sized shift operations
in roundup_pow_of_two().  And probably due to that doesn't notice that
the ALIGN() is a no-op.  But that's a (tiny) gcc misfeature that doesn't
matter.  Having the explicit alignment makes sense, and would actually
matter on a 128-bit architecture if we ever go there.

This also adds big comments above both functions about how fdtable sizes
have to have that BITS_PER_LONG alignment.

Fixes: 60997c3d45d9 ("close_range: add CLOSE_RANGE_UNSHARE")
Reported-by: Fedor Pchelkin <aissur0002@gmail.com>
Reported-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
Link: https://lore.kernel.org/all/20220326114009.1690-1-aissur0002@gmail.com/
Tested-and-acked-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/fs/file.c b/fs/file.c
index 97d212a9b814..c01c29417ae6 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -87,6 +87,21 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
 }
 
+/*
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
+ * 'unsigned long' in some places, but simply because that is how the Linux
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
+ * they are very much "bits in an array of unsigned long".
+ *
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
+ *
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
+ * let's consider it documentation (and maybe a test-case for gcc to improve
+ * its code generation ;)
+ */
 static struct fdtable * alloc_fdtable(unsigned int nr)
 {
 	struct fdtable *fdt;
@@ -102,6 +117,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	nr /= (1024 / sizeof(struct file *));
 	nr = roundup_pow_of_two(nr + 1);
 	nr *= (1024 / sizeof(struct file *));
+	nr = ALIGN(nr, BITS_PER_LONG);
 	/*
 	 * Note that this can drive nr *below* what we had passed if sysctl_nr_open
 	 * had been set lower between the check in expand_files() and here.  Deal
@@ -269,11 +285,25 @@ static unsigned int count_open_files(struct fdtable *fdt)
 	return i;
 }
 
+/*
+ * Note that a sane fdtable size always has to be a multiple of
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
+ *
+ * 'max_fds' will normally already be properly aligned, but it
+ * turns out that in the close_range() -> __close_range() ->
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
+ * up having a 'max_fds' value that isn't already aligned.
+ *
+ * Rather than make close_range() have to worry about this,
+ * just make that BITS_PER_LONG alignment be part of a sane
+ * fdtable size. Becuase that's really what it is.
+ */
 static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
 {
 	unsigned int count;
 
 	count = count_open_files(fdt);
+	max_fds = ALIGN(max_fds, BITS_PER_LONG);
 	if (max_fds < NR_OPEN_DEFAULT)
 		max_fds = NR_OPEN_DEFAULT;
 	return min(count, max_fds);

From d888c83fcec75194a8a48ccd283953bdba7b2550 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 29 Mar 2022 23:29:18 -0700
Subject: [PATCH 229/229] fs: fix fd table size alignment properly

Jason Donenfeld reports that my commit 1c24a186398f ("fs: fd tables have
to be multiples of BITS_PER_LONG") doesn't work, and the reason is an
embarrassing brown-paper-bag bug.

Yes, we want to align the number of fds to BITS_PER_LONG, and yes, the
reason they might not be aligned is because the incoming 'max_fd'
argument might not be aligned.

But aligining the argument - while simple - will cause a "infinitely
big" maxfd (eg NR_OPEN_MAX) to just overflow to zero.  Which most
definitely isn't what we want either.

The obvious fix was always just to do the alignment last, but I had
moved it earlier just to make the patch smaller and the code look
simpler.  Duh.  It certainly made _me_ look simple.

Fixes: 1c24a186398f ("fs: fd tables have to be multiples of BITS_PER_LONG")
Reported-and-tested-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Fedor Pchelkin <aissur0002@gmail.com>
Cc: Alexey Khoroshilov <khoroshilov@ispras.ru>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index c01c29417ae6..ee9317346702 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -303,10 +303,9 @@ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
 	unsigned int count;
 
 	count = count_open_files(fdt);
-	max_fds = ALIGN(max_fds, BITS_PER_LONG);
 	if (max_fds < NR_OPEN_DEFAULT)
 		max_fds = NR_OPEN_DEFAULT;
-	return min(count, max_fds);
+	return ALIGN(min(count, max_fds), BITS_PER_LONG);
 }
 
 /*