From d98793b5d4256faae76177178456214f55bc7083 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 14:34:09 -0700
Subject: [PATCH 001/230] dmaengine: idxd: fix wq config registers offset
 programming

DSA spec v1.1 [1] updated to include a stride size register for WQ
configuration that will specify how much space is reserved for the WQ
configuration register set. This change is expected to be in the final
gen1 DSA hardware. Fix the driver to use WQCFG_OFFSET() for all WQ
offset calculation and fixup WQCFG_OFFSET() to use the new calculated
wq size.

[1]: https://software.intel.com/content/www/us/en/develop/download/intel-data-streaming-accelerator-preliminary-architecture-specification.html

Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160383444959.48058.14249265538404901781.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/device.c    | 29 ++++++++++++++---------------
 drivers/dma/idxd/idxd.h      |  3 ++-
 drivers/dma/idxd/init.c      |  5 +++++
 drivers/dma/idxd/registers.h | 23 ++++++++++++++++++++++-
 4 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 200b9109cacf..506ac85c4a7f 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -295,7 +295,7 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	int i, wq_offset;
 
 	lockdep_assert_held(&idxd->dev_lock);
-	memset(&wq->wqcfg, 0, sizeof(wq->wqcfg));
+	memset(wq->wqcfg, 0, idxd->wqcfg_size);
 	wq->type = IDXD_WQT_NONE;
 	wq->size = 0;
 	wq->group = NULL;
@@ -304,8 +304,8 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
 
-	for (i = 0; i < 8; i++) {
-		wq_offset = idxd->wqcfg_offset + wq->id * 32 + i * sizeof(u32);
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
 		iowrite32(0, idxd->reg_base + wq_offset);
 		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
 			wq->id, i, wq_offset,
@@ -539,10 +539,10 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	if (!wq->group)
 		return 0;
 
-	memset(&wq->wqcfg, 0, sizeof(union wqcfg));
+	memset(wq->wqcfg, 0, idxd->wqcfg_size);
 
 	/* byte 0-3 */
-	wq->wqcfg.wq_size = wq->size;
+	wq->wqcfg->wq_size = wq->size;
 
 	if (wq->size == 0) {
 		dev_warn(dev, "Incorrect work queue size: 0\n");
@@ -550,22 +550,21 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	}
 
 	/* bytes 4-7 */
-	wq->wqcfg.wq_thresh = wq->threshold;
+	wq->wqcfg->wq_thresh = wq->threshold;
 
 	/* byte 8-11 */
-	wq->wqcfg.priv = !!(wq->type == IDXD_WQT_KERNEL);
-	wq->wqcfg.mode = 1;
-
-	wq->wqcfg.priority = wq->priority;
+	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
+	wq->wqcfg->mode = 1;
+	wq->wqcfg->priority = wq->priority;
 
 	/* bytes 12-15 */
-	wq->wqcfg.max_xfer_shift = ilog2(wq->max_xfer_bytes);
-	wq->wqcfg.max_batch_shift = ilog2(wq->max_batch_size);
+	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
+	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
 
 	dev_dbg(dev, "WQ %d CFGs\n", wq->id);
-	for (i = 0; i < 8; i++) {
-		wq_offset = idxd->wqcfg_offset + wq->id * 32 + i * sizeof(u32);
-		iowrite32(wq->wqcfg.bits[i], idxd->reg_base + wq_offset);
+	for (i = 0; i < WQCFG_STRIDES(idxd); i++) {
+		wq_offset = WQCFG_OFFSET(idxd, wq->id, i);
+		iowrite32(wq->wqcfg->bits[i], idxd->reg_base + wq_offset);
 		dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n",
 			wq->id, i, wq_offset,
 			ioread32(idxd->reg_base + wq_offset));
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index c64df197e724..d48f193daacc 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -103,7 +103,7 @@ struct idxd_wq {
 	u32 priority;
 	enum idxd_wq_state state;
 	unsigned long flags;
-	union wqcfg wqcfg;
+	union wqcfg *wqcfg;
 	u32 vec_ptr;		/* interrupt steering */
 	struct dsa_hw_desc **hw_descs;
 	int num_descs;
@@ -183,6 +183,7 @@ struct idxd_device {
 	int max_wq_size;
 	int token_limit;
 	int nr_tokens;		/* non-reserved tokens */
+	unsigned int wqcfg_size;
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 11e5ce168177..0a4432b063b5 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -178,6 +178,9 @@ static int idxd_setup_internals(struct idxd_device *idxd)
 		wq->idxd_cdev.minor = -1;
 		wq->max_xfer_bytes = idxd->max_xfer_bytes;
 		wq->max_batch_size = idxd->max_batch_size;
+		wq->wqcfg = devm_kzalloc(dev, idxd->wqcfg_size, GFP_KERNEL);
+		if (!wq->wqcfg)
+			return -ENOMEM;
 	}
 
 	for (i = 0; i < idxd->max_engines; i++) {
@@ -251,6 +254,8 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	dev_dbg(dev, "total workqueue size: %u\n", idxd->max_wq_size);
 	idxd->max_wqs = idxd->hw.wq_cap.num_wqs;
 	dev_dbg(dev, "max workqueues: %u\n", idxd->max_wqs);
+	idxd->wqcfg_size = 1 << (idxd->hw.wq_cap.wqcfg_size + IDXD_WQCFG_MIN);
+	dev_dbg(dev, "wqcfg size: %u\n", idxd->wqcfg_size);
 
 	/* reading operation capabilities */
 	for (i = 0; i < 4; i++) {
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index a39e7ae6b3d9..aef5a902829e 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -43,7 +43,8 @@ union wq_cap_reg {
 	struct {
 		u64 total_wq_size:16;
 		u64 num_wqs:8;
-		u64 rsvd:24;
+		u64 wqcfg_size:4;
+		u64 rsvd:20;
 		u64 shared_mode:1;
 		u64 dedicated_mode:1;
 		u64 rsvd2:1;
@@ -55,6 +56,7 @@ union wq_cap_reg {
 	u64 bits;
 } __packed;
 #define IDXD_WQCAP_OFFSET		0x20
+#define IDXD_WQCFG_MIN			5
 
 union group_cap_reg {
 	struct {
@@ -333,4 +335,23 @@ union wqcfg {
 	};
 	u32 bits[8];
 } __packed;
+
+/*
+ * This macro calculates the offset into the WQCFG register
+ * idxd - struct idxd *
+ * n - wq id
+ * ofs - the index of the 32b dword for the config register
+ *
+ * The WQCFG register block is divided into groups per each wq. The n index
+ * allows us to move to the register group that's for that particular wq.
+ * Each register is 32bits. The ofs gives us the number of register to access.
+ */
+#define WQCFG_OFFSET(_idxd_dev, n, ofs) \
+({\
+	typeof(_idxd_dev) __idxd_dev = (_idxd_dev);	\
+	(__idxd_dev)->wqcfg_offset + (n) * (__idxd_dev)->wqcfg_size + sizeof(u32) * (ofs);	\
+})
+
+#define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32))
+
 #endif

From 8145dce88a788f178dfe19376e5d8645d1b87e05 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 8 Oct 2020 09:18:28 -0500
Subject: [PATCH 002/230] dmaengine: stm32-mdma: Use struct_size() in kzalloc()

Make use of the new struct_size() helper instead of the offsetof() idiom.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20201008141828.GA20325@embeddedor
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/stm32-mdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/stm32-mdma.c b/drivers/dma/stm32-mdma.c
index 08cfbfab837b..29e5e30524bb 100644
--- a/drivers/dma/stm32-mdma.c
+++ b/drivers/dma/stm32-mdma.c
@@ -339,7 +339,7 @@ static struct stm32_mdma_desc *stm32_mdma_alloc_desc(
 	struct stm32_mdma_desc *desc;
 	int i;
 
-	desc = kzalloc(offsetof(typeof(*desc), node[count]), GFP_NOWAIT);
+	desc = kzalloc(struct_size(desc, node, count), GFP_NOWAIT);
 	if (!desc)
 		return NULL;
 

From dafd8fe27a9999ef14094dddfd5aaccd49750e1d Mon Sep 17 00:00:00 2001
From: Surendran K <surendran.k@samsung.com>
Date: Fri, 16 Oct 2020 16:03:47 +0530
Subject: [PATCH 003/230] dmaengine: pl330: Remove unreachable code

_setup_req(..) never returns negative value.
Hence the condition ret < 0 is never met

Signed-off-by: Surendran K <surendran.k@samsung.com>
Link: https://lore.kernel.org/r/20201016103347.63084-1-surendran.k@samsung.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/pl330.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index e9f0101d92fa..8355586c9788 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -1527,8 +1527,6 @@ static int pl330_submit_req(struct pl330_thread *thrd,
 
 	/* First dry run to check if req is acceptable */
 	ret = _setup_req(pl330, 1, thrd, idx, &xs);
-	if (ret < 0)
-		goto xfer_exit;
 
 	if (ret > pl330->mcbufsz / 2) {
 		dev_info(pl330->ddma.dev, "%s:%d Try increasing mcbufsz (%i/%i)\n",

From cb0362b6ec537619b842b54e794ae8a87a5122bb Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@microchip.com>
Date: Fri, 16 Oct 2020 11:17:51 +0300
Subject: [PATCH 004/230] dt-bindings: dmaengine: at_xdmac: add compatible with
 microchip,sama7g5

Add compatible to sama7g5 SoC.

Signed-off-by: Eugen Hristev <eugen.hristev@microchip.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201016081754.288488-1-eugen.hristev@microchip.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/devicetree/bindings/dma/atmel-xdma.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/dma/atmel-xdma.txt b/Documentation/devicetree/bindings/dma/atmel-xdma.txt
index 4dc398e1a371..510b7f25ba24 100644
--- a/Documentation/devicetree/bindings/dma/atmel-xdma.txt
+++ b/Documentation/devicetree/bindings/dma/atmel-xdma.txt
@@ -2,7 +2,8 @@
 
 * XDMA Controller
 Required properties:
-- compatible: Should be "atmel,sama5d4-dma" or "microchip,sam9x60-dma".
+- compatible: Should be "atmel,sama5d4-dma", "microchip,sam9x60-dma" or
+  "microchip,sama7g5-dma".
 - reg: Should contain DMA registers location and length.
 - interrupts: Should contain DMA interrupt.
 - #dma-cells: Must be <1>, used to represent the number of integer cells in

From 60f88c031d94f906f01df1cb155f7a866812a716 Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@microchip.com>
Date: Fri, 16 Oct 2020 12:37:25 +0300
Subject: [PATCH 005/230] dmaengine: at_xdmac: adapt perid for mem2mem
 operations

The PERID in the CC register for mem2mem operations must match an unused
PERID.
The PERID field is 7 bits, but the selected value is 0x3f.
On later products we can have more reserved PERIDs for actual peripherals,
thus this needs to be increased to maximum size.
Changing the value to 0x7f, which is the maximum for 7 bits field.

Signed-off-by: Eugen Hristev <eugen.hristev@microchip.com>
Reviewed-by: Tudor Ambarus <tudor.ambarus@microchip.com>
Link: https://lore.kernel.org/r/20201016093725.289880-1-eugen.hristev@microchip.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/at_xdmac.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 3b53115db268..9f8fc9271a74 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -867,7 +867,7 @@ at_xdmac_interleaved_queue_desc(struct dma_chan *chan,
 	 * match the one of another channel. If not, it could lead to spurious
 	 * flag status.
 	 */
-	u32			chan_cc = AT_XDMAC_CC_PERID(0x3f)
+	u32			chan_cc = AT_XDMAC_CC_PERID(0x7f)
 					| AT_XDMAC_CC_DIF(0)
 					| AT_XDMAC_CC_SIF(0)
 					| AT_XDMAC_CC_MBSIZE_SIXTEEN
@@ -1049,7 +1049,7 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	 * match the one of another channel. If not, it could lead to spurious
 	 * flag status.
 	 */
-	u32			chan_cc = AT_XDMAC_CC_PERID(0x3f)
+	u32			chan_cc = AT_XDMAC_CC_PERID(0x7f)
 					| AT_XDMAC_CC_DAM_INCREMENTED_AM
 					| AT_XDMAC_CC_SAM_INCREMENTED_AM
 					| AT_XDMAC_CC_DIF(0)
@@ -1155,7 +1155,7 @@ static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan,
 	 * match the one of another channel. If not, it could lead to spurious
 	 * flag status.
 	 */
-	u32			chan_cc = AT_XDMAC_CC_PERID(0x3f)
+	u32			chan_cc = AT_XDMAC_CC_PERID(0x7f)
 					| AT_XDMAC_CC_DAM_UBS_AM
 					| AT_XDMAC_CC_SAM_INCREMENTED_AM
 					| AT_XDMAC_CC_DIF(0)

From 2bec35a529b7d3bf28ee0d90ec157560fdf8d4e4 Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@microchip.com>
Date: Fri, 16 Oct 2020 12:38:50 +0300
Subject: [PATCH 006/230] dmaengine: at_xdmac: add support for sama7g5 based
 at_xdmac

SAMA7G5 SoC uses a slightly different variant of the AT_XDMAC.
Added support by a new compatible and a layout struct that copes
to the specific version considering the compatible string.
Only the differences in register map are present in the layout struct.
I reworked the register access for this part that has the differences.
Also the Source/Destination Interface bits are no longer valid for this
variant of the XDMAC. Thus, the layout also has a bool for specifying
whether these bits are required or not.

Signed-off-by: Eugen Hristev <eugen.hristev@microchip.com>
Link: https://lore.kernel.org/r/20201016093850.290053-1-eugen.hristev@microchip.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/at_xdmac.c | 110 +++++++++++++++++++++++++++++++----------
 1 file changed, 84 insertions(+), 26 deletions(-)

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 9f8fc9271a74..5a5c621206f2 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -38,13 +38,6 @@
 #define AT_XDMAC_GE		0x1C	/* Global Channel Enable Register */
 #define AT_XDMAC_GD		0x20	/* Global Channel Disable Register */
 #define AT_XDMAC_GS		0x24	/* Global Channel Status Register */
-#define AT_XDMAC_GRS		0x28	/* Global Channel Read Suspend Register */
-#define AT_XDMAC_GWS		0x2C	/* Global Write Suspend Register */
-#define AT_XDMAC_GRWS		0x30	/* Global Channel Read Write Suspend Register */
-#define AT_XDMAC_GRWR		0x34	/* Global Channel Read Write Resume Register */
-#define AT_XDMAC_GSWR		0x38	/* Global Channel Software Request Register */
-#define AT_XDMAC_GSWS		0x3C	/* Global channel Software Request Status Register */
-#define AT_XDMAC_GSWF		0x40	/* Global Channel Software Flush Request Register */
 #define AT_XDMAC_VERSION	0xFFC	/* XDMAC Version Register */
 
 /* Channel relative registers offsets */
@@ -150,8 +143,6 @@
 #define AT_XDMAC_CSUS		0x30	/* Channel Source Microblock Stride */
 #define AT_XDMAC_CDUS		0x34	/* Channel Destination Microblock Stride */
 
-#define AT_XDMAC_CHAN_REG_BASE	0x50	/* Channel registers base address */
-
 /* Microblock control members */
 #define AT_XDMAC_MBR_UBC_UBLEN_MAX	0xFFFFFFUL	/* Maximum Microblock Length */
 #define AT_XDMAC_MBR_UBC_NDE		(0x1 << 24)	/* Next Descriptor Enable */
@@ -179,6 +170,27 @@ enum atc_status {
 	AT_XDMAC_CHAN_IS_PAUSED,
 };
 
+struct at_xdmac_layout {
+	/* Global Channel Read Suspend Register */
+	u8				grs;
+	/* Global Write Suspend Register */
+	u8				gws;
+	/* Global Channel Read Write Suspend Register */
+	u8				grws;
+	/* Global Channel Read Write Resume Register */
+	u8				grwr;
+	/* Global Channel Software Request Register */
+	u8				gswr;
+	/* Global channel Software Request Status Register */
+	u8				gsws;
+	/* Global Channel Software Flush Request Register */
+	u8				gswf;
+	/* Channel reg base */
+	u8				chan_cc_reg_base;
+	/* Source/Destination Interface must be specified or not */
+	bool				sdif;
+};
+
 /* ----- Channels ----- */
 struct at_xdmac_chan {
 	struct dma_chan			chan;
@@ -212,6 +224,7 @@ struct at_xdmac {
 	struct clk		*clk;
 	u32			save_gim;
 	struct dma_pool		*at_xdmac_desc_pool;
+	const struct at_xdmac_layout	*layout;
 	struct at_xdmac_chan	chan[];
 };
 
@@ -244,9 +257,33 @@ struct at_xdmac_desc {
 	struct list_head		xfer_node;
 } __aligned(sizeof(u64));
 
+static const struct at_xdmac_layout at_xdmac_sama5d4_layout = {
+	.grs = 0x28,
+	.gws = 0x2C,
+	.grws = 0x30,
+	.grwr = 0x34,
+	.gswr = 0x38,
+	.gsws = 0x3C,
+	.gswf = 0x40,
+	.chan_cc_reg_base = 0x50,
+	.sdif = true,
+};
+
+static const struct at_xdmac_layout at_xdmac_sama7g5_layout = {
+	.grs = 0x30,
+	.gws = 0x38,
+	.grws = 0x40,
+	.grwr = 0x44,
+	.gswr = 0x48,
+	.gsws = 0x4C,
+	.gswf = 0x50,
+	.chan_cc_reg_base = 0x60,
+	.sdif = false,
+};
+
 static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb)
 {
-	return atxdmac->regs + (AT_XDMAC_CHAN_REG_BASE + chan_nb * 0x40);
+	return atxdmac->regs + (atxdmac->layout->chan_cc_reg_base + chan_nb * 0x40);
 }
 
 #define at_xdmac_read(atxdmac, reg) readl_relaxed((atxdmac)->regs + (reg))
@@ -345,8 +382,10 @@ static void at_xdmac_start_xfer(struct at_xdmac_chan *atchan,
 	first->active_xfer = true;
 
 	/* Tell xdmac where to get the first descriptor. */
-	reg = AT_XDMAC_CNDA_NDA(first->tx_dma_desc.phys)
-	      | AT_XDMAC_CNDA_NDAIF(atchan->memif);
+	reg = AT_XDMAC_CNDA_NDA(first->tx_dma_desc.phys);
+	if (atxdmac->layout->sdif)
+		reg |= AT_XDMAC_CNDA_NDAIF(atchan->memif);
+
 	at_xdmac_chan_write(atchan, AT_XDMAC_CNDA, reg);
 
 	/*
@@ -541,6 +580,7 @@ static int at_xdmac_compute_chan_conf(struct dma_chan *chan,
 				      enum dma_transfer_direction direction)
 {
 	struct at_xdmac_chan	*atchan = to_at_xdmac_chan(chan);
+	struct at_xdmac		*atxdmac = to_at_xdmac(atchan->chan.device);
 	int			csize, dwidth;
 
 	if (direction == DMA_DEV_TO_MEM) {
@@ -548,12 +588,14 @@ static int at_xdmac_compute_chan_conf(struct dma_chan *chan,
 			AT91_XDMAC_DT_PERID(atchan->perid)
 			| AT_XDMAC_CC_DAM_INCREMENTED_AM
 			| AT_XDMAC_CC_SAM_FIXED_AM
-			| AT_XDMAC_CC_DIF(atchan->memif)
-			| AT_XDMAC_CC_SIF(atchan->perif)
 			| AT_XDMAC_CC_SWREQ_HWR_CONNECTED
 			| AT_XDMAC_CC_DSYNC_PER2MEM
 			| AT_XDMAC_CC_MBSIZE_SIXTEEN
 			| AT_XDMAC_CC_TYPE_PER_TRAN;
+		if (atxdmac->layout->sdif)
+			atchan->cfg |= AT_XDMAC_CC_DIF(atchan->memif) |
+				       AT_XDMAC_CC_SIF(atchan->perif);
+
 		csize = ffs(atchan->sconfig.src_maxburst) - 1;
 		if (csize < 0) {
 			dev_err(chan2dev(chan), "invalid src maxburst value\n");
@@ -571,12 +613,14 @@ static int at_xdmac_compute_chan_conf(struct dma_chan *chan,
 			AT91_XDMAC_DT_PERID(atchan->perid)
 			| AT_XDMAC_CC_DAM_FIXED_AM
 			| AT_XDMAC_CC_SAM_INCREMENTED_AM
-			| AT_XDMAC_CC_DIF(atchan->perif)
-			| AT_XDMAC_CC_SIF(atchan->memif)
 			| AT_XDMAC_CC_SWREQ_HWR_CONNECTED
 			| AT_XDMAC_CC_DSYNC_MEM2PER
 			| AT_XDMAC_CC_MBSIZE_SIXTEEN
 			| AT_XDMAC_CC_TYPE_PER_TRAN;
+		if (atxdmac->layout->sdif)
+			atchan->cfg |= AT_XDMAC_CC_DIF(atchan->perif) |
+				       AT_XDMAC_CC_SIF(atchan->memif);
+
 		csize = ffs(atchan->sconfig.dst_maxburst) - 1;
 		if (csize < 0) {
 			dev_err(chan2dev(chan), "invalid src maxburst value\n");
@@ -866,10 +910,12 @@ at_xdmac_interleaved_queue_desc(struct dma_chan *chan,
 	 * ERRATA: Even if useless for memory transfers, the PERID has to not
 	 * match the one of another channel. If not, it could lead to spurious
 	 * flag status.
+	 * For SAMA7G5x case, the SIF and DIF fields are no longer used.
+	 * Thus, no need to have the SIF/DIF interfaces here.
+	 * For SAMA5D4x and SAMA5D2x the SIF and DIF are already configured as
+	 * zero.
 	 */
 	u32			chan_cc = AT_XDMAC_CC_PERID(0x7f)
-					| AT_XDMAC_CC_DIF(0)
-					| AT_XDMAC_CC_SIF(0)
 					| AT_XDMAC_CC_MBSIZE_SIXTEEN
 					| AT_XDMAC_CC_TYPE_MEM_TRAN;
 
@@ -1048,12 +1094,14 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	 * ERRATA: Even if useless for memory transfers, the PERID has to not
 	 * match the one of another channel. If not, it could lead to spurious
 	 * flag status.
+	 * For SAMA7G5x case, the SIF and DIF fields are no longer used.
+	 * Thus, no need to have the SIF/DIF interfaces here.
+	 * For SAMA5D4x and SAMA5D2x the SIF and DIF are already configured as
+	 * zero.
 	 */
 	u32			chan_cc = AT_XDMAC_CC_PERID(0x7f)
 					| AT_XDMAC_CC_DAM_INCREMENTED_AM
 					| AT_XDMAC_CC_SAM_INCREMENTED_AM
-					| AT_XDMAC_CC_DIF(0)
-					| AT_XDMAC_CC_SIF(0)
 					| AT_XDMAC_CC_MBSIZE_SIXTEEN
 					| AT_XDMAC_CC_TYPE_MEM_TRAN;
 	unsigned long		irqflags;
@@ -1154,12 +1202,14 @@ static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan,
 	 * ERRATA: Even if useless for memory transfers, the PERID has to not
 	 * match the one of another channel. If not, it could lead to spurious
 	 * flag status.
+	 * For SAMA7G5x case, the SIF and DIF fields are no longer used.
+	 * Thus, no need to have the SIF/DIF interfaces here.
+	 * For SAMA5D4x and SAMA5D2x the SIF and DIF are already configured as
+	 * zero.
 	 */
 	u32			chan_cc = AT_XDMAC_CC_PERID(0x7f)
 					| AT_XDMAC_CC_DAM_UBS_AM
 					| AT_XDMAC_CC_SAM_INCREMENTED_AM
-					| AT_XDMAC_CC_DIF(0)
-					| AT_XDMAC_CC_SIF(0)
 					| AT_XDMAC_CC_MBSIZE_SIXTEEN
 					| AT_XDMAC_CC_MEMSET_HW_MODE
 					| AT_XDMAC_CC_TYPE_MEM_TRAN;
@@ -1438,7 +1488,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 	mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC;
 	value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM;
 	if ((desc->lld.mbr_cfg & mask) == value) {
-		at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask);
+		at_xdmac_write(atxdmac, atxdmac->layout->gswf, atchan->mask);
 		while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS))
 			cpu_relax();
 	}
@@ -1496,7 +1546,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 	 * FIFO flush ensures that data are really written.
 	 */
 	if ((desc->lld.mbr_cfg & mask) == value) {
-		at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask);
+		at_xdmac_write(atxdmac, atxdmac->layout->gswf, atchan->mask);
 		while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS))
 			cpu_relax();
 	}
@@ -1761,7 +1811,7 @@ static int at_xdmac_device_pause(struct dma_chan *chan)
 		return 0;
 
 	spin_lock_irqsave(&atchan->lock, flags);
-	at_xdmac_write(atxdmac, AT_XDMAC_GRWS, atchan->mask);
+	at_xdmac_write(atxdmac, atxdmac->layout->grws, atchan->mask);
 	while (at_xdmac_chan_read(atchan, AT_XDMAC_CC)
 	       & (AT_XDMAC_CC_WRIP | AT_XDMAC_CC_RDIP))
 		cpu_relax();
@@ -1784,7 +1834,7 @@ static int at_xdmac_device_resume(struct dma_chan *chan)
 		return 0;
 	}
 
-	at_xdmac_write(atxdmac, AT_XDMAC_GRWR, atchan->mask);
+	at_xdmac_write(atxdmac, atxdmac->layout->grwr, atchan->mask);
 	clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status);
 	spin_unlock_irqrestore(&atchan->lock, flags);
 
@@ -1986,6 +2036,10 @@ static int at_xdmac_probe(struct platform_device *pdev)
 	atxdmac->regs = base;
 	atxdmac->irq = irq;
 
+	atxdmac->layout = of_device_get_match_data(&pdev->dev);
+	if (!atxdmac->layout)
+		return -ENODEV;
+
 	atxdmac->clk = devm_clk_get(&pdev->dev, "dma_clk");
 	if (IS_ERR(atxdmac->clk)) {
 		dev_err(&pdev->dev, "can't get dma_clk\n");
@@ -2128,6 +2182,10 @@ static const struct dev_pm_ops atmel_xdmac_dev_pm_ops = {
 static const struct of_device_id atmel_xdmac_dt_ids[] = {
 	{
 		.compatible = "atmel,sama5d4-dma",
+		.data = &at_xdmac_sama5d4_layout,
+	}, {
+		.compatible = "microchip,sama7g5-dma",
+		.data = &at_xdmac_sama7g5_layout,
 	}, {
 		/* sentinel */
 	}

From f40566f220a1a7ab3ac4de8d594a4a038bace156 Mon Sep 17 00:00:00 2001
From: Eugen Hristev <eugen.hristev@microchip.com>
Date: Fri, 16 Oct 2020 12:39:18 +0300
Subject: [PATCH 007/230] dmaengine: at_xdmac: add AXI priority support and
 recommended settings

The sama7g5 version of the XDMAC supports priority configuration and
outstanding capabilities.
Add defines for the specific registers for this configuration, together
with recommended settings.
However the settings are very different if the XDMAC is a mem2mem or a
per2mem controller.
Thus, we need to differentiate according to device tree property.

Signed-off-by: Eugen Hristev <eugen.hristev@microchip.com>
Link: https://lore.kernel.org/r/20201016093918.290137-1-eugen.hristev@microchip.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/at_xdmac.c | 47 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 5a5c621206f2..fe45ad5d06c4 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -30,7 +30,24 @@
 #define		AT_XDMAC_FIFO_SZ(i)	(((i) >> 5) & 0x7FF)		/* Number of Bytes */
 #define		AT_XDMAC_NB_REQ(i)	((((i) >> 16) & 0x3F) + 1)	/* Number of Peripheral Requests Minus One */
 #define AT_XDMAC_GCFG		0x04	/* Global Configuration Register */
+#define		AT_XDMAC_WRHP(i)		(((i) & 0xF) << 4)
+#define		AT_XDMAC_WRMP(i)		(((i) & 0xF) << 8)
+#define		AT_XDMAC_WRLP(i)		(((i) & 0xF) << 12)
+#define		AT_XDMAC_RDHP(i)		(((i) & 0xF) << 16)
+#define		AT_XDMAC_RDMP(i)		(((i) & 0xF) << 20)
+#define		AT_XDMAC_RDLP(i)		(((i) & 0xF) << 24)
+#define		AT_XDMAC_RDSG(i)		(((i) & 0xF) << 28)
+#define AT_XDMAC_GCFG_M2M	(AT_XDMAC_RDLP(0xF) | AT_XDMAC_WRLP(0xF))
+#define AT_XDMAC_GCFG_P2M	(AT_XDMAC_RDSG(0x1) | AT_XDMAC_RDHP(0x3) | \
+				AT_XDMAC_WRHP(0x5))
 #define AT_XDMAC_GWAC		0x08	/* Global Weighted Arbiter Configuration Register */
+#define		AT_XDMAC_PW0(i)		(((i) & 0xF) << 0)
+#define		AT_XDMAC_PW1(i)		(((i) & 0xF) << 4)
+#define		AT_XDMAC_PW2(i)		(((i) & 0xF) << 8)
+#define		AT_XDMAC_PW3(i)		(((i) & 0xF) << 12)
+#define AT_XDMAC_GWAC_M2M	0
+#define AT_XDMAC_GWAC_P2M	(AT_XDMAC_PW0(0xF) | AT_XDMAC_PW2(0xF))
+
 #define AT_XDMAC_GIE		0x0C	/* Global Interrupt Enable Register */
 #define AT_XDMAC_GID		0x10	/* Global Interrupt Disable Register */
 #define AT_XDMAC_GIM		0x14	/* Global Interrupt Mask Register */
@@ -189,6 +206,8 @@ struct at_xdmac_layout {
 	u8				chan_cc_reg_base;
 	/* Source/Destination Interface must be specified or not */
 	bool				sdif;
+	/* AXI queue priority configuration supported */
+	bool				axi_config;
 };
 
 /* ----- Channels ----- */
@@ -267,6 +286,7 @@ static const struct at_xdmac_layout at_xdmac_sama5d4_layout = {
 	.gswf = 0x40,
 	.chan_cc_reg_base = 0x50,
 	.sdif = true,
+	.axi_config = false,
 };
 
 static const struct at_xdmac_layout at_xdmac_sama7g5_layout = {
@@ -279,6 +299,7 @@ static const struct at_xdmac_layout at_xdmac_sama7g5_layout = {
 	.gswf = 0x50,
 	.chan_cc_reg_base = 0x60,
 	.sdif = false,
+	.axi_config = true,
 };
 
 static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb)
@@ -1997,6 +2018,30 @@ static int atmel_xdmac_resume(struct device *dev)
 }
 #endif /* CONFIG_PM_SLEEP */
 
+static void at_xdmac_axi_config(struct platform_device *pdev)
+{
+	struct at_xdmac	*atxdmac = (struct at_xdmac *)platform_get_drvdata(pdev);
+	bool dev_m2m = false;
+	u32 dma_requests;
+
+	if (!atxdmac->layout->axi_config)
+		return; /* Not supported */
+
+	if (!of_property_read_u32(pdev->dev.of_node, "dma-requests",
+				  &dma_requests)) {
+		dev_info(&pdev->dev, "controller in mem2mem mode.\n");
+		dev_m2m = true;
+	}
+
+	if (dev_m2m) {
+		at_xdmac_write(atxdmac, AT_XDMAC_GCFG, AT_XDMAC_GCFG_M2M);
+		at_xdmac_write(atxdmac, AT_XDMAC_GWAC, AT_XDMAC_GWAC_M2M);
+	} else {
+		at_xdmac_write(atxdmac, AT_XDMAC_GCFG, AT_XDMAC_GCFG_P2M);
+		at_xdmac_write(atxdmac, AT_XDMAC_GWAC, AT_XDMAC_GWAC_P2M);
+	}
+}
+
 static int at_xdmac_probe(struct platform_device *pdev)
 {
 	struct at_xdmac	*atxdmac;
@@ -2141,6 +2186,8 @@ static int at_xdmac_probe(struct platform_device *pdev)
 	dev_info(&pdev->dev, "%d channels, mapped at 0x%p\n",
 		 nr_channels, atxdmac->regs);
 
+	at_xdmac_axi_config(pdev);
+
 	return 0;
 
 err_dma_unregister:

From 68f35add4ba4c850a33878633e10c02d7ba32d84 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Mon, 19 Oct 2020 17:57:55 +0200
Subject: [PATCH 008/230] dmaengine: ppc4xx: make ppc440spe_adma_chan_list
 static

The ppc440spe_adma_chan_list file-scope variable is not used outside of
the unit so it can be made static.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201019155756.21445-1-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ppc4xx/adma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c
index 71cdaaa8134c..fea598550582 100644
--- a/drivers/dma/ppc4xx/adma.c
+++ b/drivers/dma/ppc4xx/adma.c
@@ -69,7 +69,7 @@ struct ppc_dma_chan_ref {
 };
 
 /* The list of channels exported by ppc440spe ADMA */
-struct list_head
+static struct list_head
 ppc440spe_adma_chan_list = LIST_HEAD_INIT(ppc440spe_adma_chan_list);
 
 /* This flag is set when want to refetch the xor chain in the interrupt

From 212a93ca435ec847bdfe238f225dd6e8b77927e9 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Mon, 19 Oct 2020 17:57:56 +0200
Subject: [PATCH 009/230] dmaengine: ppc4xx: remove xor_hw_desc assignment
 without reading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The xor_hw_desc local variable is assigned but never read:

    drivers/dma/ppc4xx/adma.c: In function ‘ppc440spe_desc_set_src_mult’:
    drivers/dma/ppc4xx/adma.c:562:17: warning: variable ‘xor_hw_desc’ set but not used [-Wunused-but-set-variable]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201019155756.21445-2-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ppc4xx/adma.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c
index fea598550582..df7704053d91 100644
--- a/drivers/dma/ppc4xx/adma.c
+++ b/drivers/dma/ppc4xx/adma.c
@@ -559,7 +559,6 @@ static void ppc440spe_desc_set_src_mult(struct ppc440spe_adma_desc_slot *desc,
 			int sg_index, unsigned char mult_value)
 {
 	struct dma_cdb *dma_hw_desc;
-	struct xor_cb *xor_hw_desc;
 	u32 *psgu;
 
 	switch (chan->device->id) {
@@ -590,7 +589,6 @@ static void ppc440spe_desc_set_src_mult(struct ppc440spe_adma_desc_slot *desc,
 		*psgu |= cpu_to_le32(mult_value << mult_index);
 		break;
 	case PPC440SPE_XOR_ID:
-		xor_hw_desc = desc->hw_desc;
 		break;
 	default:
 		BUG();

From 8e50d392652f20616a136165dff516b86baf5e49 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 10:34:35 -0700
Subject: [PATCH 010/230] dmaengine: idxd: Add shared workqueue support

Add shared workqueue support that includes the support of Shared Virtual
memory (SVM) or in similar terms On Demand Paging (ODP). The shared
workqueue uses the enqcmds command in kernel and will respond with retry if
the workqueue is full. Shared workqueue only works when there is PASID
support from the IOMMU.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160382007499.3911367.26043087963708134.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/Kconfig          |  10 +++
 drivers/dma/idxd/cdev.c      |  49 +++++++++++++-
 drivers/dma/idxd/device.c    |  90 +++++++++++++++++++++++--
 drivers/dma/idxd/dma.c       |   9 ---
 drivers/dma/idxd/idxd.h      |  28 +++++++-
 drivers/dma/idxd/init.c      |  91 +++++++++++++++++++------
 drivers/dma/idxd/registers.h |   2 +
 drivers/dma/idxd/submit.c    |  35 ++++++++--
 drivers/dma/idxd/sysfs.c     | 127 +++++++++++++++++++++++++++++++++++
 9 files changed, 398 insertions(+), 43 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 518a1437862a..6a908785a5f7 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -296,6 +296,16 @@ config INTEL_IDXD
 
 	  If unsure, say N.
 
+# Config symbol that collects all the dependencies that's necessary to
+# support shared virtual memory for the devices supported by idxd.
+config INTEL_IDXD_SVM
+	bool "Accelerator Shared Virtual Memory Support"
+	depends on INTEL_IDXD
+	depends on INTEL_IOMMU_SVM
+	depends on PCI_PRI
+	depends on PCI_PASID
+	depends on PCI_IOV
+
 config INTEL_IOATDMA
 	tristate "Intel I/OAT DMA support"
 	depends on PCI && X86_64
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index c3976156db2f..010b820d8f74 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -11,6 +11,7 @@
 #include <linux/cdev.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
+#include <linux/iommu.h>
 #include <uapi/linux/idxd.h>
 #include "registers.h"
 #include "idxd.h"
@@ -32,7 +33,9 @@ static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = {
 struct idxd_user_context {
 	struct idxd_wq *wq;
 	struct task_struct *task;
+	unsigned int pasid;
 	unsigned int flags;
+	struct iommu_sva *sva;
 };
 
 enum idxd_cdev_cleanup {
@@ -75,6 +78,8 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 	struct idxd_wq *wq;
 	struct device *dev;
 	int rc = 0;
+	struct iommu_sva *sva;
+	unsigned int pasid;
 
 	wq = inode_wq(inode);
 	idxd = wq->idxd;
@@ -95,6 +100,34 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp)
 
 	ctx->wq = wq;
 	filp->private_data = ctx;
+
+	if (device_pasid_enabled(idxd)) {
+		sva = iommu_sva_bind_device(dev, current->mm, NULL);
+		if (IS_ERR(sva)) {
+			rc = PTR_ERR(sva);
+			dev_err(dev, "pasid allocation failed: %d\n", rc);
+			goto failed;
+		}
+
+		pasid = iommu_sva_get_pasid(sva);
+		if (pasid == IOMMU_PASID_INVALID) {
+			iommu_sva_unbind_device(sva);
+			goto failed;
+		}
+
+		ctx->sva = sva;
+		ctx->pasid = pasid;
+
+		if (wq_dedicated(wq)) {
+			rc = idxd_wq_set_pasid(wq, pasid);
+			if (rc < 0) {
+				iommu_sva_unbind_device(sva);
+				dev_err(dev, "wq set pasid failed: %d\n", rc);
+				goto failed;
+			}
+		}
+	}
+
 	idxd_wq_get(wq);
 	mutex_unlock(&wq->wq_lock);
 	return 0;
@@ -111,13 +144,27 @@ static int idxd_cdev_release(struct inode *node, struct file *filep)
 	struct idxd_wq *wq = ctx->wq;
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
+	int rc;
 
 	dev_dbg(dev, "%s called\n", __func__);
 	filep->private_data = NULL;
 
 	/* Wait for in-flight operations to complete. */
-	idxd_wq_drain(wq);
+	if (wq_shared(wq)) {
+		idxd_device_drain_pasid(idxd, ctx->pasid);
+	} else {
+		if (device_pasid_enabled(idxd)) {
+			/* The wq disable in the disable pasid function will drain the wq */
+			rc = idxd_wq_disable_pasid(wq);
+			if (rc < 0)
+				dev_err(dev, "wq disable pasid failed.\n");
+		} else {
+			idxd_wq_drain(wq);
+		}
+	}
 
+	if (ctx->sva)
+		iommu_sva_unbind_device(ctx->sva);
 	kfree(ctx);
 	mutex_lock(&wq->wq_lock);
 	idxd_wq_put(wq);
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 506ac85c4a7f..2f09eb89a906 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -273,10 +273,9 @@ int idxd_wq_map_portal(struct idxd_wq *wq)
 	start = pci_resource_start(pdev, IDXD_WQ_BAR);
 	start = start + wq->id * IDXD_PORTAL_SIZE;
 
-	wq->dportal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
-	if (!wq->dportal)
+	wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE);
+	if (!wq->portal)
 		return -ENOMEM;
-	dev_dbg(dev, "wq %d portal mapped at %p\n", wq->id, wq->dportal);
 
 	return 0;
 }
@@ -285,7 +284,61 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq)
 {
 	struct device *dev = &wq->idxd->pdev->dev;
 
-	devm_iounmap(dev, wq->dportal);
+	devm_iounmap(dev, wq->portal);
+}
+
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+	union wqcfg wqcfg;
+	unsigned int offset;
+	unsigned long flags;
+
+	rc = idxd_wq_disable(wq);
+	if (rc < 0)
+		return rc;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.pasid_en = 1;
+	wqcfg.pasid = pasid;
+	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	rc = idxd_wq_enable(wq);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+int idxd_wq_disable_pasid(struct idxd_wq *wq)
+{
+	struct idxd_device *idxd = wq->idxd;
+	int rc;
+	union wqcfg wqcfg;
+	unsigned int offset;
+	unsigned long flags;
+
+	rc = idxd_wq_disable(wq);
+	if (rc < 0)
+		return rc;
+
+	offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX);
+	spin_lock_irqsave(&idxd->dev_lock, flags);
+	wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset);
+	wqcfg.pasid_en = 0;
+	wqcfg.pasid = 0;
+	iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset);
+	spin_unlock_irqrestore(&idxd->dev_lock, flags);
+
+	rc = idxd_wq_enable(wq);
+	if (rc < 0)
+		return rc;
+
+	return 0;
 }
 
 void idxd_wq_disable_cleanup(struct idxd_wq *wq)
@@ -468,6 +521,17 @@ void idxd_device_reset(struct idxd_device *idxd)
 	spin_unlock_irqrestore(&idxd->dev_lock, flags);
 }
 
+void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid)
+{
+	struct device *dev = &idxd->pdev->dev;
+	u32 operand;
+
+	operand = pasid;
+	dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_DRAIN_PASID, operand);
+	idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_PASID, operand, NULL);
+	dev_dbg(dev, "pasid %d drained\n", pasid);
+}
+
 /* Device configuration bits */
 static void idxd_group_config_write(struct idxd_group *group)
 {
@@ -554,9 +618,21 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 
 	/* byte 8-11 */
 	wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL);
-	wq->wqcfg->mode = 1;
+	if (wq_dedicated(wq))
+		wq->wqcfg->mode = 1;
+
+	if (device_pasid_enabled(idxd)) {
+		wq->wqcfg->pasid_en = 1;
+		if (wq->type == IDXD_WQT_KERNEL && wq_dedicated(wq))
+			wq->wqcfg->pasid = idxd->pasid;
+	}
+
 	wq->wqcfg->priority = wq->priority;
 
+	if (idxd->hw.gen_cap.block_on_fault &&
+	    test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags))
+		wq->wqcfg->bof = 1;
+
 	/* bytes 12-15 */
 	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
 	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
@@ -664,8 +740,8 @@ static int idxd_wqs_setup(struct idxd_device *idxd)
 		if (!wq->size)
 			continue;
 
-		if (!wq_dedicated(wq)) {
-			dev_warn(dev, "No shared workqueue support.\n");
+		if (wq_shared(wq) && !device_swq_supported(idxd)) {
+			dev_warn(dev, "No shared wq support but configured.\n");
 			return -EINVAL;
 		}
 
diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c
index 0c892cbd72e0..8ed2773d8285 100644
--- a/drivers/dma/idxd/dma.c
+++ b/drivers/dma/idxd/dma.c
@@ -61,8 +61,6 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq,
 					 u64 addr_f1, u64 addr_f2, u64 len,
 					 u64 compl, u32 flags)
 {
-	struct idxd_device *idxd = wq->idxd;
-
 	hw->flags = flags;
 	hw->opcode = opcode;
 	hw->src_addr = addr_f1;
@@ -70,13 +68,6 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq,
 	hw->xfer_size = len;
 	hw->priv = !!(wq->type == IDXD_WQT_KERNEL);
 	hw->completion_addr = compl;
-
-	/*
-	 * Descriptor completion vectors are 1-8 for MSIX. We will round
-	 * robin through the 8 vectors.
-	 */
-	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
-	hw->int_handle =  wq->vec_ptr;
 }
 
 static struct dma_async_tx_descriptor *
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index d48f193daacc..dcac3bb5a0d0 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -59,6 +59,7 @@ enum idxd_wq_state {
 
 enum idxd_wq_flag {
 	WQ_FLAG_DEDICATED = 0,
+	WQ_FLAG_BLOCK_ON_FAULT,
 };
 
 enum idxd_wq_type {
@@ -86,10 +87,11 @@ enum idxd_op_type {
 enum idxd_complete_type {
 	IDXD_COMPLETE_NORMAL = 0,
 	IDXD_COMPLETE_ABORT,
+	IDXD_COMPLETE_DEV_FAIL,
 };
 
 struct idxd_wq {
-	void __iomem *dportal;
+	void __iomem *portal;
 	struct device conf_dev;
 	struct idxd_cdev idxd_cdev;
 	struct idxd_device *idxd;
@@ -145,6 +147,7 @@ enum idxd_device_state {
 enum idxd_device_flag {
 	IDXD_FLAG_CONFIGURABLE = 0,
 	IDXD_FLAG_CMD_RUNNING,
+	IDXD_FLAG_PASID_ENABLED,
 };
 
 struct idxd_device {
@@ -167,6 +170,9 @@ struct idxd_device {
 	struct idxd_wq *wqs;
 	struct idxd_engine *engines;
 
+	struct iommu_sva *sva;
+	unsigned int pasid;
+
 	int num_groups;
 
 	u32 msix_perm_offset;
@@ -215,11 +221,28 @@ struct idxd_desc {
 
 extern struct bus_type dsa_bus_type;
 
+extern bool support_enqcmd;
+
 static inline bool wq_dedicated(struct idxd_wq *wq)
 {
 	return test_bit(WQ_FLAG_DEDICATED, &wq->flags);
 }
 
+static inline bool wq_shared(struct idxd_wq *wq)
+{
+	return !test_bit(WQ_FLAG_DEDICATED, &wq->flags);
+}
+
+static inline bool device_pasid_enabled(struct idxd_device *idxd)
+{
+	return test_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+}
+
+static inline bool device_swq_supported(struct idxd_device *idxd)
+{
+	return (support_enqcmd && device_pasid_enabled(idxd));
+}
+
 enum idxd_portal_prot {
 	IDXD_PORTAL_UNLIMITED = 0,
 	IDXD_PORTAL_LIMITED,
@@ -288,6 +311,7 @@ void idxd_device_reset(struct idxd_device *idxd);
 void idxd_device_cleanup(struct idxd_device *idxd);
 int idxd_device_config(struct idxd_device *idxd);
 void idxd_device_wqs_clear_state(struct idxd_device *idxd);
+void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid);
 
 /* work queue control */
 int idxd_wq_alloc_resources(struct idxd_wq *wq);
@@ -298,6 +322,8 @@ void idxd_wq_drain(struct idxd_wq *wq);
 int idxd_wq_map_portal(struct idxd_wq *wq);
 void idxd_wq_unmap_portal(struct idxd_wq *wq);
 void idxd_wq_disable_cleanup(struct idxd_wq *wq);
+int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid);
+int idxd_wq_disable_pasid(struct idxd_wq *wq);
 
 /* submission */
 int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc);
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 0a4432b063b5..1639f3b2aa58 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -14,6 +14,8 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/device.h>
 #include <linux/idr.h>
+#include <linux/intel-svm.h>
+#include <linux/iommu.h>
 #include <uapi/linux/idxd.h>
 #include <linux/dmaengine.h>
 #include "../dmaengine.h"
@@ -26,6 +28,8 @@ MODULE_AUTHOR("Intel Corporation");
 
 #define DRV_NAME "idxd"
 
+bool support_enqcmd;
+
 static struct idr idxd_idrs[IDXD_TYPE_MAX];
 static struct mutex idxd_idr_lock;
 
@@ -53,6 +57,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	struct idxd_irq_entry *irq_entry;
 	int i, msixcnt;
 	int rc = 0;
+	union msix_perm mperm;
 
 	msixcnt = pci_msix_vec_count(pdev);
 	if (msixcnt < 0) {
@@ -131,6 +136,13 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 
 	idxd_unmask_error_interrupts(idxd);
 
+	/* Setup MSIX permission table */
+	mperm.bits = 0;
+	mperm.pasid = idxd->pasid;
+	mperm.pasid_en = device_pasid_enabled(idxd);
+	for (i = 1; i < msixcnt; i++)
+		iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8);
+
 	return 0;
 
  err_no_irq:
@@ -265,8 +277,7 @@ static void idxd_read_caps(struct idxd_device *idxd)
 	}
 }
 
-static struct idxd_device *idxd_alloc(struct pci_dev *pdev,
-				      void __iomem * const *iomap)
+static struct idxd_device *idxd_alloc(struct pci_dev *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
@@ -276,12 +287,45 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev,
 		return NULL;
 
 	idxd->pdev = pdev;
-	idxd->reg_base = iomap[IDXD_MMIO_BAR];
 	spin_lock_init(&idxd->dev_lock);
 
 	return idxd;
 }
 
+static int idxd_enable_system_pasid(struct idxd_device *idxd)
+{
+	int flags;
+	unsigned int pasid;
+	struct iommu_sva *sva;
+
+	flags = SVM_FLAG_SUPERVISOR_MODE;
+
+	sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags);
+	if (IS_ERR(sva)) {
+		dev_warn(&idxd->pdev->dev,
+			 "iommu sva bind failed: %ld\n", PTR_ERR(sva));
+		return PTR_ERR(sva);
+	}
+
+	pasid = iommu_sva_get_pasid(sva);
+	if (pasid == IOMMU_PASID_INVALID) {
+		iommu_sva_unbind_device(sva);
+		return -ENODEV;
+	}
+
+	idxd->sva = sva;
+	idxd->pasid = pasid;
+	dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid);
+	return 0;
+}
+
+static void idxd_disable_system_pasid(struct idxd_device *idxd)
+{
+
+	iommu_sva_unbind_device(idxd->sva);
+	idxd->sva = NULL;
+}
+
 static int idxd_probe(struct idxd_device *idxd)
 {
 	struct pci_dev *pdev = idxd->pdev;
@@ -292,6 +336,14 @@ static int idxd_probe(struct idxd_device *idxd)
 	idxd_device_init_reset(idxd);
 	dev_dbg(dev, "IDXD reset complete\n");
 
+	if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) {
+		rc = idxd_enable_system_pasid(idxd);
+		if (rc < 0)
+			dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc);
+		else
+			set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags);
+	}
+
 	idxd_read_caps(idxd);
 	idxd_read_table_offsets(idxd);
 
@@ -322,29 +374,29 @@ static int idxd_probe(struct idxd_device *idxd)
 	idxd_mask_error_interrupts(idxd);
 	idxd_mask_msix_vectors(idxd);
  err_setup:
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
 	return rc;
 }
 
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
-	void __iomem * const *iomap;
 	struct device *dev = &pdev->dev;
 	struct idxd_device *idxd;
 	int rc;
-	unsigned int mask;
 
 	rc = pcim_enable_device(pdev);
 	if (rc)
 		return rc;
 
-	dev_dbg(dev, "Mapping BARs\n");
-	mask = (1 << IDXD_MMIO_BAR);
-	rc = pcim_iomap_regions(pdev, mask, DRV_NAME);
-	if (rc)
-		return rc;
+	dev_dbg(dev, "Alloc IDXD context\n");
+	idxd = idxd_alloc(pdev);
+	if (!idxd)
+		return -ENOMEM;
 
-	iomap = pcim_iomap_table(pdev);
-	if (!iomap)
+	dev_dbg(dev, "Mapping BARs\n");
+	idxd->reg_base = pcim_iomap(pdev, IDXD_MMIO_BAR, 0);
+	if (!idxd->reg_base)
 		return -ENOMEM;
 
 	dev_dbg(dev, "Set DMA masks\n");
@@ -360,11 +412,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (rc)
 		return rc;
 
-	dev_dbg(dev, "Alloc IDXD context\n");
-	idxd = idxd_alloc(pdev, iomap);
-	if (!idxd)
-		return -ENOMEM;
-
 	idxd_set_type(idxd);
 
 	dev_dbg(dev, "Set PCI master\n");
@@ -452,6 +499,8 @@ static void idxd_remove(struct pci_dev *pdev)
 	dev_dbg(&pdev->dev, "%s called\n", __func__);
 	idxd_cleanup_sysfs(idxd);
 	idxd_shutdown(pdev);
+	if (device_pasid_enabled(idxd))
+		idxd_disable_system_pasid(idxd);
 	mutex_lock(&idxd_idr_lock);
 	idr_remove(&idxd_idrs[idxd->type], idxd->id);
 	mutex_unlock(&idxd_idr_lock);
@@ -470,7 +519,7 @@ static int __init idxd_init_module(void)
 	int err, i;
 
 	/*
-	 * If the CPU does not support write512, there's no point in
+	 * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
 	 * enumerating the device. We can not utilize it.
 	 */
 	if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
@@ -478,8 +527,10 @@ static int __init idxd_init_module(void)
 		return -ENODEV;
 	}
 
-	pr_info("%s: Intel(R) Accelerator Devices Driver %s\n",
-		DRV_NAME, IDXD_DRIVER_VERSION);
+	if (!boot_cpu_has(X86_FEATURE_ENQCMD))
+		pr_warn("Platform does not have ENQCMD(S) support.\n");
+	else
+		support_enqcmd = true;
 
 	mutex_init(&idxd_idr_lock);
 	for (i = 0; i < IDXD_TYPE_MAX; i++)
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index aef5a902829e..1041c2779f9b 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -336,6 +336,8 @@ union wqcfg {
 	u32 bits[8];
 } __packed;
 
+#define WQCFG_PASID_IDX                2
+
 /*
  * This macro calculates the offset into the WQCFG register
  * idxd - struct idxd *
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index 156a1ee233aa..efca5d8468a6 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -11,11 +11,22 @@
 static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 {
 	struct idxd_desc *desc;
+	struct idxd_device *idxd = wq->idxd;
 
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
 	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
 	desc->cpu = cpu;
+
+	if (device_pasid_enabled(idxd))
+		desc->hw->pasid = idxd->pasid;
+
+	/*
+	 * Descriptor completion vectors are 1-8 for MSIX. We will round
+	 * robin through the 8 vectors.
+	 */
+	wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1;
+	desc->hw->int_handle = wq->vec_ptr;
 	return desc;
 }
 
@@ -70,18 +81,32 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
 	struct idxd_device *idxd = wq->idxd;
 	int vec = desc->hw->int_handle;
 	void __iomem *portal;
+	int rc;
 
 	if (idxd->state != IDXD_DEV_ENABLED)
 		return -EIO;
 
-	portal = wq->dportal + idxd_get_wq_portal_offset(IDXD_PORTAL_UNLIMITED);
+	portal = wq->portal + idxd_get_wq_portal_offset(IDXD_PORTAL_LIMITED);
+
 	/*
-	 * The wmb() flushes writes to coherent DMA data before possibly
-	 * triggering a DMA read. The wmb() is necessary even on UP because
-	 * the recipient is a device.
+	 * The wmb() flushes writes to coherent DMA data before
+	 * possibly triggering a DMA read. The wmb() is necessary
+	 * even on UP because the recipient is a device.
 	 */
 	wmb();
-	iosubmit_cmds512(portal, desc->hw, 1);
+	if (wq_dedicated(wq)) {
+		iosubmit_cmds512(portal, desc->hw, 1);
+	} else {
+		/*
+		 * It's not likely that we would receive queue full rejection
+		 * since the descriptor allocation gates at wq size. If we
+		 * receive a -EAGAIN, that means something went wrong such as the
+		 * device is not accepting descriptor at all.
+		 */
+		rc = enqcmds(portal, desc->hw);
+		if (rc < 0)
+			return rc;
+	}
 
 	/*
 	 * Pending the descriptor to the lockless list for the irq_entry
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 07a5db06a29a..6d292eb79bf3 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -175,6 +175,30 @@ static int idxd_config_bus_probe(struct device *dev)
 			return -EINVAL;
 		}
 
+		/* Shared WQ checks */
+		if (wq_shared(wq)) {
+			if (!device_swq_supported(idxd)) {
+				dev_warn(dev,
+					 "PASID not enabled and shared WQ.\n");
+				mutex_unlock(&wq->wq_lock);
+				return -ENXIO;
+			}
+			/*
+			 * Shared wq with the threshold set to 0 means the user
+			 * did not set the threshold or transitioned from a
+			 * dedicated wq but did not set threshold. A value
+			 * of 0 would effectively disable the shared wq. The
+			 * driver does not allow a value of 0 to be set for
+			 * threshold via sysfs.
+			 */
+			if (wq->threshold == 0) {
+				dev_warn(dev,
+					 "Shared WQ and threshold 0.\n");
+				mutex_unlock(&wq->wq_lock);
+				return -EINVAL;
+			}
+		}
+
 		rc = idxd_wq_alloc_resources(wq);
 		if (rc < 0) {
 			mutex_unlock(&wq->wq_lock);
@@ -875,6 +899,8 @@ static ssize_t wq_mode_store(struct device *dev,
 	if (sysfs_streq(buf, "dedicated")) {
 		set_bit(WQ_FLAG_DEDICATED, &wq->flags);
 		wq->threshold = 0;
+	} else if (sysfs_streq(buf, "shared") && device_swq_supported(idxd)) {
+		clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	} else {
 		return -EINVAL;
 	}
@@ -973,6 +999,87 @@ static ssize_t wq_priority_store(struct device *dev,
 static struct device_attribute dev_attr_wq_priority =
 		__ATTR(priority, 0644, wq_priority_show, wq_priority_store);
 
+static ssize_t wq_block_on_fault_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n",
+		       test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags));
+}
+
+static ssize_t wq_block_on_fault_store(struct device *dev,
+				       struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	bool bof;
+	int rc;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -ENXIO;
+
+	rc = kstrtobool(buf, &bof);
+	if (rc < 0)
+		return rc;
+
+	if (bof)
+		set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+	else
+		clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags);
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_block_on_fault =
+		__ATTR(block_on_fault, 0644, wq_block_on_fault_show,
+		       wq_block_on_fault_store);
+
+static ssize_t wq_threshold_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->threshold);
+}
+
+static ssize_t wq_threshold_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	unsigned int val;
+	int rc;
+
+	rc = kstrtouint(buf, 0, &val);
+	if (rc < 0)
+		return -EINVAL;
+
+	if (val > wq->size || val <= 0)
+		return -EINVAL;
+
+	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
+		return -EPERM;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -ENXIO;
+
+	if (test_bit(WQ_FLAG_DEDICATED, &wq->flags))
+		return -EINVAL;
+
+	wq->threshold = val;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_threshold =
+		__ATTR(threshold, 0644, wq_threshold_show, wq_threshold_store);
+
 static ssize_t wq_type_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
@@ -1044,6 +1151,13 @@ static ssize_t wq_name_store(struct device *dev,
 	if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0)
 		return -EINVAL;
 
+	/*
+	 * This is temporarily placed here until we have SVM support for
+	 * dmaengine.
+	 */
+	if (wq->type == IDXD_WQT_KERNEL && device_pasid_enabled(wq->idxd))
+		return -EOPNOTSUPP;
+
 	memset(wq->name, 0, WQ_NAME_SIZE + 1);
 	strncpy(wq->name, buf, WQ_NAME_SIZE);
 	strreplace(wq->name, '\n', '\0');
@@ -1154,6 +1268,8 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_mode.attr,
 	&dev_attr_wq_size.attr,
 	&dev_attr_wq_priority.attr,
+	&dev_attr_wq_block_on_fault.attr,
+	&dev_attr_wq_threshold.attr,
 	&dev_attr_wq_type.attr,
 	&dev_attr_wq_name.attr,
 	&dev_attr_wq_cdev_minor.attr,
@@ -1305,6 +1421,16 @@ static ssize_t clients_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(clients);
 
+static ssize_t pasid_enabled_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct idxd_device *idxd =
+		container_of(dev, struct idxd_device, conf_dev);
+
+	return sprintf(buf, "%u\n", device_pasid_enabled(idxd));
+}
+static DEVICE_ATTR_RO(pasid_enabled);
+
 static ssize_t state_show(struct device *dev,
 			  struct device_attribute *attr, char *buf)
 {
@@ -1424,6 +1550,7 @@ static struct attribute *idxd_device_attributes[] = {
 	&dev_attr_gen_cap.attr,
 	&dev_attr_configurable.attr,
 	&dev_attr_clients.attr,
+	&dev_attr_pasid_enabled.attr,
 	&dev_attr_state.attr,
 	&dev_attr_errors.attr,
 	&dev_attr_max_tokens.attr,

From e4f4d8cdeb9a2fe746411c0b9a7538b5c0232c1e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 10:34:40 -0700
Subject: [PATCH 011/230] dmaengine: idxd: Clean up descriptors with fault
 error

Add code to "complete" a descriptor when the descriptor or its completion
address hit a fault error when SVA mode is being used. This error can be
triggered due to bad programming by the user. A lock is introduced in order
to protect the descriptor completion lists since the fault handler will run
from the system work queue after being scheduled in the interrupt handler.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160382008092.3911367.12766483427643278985.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/idxd.h |   5 ++
 drivers/dma/idxd/init.c |   1 +
 drivers/dma/idxd/irq.c  | 146 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 140 insertions(+), 12 deletions(-)

diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index dcac3bb5a0d0..7e54209c433a 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -34,6 +34,11 @@ struct idxd_irq_entry {
 	int id;
 	struct llist_head pending_llist;
 	struct list_head work_list;
+	/*
+	 * Lock to protect access between irq thread process descriptor
+	 * and irq thread processing error descriptor.
+	 */
+	spinlock_t list_lock;
 };
 
 struct idxd_group {
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 1639f3b2aa58..c24106efc16e 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -97,6 +97,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd)
 	for (i = 0; i < msixcnt; i++) {
 		idxd->irq_entries[i].id = i;
 		idxd->irq_entries[i].idxd = idxd;
+		spin_lock_init(&idxd->irq_entries[i].list_lock);
 	}
 
 	msix = &idxd->msix_entries[0];
diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c
index 17a65a13fb64..593a2f6ed16c 100644
--- a/drivers/dma/idxd/irq.c
+++ b/drivers/dma/idxd/irq.c
@@ -11,6 +11,24 @@
 #include "idxd.h"
 #include "registers.h"
 
+enum irq_work_type {
+	IRQ_WORK_NORMAL = 0,
+	IRQ_WORK_PROCESS_FAULT,
+};
+
+struct idxd_fault {
+	struct work_struct work;
+	u64 addr;
+	struct idxd_device *idxd;
+};
+
+static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
+				 enum irq_work_type wtype,
+				 int *processed, u64 data);
+static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
+				     enum irq_work_type wtype,
+				     int *processed, u64 data);
+
 static void idxd_device_reinit(struct work_struct *work)
 {
 	struct idxd_device *idxd = container_of(work, struct idxd_device, work);
@@ -44,6 +62,46 @@ static void idxd_device_reinit(struct work_struct *work)
 	idxd_device_wqs_clear_state(idxd);
 }
 
+static void idxd_device_fault_work(struct work_struct *work)
+{
+	struct idxd_fault *fault = container_of(work, struct idxd_fault, work);
+	struct idxd_irq_entry *ie;
+	int i;
+	int processed;
+	int irqcnt = fault->idxd->num_wq_irqs + 1;
+
+	for (i = 1; i < irqcnt; i++) {
+		ie = &fault->idxd->irq_entries[i];
+		irq_process_work_list(ie, IRQ_WORK_PROCESS_FAULT,
+				      &processed, fault->addr);
+		if (processed)
+			break;
+
+		irq_process_pending_llist(ie, IRQ_WORK_PROCESS_FAULT,
+					  &processed, fault->addr);
+		if (processed)
+			break;
+	}
+
+	kfree(fault);
+}
+
+static int idxd_device_schedule_fault_process(struct idxd_device *idxd,
+					      u64 fault_addr)
+{
+	struct idxd_fault *fault;
+
+	fault = kmalloc(sizeof(*fault), GFP_ATOMIC);
+	if (!fault)
+		return -ENOMEM;
+
+	fault->addr = fault_addr;
+	fault->idxd = idxd;
+	INIT_WORK(&fault->work, idxd_device_fault_work);
+	queue_work(idxd->wq, &fault->work);
+	return 0;
+}
+
 irqreturn_t idxd_irq_handler(int vec, void *data)
 {
 	struct idxd_irq_entry *irq_entry = data;
@@ -125,6 +183,15 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	if (!err)
 		goto out;
 
+	/*
+	 * This case should rarely happen and typically is due to software
+	 * programming error by the driver.
+	 */
+	if (idxd->sw_err.valid &&
+	    idxd->sw_err.desc_valid &&
+	    idxd->sw_err.fault_addr)
+		idxd_device_schedule_fault_process(idxd, idxd->sw_err.fault_addr);
+
 	gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET);
 	if (gensts.state == IDXD_DEVICE_STATE_HALT) {
 		idxd->state = IDXD_DEV_HALTED;
@@ -152,57 +219,110 @@ irqreturn_t idxd_misc_thread(int vec, void *data)
 	return IRQ_HANDLED;
 }
 
+static bool process_fault(struct idxd_desc *desc, u64 fault_addr)
+{
+	/*
+	 * Completion address can be bad as well. Check fault address match for descriptor
+	 * and completion address.
+	 */
+	if ((u64)desc->hw == fault_addr ||
+	    (u64)desc->completion == fault_addr) {
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_DEV_FAIL);
+		return true;
+	}
+
+	return false;
+}
+
+static bool complete_desc(struct idxd_desc *desc)
+{
+	if (desc->completion->status) {
+		idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+		return true;
+	}
+
+	return false;
+}
+
 static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
-				     int *processed)
+				     enum irq_work_type wtype,
+				     int *processed, u64 data)
 {
 	struct idxd_desc *desc, *t;
 	struct llist_node *head;
 	int queued = 0;
+	bool completed = false;
+	unsigned long flags;
 
 	*processed = 0;
 	head = llist_del_all(&irq_entry->pending_llist);
 	if (!head)
-		return 0;
+		goto out;
 
 	llist_for_each_entry_safe(desc, t, head, llnode) {
-		if (desc->completion->status) {
-			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+		if (wtype == IRQ_WORK_NORMAL)
+			completed = complete_desc(desc);
+		else if (wtype == IRQ_WORK_PROCESS_FAULT)
+			completed = process_fault(desc, data);
+
+		if (completed) {
 			idxd_free_desc(desc->wq, desc);
 			(*processed)++;
+			if (wtype == IRQ_WORK_PROCESS_FAULT)
+				break;
 		} else {
-			list_add_tail(&desc->list, &irq_entry->work_list);
+			spin_lock_irqsave(&irq_entry->list_lock, flags);
+			list_add_tail(&desc->list,
+				      &irq_entry->work_list);
+			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 			queued++;
 		}
 	}
 
+ out:
 	return queued;
 }
 
 static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
-				 int *processed)
+				 enum irq_work_type wtype,
+				 int *processed, u64 data)
 {
 	struct list_head *node, *next;
 	int queued = 0;
+	bool completed = false;
+	unsigned long flags;
 
 	*processed = 0;
+	spin_lock_irqsave(&irq_entry->list_lock, flags);
 	if (list_empty(&irq_entry->work_list))
-		return 0;
+		goto out;
 
 	list_for_each_safe(node, next, &irq_entry->work_list) {
 		struct idxd_desc *desc =
 			container_of(node, struct idxd_desc, list);
 
-		if (desc->completion->status) {
+		spin_unlock_irqrestore(&irq_entry->list_lock, flags);
+		if (wtype == IRQ_WORK_NORMAL)
+			completed = complete_desc(desc);
+		else if (wtype == IRQ_WORK_PROCESS_FAULT)
+			completed = process_fault(desc, data);
+
+		if (completed) {
+			spin_lock_irqsave(&irq_entry->list_lock, flags);
 			list_del(&desc->list);
-			/* process and callback */
-			idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL);
+			spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 			idxd_free_desc(desc->wq, desc);
 			(*processed)++;
+			if (wtype == IRQ_WORK_PROCESS_FAULT)
+				return queued;
 		} else {
 			queued++;
 		}
+		spin_lock_irqsave(&irq_entry->list_lock, flags);
 	}
 
+ out:
+	spin_unlock_irqrestore(&irq_entry->list_lock, flags);
 	return queued;
 }
 
@@ -230,12 +350,14 @@ static int idxd_desc_process(struct idxd_irq_entry *irq_entry)
 	 * 5. Repeat until no more descriptors.
 	 */
 	do {
-		rc = irq_process_work_list(irq_entry, &processed);
+		rc = irq_process_work_list(irq_entry, IRQ_WORK_NORMAL,
+					   &processed, 0);
 		total += processed;
 		if (rc != 0)
 			continue;
 
-		rc = irq_process_pending_llist(irq_entry, &processed);
+		rc = irq_process_pending_llist(irq_entry, IRQ_WORK_NORMAL,
+					       &processed, 0);
 		total += processed;
 	} while (rc != 0);
 

From 4749f51ddd8aee5c7aa11e6593a54f96374a77ad Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 27 Oct 2020 10:34:46 -0700
Subject: [PATCH 012/230] dmaengine: idxd: Add ABI documentation for shared wq

Add the sysfs attribute bits in ABI/stable for shared wq support.

Signed-off-by: Jing Lin <jing.lin@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/160382008649.3911367.10851752182908509837.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/ABI/stable/sysfs-driver-dma-idxd | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index b44183880935..5ea81ffd3c1a 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -77,6 +77,13 @@ Contact:        dmaengine@vger.kernel.org
 Description:    The operation capability bit mask specify the operation types
 		supported by the this device.
 
+What:		/sys/bus/dsa/devices/dsa<m>/pasid_enabled
+Date:		Oct 27, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	To indicate if PASID (process address space identifier) is
+		enabled or not for this device.
+
 What:           /sys/bus/dsa/devices/dsa<m>/state
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
@@ -122,6 +129,13 @@ KernelVersion:	5.10.0
 Contact:	dmaengine@vger.kernel.org
 Description:	The last executed device administrative command's status/error.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/block_on_fault
+Date:		Oct 27, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	To indicate block on fault is allowed or not for the work queue
+		to support on demand paging.
+
 What:           /sys/bus/dsa/devices/wq<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0

From 5a71270197f39ff3e526cf130bc5d6e84db8f2d7 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 30 Oct 2020 08:49:06 -0700
Subject: [PATCH 013/230] dmaengine: idxd: Update calculation of group offset
 to be more readable

Create helper macros to make group offset calculation more readable.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160407294683.839093.10740868559754142070.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/device.c    | 12 +++++-------
 drivers/dma/idxd/registers.h | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index 2f09eb89a906..d6f551dcbcb6 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -543,24 +543,22 @@ static void idxd_group_config_write(struct idxd_group *group)
 	dev_dbg(dev, "Writing group %d cfg registers\n", group->id);
 
 	/* setup GRPWQCFG */
-	for (i = 0; i < 4; i++) {
-		grpcfg_offset = idxd->grpcfg_offset +
-			group->id * 64 + i * sizeof(u64);
-		iowrite64(group->grpcfg.wqs[i],
-			  idxd->reg_base + grpcfg_offset);
+	for (i = 0; i < GRPWQCFG_STRIDES; i++) {
+		grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i);
+		iowrite64(group->grpcfg.wqs[i], idxd->reg_base + grpcfg_offset);
 		dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n",
 			group->id, i, grpcfg_offset,
 			ioread64(idxd->reg_base + grpcfg_offset));
 	}
 
 	/* setup GRPENGCFG */
-	grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 32;
+	grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id);
 	iowrite64(group->grpcfg.engines, idxd->reg_base + grpcfg_offset);
 	dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id,
 		grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset));
 
 	/* setup GRPFLAGS */
-	grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 40;
+	grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id);
 	iowrite32(group->grpcfg.flags.bits, idxd->reg_base + grpcfg_offset);
 	dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n",
 		group->id, grpcfg_offset,
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 1041c2779f9b..6f2f736097e5 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -356,4 +356,22 @@ union wqcfg {
 
 #define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32))
 
+#define GRPCFG_SIZE		64
+#define GRPWQCFG_STRIDES	4
+
+/*
+ * This macro calculates the offset into the GRPCFG register
+ * idxd - struct idxd *
+ * n - wq id
+ * ofs - the index of the 32b dword for the config register
+ *
+ * The WQCFG register block is divided into groups per each wq. The n index
+ * allows us to move to the register group that's for that particular wq.
+ * Each register is 32bits. The ofs gives us the number of register to access.
+ */
+#define GRPWQCFG_OFFSET(idxd_dev, n, ofs) ((idxd_dev)->grpcfg_offset +\
+					   (n) * GRPCFG_SIZE + sizeof(u64) * (ofs))
+#define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32)
+#define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40)
+
 #endif

From 2f8417a967d571bf8fb81cba95d7acf508ed334f Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 30 Oct 2020 08:51:56 -0700
Subject: [PATCH 014/230] dmaengine: idxd: define table offset multiplier

Convert table offset multiplier magic number to a define.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160407311690.839435.6941865731867828234.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/init.c      | 17 +++++++----------
 drivers/dma/idxd/registers.h |  2 ++
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index c24106efc16e..45b0eac640c3 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -214,17 +214,14 @@ static void idxd_read_table_offsets(struct idxd_device *idxd)
 	struct device *dev = &idxd->pdev->dev;
 
 	offsets.bits[0] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET);
-	offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET
-			+ sizeof(u64));
-	idxd->grpcfg_offset = offsets.grpcfg * 0x100;
+	offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET + sizeof(u64));
+	idxd->grpcfg_offset = offsets.grpcfg * IDXD_TABLE_MULT;
 	dev_dbg(dev, "IDXD Group Config Offset: %#x\n", idxd->grpcfg_offset);
-	idxd->wqcfg_offset = offsets.wqcfg * 0x100;
-	dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n",
-		idxd->wqcfg_offset);
-	idxd->msix_perm_offset = offsets.msix_perm * 0x100;
-	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n",
-		idxd->msix_perm_offset);
-	idxd->perfmon_offset = offsets.perfmon * 0x100;
+	idxd->wqcfg_offset = offsets.wqcfg * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n", idxd->wqcfg_offset);
+	idxd->msix_perm_offset = offsets.msix_perm * IDXD_TABLE_MULT;
+	dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", idxd->msix_perm_offset);
+	idxd->perfmon_offset = offsets.perfmon * IDXD_TABLE_MULT;
 	dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset);
 }
 
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 6f2f736097e5..d29a58ee2651 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -102,6 +102,8 @@ union offsets_reg {
 	u64 bits[2];
 } __packed;
 
+#define IDXD_TABLE_MULT			0x100
+
 #define IDXD_GENCFG_OFFSET		0x80
 union gencfg_reg {
 	struct {

From 842067940a3e3fc008a60fee388e000219b32632 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 3 Nov 2020 20:39:38 +0200
Subject: [PATCH 015/230] dmaengine: dw: Enable runtime PM

When consumer requests channel power on the DMA controller device
and otherwise on the freeing channel resources.

Note, in some cases consumer acquires channel at the ->probe() stage and
releases it at the ->remove() stage. It will mean that DMA controller device
will be powered during all this time if there is no assist from hardware
to idle it. The above mentioned cases should be investigated separately
and individually.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20201103183938.64752-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw/core.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c
index 7ab83fe601ed..19a23767533a 100644
--- a/drivers/dma/dw/core.c
+++ b/drivers/dma/dw/core.c
@@ -982,8 +982,11 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 
 	dev_vdbg(chan2dev(chan), "%s\n", __func__);
 
+	pm_runtime_get_sync(dw->dma.dev);
+
 	/* ASSERT:  channel is idle */
 	if (dma_readl(dw, CH_EN) & dwc->mask) {
+		pm_runtime_put_sync_suspend(dw->dma.dev);
 		dev_dbg(chan2dev(chan), "DMA channel not idle?\n");
 		return -EIO;
 	}
@@ -1000,6 +1003,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 	 * We need controller-specific data to set up slave transfers.
 	 */
 	if (chan->private && !dw_dma_filter(chan, chan->private)) {
+		pm_runtime_put_sync_suspend(dw->dma.dev);
 		dev_warn(chan2dev(chan), "Wrong controller-specific data\n");
 		return -EINVAL;
 	}
@@ -1043,6 +1047,8 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
 	if (!dw->in_use)
 		do_dw_dma_off(dw);
 
+	pm_runtime_put_sync_suspend(dw->dma.dev);
+
 	dev_vdbg(chan2dev(chan), "%s: done\n", __func__);
 }
 

From 6349753276a657b423460b93c2f511c73bb0a118 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 4 Nov 2020 12:31:31 +0200
Subject: [PATCH 016/230] dmaengine: idma64: Switch to use __maybe_unused
 instead of ifdeffery

ifdeffery is prone to errors and makes code harder to read.
Switch to use __maybe_unused instead of ifdeffery.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20201104103131.89907-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idma64.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/idma64.c b/drivers/dma/idma64.c
index f5a84c846394..f4c07ad3be15 100644
--- a/drivers/dma/idma64.c
+++ b/drivers/dma/idma64.c
@@ -667,9 +667,7 @@ static int idma64_platform_remove(struct platform_device *pdev)
 	return idma64_remove(chip);
 }
 
-#ifdef CONFIG_PM_SLEEP
-
-static int idma64_pm_suspend(struct device *dev)
+static int __maybe_unused idma64_pm_suspend(struct device *dev)
 {
 	struct idma64_chip *chip = dev_get_drvdata(dev);
 
@@ -677,7 +675,7 @@ static int idma64_pm_suspend(struct device *dev)
 	return 0;
 }
 
-static int idma64_pm_resume(struct device *dev)
+static int __maybe_unused idma64_pm_resume(struct device *dev)
 {
 	struct idma64_chip *chip = dev_get_drvdata(dev);
 
@@ -685,8 +683,6 @@ static int idma64_pm_resume(struct device *dev)
 	return 0;
 }
 
-#endif /* CONFIG_PM_SLEEP */
-
 static const struct dev_pm_ops idma64_dev_pm_ops = {
 	SET_SYSTEM_SLEEP_PM_OPS(idma64_pm_suspend, idma64_pm_resume)
 };

From 69973b4895b35a67409c8ce1c3d62090470ef47b Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Fri, 30 Oct 2020 22:30:00 +0200
Subject: [PATCH 017/230] dmaengine: ti: k3-udma-glue: move psi-l pairing in
 channel en/dis functions

The NAVSS UDMA will stuck if target IP module is disabled by PM while PSI-L
threads are paired UDMA<->IP and no further transfers is possible. This
could be the case for IPs J721E Main CPSW (cpsw9g).

Hence, to avoid such situation do PSI-L threads pairing only when UDMA
channel is going to be enabled as at this time DMA consumer module expected
to be active already.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201030203000.4281-1-grygorii.strashko@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c | 64 +++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index a367584f0d7b..dfb65e382ab9 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -303,19 +303,6 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 		goto err;
 	}
 
-	ret = xudma_navss_psil_pair(tx_chn->common.udmax,
-				    tx_chn->common.src_thread,
-				    tx_chn->common.dst_thread);
-	if (ret) {
-		dev_err(dev, "PSI-L request err %d\n", ret);
-		goto err;
-	}
-
-	tx_chn->psil_paired = true;
-
-	/* reset TX RT registers */
-	k3_udma_glue_disable_tx_chn(tx_chn);
-
 	k3_udma_glue_dump_tx_chn(tx_chn);
 
 	return tx_chn;
@@ -378,6 +365,18 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_pop_tx_chn);
 
 int k3_udma_glue_enable_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 {
+	int ret;
+
+	ret = xudma_navss_psil_pair(tx_chn->common.udmax,
+				    tx_chn->common.src_thread,
+				    tx_chn->common.dst_thread);
+	if (ret) {
+		dev_err(tx_chn->common.dev, "PSI-L request err %d\n", ret);
+		return ret;
+	}
+
+	tx_chn->psil_paired = true;
+
 	xudma_tchanrt_write(tx_chn->udma_tchanx, UDMA_CHAN_RT_PEER_RT_EN_REG,
 			    UDMA_PEER_RT_EN_ENABLE);
 
@@ -398,6 +397,13 @@ void k3_udma_glue_disable_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 	xudma_tchanrt_write(tx_chn->udma_tchanx,
 			    UDMA_CHAN_RT_PEER_RT_EN_REG, 0);
 	k3_udma_glue_dump_tx_rt_chn(tx_chn, "txchn dis2");
+
+	if (tx_chn->psil_paired) {
+		xudma_navss_psil_unpair(tx_chn->common.udmax,
+					tx_chn->common.src_thread,
+					tx_chn->common.dst_thread);
+		tx_chn->psil_paired = false;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_disable_tx_chn);
 
@@ -815,19 +821,6 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 			goto err;
 	}
 
-	ret = xudma_navss_psil_pair(rx_chn->common.udmax,
-				    rx_chn->common.src_thread,
-				    rx_chn->common.dst_thread);
-	if (ret) {
-		dev_err(dev, "PSI-L request err %d\n", ret);
-		goto err;
-	}
-
-	rx_chn->psil_paired = true;
-
-	/* reset RX RT registers */
-	k3_udma_glue_disable_rx_chn(rx_chn);
-
 	k3_udma_glue_dump_rx_chn(rx_chn);
 
 	return rx_chn;
@@ -1052,12 +1045,24 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_rx_flow_disable);
 
 int k3_udma_glue_enable_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
+	int ret;
+
 	if (rx_chn->remote)
 		return -EINVAL;
 
 	if (rx_chn->flows_ready < rx_chn->flow_num)
 		return -EINVAL;
 
+	ret = xudma_navss_psil_pair(rx_chn->common.udmax,
+				    rx_chn->common.src_thread,
+				    rx_chn->common.dst_thread);
+	if (ret) {
+		dev_err(rx_chn->common.dev, "PSI-L request err %d\n", ret);
+		return ret;
+	}
+
+	rx_chn->psil_paired = true;
+
 	xudma_rchanrt_write(rx_chn->udma_rchanx, UDMA_CHAN_RT_CTL_REG,
 			    UDMA_CHAN_RT_CTL_EN);
 
@@ -1078,6 +1083,13 @@ void k3_udma_glue_disable_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	xudma_rchanrt_write(rx_chn->udma_rchanx, UDMA_CHAN_RT_CTL_REG, 0);
 
 	k3_udma_glue_dump_rx_rt_chn(rx_chn, "rxrt dis2");
+
+	if (rx_chn->psil_paired) {
+		xudma_navss_psil_unpair(rx_chn->common.udmax,
+					rx_chn->common.src_thread,
+					rx_chn->common.dst_thread);
+		rx_chn->psil_paired = false;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_disable_rx_chn);
 

From f3b1024908ec02d7af4d01a2659901828844bdd2 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:43 +1300
Subject: [PATCH 018/230] dmaengine: ipu_idmac: remove redundant irqsave and
 restore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Link: https://lore.kernel.org/r/20201027215252.25820-2-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ipu/ipu_idmac.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c
index 38036db284cb..104ad420abbe 100644
--- a/drivers/dma/ipu/ipu_idmac.c
+++ b/drivers/dma/ipu/ipu_idmac.c
@@ -1160,14 +1160,13 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 	struct idmac_tx_desc *desc, *descnew;
 	bool done = false;
 	u32 ready0, ready1, curbuf, err;
-	unsigned long flags;
 	struct dmaengine_desc_callback cb;
 
 	/* IDMAC has cleared the respective BUFx_RDY bit, we manage the buffer */
 
 	dev_dbg(dev, "IDMAC irq %d, buf %d\n", irq, ichan->active_buffer);
 
-	spin_lock_irqsave(&ipu_data.lock, flags);
+	spin_lock(&ipu_data.lock);
 
 	ready0	= idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY);
 	ready1	= idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY);
@@ -1176,7 +1175,7 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 
 	if (err & (1 << chan_id)) {
 		idmac_write_ipureg(&ipu_data, 1 << chan_id, IPU_INT_STAT_4);
-		spin_unlock_irqrestore(&ipu_data.lock, flags);
+		spin_unlock(&ipu_data.lock);
 		/*
 		 * Doing this
 		 * ichan->sg[0] = ichan->sg[1] = NULL;
@@ -1188,7 +1187,7 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 			 chan_id, ready0, ready1, curbuf);
 		return IRQ_HANDLED;
 	}
-	spin_unlock_irqrestore(&ipu_data.lock, flags);
+	spin_unlock(&ipu_data.lock);
 
 	/* Other interrupts do not interfere with this channel */
 	spin_lock(&ichan->lock);
@@ -1251,9 +1250,9 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id)
 		if (unlikely(sgnew)) {
 			ipu_submit_buffer(ichan, descnew, sgnew, !ichan->active_buffer);
 		} else {
-			spin_lock_irqsave(&ipu_data.lock, flags);
+			spin_lock(&ipu_data.lock);
 			ipu_ic_disable_task(&ipu_data, chan_id);
-			spin_unlock_irqrestore(&ipu_data.lock, flags);
+			spin_unlock(&ipu_data.lock);
 			ichan->status = IPU_CHANNEL_READY;
 			/* Continue to check for complete descriptor */
 		}

From e991c06ed714c64d3d79696513370f894a4ab751 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:44 +1300
Subject: [PATCH 019/230] dmaengine: ti: k3-udma: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Acked-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201027215252.25820-3-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index 82cf6c77f5c9..e508280b3d70 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -1020,13 +1020,12 @@ static irqreturn_t udma_ring_irq_handler(int irq, void *data)
 {
 	struct udma_chan *uc = data;
 	struct udma_desc *d;
-	unsigned long flags;
 	dma_addr_t paddr = 0;
 
 	if (udma_pop_from_ring(uc, &paddr) || !paddr)
 		return IRQ_HANDLED;
 
-	spin_lock_irqsave(&uc->vc.lock, flags);
+	spin_lock(&uc->vc.lock);
 
 	/* Teardown completion message */
 	if (cppi5_desc_is_tdcm(paddr)) {
@@ -1077,7 +1076,7 @@ static irqreturn_t udma_ring_irq_handler(int irq, void *data)
 		}
 	}
 out:
-	spin_unlock_irqrestore(&uc->vc.lock, flags);
+	spin_unlock(&uc->vc.lock);
 
 	return IRQ_HANDLED;
 }
@@ -1086,9 +1085,8 @@ static irqreturn_t udma_udma_irq_handler(int irq, void *data)
 {
 	struct udma_chan *uc = data;
 	struct udma_desc *d;
-	unsigned long flags;
 
-	spin_lock_irqsave(&uc->vc.lock, flags);
+	spin_lock(&uc->vc.lock);
 	d = uc->desc;
 	if (d) {
 		d->tr_idx = (d->tr_idx + 1) % d->sglen;
@@ -1103,7 +1101,7 @@ static irqreturn_t udma_udma_irq_handler(int irq, void *data)
 		}
 	}
 
-	spin_unlock_irqrestore(&uc->vc.lock, flags);
+	spin_unlock(&uc->vc.lock);
 
 	return IRQ_HANDLED;
 }

From 302b3b38236a70bffb13f3a90a52bd630841aa90 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:45 +1300
Subject: [PATCH 020/230] dmaengine: sf-pdma: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Cc: Green Wan <green.wan@sifive.com>
Link: https://lore.kernel.org/r/20201027215252.25820-4-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sf-pdma/sf-pdma.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/sf-pdma/sf-pdma.c b/drivers/dma/sf-pdma/sf-pdma.c
index 528deb5d9f31..981ddcc42d70 100644
--- a/drivers/dma/sf-pdma/sf-pdma.c
+++ b/drivers/dma/sf-pdma/sf-pdma.c
@@ -326,10 +326,9 @@ static irqreturn_t sf_pdma_done_isr(int irq, void *dev_id)
 {
 	struct sf_pdma_chan *chan = dev_id;
 	struct pdma_regs *regs = &chan->regs;
-	unsigned long flags;
 	u64 residue;
 
-	spin_lock_irqsave(&chan->vchan.lock, flags);
+	spin_lock(&chan->vchan.lock);
 	writel((readl(regs->ctrl)) & ~PDMA_DONE_STATUS_MASK, regs->ctrl);
 	residue = readq(regs->residue);
 
@@ -346,7 +345,7 @@ static irqreturn_t sf_pdma_done_isr(int irq, void *dev_id)
 		sf_pdma_xfer_desc(chan);
 	}
 
-	spin_unlock_irqrestore(&chan->vchan.lock, flags);
+	spin_unlock(&chan->vchan.lock);
 
 	return IRQ_HANDLED;
 }
@@ -355,11 +354,10 @@ static irqreturn_t sf_pdma_err_isr(int irq, void *dev_id)
 {
 	struct sf_pdma_chan *chan = dev_id;
 	struct pdma_regs *regs = &chan->regs;
-	unsigned long flags;
 
-	spin_lock_irqsave(&chan->lock, flags);
+	spin_lock(&chan->lock);
 	writel((readl(regs->ctrl)) & ~PDMA_ERR_STATUS_MASK, regs->ctrl);
-	spin_unlock_irqrestore(&chan->lock, flags);
+	spin_unlock(&chan->lock);
 
 	tasklet_schedule(&chan->err_tasklet);
 

From 654115e3f62671a6b8bcee702697dbdfc6fce1a3 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:46 +1300
Subject: [PATCH 021/230] dmaengine: tegra210-adma: remove redundant irqsave
 and irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Cc: Laxman Dewangan <ldewangan@nvidia.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Link: https://lore.kernel.org/r/20201027215252.25820-5-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/tegra210-adma.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c
index c5fa2ef74abc..4735742e826d 100644
--- a/drivers/dma/tegra210-adma.c
+++ b/drivers/dma/tegra210-adma.c
@@ -408,19 +408,18 @@ static irqreturn_t tegra_adma_isr(int irq, void *dev_id)
 {
 	struct tegra_adma_chan *tdc = dev_id;
 	unsigned long status;
-	unsigned long flags;
 
-	spin_lock_irqsave(&tdc->vc.lock, flags);
+	spin_lock(&tdc->vc.lock);
 
 	status = tegra_adma_irq_clear(tdc);
 	if (status == 0 || !tdc->desc) {
-		spin_unlock_irqrestore(&tdc->vc.lock, flags);
+		spin_unlock(&tdc->vc.lock);
 		return IRQ_NONE;
 	}
 
 	vchan_cyclic_callback(&tdc->desc->vd);
 
-	spin_unlock_irqrestore(&tdc->vc.lock, flags);
+	spin_unlock(&tdc->vc.lock);
 
 	return IRQ_HANDLED;
 }

From 280e7f90d4520682d0a18b33fa43fd8cccab3439 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:47 +1300
Subject: [PATCH 022/230] dmaengine: milbeaut-xdmac: remove redundant irqsave
 and irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Link: https://lore.kernel.org/r/20201027215252.25820-6-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/milbeaut-xdmac.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/milbeaut-xdmac.c b/drivers/dma/milbeaut-xdmac.c
index 85a597228fb0..584c931e807a 100644
--- a/drivers/dma/milbeaut-xdmac.c
+++ b/drivers/dma/milbeaut-xdmac.c
@@ -160,10 +160,9 @@ static irqreturn_t milbeaut_xdmac_interrupt(int irq, void *dev_id)
 {
 	struct milbeaut_xdmac_chan *mc = dev_id;
 	struct milbeaut_xdmac_desc *md;
-	unsigned long flags;
 	u32 val;
 
-	spin_lock_irqsave(&mc->vc.lock, flags);
+	spin_lock(&mc->vc.lock);
 
 	/* Ack and Stop */
 	val = FIELD_PREP(M10V_XDDSD_IS_MASK, 0x0);
@@ -177,7 +176,7 @@ static irqreturn_t milbeaut_xdmac_interrupt(int irq, void *dev_id)
 
 	milbeaut_xdmac_start(mc);
 out:
-	spin_unlock_irqrestore(&mc->vc.lock, flags);
+	spin_unlock(&mc->vc.lock);
 	return IRQ_HANDLED;
 }
 

From 1ff206561920c9998058e52ea465347d4944b635 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:48 +1300
Subject: [PATCH 023/230] dmaengine: k3dma: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Link: https://lore.kernel.org/r/20201027215252.25820-7-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/k3dma.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c
index f609a84c493c..d0b2e601e3e5 100644
--- a/drivers/dma/k3dma.c
+++ b/drivers/dma/k3dma.c
@@ -223,24 +223,23 @@ static irqreturn_t k3_dma_int_handler(int irq, void *dev_id)
 		i = __ffs(stat);
 		stat &= ~BIT(i);
 		if (likely(tc1 & BIT(i)) || (tc2 & BIT(i))) {
-			unsigned long flags;
 
 			p = &d->phy[i];
 			c = p->vchan;
 			if (c && (tc1 & BIT(i))) {
-				spin_lock_irqsave(&c->vc.lock, flags);
+				spin_lock(&c->vc.lock);
 				if (p->ds_run != NULL) {
 					vchan_cookie_complete(&p->ds_run->vd);
 					p->ds_done = p->ds_run;
 					p->ds_run = NULL;
 				}
-				spin_unlock_irqrestore(&c->vc.lock, flags);
+				spin_unlock(&c->vc.lock);
 			}
 			if (c && (tc2 & BIT(i))) {
-				spin_lock_irqsave(&c->vc.lock, flags);
+				spin_lock(&c->vc.lock);
 				if (p->ds_run != NULL)
 					vchan_cyclic_callback(&p->ds_run->vd);
-				spin_unlock_irqrestore(&c->vc.lock, flags);
+				spin_unlock(&c->vc.lock);
 			}
 			irq_chan |= BIT(i);
 		}

From d9c8d4b278d167bf7cba83ec6fc15a0a17dfc407 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:49 +1300
Subject: [PATCH 024/230] dmaengine: hisi_dma: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Acked-by: Zhou Wang <wangzhou1@hisilicon.com>
Link: https://lore.kernel.org/r/20201027215252.25820-8-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/hisi_dma.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/hisi_dma.c b/drivers/dma/hisi_dma.c
index e1a958ae7925..a259ee010e9b 100644
--- a/drivers/dma/hisi_dma.c
+++ b/drivers/dma/hisi_dma.c
@@ -431,9 +431,8 @@ static irqreturn_t hisi_dma_irq(int irq, void *data)
 	struct hisi_dma_dev *hdma_dev = chan->hdma_dev;
 	struct hisi_dma_desc *desc;
 	struct hisi_dma_cqe *cqe;
-	unsigned long flags;
 
-	spin_lock_irqsave(&chan->vc.lock, flags);
+	spin_lock(&chan->vc.lock);
 
 	desc = chan->desc;
 	cqe = chan->cq + chan->cq_head;
@@ -452,7 +451,7 @@ static irqreturn_t hisi_dma_irq(int irq, void *data)
 		chan->desc = NULL;
 	}
 
-	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	spin_unlock(&chan->vc.lock);
 
 	return IRQ_HANDLED;
 }

From 8c94b83e0c370e72c131f5da8cb19c2cb858a1bf Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:50 +1300
Subject: [PATCH 025/230] dmaengine: moxart-dma: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Link: https://lore.kernel.org/r/20201027215252.25820-9-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/moxart-dma.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/moxart-dma.c b/drivers/dma/moxart-dma.c
index 347146a6e1d0..74755093e14b 100644
--- a/drivers/dma/moxart-dma.c
+++ b/drivers/dma/moxart-dma.c
@@ -524,7 +524,6 @@ static irqreturn_t moxart_dma_interrupt(int irq, void *devid)
 	struct moxart_dmadev *mc = devid;
 	struct moxart_chan *ch = &mc->slave_chans[0];
 	unsigned int i;
-	unsigned long flags;
 	u32 ctrl;
 
 	dev_dbg(chan2dev(&ch->vc.chan), "%s\n", __func__);
@@ -541,14 +540,14 @@ static irqreturn_t moxart_dma_interrupt(int irq, void *devid)
 		if (ctrl & APB_DMA_FIN_INT_STS) {
 			ctrl &= ~APB_DMA_FIN_INT_STS;
 			if (ch->desc) {
-				spin_lock_irqsave(&ch->vc.lock, flags);
+				spin_lock(&ch->vc.lock);
 				if (++ch->sgidx < ch->desc->sglen) {
 					moxart_dma_start_sg(ch, ch->sgidx);
 				} else {
 					vchan_cookie_complete(&ch->desc->vd);
 					moxart_dma_start_desc(&ch->vc.chan);
 				}
-				spin_unlock_irqrestore(&ch->vc.lock, flags);
+				spin_unlock(&ch->vc.lock);
 			}
 		}
 

From 618a8e383bbdf9ce5a252797046dda694bf40389 Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:51 +1300
Subject: [PATCH 026/230] dmaengine: ste_dma40: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20201027215252.25820-10-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ste_dma40.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c
index 77ab1f4730be..4256e55bbf25 100644
--- a/drivers/dma/ste_dma40.c
+++ b/drivers/dma/ste_dma40.c
@@ -1643,13 +1643,12 @@ static irqreturn_t d40_handle_interrupt(int irq, void *data)
 	u32 row;
 	long chan = -1;
 	struct d40_chan *d40c;
-	unsigned long flags;
 	struct d40_base *base = data;
 	u32 *regs = base->regs_interrupt;
 	struct d40_interrupt_lookup *il = base->gen_dmac.il;
 	u32 il_size = base->gen_dmac.il_size;
 
-	spin_lock_irqsave(&base->interrupt_lock, flags);
+	spin_lock(&base->interrupt_lock);
 
 	/* Read interrupt status of both logical and physical channels */
 	for (i = 0; i < il_size; i++)
@@ -1694,7 +1693,7 @@ static irqreturn_t d40_handle_interrupt(int irq, void *data)
 		spin_unlock(&d40c->lock);
 	}
 
-	spin_unlock_irqrestore(&base->interrupt_lock, flags);
+	spin_unlock(&base->interrupt_lock);
 
 	return IRQ_HANDLED;
 }

From 0e15ca5fe2240e9f1e4c408a29505b60dbdfaddd Mon Sep 17 00:00:00 2001
From: Barry Song <song.bao.hua@hisilicon.com>
Date: Wed, 28 Oct 2020 10:52:52 +1300
Subject: [PATCH 027/230] dmaengine: pxa_dma: remove redundant irqsave and
 irqrestore in hardIRQ

Running in hardIRQ, disabling IRQ is redundant since hardIRQ has disabled
IRQ. This patch removes the irqsave and irqstore to save some instruction
cycles.

Signed-off-by: Barry Song <song.bao.hua@hisilicon.com>
Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: Daniel Mack <daniel@zonque.org>
Cc: Haojian Zhuang <haojian.zhuang@gmail.com>
Link: https://lore.kernel.org/r/20201027215252.25820-11-song.bao.hua@hisilicon.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/pxa_dma.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c
index 349fb312c872..4a2a796e348c 100644
--- a/drivers/dma/pxa_dma.c
+++ b/drivers/dma/pxa_dma.c
@@ -606,7 +606,6 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
 	struct pxad_chan *chan = phy->vchan;
 	struct virt_dma_desc *vd, *tmp;
 	unsigned int dcsr;
-	unsigned long flags;
 	bool vd_completed;
 	dma_cookie_t last_started = 0;
 
@@ -616,7 +615,7 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
 	if (dcsr & PXA_DCSR_RUN)
 		return IRQ_NONE;
 
-	spin_lock_irqsave(&chan->vc.lock, flags);
+	spin_lock(&chan->vc.lock);
 	list_for_each_entry_safe(vd, tmp, &chan->vc.desc_issued, node) {
 		vd_completed = is_desc_completed(vd);
 		dev_dbg(&chan->vc.chan.dev->device,
@@ -658,7 +657,7 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
 			pxad_launch_chan(chan, to_pxad_sw_desc(vd));
 		}
 	}
-	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	spin_unlock(&chan->vc.lock);
 	wake_up(&chan->wq_state);
 
 	return IRQ_HANDLED;

From 3821e232eb3b7591f07ce4c389313ab55ebee372 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:15 -0500
Subject: [PATCH 028/230] xprtrdma: Replace dprintk call sites in ERR_CHUNK
 path

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 82 ++++++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/rpc_rdma.c | 13 ++----
 2 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index bf1065772228..d5e66428e27e 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1128,6 +1128,88 @@ DEFINE_REPLY_EVENT(xprtrdma_reply_rqst);
 DEFINE_REPLY_EVENT(xprtrdma_reply_short);
 DEFINE_REPLY_EVENT(xprtrdma_reply_hdr);
 
+TRACE_EVENT(xprtrdma_err_vers,
+	TP_PROTO(
+		const struct rpc_rqst *rqst,
+		__be32 *min,
+		__be32 *max
+	),
+
+	TP_ARGS(rqst, min, max),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(u32, min)
+		__field(u32, max)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->min = be32_to_cpup(min);
+		__entry->max = be32_to_cpup(max);
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x versions=[%u, %u]",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->min, __entry->max
+	)
+);
+
+TRACE_EVENT(xprtrdma_err_chunk,
+	TP_PROTO(
+		const struct rpc_rqst *rqst
+	),
+
+	TP_ARGS(rqst),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x",
+		__entry->task_id, __entry->client_id, __entry->xid
+	)
+);
+
+TRACE_EVENT(xprtrdma_err_unrecognized,
+	TP_PROTO(
+		const struct rpc_rqst *rqst,
+		__be32 *procedure
+	),
+
+	TP_ARGS(rqst, procedure),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, xid)
+		__field(u32, procedure)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = rqst->rq_task->tk_pid;
+		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
+		__entry->procedure = be32_to_cpup(procedure);
+	),
+
+	TP_printk("task:%u@%u xid=0x%08x procedure=%u",
+		__entry->task_id, __entry->client_id, __entry->xid,
+		__entry->procedure
+	)
+);
+
 TRACE_EVENT(xprtrdma_fixup,
 	TP_PROTO(
 		const struct rpc_rqst *rqst,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 0f5120c7668f..c178f93aa40b 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1322,20 +1322,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
 		p = xdr_inline_decode(xdr, 2 * sizeof(*p));
 		if (!p)
 			break;
-		dprintk("RPC:       %s: server reports "
-			"version error (%u-%u), xid %08x\n", __func__,
-			be32_to_cpup(p), be32_to_cpu(*(p + 1)),
-			be32_to_cpu(rep->rr_xid));
+		trace_xprtrdma_err_vers(rqst, p, p + 1);
 		break;
 	case err_chunk:
-		dprintk("RPC:       %s: server reports "
-			"header decoding error, xid %08x\n", __func__,
-			be32_to_cpu(rep->rr_xid));
+		trace_xprtrdma_err_chunk(rqst);
 		break;
 	default:
-		dprintk("RPC:       %s: server reports "
-			"unrecognized error %d, xid %08x\n", __func__,
-			be32_to_cpup(p), be32_to_cpu(rep->rr_xid));
+		trace_xprtrdma_err_unrecognized(rqst, p);
 	}
 
 	return -EIO;

From af5865d2783958294179da56a4d073a9630b3068 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:21 -0500
Subject: [PATCH 029/230] xprtrdma: Introduce Receive completion IDs

Set up a completion ID in each rpcrdma_rep. The ID is used to match
an incoming Receive completion to a transport and to a previous
ib_post_recv().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  | 46 ++++++---------------------------
 net/sunrpc/xprtrdma/verbs.c     |  6 ++++-
 net/sunrpc/xprtrdma/xprt_rdma.h |  5 ++++
 3 files changed, 18 insertions(+), 39 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index d5e66428e27e..1c91c8e721e7 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -771,15 +771,17 @@ TRACE_EVENT(xprtrdma_post_recv,
 	TP_ARGS(rep),
 
 	TP_STRUCT__entry(
-		__field(const void *, rep)
+		__field(u32, cq_id)
+		__field(int, completion_id)
 	),
 
 	TP_fast_assign(
-		__entry->rep = rep;
+		__entry->cq_id = rep->rr_cid.ci_queue_id;
+		__entry->completion_id = rep->rr_cid.ci_completion_id;
 	),
 
-	TP_printk("rep=%p",
-		__entry->rep
+	TP_printk("cq.id=%d cid=%d",
+		__entry->cq_id, __entry->completion_id
 	)
 );
 
@@ -845,6 +847,8 @@ TRACE_EVENT(xprtrdma_post_linv,
  ** Completion events
  **/
 
+DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
+
 TRACE_EVENT(xprtrdma_wc_send,
 	TP_PROTO(
 		const struct rpcrdma_sendctx *sc,
@@ -876,40 +880,6 @@ TRACE_EVENT(xprtrdma_wc_send,
 	)
 );
 
-TRACE_EVENT(xprtrdma_wc_receive,
-	TP_PROTO(
-		const struct ib_wc *wc
-	),
-
-	TP_ARGS(wc),
-
-	TP_STRUCT__entry(
-		__field(const void *, rep)
-		__field(u32, byte_len)
-		__field(unsigned int, status)
-		__field(u32, vendor_err)
-	),
-
-	TP_fast_assign(
-		__entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep,
-					    rr_cqe);
-		__entry->status = wc->status;
-		if (wc->status) {
-			__entry->byte_len = 0;
-			__entry->vendor_err = wc->vendor_err;
-		} else {
-			__entry->byte_len = wc->byte_len;
-			__entry->vendor_err = 0;
-		}
-	),
-
-	TP_printk("rep=%p %u bytes: %s (%u/0x%x)",
-		__entry->rep, __entry->byte_len,
-		rdma_show_wc_status(__entry->status),
-		__entry->status, __entry->vendor_err
-	)
-);
-
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index ad6e2e4994ce..2c8d2801ec4f 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_xprt *r_xprt = cq->cq_context;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_receive(wc);
+	trace_xprtrdma_wc_receive(wc, &rep->rr_cid);
 	--r_xprt->rx_ep->re_receive_count;
 	if (wc->status != IB_WC_SUCCESS)
 		goto out_flushed;
@@ -972,6 +972,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
 	if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
 		goto out_free_regbuf;
 
+	rep->rr_cid.ci_completion_id =
+		atomic_inc_return(&r_xprt->rx_ep->re_completion_ids);
+
 	xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
 		     rdmab_length(rep->rr_rdmabuf));
 	rep->rr_cqe.done = rpcrdma_wc_receive;
@@ -1411,6 +1414,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
 		if (!rep)
 			break;
 
+		rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id;
 		trace_xprtrdma_post_recv(rep);
 		rep->rr_recv_wr.next = wr;
 		wr = &rep->rr_recv_wr;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 43974ef39a50..b94940bc67aa 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -53,6 +53,7 @@
 #include <rdma/ib_verbs.h>		/* RDMA verbs api */
 
 #include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
+#include <linux/sunrpc/rpc_rdma_cid.h> 	/* completion IDs */
 #include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
 #include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
 
@@ -93,6 +94,8 @@ struct rpcrdma_ep {
 	unsigned int		re_max_requests; /* depends on device */
 	unsigned int		re_inline_send;	/* negotiated */
 	unsigned int		re_inline_recv;	/* negotiated */
+
+	atomic_t		re_completion_ids;
 };
 
 /* Pre-allocate extra Work Requests for handling backward receives
@@ -180,6 +183,8 @@ enum {
 
 struct rpcrdma_rep {
 	struct ib_cqe		rr_cqe;
+	struct rpc_rdma_cid	rr_cid;
+
 	__be32			rr_xid;
 	__be32			rr_vers;
 	__be32			rr_proc;

From b2e7467f26d7813d98cbaad5e62b54960f2c071b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:26 -0500
Subject: [PATCH 030/230] xprtrdma: Introduce Send completion IDs

Set up a completion ID in each rpcrdma_req. The ID is used to match
an incoming Send completion to a transport and to a previous
ib_post_send().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  | 47 +++++++--------------------------
 net/sunrpc/xprtrdma/verbs.c     |  5 +++-
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 3 files changed, 14 insertions(+), 39 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 1c91c8e721e7..ab239f4f924e 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -735,8 +735,8 @@ TRACE_EVENT(xprtrdma_post_send,
 	TP_ARGS(req),
 
 	TP_STRUCT__entry(
-		__field(const void *, req)
-		__field(const void *, sc)
+		__field(u32, cq_id)
+		__field(int, completion_id)
 		__field(unsigned int, task_id)
 		__field(unsigned int, client_id)
 		__field(int, num_sge)
@@ -745,20 +745,21 @@ TRACE_EVENT(xprtrdma_post_send,
 
 	TP_fast_assign(
 		const struct rpc_rqst *rqst = &req->rl_slot;
+		const struct rpcrdma_sendctx *sc = req->rl_sendctx;
 
+		__entry->cq_id = sc->sc_cid.ci_queue_id;
+		__entry->completion_id = sc->sc_cid.ci_completion_id;
 		__entry->task_id = rqst->rq_task->tk_pid;
 		__entry->client_id = rqst->rq_task->tk_client ?
 				     rqst->rq_task->tk_client->cl_clid : -1;
-		__entry->req = req;
-		__entry->sc = req->rl_sendctx;
 		__entry->num_sge = req->rl_wr.num_sge;
 		__entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED;
 	),
 
-	TP_printk("task:%u@%u req=%p sc=%p (%d SGE%s) %s",
+	TP_printk("task:%u@%u cq.id=%u cid=%d (%d SGE%s) %s",
 		__entry->task_id, __entry->client_id,
-		__entry->req, __entry->sc, __entry->num_sge,
-		(__entry->num_sge == 1 ? "" : "s"),
+		__entry->cq_id, __entry->completion_id,
+		__entry->num_sge, (__entry->num_sge == 1 ? "" : "s"),
 		(__entry->signaled ? "signaled" : "")
 	)
 );
@@ -848,37 +849,7 @@ TRACE_EVENT(xprtrdma_post_linv,
  **/
 
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
-
-TRACE_EVENT(xprtrdma_wc_send,
-	TP_PROTO(
-		const struct rpcrdma_sendctx *sc,
-		const struct ib_wc *wc
-	),
-
-	TP_ARGS(sc, wc),
-
-	TP_STRUCT__entry(
-		__field(const void *, req)
-		__field(const void *, sc)
-		__field(unsigned int, unmap_count)
-		__field(unsigned int, status)
-		__field(unsigned int, vendor_err)
-	),
-
-	TP_fast_assign(
-		__entry->req = sc->sc_req;
-		__entry->sc = sc;
-		__entry->unmap_count = sc->sc_unmap_count;
-		__entry->status = wc->status;
-		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
-	),
-
-	TP_printk("req=%p sc=%p unmapped=%u: %s (%u/0x%x)",
-		__entry->req, __entry->sc, __entry->unmap_count,
-		rdma_show_wc_status(__entry->status),
-		__entry->status, __entry->vendor_err
-	)
-);
+DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
 
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
 DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 2c8d2801ec4f..63837b5d14e5 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_xprt *r_xprt = cq->cq_context;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_send(sc, wc);
+	trace_xprtrdma_wc_send(wc, &sc->sc_cid);
 	rpcrdma_sendctx_put_locked(r_xprt, sc);
 	rpcrdma_flush_disconnect(r_xprt, wc);
 }
@@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
 		return NULL;
 
 	sc->sc_cqe.done = rpcrdma_wc_send;
+	sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id;
+	sc->sc_cid.ci_completion_id =
+		atomic_inc_return(&ep->re_completion_ids);
 	return sc;
 }
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b94940bc67aa..4eb8e32b9f4a 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -216,6 +216,7 @@ enum {
 struct rpcrdma_req;
 struct rpcrdma_sendctx {
 	struct ib_cqe		sc_cqe;
+	struct rpc_rdma_cid	sc_cid;
 	struct rpcrdma_req	*sc_req;
 	unsigned int		sc_unmap_count;
 	struct ib_sge		sc_sges[];

From 5ecef9c84366955f61e379140c1065d576c66ada Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:31 -0500
Subject: [PATCH 031/230] xprtrdma: Introduce FRWR completion IDs

Set up a completion ID in each rpcrdma_frwr. The ID is used to match
an incoming completion to a transport (CQ) and other MR-related
activity.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  | 44 +++------------------------------
 net/sunrpc/xprtrdma/frwr_ops.c  | 29 ++++++++++++++++------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 3 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index ab239f4f924e..9e30f8aa3562 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -261,41 +261,6 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event,
 				),					\
 				TP_ARGS(task, mr, nsegs))
 
-DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
-	TP_PROTO(
-		const struct ib_wc *wc,
-		const struct rpcrdma_frwr *frwr
-	),
-
-	TP_ARGS(wc, frwr),
-
-	TP_STRUCT__entry(
-		__field(u32, mr_id)
-		__field(unsigned int, status)
-		__field(unsigned int, vendor_err)
-	),
-
-	TP_fast_assign(
-		__entry->mr_id = frwr->fr_mr->res.id;
-		__entry->status = wc->status;
-		__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
-	),
-
-	TP_printk(
-		"mr.id=%u: %s (%u/0x%x)",
-		__entry->mr_id, rdma_show_wc_status(__entry->status),
-		__entry->status, __entry->vendor_err
-	)
-);
-
-#define DEFINE_FRWR_DONE_EVENT(name)					\
-		DEFINE_EVENT(xprtrdma_frwr_done, name,			\
-				TP_PROTO(				\
-					const struct ib_wc *wc,		\
-					const struct rpcrdma_frwr *frwr	\
-				),					\
-				TP_ARGS(wc, frwr))
-
 TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL);
 TRACE_DEFINE_ENUM(DMA_TO_DEVICE);
 TRACE_DEFINE_ENUM(DMA_FROM_DEVICE);
@@ -850,11 +815,10 @@ TRACE_EVENT(xprtrdma_post_linv,
 
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive);
 DEFINE_COMPLETION_EVENT(xprtrdma_wc_send);
-
-DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg);
-DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li);
-DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake);
-DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_done);
+DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg);
+DEFINE_COMPLETION_EVENT(xprtrdma_wc_li);
+DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake);
+DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done);
 
 TRACE_EVENT(xprtrdma_frwr_alloc,
 	TP_PROTO(
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 44888f5badef..2cc6862a52dc 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -363,12 +363,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
 		container_of(cqe, struct rpcrdma_frwr, fr_cqe);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_fastreg(wc, frwr);
+	trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid);
 	/* The MR will get recycled when the associated req is retransmitted */
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
 
+static void frwr_cid_init(struct rpcrdma_ep *ep,
+			  struct rpcrdma_frwr *frwr)
+{
+	struct rpc_rdma_cid *cid = &frwr->fr_cid;
+
+	cid->ci_queue_id = ep->re_attr.send_cq->res.id;
+	cid->ci_completion_id = frwr->fr_mr->res.id;
+}
+
 /**
  * frwr_send - post Send WRs containing the RPC Call message
  * @r_xprt: controlling transport instance
@@ -385,6 +394,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
  */
 int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	struct ib_send_wr *post_wr;
 	struct rpcrdma_mr *mr;
 
@@ -395,6 +405,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		frwr = &mr->frwr;
 
 		frwr->fr_cqe.done = frwr_wc_fastreg;
+		frwr_cid_init(ep, frwr);
 		frwr->fr_regwr.wr.next = post_wr;
 		frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe;
 		frwr->fr_regwr.wr.num_sge = 0;
@@ -404,7 +415,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		post_wr = &frwr->fr_regwr.wr;
 	}
 
-	return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
+	return ib_post_send(ep->re_id->qp, post_wr, NULL);
 }
 
 /**
@@ -448,7 +459,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_li(wc, frwr);
+	trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
 	__frwr_release_mr(wc, mr);
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -469,7 +480,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_li_wake(wc, frwr);
+	trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
 	__frwr_release_mr(wc, mr);
 	complete(&frwr->fr_linv_done);
 
@@ -490,6 +501,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *first, **prev, *last;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	const struct ib_send_wr *bad_wr;
 	struct rpcrdma_frwr *frwr;
 	struct rpcrdma_mr *mr;
@@ -509,6 +521,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 		frwr = &mr->frwr;
 		frwr->fr_cqe.done = frwr_wc_localinv;
+		frwr_cid_init(ep, frwr);
 		last = &frwr->fr_invwr;
 		last->next = NULL;
 		last->wr_cqe = &frwr->fr_cqe;
@@ -534,7 +547,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * unless re_id->qp is a valid pointer.
 	 */
 	bad_wr = NULL;
-	rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+	rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
 
 	/* The final LOCAL_INV WR in the chain is supposed to
 	 * do the wake. If it was never posted, the wake will
@@ -574,7 +587,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 	struct rpcrdma_rep *rep = mr->mr_req->rl_reply;
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
-	trace_xprtrdma_wc_li_done(wc, frwr);
+	trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
 	__frwr_release_mr(wc, mr);
 
 	/* Ensure @rep is generated before __frwr_release_mr */
@@ -597,6 +610,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *first, *last, **prev;
+	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	const struct ib_send_wr *bad_wr;
 	struct rpcrdma_frwr *frwr;
 	struct rpcrdma_mr *mr;
@@ -614,6 +628,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 		frwr = &mr->frwr;
 		frwr->fr_cqe.done = frwr_wc_localinv;
+		frwr_cid_init(ep, frwr);
 		last = &frwr->fr_invwr;
 		last->next = NULL;
 		last->wr_cqe = &frwr->fr_cqe;
@@ -639,7 +654,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * unless re_id->qp is a valid pointer.
 	 */
 	bad_wr = NULL;
-	rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
+	rc = ib_post_send(ep->re_id->qp, first, &bad_wr);
 	if (!rc)
 		return;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 4eb8e32b9f4a..cef9d0f2e2c8 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -231,6 +231,7 @@ struct rpcrdma_sendctx {
 struct rpcrdma_frwr {
 	struct ib_mr			*fr_mr;
 	struct ib_cqe			fr_cqe;
+	struct rpc_rdma_cid		fr_cid;
 	struct completion		fr_linv_done;
 	union {
 		struct ib_reg_wr	fr_regwr;

From 36a55edfc3d5b1c2735c088bcb06967de103f299 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:37 -0500
Subject: [PATCH 032/230] xprtrdma: Clean up trace_xprtrdma_post_linv

- Replace the display of kernel memory addresses
- Add "_err" to the end of its name to indicate that it's a
  tracepoint that fires only when there's an error

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 16 +++++++++-------
 net/sunrpc/xprtrdma/frwr_ops.c |  4 ++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 9e30f8aa3562..b0750c0d2753 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -784,7 +784,7 @@ TRACE_EVENT(xprtrdma_post_recvs,
 	)
 );
 
-TRACE_EVENT(xprtrdma_post_linv,
+TRACE_EVENT(xprtrdma_post_linv_err,
 	TP_PROTO(
 		const struct rpcrdma_req *req,
 		int status
@@ -793,19 +793,21 @@ TRACE_EVENT(xprtrdma_post_linv,
 	TP_ARGS(req, status),
 
 	TP_STRUCT__entry(
-		__field(const void *, req)
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
 		__field(int, status)
-		__field(u32, xid)
 	),
 
 	TP_fast_assign(
-		__entry->req = req;
+		const struct rpc_task *task = req->rl_slot.rq_task;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
 		__entry->status = status;
-		__entry->xid = be32_to_cpu(req->rl_slot.rq_xid);
 	),
 
-	TP_printk("req=%p xid=0x%08x status=%d",
-		__entry->req, __entry->xid, __entry->status
+	TP_printk("task:%u@%u status=%d",
+		__entry->task_id, __entry->client_id, __entry->status
 	)
 );
 
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 2cc6862a52dc..76322b1acf3d 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -560,7 +560,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
 	 */
-	trace_xprtrdma_post_linv(req, rc);
+	trace_xprtrdma_post_linv_err(req, rc);
 	while (bad_wr) {
 		frwr = container_of(bad_wr, struct rpcrdma_frwr,
 				    fr_invwr);
@@ -660,7 +660,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 
 	/* Recycle MRs in the LOCAL_INV chain that did not get posted.
 	 */
-	trace_xprtrdma_post_linv(req, rc);
+	trace_xprtrdma_post_linv_err(req, rc);
 	while (bad_wr) {
 		frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
 		mr = container_of(frwr, struct rpcrdma_mr, frwr);

From 3a9568fedccc6cf26c1a87621c3bfed7b7432119 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:42 -0500
Subject: [PATCH 033/230] xprtrdma: Clean up reply parsing error tracepoints

- Rename the tracepoints with the "_err" suffix to indicate these
  are rare error events
- Replace display of kernel memory addresses
- Tie the XID and error to a connection IP address instead

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 27 ++++++++++++++-------------
 net/sunrpc/xprtrdma/rpc_rdma.c | 10 +++++-----
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index b0750c0d2753..93d717d8139f 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class,
 				),					\
 				TP_ARGS(wc, cid))
 
-DECLARE_EVENT_CLASS(xprtrdma_reply_event,
+DECLARE_EVENT_CLASS(xprtrdma_reply_class,
 	TP_PROTO(
 		const struct rpcrdma_rep *rep
 	),
@@ -68,29 +68,30 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_event,
 	TP_ARGS(rep),
 
 	TP_STRUCT__entry(
-		__field(const void *, rep)
-		__field(const void *, r_xprt)
 		__field(u32, xid)
 		__field(u32, version)
 		__field(u32, proc)
+		__string(addr, rpcrdma_addrstr(rep->rr_rxprt))
+		__string(port, rpcrdma_portstr(rep->rr_rxprt))
 	),
 
 	TP_fast_assign(
-		__entry->rep = rep;
-		__entry->r_xprt = rep->rr_rxprt;
 		__entry->xid = be32_to_cpu(rep->rr_xid);
 		__entry->version = be32_to_cpu(rep->rr_vers);
 		__entry->proc = be32_to_cpu(rep->rr_proc);
+		__assign_str(addr, rpcrdma_addrstr(rep->rr_rxprt));
+		__assign_str(port, rpcrdma_portstr(rep->rr_rxprt));
 	),
 
-	TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u",
-		__entry->r_xprt, __entry->xid, __entry->rep,
-		__entry->version, __entry->proc
+	TP_printk("peer=[%s]:%s xid=0x%08x version=%u proc=%u",
+		__get_str(addr), __get_str(port),
+		__entry->xid, __entry->version, __entry->proc
 	)
 );
 
 #define DEFINE_REPLY_EVENT(name)					\
-		DEFINE_EVENT(xprtrdma_reply_event, name,		\
+		DEFINE_EVENT(xprtrdma_reply_class,			\
+				xprtrdma_reply_##name##_err,		\
 				TP_PROTO(				\
 					const struct rpcrdma_rep *rep	\
 				),					\
@@ -1030,10 +1031,10 @@ TRACE_EVENT(xprtrdma_defer_cmp,
 	)
 );
 
-DEFINE_REPLY_EVENT(xprtrdma_reply_vers);
-DEFINE_REPLY_EVENT(xprtrdma_reply_rqst);
-DEFINE_REPLY_EVENT(xprtrdma_reply_short);
-DEFINE_REPLY_EVENT(xprtrdma_reply_hdr);
+DEFINE_REPLY_EVENT(vers);
+DEFINE_REPLY_EVENT(rqst);
+DEFINE_REPLY_EVENT(short);
+DEFINE_REPLY_EVENT(hdr);
 
 TRACE_EVENT(xprtrdma_err_vers,
 	TP_PROTO(
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c178f93aa40b..29f847c8f609 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /*
- * Copyright (c) 2014-2017 Oracle.  All rights reserved.
+ * Copyright (c) 2014-2020, Oracle and/or its affiliates.
  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -1369,7 +1369,7 @@ out:
 	return;
 
 out_badheader:
-	trace_xprtrdma_reply_hdr(rep);
+	trace_xprtrdma_reply_hdr_err(rep);
 	r_xprt->rx_stats.bad_reply_count++;
 	rqst->rq_task->tk_status = status;
 	status = 0;
@@ -1462,16 +1462,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	return;
 
 out_badversion:
-	trace_xprtrdma_reply_vers(rep);
+	trace_xprtrdma_reply_vers_err(rep);
 	goto out;
 
 out_norqst:
 	spin_unlock(&xprt->queue_lock);
-	trace_xprtrdma_reply_rqst(rep);
+	trace_xprtrdma_reply_rqst_err(rep);
 	goto out;
 
 out_shortreply:
-	trace_xprtrdma_reply_short(rep);
+	trace_xprtrdma_reply_short_err(rep);
 
 out:
 	rpcrdma_recv_buffer_put(rep);

From 03ffd92494a53dcc4b98c909ae1f6787d1fec646 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:47 -0500
Subject: [PATCH 034/230] xprtrdma: Clean up tracepoints in the reply path

Replace unnecessary display of kernel memory addresses.

Also, there are no longer any trace_xprtrdma_defer_cmp() call sites.
And remove the trace_xprtrdma_leaked_rep() tracepoint because there
doesn't seem to be an overwhelming need to have a tracepoint for
catching a software bug that has long since been fixed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 66 ++--------------------------------
 net/sunrpc/xprtrdma/rpc_rdma.c |  6 ++--
 2 files changed, 5 insertions(+), 67 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 93d717d8139f..c28bf17e769b 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -974,17 +974,14 @@ TRACE_EVENT(xprtrdma_reply,
 	TP_PROTO(
 		const struct rpc_task *task,
 		const struct rpcrdma_rep *rep,
-		const struct rpcrdma_req *req,
 		unsigned int credits
 	),
 
-	TP_ARGS(task, rep, req, credits),
+	TP_ARGS(task, rep, credits),
 
 	TP_STRUCT__entry(
 		__field(unsigned int, task_id)
 		__field(unsigned int, client_id)
-		__field(const void *, rep)
-		__field(const void *, req)
 		__field(u32, xid)
 		__field(unsigned int, credits)
 	),
@@ -992,42 +989,13 @@ TRACE_EVENT(xprtrdma_reply,
 	TP_fast_assign(
 		__entry->task_id = task->tk_pid;
 		__entry->client_id = task->tk_client->cl_clid;
-		__entry->rep = rep;
-		__entry->req = req;
 		__entry->xid = be32_to_cpu(rep->rr_xid);
 		__entry->credits = credits;
 	),
 
-	TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p",
+	TP_printk("task:%u@%u xid=0x%08x credits=%u",
 		__entry->task_id, __entry->client_id, __entry->xid,
-		__entry->credits, __entry->rep, __entry->req
-	)
-);
-
-TRACE_EVENT(xprtrdma_defer_cmp,
-	TP_PROTO(
-		const struct rpcrdma_rep *rep
-	),
-
-	TP_ARGS(rep),
-
-	TP_STRUCT__entry(
-		__field(unsigned int, task_id)
-		__field(unsigned int, client_id)
-		__field(const void *, rep)
-		__field(u32, xid)
-	),
-
-	TP_fast_assign(
-		__entry->task_id = rep->rr_rqst->rq_task->tk_pid;
-		__entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid;
-		__entry->rep = rep;
-		__entry->xid = be32_to_cpu(rep->rr_xid);
-	),
-
-	TP_printk("task:%u@%u xid=0x%08x rep=%p",
-		__entry->task_id, __entry->client_id, __entry->xid,
-		__entry->rep
+		__entry->credits
 	)
 );
 
@@ -1212,34 +1180,6 @@ TRACE_EVENT(xprtrdma_cb_setup,
 DEFINE_CB_EVENT(xprtrdma_cb_call);
 DEFINE_CB_EVENT(xprtrdma_cb_reply);
 
-TRACE_EVENT(xprtrdma_leaked_rep,
-	TP_PROTO(
-		const struct rpc_rqst *rqst,
-		const struct rpcrdma_rep *rep
-	),
-
-	TP_ARGS(rqst, rep),
-
-	TP_STRUCT__entry(
-		__field(unsigned int, task_id)
-		__field(unsigned int, client_id)
-		__field(u32, xid)
-		__field(const void *, rep)
-	),
-
-	TP_fast_assign(
-		__entry->task_id = rqst->rq_task->tk_pid;
-		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
-		__entry->xid = be32_to_cpu(rqst->rq_xid);
-		__entry->rep = rep;
-	),
-
-	TP_printk("task:%u@%u xid=0x%08x rep=%p",
-		__entry->task_id, __entry->client_id, __entry->xid,
-		__entry->rep
-	)
-);
-
 /**
  ** Server-side RPC/RDMA events
  **/
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 29f847c8f609..8078559bdc31 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -1443,14 +1443,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	rpcrdma_post_recvs(r_xprt, false);
 
 	req = rpcr_to_rdmar(rqst);
-	if (req->rl_reply) {
-		trace_xprtrdma_leaked_rep(rqst, req->rl_reply);
+	if (unlikely(req->rl_reply))
 		rpcrdma_recv_buffer_put(req->rl_reply);
-	}
 	req->rl_reply = rep;
 	rep->rr_rqst = rqst;
 
-	trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
+	trace_xprtrdma_reply(rqst->rq_task, rep, credits);
 
 	if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
 		frwr_reminv(rep, &req->rl_registered);

From d11e934606ef6ce37a4bcbe89f34faf37347abb1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:53 -0500
Subject: [PATCH 035/230] xprtrdma: Clean up xprtrdma callback tracepoints

- Replace displayed kernel memory addresses
- Tie the XID and event with the peer's IP address

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h    | 31 ++++++++++++++++---------------
 net/sunrpc/xprtrdma/backchannel.c |  6 +++---
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index c28bf17e769b..6bdbe1165270 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -313,38 +313,39 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
 				), \
 				TP_ARGS(mr))
 
-DECLARE_EVENT_CLASS(xprtrdma_cb_event,
+DECLARE_EVENT_CLASS(xprtrdma_callback_class,
 	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
 		const struct rpc_rqst *rqst
 	),
 
-	TP_ARGS(rqst),
+	TP_ARGS(r_xprt, rqst),
 
 	TP_STRUCT__entry(
-		__field(const void *, rqst)
-		__field(const void *, rep)
-		__field(const void *, req)
 		__field(u32, xid)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
 	),
 
 	TP_fast_assign(
-		__entry->rqst = rqst;
-		__entry->req = rpcr_to_rdmar(rqst);
-		__entry->rep = rpcr_to_rdmar(rqst)->rl_reply;
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
 	),
 
-	TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p",
-		__entry->xid, __entry->rqst, __entry->req, __entry->rep
+	TP_printk("peer=[%s]:%s xid=0x%08x",
+		__get_str(addr), __get_str(port), __entry->xid
 	)
 );
 
-#define DEFINE_CB_EVENT(name)						\
-		DEFINE_EVENT(xprtrdma_cb_event, name,			\
+#define DEFINE_CALLBACK_EVENT(name)					\
+		DEFINE_EVENT(xprtrdma_callback_class,			\
+				xprtrdma_cb_##name,			\
 				TP_PROTO(				\
+					const struct rpcrdma_xprt *r_xprt, \
 					const struct rpc_rqst *rqst	\
 				),					\
-				TP_ARGS(rqst))
+				TP_ARGS(r_xprt, rqst))
 
 /**
  ** Connection events
@@ -1177,8 +1178,8 @@ TRACE_EVENT(xprtrdma_cb_setup,
 	)
 );
 
-DEFINE_CB_EVENT(xprtrdma_cb_call);
-DEFINE_CB_EVENT(xprtrdma_cb_reply);
+DEFINE_CALLBACK_EVENT(call);
+DEFINE_CALLBACK_EVENT(reply);
 
 /**
  ** Server-side RPC/RDMA events
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index c92c1aac270a..946edf2db646 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Copyright (c) 2015 Oracle.  All rights reserved.
+ * Copyright (c) 2015-2020, Oracle and/or its affiliates.
  *
  * Support for backward direction RPCs on RPC/RDMA.
  */
@@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 				      &rqst->rq_snd_buf, rpcrdma_noch_pullup))
 		return -EIO;
 
-	trace_xprtrdma_cb_reply(rqst);
+	trace_xprtrdma_cb_reply(r_xprt, rqst);
 	return 0;
 }
 
@@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	 */
 	req = rpcr_to_rdmar(rqst);
 	req->rl_reply = rep;
-	trace_xprtrdma_cb_call(rqst);
+	trace_xprtrdma_cb_call(r_xprt, rqst);
 
 	/* Queue rqst for ULP's callback service */
 	bc_serv = xprt->bc_serv;

From 0307cdec7c3412d7665363ab0cd61fccf82bfb2d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:39:58 -0500
Subject: [PATCH 036/230] xprtrdma: Clean up trace_xprtrdma_nomrs()

- Rename it following the "_err" suffix convention
- Replace display of kernel memory addresses
- Tie MR exhaustion to a peer IP address, similar to the createmrs
   tracepoint

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 19 ++++++++++---------
 net/sunrpc/xprtrdma/rpc_rdma.c |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 6bdbe1165270..4fcda2a25bb8 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -545,32 +545,33 @@ TRACE_EVENT(xprtrdma_mr_get,
 	)
 );
 
-TRACE_EVENT(xprtrdma_nomrs,
+TRACE_EVENT(xprtrdma_nomrs_err,
 	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
 		const struct rpcrdma_req *req
 	),
 
-	TP_ARGS(req),
+	TP_ARGS(r_xprt, req),
 
 	TP_STRUCT__entry(
-		__field(const void *, req)
 		__field(unsigned int, task_id)
 		__field(unsigned int, client_id)
-		__field(u32, xid)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
 	),
 
 	TP_fast_assign(
 		const struct rpc_rqst *rqst = &req->rl_slot;
 
-		__entry->req = req;
 		__entry->task_id = rqst->rq_task->tk_pid;
 		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
-		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
 	),
 
-	TP_printk("task:%u@%u xid=0x%08x req=%p",
-		__entry->task_id, __entry->client_id, __entry->xid,
-		__entry->req
+	TP_printk("peer=[%s]:%s task:%u@%u",
+		__get_str(addr), __get_str(port),
+		__entry->task_id, __entry->client_id
 	)
 );
 
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 8078559bdc31..f27eb2322b38 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -323,7 +323,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
 	return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
 
 out_getmr_err:
-	trace_xprtrdma_nomrs(req);
+	trace_xprtrdma_nomrs_err(r_xprt, req);
 	xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
 	rpcrdma_mrs_refresh(r_xprt);
 	return ERR_PTR(-EAGAIN);

From 7703db978d4cf7c51426183b7c0d03c039757a44 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:40:03 -0500
Subject: [PATCH 037/230] xprtrdma: Display the task ID when reporting MR
 events

Tie each MR event to the requesting rpc_task to make it easier to
follow MR ownership and control flow.

MR unmapping and recycling can happen in the background, after an
MR's mr_req field is stale, so set up a separate tracepoint class
for those events.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h | 96 ++++++++++++++++++++--------------
 net/sunrpc/xprtrdma/frwr_ops.c |  1 -
 net/sunrpc/xprtrdma/rpc_rdma.c |  1 -
 3 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 4fcda2a25bb8..166bbeef996c 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -274,7 +274,55 @@ TRACE_DEFINE_ENUM(DMA_NONE);
 				{ DMA_FROM_DEVICE, "FROM_DEVICE" },	\
 				{ DMA_NONE, "NONE" })
 
-DECLARE_EVENT_CLASS(xprtrdma_mr,
+DECLARE_EVENT_CLASS(xprtrdma_mr_class,
+	TP_PROTO(
+		const struct rpcrdma_mr *mr
+	),
+
+	TP_ARGS(mr),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, mr_id)
+		__field(int, nents)
+		__field(u32, handle)
+		__field(u32, length)
+		__field(u64, offset)
+		__field(u32, dir)
+	),
+
+	TP_fast_assign(
+		const struct rpcrdma_req *req = mr->mr_req;
+		const struct rpc_task *task = req->rl_slot.rq_task;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+		__entry->mr_id  = mr->frwr.fr_mr->res.id;
+		__entry->nents  = mr->mr_nents;
+		__entry->handle = mr->mr_handle;
+		__entry->length = mr->mr_length;
+		__entry->offset = mr->mr_offset;
+		__entry->dir    = mr->mr_dir;
+	),
+
+	TP_printk("task:%u@%u mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)",
+		__entry->task_id, __entry->client_id,
+		__entry->mr_id, __entry->nents, __entry->length,
+		(unsigned long long)__entry->offset, __entry->handle,
+		xprtrdma_show_direction(__entry->dir)
+	)
+);
+
+#define DEFINE_MR_EVENT(name)						\
+		DEFINE_EVENT(xprtrdma_mr_class,				\
+				xprtrdma_mr_##name,			\
+				TP_PROTO(				\
+					const struct rpcrdma_mr *mr	\
+				),					\
+				TP_ARGS(mr))
+
+DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class,
 	TP_PROTO(
 		const struct rpcrdma_mr *mr
 	),
@@ -306,11 +354,12 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
 	)
 );
 
-#define DEFINE_MR_EVENT(name) \
-		DEFINE_EVENT(xprtrdma_mr, xprtrdma_mr_##name, \
-				TP_PROTO( \
-					const struct rpcrdma_mr *mr \
-				), \
+#define DEFINE_ANON_MR_EVENT(name)					\
+		DEFINE_EVENT(xprtrdma_anonymous_mr_class,		\
+				xprtrdma_mr_##name,			\
+				TP_PROTO(				\
+					const struct rpcrdma_mr *mr	\
+				),					\
 				TP_ARGS(mr))
 
 DECLARE_EVENT_CLASS(xprtrdma_callback_class,
@@ -516,35 +565,6 @@ TRACE_EVENT(xprtrdma_createmrs,
 	)
 );
 
-TRACE_EVENT(xprtrdma_mr_get,
-	TP_PROTO(
-		const struct rpcrdma_req *req
-	),
-
-	TP_ARGS(req),
-
-	TP_STRUCT__entry(
-		__field(const void *, req)
-		__field(unsigned int, task_id)
-		__field(unsigned int, client_id)
-		__field(u32, xid)
-	),
-
-	TP_fast_assign(
-		const struct rpc_rqst *rqst = &req->rl_slot;
-
-		__entry->req = req;
-		__entry->task_id = rqst->rq_task->tk_pid;
-		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
-		__entry->xid = be32_to_cpu(rqst->rq_xid);
-	),
-
-	TP_printk("task:%u@%u xid=0x%08x req=%p",
-		__entry->task_id, __entry->client_id, __entry->xid,
-		__entry->req
-	)
-);
-
 TRACE_EVENT(xprtrdma_nomrs_err,
 	TP_PROTO(
 		const struct rpcrdma_xprt *r_xprt,
@@ -946,9 +966,9 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
 
 DEFINE_MR_EVENT(localinv);
 DEFINE_MR_EVENT(map);
-DEFINE_MR_EVENT(unmap);
-DEFINE_MR_EVENT(reminv);
-DEFINE_MR_EVENT(recycle);
+
+DEFINE_ANON_MR_EVENT(unmap);
+DEFINE_ANON_MR_EVENT(recycle);
 
 TRACE_EVENT(xprtrdma_dma_maperr,
 	TP_PROTO(
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 76322b1acf3d..cb2f92409c2f 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -431,7 +431,6 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 	list_for_each_entry(mr, mrs, mr_list)
 		if (mr->mr_handle == rep->rr_inv_rkey) {
 			list_del_init(&mr->mr_list);
-			trace_xprtrdma_mr_reminv(mr);
 			rpcrdma_mr_put(mr);
 			break;	/* only one invalidated MR per RPC */
 		}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index f27eb2322b38..9ed89872ec75 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -315,7 +315,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
 		*mr = rpcrdma_mr_get(r_xprt);
 		if (!*mr)
 			goto out_getmr_err;
-		trace_xprtrdma_mr_get(req);
 		(*mr)->mr_req = req;
 	}
 

From 8e24e191d44f49f08f857f0ebc6fe91961cd1a09 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:40:08 -0500
Subject: [PATCH 038/230] xprtrdma: Trace unmap_sync calls

->buf_free is called nearly once per RPC. Only rarely does
xprt_rdma_free() have to do anything, thus tracing every one of
these calls seems unnecessary. Instead, just throw a trace event
when that one occasional RPC still has MRs that need to be
released.

xprt_rdma_free() is further micro-optimized to reduce the amount of
work done in the common case.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/trace/events/rpcrdma.h  | 22 ++++++++++++++++++++++
 net/sunrpc/xprtrdma/transport.c |  7 ++++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 166bbeef996c..69e1caf7e882 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -1167,6 +1167,28 @@ TRACE_EVENT(xprtrdma_decode_seg,
 	)
 );
 
+TRACE_EVENT(xprtrdma_mrs_zap,
+	TP_PROTO(
+		const struct rpc_task *task
+	),
+
+	TP_ARGS(task),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+	),
+
+	TP_fast_assign(
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client->cl_clid;
+	),
+
+	TP_printk("task:%u@%u",
+		__entry->task_id, __entry->client_id
+	)
+);
+
 /**
  ** Callback events
  **/
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8915e42240d3..bb3ed3db6c0a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -599,11 +599,12 @@ static void
 xprt_rdma_free(struct rpc_task *task)
 {
 	struct rpc_rqst *rqst = task->tk_rqstp;
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 
-	if (!list_empty(&req->rl_registered))
-		frwr_unmap_sync(r_xprt, req);
+	if (unlikely(!list_empty(&req->rl_registered))) {
+		trace_xprtrdma_mrs_zap(task);
+		frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req);
+	}
 
 	/* XXX: If the RPC is completing because of a signal and
 	 * not because a reply was received, we ought to ensure

From ef2be5918ff57bea9cd1e63968c73b1358a765ca Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:40:14 -0500
Subject: [PATCH 039/230] xprtrdma: Move rpcrdma_mr_put()

Clean up: This function is now invoked only in frwr_ops.c. The move
enables deduplication of the trace_xprtrdma_mr_unmap() call site.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  | 41 ++++++++++++++++++++++-----------
 net/sunrpc/xprtrdma/verbs.c     | 19 ---------------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 -
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index cb2f92409c2f..e93b3457b958 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
 	kfree(mr);
 }
 
-static void frwr_mr_recycle(struct rpcrdma_mr *mr)
+static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 {
-	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
-	trace_xprtrdma_mr_recycle(mr);
-
 	if (mr->mr_dir != DMA_NONE) {
 		trace_xprtrdma_mr_unmap(mr);
 		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
 				mr->mr_sg, mr->mr_nents, mr->mr_dir);
 		mr->mr_dir = DMA_NONE;
 	}
+}
+
+static void frwr_mr_recycle(struct rpcrdma_mr *mr)
+{
+	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
+
+	trace_xprtrdma_mr_recycle(mr);
+
+	frwr_mr_unmap(r_xprt, mr);
 
 	spin_lock(&r_xprt->rx_buf.rb_lock);
 	list_del(&mr->mr_all);
@@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
 	frwr_release_mr(mr);
 }
 
+static void frwr_mr_put(struct rpcrdma_mr *mr)
+{
+	frwr_mr_unmap(mr->mr_xprt, mr);
+
+	/* The MR is returned to the req's MR free list instead
+	 * of to the xprt's MR free list. No spinlock is needed.
+	 */
+	rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+}
+
 /* frwr_reset - Place MRs back on the free list
  * @req: request to reset
  *
@@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req)
 	struct rpcrdma_mr *mr;
 
 	while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
-		rpcrdma_mr_put(mr);
+		frwr_mr_put(mr);
 }
 
 /**
@@ -431,17 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
 	list_for_each_entry(mr, mrs, mr_list)
 		if (mr->mr_handle == rep->rr_inv_rkey) {
 			list_del_init(&mr->mr_list);
-			rpcrdma_mr_put(mr);
+			frwr_mr_put(mr);
 			break;	/* only one invalidated MR per RPC */
 		}
 }
 
-static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
 {
 	if (wc->status != IB_WC_SUCCESS)
 		frwr_mr_recycle(mr);
 	else
-		rpcrdma_mr_put(mr);
+		frwr_mr_put(mr);
 }
 
 /**
@@ -459,7 +474,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	trace_xprtrdma_wc_li(wc, &frwr->fr_cid);
-	__frwr_release_mr(wc, mr);
+	frwr_mr_done(wc, mr);
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
 }
@@ -480,7 +495,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid);
-	__frwr_release_mr(wc, mr);
+	frwr_mr_done(wc, mr);
 	complete(&frwr->fr_linv_done);
 
 	rpcrdma_flush_disconnect(cq->cq_context, wc);
@@ -587,9 +602,9 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
 
 	/* WARNING: Only wr_cqe and status are reliable at this point */
 	trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid);
-	__frwr_release_mr(wc, mr);
+	frwr_mr_done(wc, mr);
 
-	/* Ensure @rep is generated before __frwr_release_mr */
+	/* Ensure @rep is generated before frwr_mr_done */
 	smp_rmb();
 	rpcrdma_complete_rqst(rep);
 
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 63837b5d14e5..ec912cf9c618 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1184,25 +1184,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt)
 	return mr;
 }
 
-/**
- * rpcrdma_mr_put - DMA unmap an MR and release it
- * @mr: MR to release
- *
- */
-void rpcrdma_mr_put(struct rpcrdma_mr *mr)
-{
-	struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
-
-	if (mr->mr_dir != DMA_NONE) {
-		trace_xprtrdma_mr_unmap(mr);
-		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
-				mr->mr_sg, mr->mr_nents, mr->mr_dir);
-		mr->mr_dir = DMA_NONE;
-	}
-
-	rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
-}
-
 /**
  * rpcrdma_buffer_get - Get a request buffer
  * @buffers: Buffer pool from which to obtain a buffer
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index cef9d0f2e2c8..6a45bf241ec0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -473,7 +473,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt);
 
 struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_mr_put(struct rpcrdma_mr *mr);
 void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt);
 
 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);

From 7a03aeb66c410366acc5439ae2a341f110c4f845 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 9 Nov 2020 14:40:19 -0500
Subject: [PATCH 040/230] xprtrdma: Micro-optimize MR DMA-unmapping

Now that rpcrdma_ep is no longer part of rpcrdma_xprt, there are
four or five serial address dereferences needed to get to the
IB device needed for DMA unmapping.

Instead, let's use the same pattern that regbufs use: cache a
pointer to the device in the MR, and use that as the indication
that unmapping is necessary.

This also guarantees that the exact same device is used for DMA
mapping and unmapping, even if the r_xprt's ep has been replaced. I
don't think this can happen today, but future changes might break
this assumption.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/xprtrdma/frwr_ops.c  | 12 ++++++------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index e93b3457b958..baca49fe83af 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -67,11 +67,11 @@ void frwr_release_mr(struct rpcrdma_mr *mr)
 
 static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 {
-	if (mr->mr_dir != DMA_NONE) {
+	if (mr->mr_device) {
 		trace_xprtrdma_mr_unmap(mr);
-		ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
-				mr->mr_sg, mr->mr_nents, mr->mr_dir);
-		mr->mr_dir = DMA_NONE;
+		ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
+				mr->mr_dir);
+		mr->mr_device = NULL;
 	}
 }
 
@@ -145,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 
 	mr->mr_xprt = r_xprt;
 	mr->frwr.fr_mr = frmr;
-	mr->mr_dir = DMA_NONE;
+	mr->mr_device = NULL;
 	INIT_LIST_HEAD(&mr->mr_list);
 	init_completion(&mr->frwr.fr_linv_done);
 
@@ -330,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				  mr->mr_dir);
 	if (!dma_nents)
 		goto out_dmamap_err;
+	mr->mr_device = ep->re_id->device;
 
 	ibmr = mr->frwr.fr_mr;
 	n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE);
@@ -356,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 	return seg;
 
 out_dmamap_err:
-	mr->mr_dir = DMA_NONE;
 	trace_xprtrdma_frwr_sgerr(mr, i);
 	return ERR_PTR(-EIO);
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 6a45bf241ec0..94b28657aeeb 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -243,6 +243,7 @@ struct rpcrdma_req;
 struct rpcrdma_mr {
 	struct list_head	mr_list;
 	struct rpcrdma_req	*mr_req;
+	struct ib_device	*mr_device;
 	struct scatterlist	*mr_sg;
 	int			mr_nents;
 	enum dma_data_direction	mr_dir;

From 5b7b41cbf2f9b473ccd13f69337d7c26f4d138c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?=
 <zhouyanjie@wanyeetech.com>
Date: Sat, 7 Nov 2020 20:20:15 +0800
Subject: [PATCH 041/230] dt-bindings: dmaengine: Add JZ4775 bindings.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the dmaengine bindings for the JZ4775 SoC from Ingenic.

Signed-off-by: 周琰杰 (Zhou Yanjie) <zhouyanjie@wanyeetech.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201107122016.89859-2-zhouyanjie@wanyeetech.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/dt-bindings/dma/jz4775-dma.h | 44 ++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 include/dt-bindings/dma/jz4775-dma.h

diff --git a/include/dt-bindings/dma/jz4775-dma.h b/include/dt-bindings/dma/jz4775-dma.h
new file mode 100644
index 000000000000..8d27e2c69dca
--- /dev/null
+++ b/include/dt-bindings/dma/jz4775-dma.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * This header provides macros for JZ4775 DMA bindings.
+ *
+ * Copyright (c) 2020 周琰杰 (Zhou Yanjie) <zhouyanjie@wanyeetech.com>
+ */
+
+#ifndef __DT_BINDINGS_DMA_JZ4775_DMA_H__
+#define __DT_BINDINGS_DMA_JZ4775_DMA_H__
+
+/*
+ * Request type numbers for the JZ4775 DMA controller (written to the DRTn
+ * register for the channel).
+ */
+#define JZ4775_DMA_I2S0_TX	0x6
+#define JZ4775_DMA_I2S0_RX	0x7
+#define JZ4775_DMA_AUTO		0x8
+#define JZ4775_DMA_SADC_RX	0x9
+#define JZ4775_DMA_UART3_TX	0x0e
+#define JZ4775_DMA_UART3_RX	0x0f
+#define JZ4775_DMA_UART2_TX	0x10
+#define JZ4775_DMA_UART2_RX	0x11
+#define JZ4775_DMA_UART1_TX	0x12
+#define JZ4775_DMA_UART1_RX	0x13
+#define JZ4775_DMA_UART0_TX	0x14
+#define JZ4775_DMA_UART0_RX	0x15
+#define JZ4775_DMA_SSI0_TX	0x16
+#define JZ4775_DMA_SSI0_RX	0x17
+#define JZ4775_DMA_MSC0_TX	0x1a
+#define JZ4775_DMA_MSC0_RX	0x1b
+#define JZ4775_DMA_MSC1_TX	0x1c
+#define JZ4775_DMA_MSC1_RX	0x1d
+#define JZ4775_DMA_MSC2_TX	0x1e
+#define JZ4775_DMA_MSC2_RX	0x1f
+#define JZ4775_DMA_PCM0_TX	0x20
+#define JZ4775_DMA_PCM0_RX	0x21
+#define JZ4775_DMA_SMB0_TX	0x24
+#define JZ4775_DMA_SMB0_RX	0x25
+#define JZ4775_DMA_SMB1_TX	0x26
+#define JZ4775_DMA_SMB1_RX	0x27
+#define JZ4775_DMA_SMB2_TX	0x28
+#define JZ4775_DMA_SMB2_RX	0x29
+
+#endif /* __DT_BINDINGS_DMA_JZ4775_DMA_H__ */

From 46d613fd8da84d41fb2adb2ccb97230979d38af8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E7=90=B0=E6=9D=B0=20=28Zhou=20Yanjie=29?=
 <zhouyanjie@wanyeetech.com>
Date: Sat, 7 Nov 2020 20:20:16 +0800
Subject: [PATCH 042/230] dt-bindings: dmaengine: Add X2000 bindings.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the dmaengine bindings for the X2000 SoC from Ingenic.

Signed-off-by: 周琰杰 (Zhou Yanjie) <zhouyanjie@wanyeetech.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201107122016.89859-3-zhouyanjie@wanyeetech.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/dt-bindings/dma/x2000-dma.h | 54 +++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 include/dt-bindings/dma/x2000-dma.h

diff --git a/include/dt-bindings/dma/x2000-dma.h b/include/dt-bindings/dma/x2000-dma.h
new file mode 100644
index 000000000000..db2cd4830b00
--- /dev/null
+++ b/include/dt-bindings/dma/x2000-dma.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * This header provides macros for X2000 DMA bindings.
+ *
+ * Copyright (c) 2020 周琰杰 (Zhou Yanjie) <zhouyanjie@wanyeetech.com>
+ */
+
+#ifndef __DT_BINDINGS_DMA_X2000_DMA_H__
+#define __DT_BINDINGS_DMA_X2000_DMA_H__
+
+/*
+ * Request type numbers for the X2000 DMA controller (written to the DRTn
+ * register for the channel).
+ */
+#define X2000_DMA_AUTO		0x8
+#define X2000_DMA_UART5_TX	0xa
+#define X2000_DMA_UART5_RX	0xb
+#define X2000_DMA_UART4_TX	0xc
+#define X2000_DMA_UART4_RX	0xd
+#define X2000_DMA_UART3_TX	0xe
+#define X2000_DMA_UART3_RX	0xf
+#define X2000_DMA_UART2_TX	0x10
+#define X2000_DMA_UART2_RX	0x11
+#define X2000_DMA_UART1_TX	0x12
+#define X2000_DMA_UART1_RX	0x13
+#define X2000_DMA_UART0_TX	0x14
+#define X2000_DMA_UART0_RX	0x15
+#define X2000_DMA_SSI0_TX	0x16
+#define X2000_DMA_SSI0_RX	0x17
+#define X2000_DMA_SSI1_TX	0x18
+#define X2000_DMA_SSI1_RX	0x19
+#define X2000_DMA_I2C0_TX	0x24
+#define X2000_DMA_I2C0_RX	0x25
+#define X2000_DMA_I2C1_TX	0x26
+#define X2000_DMA_I2C1_RX	0x27
+#define X2000_DMA_I2C2_TX	0x28
+#define X2000_DMA_I2C2_RX	0x29
+#define X2000_DMA_I2C3_TX	0x2a
+#define X2000_DMA_I2C3_RX	0x2b
+#define X2000_DMA_I2C4_TX	0x2c
+#define X2000_DMA_I2C4_RX	0x2d
+#define X2000_DMA_I2C5_TX	0x2e
+#define X2000_DMA_I2C5_RX	0x2f
+#define X2000_DMA_UART6_TX	0x30
+#define X2000_DMA_UART6_RX	0x31
+#define X2000_DMA_UART7_TX	0x32
+#define X2000_DMA_UART7_RX	0x33
+#define X2000_DMA_UART8_TX	0x34
+#define X2000_DMA_UART8_RX	0x35
+#define X2000_DMA_UART9_TX	0x36
+#define X2000_DMA_UART9_RX	0x37
+#define X2000_DMA_SADC_RX	0x38
+
+#endif /* __DT_BINDINGS_DMA_X2000_DMA_H__ */

From 613ff7e19c5877d62118cb6612d4e336272620e7 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank@allwinnertech.com>
Date: Tue, 10 Nov 2020 14:26:38 +0800
Subject: [PATCH 043/230] dt-bindings: dma: allwinner,sun50i-a64-dma: Add A100
 compatible

Add a binding for A100's dma controller.

Signed-off-by: Yangtao Li <frank@allwinnertech.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/f15a18e9b8868e8853db1b5a3d1e411b0ac1c63a.1604988979.git.frank@allwinnertech.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml    | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
index 372679dbd216..b6e1ebfaf366 100644
--- a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
+++ b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml
@@ -21,6 +21,7 @@ properties:
   compatible:
     oneOf:
       - const: allwinner,sun50i-a64-dma
+      - const: allwinner,sun50i-a100-dma
       - const: allwinner,sun50i-h6-dma
       - items:
           - const: allwinner,sun8i-r40-dma
@@ -56,7 +57,9 @@ required:
 if:
   properties:
     compatible:
-      const: allwinner,sun50i-h6-dma
+      enum:
+        - allwinner,sun50i-a100-dma
+        - allwinner,sun50i-h6-dma
 
 then:
   properties:

From 07b552732edd1b09aecc0f57d479e5eccf11c295 Mon Sep 17 00:00:00 2001
From: Yangtao Li <frank@allwinnertech.com>
Date: Tue, 10 Nov 2020 14:28:02 +0800
Subject: [PATCH 044/230] dmaengine: sun6i: Add support for A100 DMA

The dma of a100 is similar to h6, with some minor changes to
support greater addressing capabilities.

Add support for it.

Signed-off-by: Yangtao Li <frank@allwinnertech.com>
Link: https://lore.kernel.org/r/719852c6a9a597bd2e82d01a268ca02b9dee826c.1604988979.git.frank@allwinnertech.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sun6i-dma.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c
index f5f9c86c50bc..5cadd4d2b824 100644
--- a/drivers/dma/sun6i-dma.c
+++ b/drivers/dma/sun6i-dma.c
@@ -1173,6 +1173,30 @@ static struct sun6i_dma_config sun50i_a64_dma_cfg = {
 			     BIT(DMA_SLAVE_BUSWIDTH_8_BYTES),
 };
 
+/*
+ * TODO: Add support for more than 4g physical addressing.
+ *
+ * The A100 binding uses the number of dma channels from the
+ * device tree node.
+ */
+static struct sun6i_dma_config sun50i_a100_dma_cfg = {
+	.clock_autogate_enable = sun6i_enable_clock_autogate_h3,
+	.set_burst_length = sun6i_set_burst_length_h3,
+	.set_drq          = sun6i_set_drq_h6,
+	.set_mode         = sun6i_set_mode_h6,
+	.src_burst_lengths = BIT(1) | BIT(4) | BIT(8) | BIT(16),
+	.dst_burst_lengths = BIT(1) | BIT(4) | BIT(8) | BIT(16),
+	.src_addr_widths   = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) |
+			     BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) |
+			     BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) |
+			     BIT(DMA_SLAVE_BUSWIDTH_8_BYTES),
+	.dst_addr_widths   = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) |
+			     BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) |
+			     BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) |
+			     BIT(DMA_SLAVE_BUSWIDTH_8_BYTES),
+	.has_mbus_clk = true,
+};
+
 /*
  * The H6 binding uses the number of dma channels from the
  * device tree node.
@@ -1225,6 +1249,7 @@ static const struct of_device_id sun6i_dma_match[] = {
 	{ .compatible = "allwinner,sun8i-h3-dma", .data = &sun8i_h3_dma_cfg },
 	{ .compatible = "allwinner,sun8i-v3s-dma", .data = &sun8i_v3s_dma_cfg },
 	{ .compatible = "allwinner,sun50i-a64-dma", .data = &sun50i_a64_dma_cfg },
+	{ .compatible = "allwinner,sun50i-a100-dma", .data = &sun50i_a100_dma_cfg },
 	{ .compatible = "allwinner,sun50i-h6-dma", .data = &sun50i_h6_dma_cfg },
 	{ /* sentinel */ }
 };

From f74faa0ca3d56df7d135602bca80f6e39be9f4ad Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 16 Nov 2020 17:24:03 -0300
Subject: [PATCH 045/230] dmaengine: imx-sdma: Remove unused .id_table support

Since 5.10-rc1 i.MX is a devicetree-only platform and the existing
.id_table support in this driver was only useful for old non-devicetree
platforms.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20201116202403.29749-1-festevam@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 16b908c77db3..41ba21eea7c8 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -566,37 +566,6 @@ static struct sdma_driver_data sdma_imx8mq = {
 	.check_ratio = 1,
 };
 
-static const struct platform_device_id sdma_devtypes[] = {
-	{
-		.name = "imx25-sdma",
-		.driver_data = (unsigned long)&sdma_imx25,
-	}, {
-		.name = "imx31-sdma",
-		.driver_data = (unsigned long)&sdma_imx31,
-	}, {
-		.name = "imx35-sdma",
-		.driver_data = (unsigned long)&sdma_imx35,
-	}, {
-		.name = "imx51-sdma",
-		.driver_data = (unsigned long)&sdma_imx51,
-	}, {
-		.name = "imx53-sdma",
-		.driver_data = (unsigned long)&sdma_imx53,
-	}, {
-		.name = "imx6q-sdma",
-		.driver_data = (unsigned long)&sdma_imx6q,
-	}, {
-		.name = "imx7d-sdma",
-		.driver_data = (unsigned long)&sdma_imx7d,
-	}, {
-		.name = "imx8mq-sdma",
-		.driver_data = (unsigned long)&sdma_imx8mq,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(platform, sdma_devtypes);
-
 static const struct of_device_id sdma_dt_ids[] = {
 	{ .compatible = "fsl,imx6q-sdma", .data = &sdma_imx6q, },
 	{ .compatible = "fsl,imx53-sdma", .data = &sdma_imx53, },
@@ -1998,11 +1967,7 @@ static int sdma_probe(struct platform_device *pdev)
 	s32 *saddr_arr;
 	const struct sdma_driver_data *drvdata = NULL;
 
-	if (of_id)
-		drvdata = of_id->data;
-	else if (pdev->id_entry)
-		drvdata = (void *)pdev->id_entry->driver_data;
-
+	drvdata = of_id->data;
 	if (!drvdata) {
 		dev_err(&pdev->dev, "unable to find driver data\n");
 		return -EINVAL;
@@ -2211,7 +2176,6 @@ static struct platform_driver sdma_driver = {
 		.name	= "imx-sdma",
 		.of_match_table = sdma_dt_ids,
 	},
-	.id_table	= sdma_devtypes,
 	.remove		= sdma_remove,
 	.probe		= sdma_probe,
 };

From 5c9f8c2dbdbe53818bcde6aa6695e1331e5f841f Mon Sep 17 00:00:00 2001
From: Jonathan McDowell <noodles@earth.li>
Date: Sat, 14 Nov 2020 14:02:33 +0000
Subject: [PATCH 046/230] dmaengine: qcom: Add ADM driver

Add the DMA engine driver for the QCOM Application Data Mover (ADM) DMA
controller found in the MSM8x60 and IPQ/APQ8064 platforms.

The ADM supports both memory to memory transactions and memory
to/from peripheral device transactions.  The controller also provides
flow control capabilities for transactions to/from peripheral devices.

The initial release of this driver supports slave transfers to/from
peripherals and also incorporates CRCI (client rate control interface)
flow control.

The hardware only supports a 32 bit physical address, so specifying
!PHYS_ADDR_T_64BIT gives maximum COMPILE_TEST coverage without having to
spend effort on kludging things in the code that will never actually be
needed on real hardware.

Signed-off-by: Andy Gross <agross@codeaurora.org>
Signed-off-by: Thomas Pedersen <twp@codeaurora.org>
Signed-off-by: Jonathan McDowell <noodles@earth.li>
Link: https://lore.kernel.org/r/20201114140233.GM32650@earth.li
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/Kconfig    |  11 +
 drivers/dma/qcom/Makefile   |   1 +
 drivers/dma/qcom/qcom_adm.c | 903 ++++++++++++++++++++++++++++++++++++
 3 files changed, 915 insertions(+)
 create mode 100644 drivers/dma/qcom/qcom_adm.c

diff --git a/drivers/dma/qcom/Kconfig b/drivers/dma/qcom/Kconfig
index 3bcb689162c6..0389d60d2604 100644
--- a/drivers/dma/qcom/Kconfig
+++ b/drivers/dma/qcom/Kconfig
@@ -1,4 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0-only
+config QCOM_ADM
+	tristate "Qualcomm ADM support"
+	depends on (ARCH_QCOM || COMPILE_TEST) && !PHYS_ADDR_T_64BIT
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Enable support for the Qualcomm Application Data Mover (ADM) DMA
+	  controller, as present on MSM8x60, APQ8064, and IPQ8064 devices.
+	  This controller provides DMA capabilities for both general purpose
+	  and on-chip peripheral devices.
+
 config QCOM_BAM_DMA
 	tristate "QCOM BAM DMA support"
 	depends on ARCH_QCOM || (COMPILE_TEST && OF && ARM)
diff --git a/drivers/dma/qcom/Makefile b/drivers/dma/qcom/Makefile
index 1ae92da88b0c..346e643fbb6d 100644
--- a/drivers/dma/qcom/Makefile
+++ b/drivers/dma/qcom/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QCOM_ADM) += qcom_adm.o
 obj-$(CONFIG_QCOM_BAM_DMA) += bam_dma.o
 obj-$(CONFIG_QCOM_HIDMA_MGMT) += hdma_mgmt.o
 hdma_mgmt-objs	 := hidma_mgmt.o hidma_mgmt_sys.o
diff --git a/drivers/dma/qcom/qcom_adm.c b/drivers/dma/qcom/qcom_adm.c
new file mode 100644
index 000000000000..9b6f8e050ecc
--- /dev/null
+++ b/drivers/dma/qcom/qcom_adm.c
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2013-2015, The Linux Foundation. All rights reserved.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/reset.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+/* ADM registers - calculated from channel number and security domain */
+#define ADM_CHAN_MULTI			0x4
+#define ADM_CI_MULTI			0x4
+#define ADM_CRCI_MULTI			0x4
+#define ADM_EE_MULTI			0x800
+#define ADM_CHAN_OFFS(chan)		(ADM_CHAN_MULTI * (chan))
+#define ADM_EE_OFFS(ee)			(ADM_EE_MULTI * (ee))
+#define ADM_CHAN_EE_OFFS(chan, ee)	(ADM_CHAN_OFFS(chan) + ADM_EE_OFFS(ee))
+#define ADM_CHAN_OFFS(chan)		(ADM_CHAN_MULTI * (chan))
+#define ADM_CI_OFFS(ci)			(ADM_CHAN_OFF(ci))
+#define ADM_CH_CMD_PTR(chan, ee)	(ADM_CHAN_EE_OFFS(chan, ee))
+#define ADM_CH_RSLT(chan, ee)		(0x40 + ADM_CHAN_EE_OFFS(chan, ee))
+#define ADM_CH_FLUSH_STATE0(chan, ee)	(0x80 + ADM_CHAN_EE_OFFS(chan, ee))
+#define ADM_CH_STATUS_SD(chan, ee)	(0x200 + ADM_CHAN_EE_OFFS(chan, ee))
+#define ADM_CH_CONF(chan)		(0x240 + ADM_CHAN_OFFS(chan))
+#define ADM_CH_RSLT_CONF(chan, ee)	(0x300 + ADM_CHAN_EE_OFFS(chan, ee))
+#define ADM_SEC_DOMAIN_IRQ_STATUS(ee)	(0x380 + ADM_EE_OFFS(ee))
+#define ADM_CI_CONF(ci)			(0x390 + (ci) * ADM_CI_MULTI)
+#define ADM_GP_CTL			0x3d8
+#define ADM_CRCI_CTL(crci, ee)		(0x400 + (crci) * ADM_CRCI_MULTI + \
+						ADM_EE_OFFS(ee))
+
+/* channel status */
+#define ADM_CH_STATUS_VALID		BIT(1)
+
+/* channel result */
+#define ADM_CH_RSLT_VALID		BIT(31)
+#define ADM_CH_RSLT_ERR			BIT(3)
+#define ADM_CH_RSLT_FLUSH		BIT(2)
+#define ADM_CH_RSLT_TPD			BIT(1)
+
+/* channel conf */
+#define ADM_CH_CONF_SHADOW_EN		BIT(12)
+#define ADM_CH_CONF_MPU_DISABLE		BIT(11)
+#define ADM_CH_CONF_PERM_MPU_CONF	BIT(9)
+#define ADM_CH_CONF_FORCE_RSLT_EN	BIT(7)
+#define ADM_CH_CONF_SEC_DOMAIN(ee)	((((ee) & 0x3) << 4) | (((ee) & 0x4) << 11))
+
+/* channel result conf */
+#define ADM_CH_RSLT_CONF_FLUSH_EN	BIT(1)
+#define ADM_CH_RSLT_CONF_IRQ_EN		BIT(0)
+
+/* CRCI CTL */
+#define ADM_CRCI_CTL_MUX_SEL		BIT(18)
+#define ADM_CRCI_CTL_RST		BIT(17)
+
+/* CI configuration */
+#define ADM_CI_RANGE_END(x)		((x) << 24)
+#define ADM_CI_RANGE_START(x)		((x) << 16)
+#define ADM_CI_BURST_4_WORDS		BIT(2)
+#define ADM_CI_BURST_8_WORDS		BIT(3)
+
+/* GP CTL */
+#define ADM_GP_CTL_LP_EN		BIT(12)
+#define ADM_GP_CTL_LP_CNT(x)		((x) << 8)
+
+/* Command pointer list entry */
+#define ADM_CPLE_LP			BIT(31)
+#define ADM_CPLE_CMD_PTR_LIST		BIT(29)
+
+/* Command list entry */
+#define ADM_CMD_LC			BIT(31)
+#define ADM_CMD_DST_CRCI(n)		(((n) & 0xf) << 7)
+#define ADM_CMD_SRC_CRCI(n)		(((n) & 0xf) << 3)
+
+#define ADM_CMD_TYPE_SINGLE		0x0
+#define ADM_CMD_TYPE_BOX		0x3
+
+#define ADM_CRCI_MUX_SEL		BIT(4)
+#define ADM_DESC_ALIGN			8
+#define ADM_MAX_XFER			(SZ_64K - 1)
+#define ADM_MAX_ROWS			(SZ_64K - 1)
+#define ADM_MAX_CHANNELS		16
+
+struct adm_desc_hw_box {
+	u32 cmd;
+	u32 src_addr;
+	u32 dst_addr;
+	u32 row_len;
+	u32 num_rows;
+	u32 row_offset;
+};
+
+struct adm_desc_hw_single {
+	u32 cmd;
+	u32 src_addr;
+	u32 dst_addr;
+	u32 len;
+};
+
+struct adm_async_desc {
+	struct virt_dma_desc vd;
+	struct adm_device *adev;
+
+	size_t length;
+	enum dma_transfer_direction dir;
+	dma_addr_t dma_addr;
+	size_t dma_len;
+
+	void *cpl;
+	dma_addr_t cp_addr;
+	u32 crci;
+	u32 mux;
+	u32 blk_size;
+};
+
+struct adm_chan {
+	struct virt_dma_chan vc;
+	struct adm_device *adev;
+
+	/* parsed from DT */
+	u32 id;			/* channel id */
+
+	struct adm_async_desc *curr_txd;
+	struct dma_slave_config slave;
+	struct list_head node;
+
+	int error;
+	int initialized;
+};
+
+static inline struct adm_chan *to_adm_chan(struct dma_chan *common)
+{
+	return container_of(common, struct adm_chan, vc.chan);
+}
+
+struct adm_device {
+	void __iomem *regs;
+	struct device *dev;
+	struct dma_device common;
+	struct device_dma_parameters dma_parms;
+	struct adm_chan *channels;
+
+	u32 ee;
+
+	struct clk *core_clk;
+	struct clk *iface_clk;
+
+	struct reset_control *clk_reset;
+	struct reset_control *c0_reset;
+	struct reset_control *c1_reset;
+	struct reset_control *c2_reset;
+	int irq;
+};
+
+/**
+ * adm_free_chan - Frees dma resources associated with the specific channel
+ *
+ * Free all allocated descriptors associated with this channel
+ *
+ */
+static void adm_free_chan(struct dma_chan *chan)
+{
+	/* free all queued descriptors */
+	vchan_free_chan_resources(to_virt_chan(chan));
+}
+
+/**
+ * adm_get_blksize - Get block size from burst value
+ *
+ */
+static int adm_get_blksize(unsigned int burst)
+{
+	int ret;
+
+	switch (burst) {
+	case 16:
+	case 32:
+	case 64:
+	case 128:
+		ret = ffs(burst >> 4) - 1;
+		break;
+	case 192:
+		ret = 4;
+		break;
+	case 256:
+		ret = 5;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * adm_process_fc_descriptors - Process descriptors for flow controlled xfers
+ *
+ * @achan: ADM channel
+ * @desc: Descriptor memory pointer
+ * @sg: Scatterlist entry
+ * @crci: CRCI value
+ * @burst: Burst size of transaction
+ * @direction: DMA transfer direction
+ */
+static void *adm_process_fc_descriptors(struct adm_chan *achan, void *desc,
+					struct scatterlist *sg, u32 crci,
+					u32 burst,
+					enum dma_transfer_direction direction)
+{
+	struct adm_desc_hw_box *box_desc = NULL;
+	struct adm_desc_hw_single *single_desc;
+	u32 remainder = sg_dma_len(sg);
+	u32 rows, row_offset, crci_cmd;
+	u32 mem_addr = sg_dma_address(sg);
+	u32 *incr_addr = &mem_addr;
+	u32 *src, *dst;
+
+	if (direction == DMA_DEV_TO_MEM) {
+		crci_cmd = ADM_CMD_SRC_CRCI(crci);
+		row_offset = burst;
+		src = &achan->slave.src_addr;
+		dst = &mem_addr;
+	} else {
+		crci_cmd = ADM_CMD_DST_CRCI(crci);
+		row_offset = burst << 16;
+		src = &mem_addr;
+		dst = &achan->slave.dst_addr;
+	}
+
+	while (remainder >= burst) {
+		box_desc = desc;
+		box_desc->cmd = ADM_CMD_TYPE_BOX | crci_cmd;
+		box_desc->row_offset = row_offset;
+		box_desc->src_addr = *src;
+		box_desc->dst_addr = *dst;
+
+		rows = remainder / burst;
+		rows = min_t(u32, rows, ADM_MAX_ROWS);
+		box_desc->num_rows = rows << 16 | rows;
+		box_desc->row_len = burst << 16 | burst;
+
+		*incr_addr += burst * rows;
+		remainder -= burst * rows;
+		desc += sizeof(*box_desc);
+	}
+
+	/* if leftover bytes, do one single descriptor */
+	if (remainder) {
+		single_desc = desc;
+		single_desc->cmd = ADM_CMD_TYPE_SINGLE | crci_cmd;
+		single_desc->len = remainder;
+		single_desc->src_addr = *src;
+		single_desc->dst_addr = *dst;
+		desc += sizeof(*single_desc);
+
+		if (sg_is_last(sg))
+			single_desc->cmd |= ADM_CMD_LC;
+	} else {
+		if (box_desc && sg_is_last(sg))
+			box_desc->cmd |= ADM_CMD_LC;
+	}
+
+	return desc;
+}
+
+/**
+ * adm_process_non_fc_descriptors - Process descriptors for non-fc xfers
+ *
+ * @achan: ADM channel
+ * @desc: Descriptor memory pointer
+ * @sg: Scatterlist entry
+ * @direction: DMA transfer direction
+ */
+static void *adm_process_non_fc_descriptors(struct adm_chan *achan, void *desc,
+					    struct scatterlist *sg,
+					    enum dma_transfer_direction direction)
+{
+	struct adm_desc_hw_single *single_desc;
+	u32 remainder = sg_dma_len(sg);
+	u32 mem_addr = sg_dma_address(sg);
+	u32 *incr_addr = &mem_addr;
+	u32 *src, *dst;
+
+	if (direction == DMA_DEV_TO_MEM) {
+		src = &achan->slave.src_addr;
+		dst = &mem_addr;
+	} else {
+		src = &mem_addr;
+		dst = &achan->slave.dst_addr;
+	}
+
+	do {
+		single_desc = desc;
+		single_desc->cmd = ADM_CMD_TYPE_SINGLE;
+		single_desc->src_addr = *src;
+		single_desc->dst_addr = *dst;
+		single_desc->len = (remainder > ADM_MAX_XFER) ?
+				ADM_MAX_XFER : remainder;
+
+		remainder -= single_desc->len;
+		*incr_addr += single_desc->len;
+		desc += sizeof(*single_desc);
+	} while (remainder);
+
+	/* set last command if this is the end of the whole transaction */
+	if (sg_is_last(sg))
+		single_desc->cmd |= ADM_CMD_LC;
+
+	return desc;
+}
+
+/**
+ * adm_prep_slave_sg - Prep slave sg transaction
+ *
+ * @chan: dma channel
+ * @sgl: scatter gather list
+ * @sg_len: length of sg
+ * @direction: DMA transfer direction
+ * @flags: DMA flags
+ * @context: transfer context (unused)
+ */
+static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan,
+							 struct scatterlist *sgl,
+							 unsigned int sg_len,
+							 enum dma_transfer_direction direction,
+							 unsigned long flags,
+							 void *context)
+{
+	struct adm_chan *achan = to_adm_chan(chan);
+	struct adm_device *adev = achan->adev;
+	struct adm_async_desc *async_desc;
+	struct scatterlist *sg;
+	dma_addr_t cple_addr;
+	u32 i, burst;
+	u32 single_count = 0, box_count = 0, crci = 0;
+	void *desc;
+	u32 *cple;
+	int blk_size = 0;
+
+	if (!is_slave_direction(direction)) {
+		dev_err(adev->dev, "invalid dma direction\n");
+		return NULL;
+	}
+
+	/*
+	 * get burst value from slave configuration
+	 */
+	burst = (direction == DMA_MEM_TO_DEV) ?
+		achan->slave.dst_maxburst :
+		achan->slave.src_maxburst;
+
+	/* if using flow control, validate burst and crci values */
+	if (achan->slave.device_fc) {
+		blk_size = adm_get_blksize(burst);
+		if (blk_size < 0) {
+			dev_err(adev->dev, "invalid burst value: %d\n",
+				burst);
+			return ERR_PTR(-EINVAL);
+		}
+
+		crci = achan->slave.slave_id & 0xf;
+		if (!crci || achan->slave.slave_id > 0x1f) {
+			dev_err(adev->dev, "invalid crci value\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	/* iterate through sgs and compute allocation size of structures */
+	for_each_sg(sgl, sg, sg_len, i) {
+		if (achan->slave.device_fc) {
+			box_count += DIV_ROUND_UP(sg_dma_len(sg) / burst,
+						  ADM_MAX_ROWS);
+			if (sg_dma_len(sg) % burst)
+				single_count++;
+		} else {
+			single_count += DIV_ROUND_UP(sg_dma_len(sg),
+						     ADM_MAX_XFER);
+		}
+	}
+
+	async_desc = kzalloc(sizeof(*async_desc), GFP_NOWAIT);
+	if (!async_desc)
+		return ERR_PTR(-ENOMEM);
+
+	if (crci)
+		async_desc->mux = achan->slave.slave_id & ADM_CRCI_MUX_SEL ?
+					ADM_CRCI_CTL_MUX_SEL : 0;
+	async_desc->crci = crci;
+	async_desc->blk_size = blk_size;
+	async_desc->dma_len = single_count * sizeof(struct adm_desc_hw_single) +
+				box_count * sizeof(struct adm_desc_hw_box) +
+				sizeof(*cple) + 2 * ADM_DESC_ALIGN;
+
+	async_desc->cpl = kzalloc(async_desc->dma_len, GFP_NOWAIT);
+	if (!async_desc->cpl)
+		goto free;
+
+	async_desc->adev = adev;
+
+	/* both command list entry and descriptors must be 8 byte aligned */
+	cple = PTR_ALIGN(async_desc->cpl, ADM_DESC_ALIGN);
+	desc = PTR_ALIGN(cple + 1, ADM_DESC_ALIGN);
+
+	for_each_sg(sgl, sg, sg_len, i) {
+		async_desc->length += sg_dma_len(sg);
+
+		if (achan->slave.device_fc)
+			desc = adm_process_fc_descriptors(achan, desc, sg, crci,
+							  burst, direction);
+		else
+			desc = adm_process_non_fc_descriptors(achan, desc, sg,
+							      direction);
+	}
+
+	async_desc->dma_addr = dma_map_single(adev->dev, async_desc->cpl,
+					      async_desc->dma_len,
+					      DMA_TO_DEVICE);
+	if (dma_mapping_error(adev->dev, async_desc->dma_addr))
+		goto free;
+
+	cple_addr = async_desc->dma_addr + ((void *)cple - async_desc->cpl);
+
+	/* init cmd list */
+	dma_sync_single_for_cpu(adev->dev, cple_addr, sizeof(*cple),
+				DMA_TO_DEVICE);
+	*cple = ADM_CPLE_LP;
+	*cple |= (async_desc->dma_addr + ADM_DESC_ALIGN) >> 3;
+	dma_sync_single_for_device(adev->dev, cple_addr, sizeof(*cple),
+				   DMA_TO_DEVICE);
+
+	return vchan_tx_prep(&achan->vc, &async_desc->vd, flags);
+
+free:
+	kfree(async_desc);
+	return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * adm_terminate_all - terminate all transactions on a channel
+ * @achan: adm dma channel
+ *
+ * Dequeues and frees all transactions, aborts current transaction
+ * No callbacks are done
+ *
+ */
+static int adm_terminate_all(struct dma_chan *chan)
+{
+	struct adm_chan *achan = to_adm_chan(chan);
+	struct adm_device *adev = achan->adev;
+	unsigned long flags;
+	LIST_HEAD(head);
+
+	spin_lock_irqsave(&achan->vc.lock, flags);
+	vchan_get_all_descriptors(&achan->vc, &head);
+
+	/* send flush command to terminate current transaction */
+	writel_relaxed(0x0,
+		       adev->regs + ADM_CH_FLUSH_STATE0(achan->id, adev->ee));
+
+	spin_unlock_irqrestore(&achan->vc.lock, flags);
+
+	vchan_dma_desc_free_list(&achan->vc, &head);
+
+	return 0;
+}
+
+static int adm_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg)
+{
+	struct adm_chan *achan = to_adm_chan(chan);
+	unsigned long flag;
+
+	spin_lock_irqsave(&achan->vc.lock, flag);
+	memcpy(&achan->slave, cfg, sizeof(struct dma_slave_config));
+	spin_unlock_irqrestore(&achan->vc.lock, flag);
+
+	return 0;
+}
+
+/**
+ * adm_start_dma - start next transaction
+ * @achan - ADM dma channel
+ */
+static void adm_start_dma(struct adm_chan *achan)
+{
+	struct virt_dma_desc *vd = vchan_next_desc(&achan->vc);
+	struct adm_device *adev = achan->adev;
+	struct adm_async_desc *async_desc;
+
+	lockdep_assert_held(&achan->vc.lock);
+
+	if (!vd)
+		return;
+
+	list_del(&vd->node);
+
+	/* write next command list out to the CMD FIFO */
+	async_desc = container_of(vd, struct adm_async_desc, vd);
+	achan->curr_txd = async_desc;
+
+	/* reset channel error */
+	achan->error = 0;
+
+	if (!achan->initialized) {
+		/* enable interrupts */
+		writel(ADM_CH_CONF_SHADOW_EN |
+		       ADM_CH_CONF_PERM_MPU_CONF |
+		       ADM_CH_CONF_MPU_DISABLE |
+		       ADM_CH_CONF_SEC_DOMAIN(adev->ee),
+		       adev->regs + ADM_CH_CONF(achan->id));
+
+		writel(ADM_CH_RSLT_CONF_IRQ_EN | ADM_CH_RSLT_CONF_FLUSH_EN,
+		       adev->regs + ADM_CH_RSLT_CONF(achan->id, adev->ee));
+
+		achan->initialized = 1;
+	}
+
+	/* set the crci block size if this transaction requires CRCI */
+	if (async_desc->crci) {
+		writel(async_desc->mux | async_desc->blk_size,
+		       adev->regs + ADM_CRCI_CTL(async_desc->crci, adev->ee));
+	}
+
+	/* make sure IRQ enable doesn't get reordered */
+	wmb();
+
+	/* write next command list out to the CMD FIFO */
+	writel(ALIGN(async_desc->dma_addr, ADM_DESC_ALIGN) >> 3,
+	       adev->regs + ADM_CH_CMD_PTR(achan->id, adev->ee));
+}
+
+/**
+ * adm_dma_irq - irq handler for ADM controller
+ * @irq: IRQ of interrupt
+ * @data: callback data
+ *
+ * IRQ handler for the bam controller
+ */
+static irqreturn_t adm_dma_irq(int irq, void *data)
+{
+	struct adm_device *adev = data;
+	u32 srcs, i;
+	struct adm_async_desc *async_desc;
+	unsigned long flags;
+
+	srcs = readl_relaxed(adev->regs +
+			ADM_SEC_DOMAIN_IRQ_STATUS(adev->ee));
+
+	for (i = 0; i < ADM_MAX_CHANNELS; i++) {
+		struct adm_chan *achan = &adev->channels[i];
+		u32 status, result;
+
+		if (srcs & BIT(i)) {
+			status = readl_relaxed(adev->regs +
+					       ADM_CH_STATUS_SD(i, adev->ee));
+
+			/* if no result present, skip */
+			if (!(status & ADM_CH_STATUS_VALID))
+				continue;
+
+			result = readl_relaxed(adev->regs +
+				ADM_CH_RSLT(i, adev->ee));
+
+			/* no valid results, skip */
+			if (!(result & ADM_CH_RSLT_VALID))
+				continue;
+
+			/* flag error if transaction was flushed or failed */
+			if (result & (ADM_CH_RSLT_ERR | ADM_CH_RSLT_FLUSH))
+				achan->error = 1;
+
+			spin_lock_irqsave(&achan->vc.lock, flags);
+			async_desc = achan->curr_txd;
+
+			achan->curr_txd = NULL;
+
+			if (async_desc) {
+				vchan_cookie_complete(&async_desc->vd);
+
+				/* kick off next DMA */
+				adm_start_dma(achan);
+			}
+
+			spin_unlock_irqrestore(&achan->vc.lock, flags);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+/**
+ * adm_tx_status - returns status of transaction
+ * @chan: dma channel
+ * @cookie: transaction cookie
+ * @txstate: DMA transaction state
+ *
+ * Return status of dma transaction
+ */
+static enum dma_status adm_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
+				     struct dma_tx_state *txstate)
+{
+	struct adm_chan *achan = to_adm_chan(chan);
+	struct virt_dma_desc *vd;
+	enum dma_status ret;
+	unsigned long flags;
+	size_t residue = 0;
+
+	ret = dma_cookie_status(chan, cookie, txstate);
+	if (ret == DMA_COMPLETE || !txstate)
+		return ret;
+
+	spin_lock_irqsave(&achan->vc.lock, flags);
+
+	vd = vchan_find_desc(&achan->vc, cookie);
+	if (vd)
+		residue = container_of(vd, struct adm_async_desc, vd)->length;
+
+	spin_unlock_irqrestore(&achan->vc.lock, flags);
+
+	/*
+	 * residue is either the full length if it is in the issued list, or 0
+	 * if it is in progress.  We have no reliable way of determining
+	 * anything inbetween
+	 */
+	dma_set_residue(txstate, residue);
+
+	if (achan->error)
+		return DMA_ERROR;
+
+	return ret;
+}
+
+/**
+ * adm_issue_pending - starts pending transactions
+ * @chan: dma channel
+ *
+ * Issues all pending transactions and starts DMA
+ */
+static void adm_issue_pending(struct dma_chan *chan)
+{
+	struct adm_chan *achan = to_adm_chan(chan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&achan->vc.lock, flags);
+
+	if (vchan_issue_pending(&achan->vc) && !achan->curr_txd)
+		adm_start_dma(achan);
+	spin_unlock_irqrestore(&achan->vc.lock, flags);
+}
+
+/**
+ * adm_dma_free_desc - free descriptor memory
+ * @vd: virtual descriptor
+ *
+ */
+static void adm_dma_free_desc(struct virt_dma_desc *vd)
+{
+	struct adm_async_desc *async_desc = container_of(vd,
+			struct adm_async_desc, vd);
+
+	dma_unmap_single(async_desc->adev->dev, async_desc->dma_addr,
+			 async_desc->dma_len, DMA_TO_DEVICE);
+	kfree(async_desc->cpl);
+	kfree(async_desc);
+}
+
+static void adm_channel_init(struct adm_device *adev, struct adm_chan *achan,
+			     u32 index)
+{
+	achan->id = index;
+	achan->adev = adev;
+
+	vchan_init(&achan->vc, &adev->common);
+	achan->vc.desc_free = adm_dma_free_desc;
+}
+
+static int adm_dma_probe(struct platform_device *pdev)
+{
+	struct adm_device *adev;
+	int ret;
+	u32 i;
+
+	adev = devm_kzalloc(&pdev->dev, sizeof(*adev), GFP_KERNEL);
+	if (!adev)
+		return -ENOMEM;
+
+	adev->dev = &pdev->dev;
+
+	adev->regs = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(adev->regs))
+		return PTR_ERR(adev->regs);
+
+	adev->irq = platform_get_irq(pdev, 0);
+	if (adev->irq < 0)
+		return adev->irq;
+
+	ret = of_property_read_u32(pdev->dev.of_node, "qcom,ee", &adev->ee);
+	if (ret) {
+		dev_err(adev->dev, "Execution environment unspecified\n");
+		return ret;
+	}
+
+	adev->core_clk = devm_clk_get(adev->dev, "core");
+	if (IS_ERR(adev->core_clk))
+		return PTR_ERR(adev->core_clk);
+
+	adev->iface_clk = devm_clk_get(adev->dev, "iface");
+	if (IS_ERR(adev->iface_clk))
+		return PTR_ERR(adev->iface_clk);
+
+	adev->clk_reset = devm_reset_control_get_exclusive(&pdev->dev, "clk");
+	if (IS_ERR(adev->clk_reset)) {
+		dev_err(adev->dev, "failed to get ADM0 reset\n");
+		return PTR_ERR(adev->clk_reset);
+	}
+
+	adev->c0_reset = devm_reset_control_get_exclusive(&pdev->dev, "c0");
+	if (IS_ERR(adev->c0_reset)) {
+		dev_err(adev->dev, "failed to get ADM0 C0 reset\n");
+		return PTR_ERR(adev->c0_reset);
+	}
+
+	adev->c1_reset = devm_reset_control_get_exclusive(&pdev->dev, "c1");
+	if (IS_ERR(adev->c1_reset)) {
+		dev_err(adev->dev, "failed to get ADM0 C1 reset\n");
+		return PTR_ERR(adev->c1_reset);
+	}
+
+	adev->c2_reset = devm_reset_control_get_exclusive(&pdev->dev, "c2");
+	if (IS_ERR(adev->c2_reset)) {
+		dev_err(adev->dev, "failed to get ADM0 C2 reset\n");
+		return PTR_ERR(adev->c2_reset);
+	}
+
+	ret = clk_prepare_enable(adev->core_clk);
+	if (ret) {
+		dev_err(adev->dev, "failed to prepare/enable core clock\n");
+		return ret;
+	}
+
+	ret = clk_prepare_enable(adev->iface_clk);
+	if (ret) {
+		dev_err(adev->dev, "failed to prepare/enable iface clock\n");
+		goto err_disable_core_clk;
+	}
+
+	reset_control_assert(adev->clk_reset);
+	reset_control_assert(adev->c0_reset);
+	reset_control_assert(adev->c1_reset);
+	reset_control_assert(adev->c2_reset);
+
+	udelay(2);
+
+	reset_control_deassert(adev->clk_reset);
+	reset_control_deassert(adev->c0_reset);
+	reset_control_deassert(adev->c1_reset);
+	reset_control_deassert(adev->c2_reset);
+
+	adev->channels = devm_kcalloc(adev->dev, ADM_MAX_CHANNELS,
+				      sizeof(*adev->channels), GFP_KERNEL);
+
+	if (!adev->channels) {
+		ret = -ENOMEM;
+		goto err_disable_clks;
+	}
+
+	/* allocate and initialize channels */
+	INIT_LIST_HEAD(&adev->common.channels);
+
+	for (i = 0; i < ADM_MAX_CHANNELS; i++)
+		adm_channel_init(adev, &adev->channels[i], i);
+
+	/* reset CRCIs */
+	for (i = 0; i < 16; i++)
+		writel(ADM_CRCI_CTL_RST, adev->regs +
+			ADM_CRCI_CTL(i, adev->ee));
+
+	/* configure client interfaces */
+	writel(ADM_CI_RANGE_START(0x40) | ADM_CI_RANGE_END(0xb0) |
+	       ADM_CI_BURST_8_WORDS, adev->regs + ADM_CI_CONF(0));
+	writel(ADM_CI_RANGE_START(0x2a) | ADM_CI_RANGE_END(0x2c) |
+	       ADM_CI_BURST_8_WORDS, adev->regs + ADM_CI_CONF(1));
+	writel(ADM_CI_RANGE_START(0x12) | ADM_CI_RANGE_END(0x28) |
+	       ADM_CI_BURST_8_WORDS, adev->regs + ADM_CI_CONF(2));
+	writel(ADM_GP_CTL_LP_EN | ADM_GP_CTL_LP_CNT(0xf),
+	       adev->regs + ADM_GP_CTL);
+
+	ret = devm_request_irq(adev->dev, adev->irq, adm_dma_irq,
+			       0, "adm_dma", adev);
+	if (ret)
+		goto err_disable_clks;
+
+	platform_set_drvdata(pdev, adev);
+
+	adev->common.dev = adev->dev;
+	adev->common.dev->dma_parms = &adev->dma_parms;
+
+	/* set capabilities */
+	dma_cap_zero(adev->common.cap_mask);
+	dma_cap_set(DMA_SLAVE, adev->common.cap_mask);
+	dma_cap_set(DMA_PRIVATE, adev->common.cap_mask);
+
+	/* initialize dmaengine apis */
+	adev->common.directions = BIT(DMA_DEV_TO_MEM | DMA_MEM_TO_DEV);
+	adev->common.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	adev->common.src_addr_widths = DMA_SLAVE_BUSWIDTH_4_BYTES;
+	adev->common.dst_addr_widths = DMA_SLAVE_BUSWIDTH_4_BYTES;
+	adev->common.device_free_chan_resources = adm_free_chan;
+	adev->common.device_prep_slave_sg = adm_prep_slave_sg;
+	adev->common.device_issue_pending = adm_issue_pending;
+	adev->common.device_tx_status = adm_tx_status;
+	adev->common.device_terminate_all = adm_terminate_all;
+	adev->common.device_config = adm_slave_config;
+
+	ret = dma_async_device_register(&adev->common);
+	if (ret) {
+		dev_err(adev->dev, "failed to register dma async device\n");
+		goto err_disable_clks;
+	}
+
+	ret = of_dma_controller_register(pdev->dev.of_node,
+					 of_dma_xlate_by_chan_id,
+					 &adev->common);
+	if (ret)
+		goto err_unregister_dma;
+
+	return 0;
+
+err_unregister_dma:
+	dma_async_device_unregister(&adev->common);
+err_disable_clks:
+	clk_disable_unprepare(adev->iface_clk);
+err_disable_core_clk:
+	clk_disable_unprepare(adev->core_clk);
+
+	return ret;
+}
+
+static int adm_dma_remove(struct platform_device *pdev)
+{
+	struct adm_device *adev = platform_get_drvdata(pdev);
+	struct adm_chan *achan;
+	u32 i;
+
+	of_dma_controller_free(pdev->dev.of_node);
+	dma_async_device_unregister(&adev->common);
+
+	for (i = 0; i < ADM_MAX_CHANNELS; i++) {
+		achan = &adev->channels[i];
+
+		/* mask IRQs for this channel/EE pair */
+		writel(0, adev->regs + ADM_CH_RSLT_CONF(achan->id, adev->ee));
+
+		tasklet_kill(&adev->channels[i].vc.task);
+		adm_terminate_all(&adev->channels[i].vc.chan);
+	}
+
+	devm_free_irq(adev->dev, adev->irq, adev);
+
+	clk_disable_unprepare(adev->core_clk);
+	clk_disable_unprepare(adev->iface_clk);
+
+	return 0;
+}
+
+static const struct of_device_id adm_of_match[] = {
+	{ .compatible = "qcom,adm", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, adm_of_match);
+
+static struct platform_driver adm_dma_driver = {
+	.probe = adm_dma_probe,
+	.remove = adm_dma_remove,
+	.driver = {
+		.name = "adm-dma-engine",
+		.of_match_table = adm_of_match,
+	},
+};
+
+module_platform_driver(adm_dma_driver);
+
+MODULE_AUTHOR("Andy Gross <agross@codeaurora.org>");
+MODULE_DESCRIPTION("QCOM ADM DMA engine driver");
+MODULE_LICENSE("GPL v2");

From 678198f21135b0aaf9782049d0a6bbbeefff1804 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:48 +0530
Subject: [PATCH 047/230] dt-bindings: dmaengine: Document qcom,gpi dma binding

Add devicetree binding documentation for GPI DMA controller
implemented on Qualcomm SoCs

Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201109085450.24843-2-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/qcom,gpi.yaml     | 88 +++++++++++++++++++
 include/dt-bindings/dma/qcom-gpi.h            | 11 +++
 2 files changed, 99 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/qcom,gpi.yaml
 create mode 100644 include/dt-bindings/dma/qcom-gpi.h

diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml
new file mode 100644
index 000000000000..f8142adf9aea
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/qcom,gpi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Qualcomm Technologies Inc GPI DMA controller
+
+maintainers:
+  - Vinod Koul <vkoul@kernel.org>
+
+description: |
+  QCOM GPI DMA controller provides DMA capabilities for
+  peripheral buses such as I2C, UART, and SPI.
+
+allOf:
+  - $ref: "dma-controller.yaml#"
+
+properties:
+  compatible:
+    enum:
+      - qcom,sdm845-gpi-dma
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    description:
+      Interrupt lines for each GPI instance
+    maxItems: 13
+
+  "#dma-cells":
+    const: 3
+    description: >
+      DMA clients must use the format described in dma.txt, giving a phandle
+      to the DMA controller plus the following 3 integer cells:
+      - channel: if set to 0xffffffff, any available channel will be allocated
+        for the client. Otherwise, the exact channel specified will be used.
+      - seid: serial id of the client as defined in the SoC documentation.
+      - client: type of the client as defined in dt-bindings/dma/qcom-gpi.h
+
+  iommus:
+    maxItems: 1
+
+  dma-channels:
+    maximum: 31
+
+  dma-channel-mask:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - "#dma-cells"
+  - iommus
+  - dma-channels
+  - dma-channel-mask
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/dma/qcom-gpi.h>
+    gpi_dma0: dma-controller@800000 {
+        compatible = "qcom,gpi-dma";
+        #dma-cells = <3>;
+        reg = <0x00800000 0x60000>;
+        iommus = <&apps_smmu 0x0016 0x0>;
+        dma-channels = <13>;
+        dma-channel-mask = <0xfa>;
+        interrupts = <GIC_SPI 244 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 245 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 246 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 247 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 248 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 249 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 250 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 251 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 252 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 253 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 254 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 255 IRQ_TYPE_LEVEL_HIGH>,
+                     <GIC_SPI 256 IRQ_TYPE_LEVEL_HIGH>;
+    };
+
+...
diff --git a/include/dt-bindings/dma/qcom-gpi.h b/include/dt-bindings/dma/qcom-gpi.h
new file mode 100644
index 000000000000..ebda2a37f52a
--- /dev/null
+++ b/include/dt-bindings/dma/qcom-gpi.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/* Copyright (c) 2020, Linaro Ltd.  */
+
+#ifndef __DT_BINDINGS_DMA_QCOM_GPI_H__
+#define __DT_BINDINGS_DMA_QCOM_GPI_H__
+
+#define QCOM_GPI_SPI		1
+#define QCOM_GPI_UART		2
+#define QCOM_GPI_I2C		3
+
+#endif /* __DT_BINDINGS_DMA_QCOM_GPI_H__ */

From e7bbb7acabf47d74672e0e314bed4d452d2097b4 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:49 +0530
Subject: [PATCH 048/230] dmaengine: add peripheral configuration

Some complex dmaengine controllers have capability to program the
peripheral device, so pass on the peripheral configuration as part of
dma_slave_config

Link: https://lore.kernel.org/r/20201109085450.24843-3-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index dd357a747780..493a047ed0a2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -418,6 +418,9 @@ enum dma_slave_buswidth {
  * @slave_id: Slave requester id. Only valid for slave channels. The dma
  * slave peripheral will have unique id as dma requester which need to be
  * pass as slave config.
+ * @peripheral_config: peripheral configuration for programming peripheral
+ * for dmaengine transfer
+ * @peripheral_size: peripheral configuration buffer size
  *
  * This struct is passed in as configuration data to a DMA engine
  * in order to set up a certain channel for DMA transport at runtime.
@@ -443,6 +446,8 @@ struct dma_slave_config {
 	u32 dst_port_window_size;
 	bool device_fc;
 	unsigned int slave_id;
+	void *peripheral_config;
+	size_t peripheral_size;
 };
 
 /**

From 5d0c3533a19f48e5e7e73806a3e4b29cd4364130 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:50 +0530
Subject: [PATCH 049/230] dmaengine: qcom: Add GPI dma driver

This controller provides DMAengine capabilities for a variety of peripheral
buses such as I2C, UART, and SPI. By using GPI dmaengine driver, bus
drivers can use a standardize interface that is protocol independent to
transfer data between memory and peripheral.

Link: https://lore.kernel.org/r/20201109085450.24843-4-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/Kconfig         |   12 +
 drivers/dma/qcom/Makefile        |    1 +
 drivers/dma/qcom/gpi.c           | 2303 ++++++++++++++++++++++++++++++
 include/linux/dma/qcom-gpi-dma.h |   83 ++
 4 files changed, 2399 insertions(+)
 create mode 100644 drivers/dma/qcom/gpi.c
 create mode 100644 include/linux/dma/qcom-gpi-dma.h

diff --git a/drivers/dma/qcom/Kconfig b/drivers/dma/qcom/Kconfig
index 0389d60d2604..365f94eb3b08 100644
--- a/drivers/dma/qcom/Kconfig
+++ b/drivers/dma/qcom/Kconfig
@@ -19,6 +19,18 @@ config QCOM_BAM_DMA
 	  Enable support for the QCOM BAM DMA controller.  This controller
 	  provides DMA capabilities for a variety of on-chip devices.
 
+config QCOM_GPI_DMA
+        tristate "Qualcomm Technologies GPI DMA support"
+        depends on ARCH_QCOM
+        select DMA_ENGINE
+        select DMA_VIRTUAL_CHANNELS
+        help
+          Enable support for the QCOM GPI DMA controller. This controller
+          provides DMA capabilities for a variety of peripheral buses such
+          as I2C, UART, and SPI. By using GPI dmaengine driver, bus drivers
+          can use a standardize interface that is protocol independent to
+          transfer data between DDR and peripheral.
+
 config QCOM_HIDMA_MGMT
 	tristate "Qualcomm Technologies HIDMA Management support"
 	select DMA_ENGINE
diff --git a/drivers/dma/qcom/Makefile b/drivers/dma/qcom/Makefile
index 346e643fbb6d..50f1e7014693 100644
--- a/drivers/dma/qcom/Makefile
+++ b/drivers/dma/qcom/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_QCOM_ADM) += qcom_adm.o
 obj-$(CONFIG_QCOM_BAM_DMA) += bam_dma.o
+obj-$(CONFIG_QCOM_GPI_DMA) += gpi.o
 obj-$(CONFIG_QCOM_HIDMA_MGMT) += hdma_mgmt.o
 hdma_mgmt-objs	 := hidma_mgmt.o hidma_mgmt_sys.o
 obj-$(CONFIG_QCOM_HIDMA) +=  hdma.o
diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
new file mode 100644
index 000000000000..d2334f535de2
--- /dev/null
+++ b/drivers/dma/qcom/gpi.c
@@ -0,0 +1,2303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2017-2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2020, Linaro Limited
+ */
+
+#include <dt-bindings/dma/qcom-gpi.h>
+#include <linux/bitfield.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/module.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/dma/qcom-gpi-dma.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+#define TRE_TYPE_DMA		0x10
+#define TRE_TYPE_GO		0x20
+#define TRE_TYPE_CONFIG0	0x22
+
+/* TRE flags */
+#define TRE_FLAGS_CHAIN		BIT(0)
+#define TRE_FLAGS_IEOB		BIT(8)
+#define TRE_FLAGS_IEOT		BIT(9)
+#define TRE_FLAGS_BEI		BIT(10)
+#define TRE_FLAGS_LINK		BIT(11)
+#define TRE_FLAGS_TYPE		GENMASK(23, 16)
+
+/* SPI CONFIG0 WD0 */
+#define TRE_SPI_C0_WORD_SZ	GENMASK(4, 0)
+#define TRE_SPI_C0_LOOPBACK	BIT(8)
+#define TRE_SPI_C0_CS		BIT(11)
+#define TRE_SPI_C0_CPHA		BIT(12)
+#define TRE_SPI_C0_CPOL		BIT(13)
+#define TRE_SPI_C0_TX_PACK	BIT(24)
+#define TRE_SPI_C0_RX_PACK	BIT(25)
+
+/* CONFIG0 WD2 */
+#define TRE_C0_CLK_DIV		GENMASK(11, 0)
+#define TRE_C0_CLK_SRC		GENMASK(19, 16)
+
+/* SPI GO WD0 */
+#define TRE_SPI_GO_CMD		GENMASK(4, 0)
+#define TRE_SPI_GO_CS		GENMASK(10, 8)
+#define TRE_SPI_GO_FRAG		BIT(26)
+
+/* GO WD2 */
+#define TRE_RX_LEN		GENMASK(23, 0)
+
+/* I2C Config0 WD0 */
+#define TRE_I2C_C0_TLOW		GENMASK(7, 0)
+#define TRE_I2C_C0_THIGH	GENMASK(15, 8)
+#define TRE_I2C_C0_TCYL		GENMASK(23, 16)
+#define TRE_I2C_C0_TX_PACK	BIT(24)
+#define TRE_I2C_C0_RX_PACK      BIT(25)
+
+/* I2C GO WD0 */
+#define TRE_I2C_GO_CMD          GENMASK(4, 0)
+#define TRE_I2C_GO_ADDR		GENMASK(14, 8)
+#define TRE_I2C_GO_STRETCH	BIT(26)
+
+/* DMA TRE */
+#define TRE_DMA_LEN		GENMASK(23, 0)
+
+/* Register offsets from gpi-top */
+#define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_CNTXT_0_EL_SIZE	GENMASK(31, 24)
+#define GPII_n_CH_k_CNTXT_0_CHSTATE	GENMASK(23, 20)
+#define GPII_n_CH_k_CNTXT_0_ERIDX	GENMASK(18, 14)
+#define GPII_n_CH_k_CNTXT_0_DIR		BIT(3)
+#define GPII_n_CH_k_CNTXT_0_PROTO	GENMASK(2, 0)
+
+#define GPII_n_CH_k_CNTXT_0(el_size, erindex, dir, chtype_proto)  \
+	(FIELD_PREP(GPII_n_CH_k_CNTXT_0_EL_SIZE, el_size)	| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_ERIDX, erindex)		| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_DIR, dir)		| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_PROTO, chtype_proto))
+
+#define GPI_CHTYPE_DIR_IN	(0)
+#define GPI_CHTYPE_DIR_OUT	(1)
+
+#define GPI_CHTYPE_PROTO_GPI	(0x2)
+
+#define GPII_n_CH_k_DOORBELL_0_OFFS(n, k)	(0x22000 + (0x4000 * (n)) + (0x8 * (k)))
+#define GPII_n_CH_CMD_OFFS(n)			(0x23008 + (0x4000 * (n)))
+#define GPII_n_CH_CMD_OPCODE			GENMASK(31, 24)
+#define GPII_n_CH_CMD_CHID			GENMASK(7, 0)
+#define GPII_n_CH_CMD(opcode, chid)				 \
+		     (FIELD_PREP(GPII_n_CH_CMD_OPCODE, opcode) | \
+		      FIELD_PREP(GPII_n_CH_CMD_CHID, chid))
+
+#define GPII_n_CH_CMD_ALLOCATE		(0)
+#define GPII_n_CH_CMD_START		(1)
+#define GPII_n_CH_CMD_STOP		(2)
+#define GPII_n_CH_CMD_RESET		(9)
+#define GPII_n_CH_CMD_DE_ALLOC		(10)
+#define GPII_n_CH_CMD_UART_SW_STALE	(32)
+#define GPII_n_CH_CMD_UART_RFR_READY	(33)
+#define GPII_n_CH_CMD_UART_RFR_NOT_READY (34)
+
+/* EV Context Array */
+#define GPII_n_EV_CH_k_CNTXT_0_OFFS(n, k) (0x21000 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_EV_k_CNTXT_0_EL_SIZE	GENMASK(31, 24)
+#define GPII_n_EV_k_CNTXT_0_CHSTATE	GENMASK(23, 20)
+#define GPII_n_EV_k_CNTXT_0_INTYPE	BIT(16)
+#define GPII_n_EV_k_CNTXT_0_CHTYPE	GENMASK(3, 0)
+
+#define GPII_n_EV_k_CNTXT_0(el_size, inttype, chtype)		\
+	(FIELD_PREP(GPII_n_EV_k_CNTXT_0_EL_SIZE, el_size) |	\
+	 FIELD_PREP(GPII_n_EV_k_CNTXT_0_INTYPE, inttype)  |	\
+	 FIELD_PREP(GPII_n_EV_k_CNTXT_0_CHTYPE, chtype))
+
+#define GPI_INTTYPE_IRQ		(1)
+#define GPI_CHTYPE_GPI_EV	(0x2)
+
+enum CNTXT_OFFS {
+	CNTXT_0_CONFIG = 0x0,
+	CNTXT_1_R_LENGTH = 0x4,
+	CNTXT_2_RING_BASE_LSB = 0x8,
+	CNTXT_3_RING_BASE_MSB = 0xC,
+	CNTXT_4_RING_RP_LSB = 0x10,
+	CNTXT_5_RING_RP_MSB = 0x14,
+	CNTXT_6_RING_WP_LSB = 0x18,
+	CNTXT_7_RING_WP_MSB = 0x1C,
+	CNTXT_8_RING_INT_MOD = 0x20,
+	CNTXT_9_RING_INTVEC = 0x24,
+	CNTXT_10_RING_MSI_LSB = 0x28,
+	CNTXT_11_RING_MSI_MSB = 0x2C,
+	CNTXT_12_RING_RP_UPDATE_LSB = 0x30,
+	CNTXT_13_RING_RP_UPDATE_MSB = 0x34,
+};
+
+#define GPII_n_EV_CH_k_DOORBELL_0_OFFS(n, k)	(0x22100 + (0x4000 * (n)) + (0x8 * (k)))
+#define GPII_n_EV_CH_CMD_OFFS(n)		(0x23010 + (0x4000 * (n)))
+#define GPII_n_EV_CMD_OPCODE			GENMASK(31, 24)
+#define GPII_n_EV_CMD_CHID			GENMASK(7, 0)
+#define GPII_n_EV_CMD(opcode, chid)				 \
+		     (FIELD_PREP(GPII_n_EV_CMD_OPCODE, opcode) | \
+		      FIELD_PREP(GPII_n_EV_CMD_CHID, chid))
+
+#define GPII_n_EV_CH_CMD_ALLOCATE		(0x00)
+#define GPII_n_EV_CH_CMD_RESET			(0x09)
+#define GPII_n_EV_CH_CMD_DE_ALLOC		(0x0A)
+
+#define GPII_n_CNTXT_TYPE_IRQ_OFFS(n)		(0x23080 + (0x4000 * (n)))
+
+/* mask type register */
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(n)	(0x23088 + (0x4000 * (n)))
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK		GENMASK(6, 0)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL	BIT(6)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB		BIT(3)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB		BIT(2)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL	BIT(1)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL	BIT(0)
+
+#define GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(n)	(0x23090 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(n)	(0x23094 + (0x4000 * (n)))
+
+/* Mask channel control interrupt register */
+#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(n)	(0x23098 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK	GENMASK(1, 0)
+
+/* Mask event control interrupt register */
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(n)	(0x2309C + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK	BIT(0)
+
+#define GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(n)	(0x230A0 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS(n)	(0x230A4 + (0x4000 * (n)))
+
+/* Mask event interrupt register */
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(n)	(0x230B8 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK	BIT(0)
+
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(n)	(0x230C0 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(n)	(0x23100 + (0x4000 * (n)))
+#define GPI_GLOB_IRQ_ERROR_INT_MSK		BIT(0)
+
+/* GPII specific Global - Enable bit register */
+#define GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(n)	(0x23108 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(n)	(0x23110 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(n)	(0x23118 + (0x4000 * (n)))
+
+/* GPII general interrupt - Enable bit register */
+#define GPII_n_CNTXT_GPII_IRQ_EN_OFFS(n)	(0x23120 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GPII_IRQ_EN_BMSK		GENMASK(3, 0)
+
+#define GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(n)	(0x23128 + (0x4000 * (n)))
+
+/* GPII Interrupt Type register */
+#define GPII_n_CNTXT_INTSET_OFFS(n)		(0x23180 + (0x4000 * (n)))
+#define GPII_n_CNTXT_INTSET_BMSK		BIT(0)
+
+#define GPII_n_CNTXT_MSI_BASE_LSB_OFFS(n)	(0x23188 + (0x4000 * (n)))
+#define GPII_n_CNTXT_MSI_BASE_MSB_OFFS(n)	(0x2318C + (0x4000 * (n)))
+#define GPII_n_CNTXT_SCRATCH_0_OFFS(n)		(0x23400 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SCRATCH_1_OFFS(n)		(0x23404 + (0x4000 * (n)))
+
+#define GPII_n_ERROR_LOG_OFFS(n)		(0x23200 + (0x4000 * (n)))
+
+/* QOS Registers */
+#define GPII_n_CH_k_QOS_OFFS(n, k)		(0x2005C + (0x4000 * (n)) + (0x80 * (k)))
+
+/* Scratch registers */
+#define GPII_n_CH_k_SCRATCH_0_OFFS(n, k)	(0x20060 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_0_SEID		GENMASK(2, 0)
+#define GPII_n_CH_k_SCRATCH_0_PROTO		GENMASK(7, 4)
+#define GPII_n_CH_k_SCRATCH_0_PAIR		GENMASK(20, 16)
+#define GPII_n_CH_k_SCRATCH_0(pair, proto, seid)		\
+			     (FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PAIR, pair)	| \
+			      FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PROTO, proto)	| \
+			      FIELD_PREP(GPII_n_CH_k_SCRATCH_0_SEID, seid))
+#define GPII_n_CH_k_SCRATCH_1_OFFS(n, k)	(0x20064 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_2_OFFS(n, k)	(0x20068 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_3_OFFS(n, k)	(0x2006C + (0x4000 * (n)) + (0x80 * (k)))
+
+struct __packed gpi_tre {
+	u32 dword[4];
+};
+
+enum msm_gpi_tce_code {
+	MSM_GPI_TCE_SUCCESS = 1,
+	MSM_GPI_TCE_EOT = 2,
+	MSM_GPI_TCE_EOB = 4,
+	MSM_GPI_TCE_UNEXP_ERR = 16,
+};
+
+#define CMD_TIMEOUT_MS		(250)
+
+#define MAX_CHANNELS_PER_GPII	(2)
+#define GPI_TX_CHAN		(0)
+#define GPI_RX_CHAN		(1)
+#define STATE_IGNORE		(U32_MAX)
+#define EV_FACTOR		(2)
+#define REQ_OF_DMA_ARGS		(5) /* # of arguments required from client */
+#define CHAN_TRES		64
+
+struct __packed xfer_compl_event {
+	u64 ptr;
+	u32 length:24;
+	u8 code;
+	u16 status;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed immediate_data_event {
+	u8 data_bytes[8];
+	u8 length:4;
+	u8 resvd:4;
+	u16 tre_index;
+	u8 code;
+	u16 status;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed qup_notif_event {
+	u32 status;
+	u32 time;
+	u32 count:24;
+	u8 resvd;
+	u16 resvd1;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed gpi_ere {
+	u32 dword[4];
+};
+
+enum GPI_EV_TYPE {
+	XFER_COMPLETE_EV_TYPE = 0x22,
+	IMMEDIATE_DATA_EV_TYPE = 0x30,
+	QUP_NOTIF_EV_TYPE = 0x31,
+	STALE_EV_TYPE = 0xFF,
+};
+
+union __packed gpi_event {
+	struct __packed xfer_compl_event xfer_compl_event;
+	struct __packed immediate_data_event immediate_data_event;
+	struct __packed qup_notif_event qup_notif_event;
+	struct __packed gpi_ere gpi_ere;
+};
+
+enum gpii_irq_settings {
+	DEFAULT_IRQ_SETTINGS,
+	MASK_IEOB_SETTINGS,
+};
+
+enum gpi_ev_state {
+	DEFAULT_EV_CH_STATE = 0,
+	EV_STATE_NOT_ALLOCATED = DEFAULT_EV_CH_STATE,
+	EV_STATE_ALLOCATED,
+	MAX_EV_STATES
+};
+
+static const char *const gpi_ev_state_str[MAX_EV_STATES] = {
+	[EV_STATE_NOT_ALLOCATED] = "NOT ALLOCATED",
+	[EV_STATE_ALLOCATED] = "ALLOCATED",
+};
+
+#define TO_GPI_EV_STATE_STR(_state) (((_state) >= MAX_EV_STATES) ? \
+				    "INVALID" : gpi_ev_state_str[(_state)])
+
+enum gpi_ch_state {
+	DEFAULT_CH_STATE = 0x0,
+	CH_STATE_NOT_ALLOCATED = DEFAULT_CH_STATE,
+	CH_STATE_ALLOCATED = 0x1,
+	CH_STATE_STARTED = 0x2,
+	CH_STATE_STOPPED = 0x3,
+	CH_STATE_STOP_IN_PROC = 0x4,
+	CH_STATE_ERROR = 0xf,
+	MAX_CH_STATES
+};
+
+enum gpi_cmd {
+	GPI_CH_CMD_BEGIN,
+	GPI_CH_CMD_ALLOCATE = GPI_CH_CMD_BEGIN,
+	GPI_CH_CMD_START,
+	GPI_CH_CMD_STOP,
+	GPI_CH_CMD_RESET,
+	GPI_CH_CMD_DE_ALLOC,
+	GPI_CH_CMD_UART_SW_STALE,
+	GPI_CH_CMD_UART_RFR_READY,
+	GPI_CH_CMD_UART_RFR_NOT_READY,
+	GPI_CH_CMD_END = GPI_CH_CMD_UART_RFR_NOT_READY,
+	GPI_EV_CMD_BEGIN,
+	GPI_EV_CMD_ALLOCATE = GPI_EV_CMD_BEGIN,
+	GPI_EV_CMD_RESET,
+	GPI_EV_CMD_DEALLOC,
+	GPI_EV_CMD_END = GPI_EV_CMD_DEALLOC,
+	GPI_MAX_CMD,
+};
+
+#define IS_CHAN_CMD(_cmd) ((_cmd) <= GPI_CH_CMD_END)
+
+static const char *const gpi_cmd_str[GPI_MAX_CMD] = {
+	[GPI_CH_CMD_ALLOCATE] = "CH ALLOCATE",
+	[GPI_CH_CMD_START] = "CH START",
+	[GPI_CH_CMD_STOP] = "CH STOP",
+	[GPI_CH_CMD_RESET] = "CH_RESET",
+	[GPI_CH_CMD_DE_ALLOC] = "DE ALLOC",
+	[GPI_CH_CMD_UART_SW_STALE] = "UART SW STALE",
+	[GPI_CH_CMD_UART_RFR_READY] = "UART RFR READY",
+	[GPI_CH_CMD_UART_RFR_NOT_READY] = "UART RFR NOT READY",
+	[GPI_EV_CMD_ALLOCATE] = "EV ALLOCATE",
+	[GPI_EV_CMD_RESET] = "EV RESET",
+	[GPI_EV_CMD_DEALLOC] = "EV DEALLOC",
+};
+
+#define TO_GPI_CMD_STR(_cmd) (((_cmd) >= GPI_MAX_CMD) ? "INVALID" : \
+				  gpi_cmd_str[(_cmd)])
+
+/*
+ * @DISABLE_STATE: no register access allowed
+ * @CONFIG_STATE:  client has configured the channel
+ * @PREP_HARDWARE: register access is allowed
+ *		   however, no processing EVENTS
+ * @ACTIVE_STATE: channels are fully operational
+ * @PREPARE_TERMINATE: graceful termination of channels
+ *		       register access is allowed
+ * @PAUSE_STATE: channels are active, but not processing any events
+ */
+enum gpi_pm_state {
+	DISABLE_STATE,
+	CONFIG_STATE,
+	PREPARE_HARDWARE,
+	ACTIVE_STATE,
+	PREPARE_TERMINATE,
+	PAUSE_STATE,
+	MAX_PM_STATE
+};
+
+#define REG_ACCESS_VALID(_pm_state) ((_pm_state) >= PREPARE_HARDWARE)
+
+static const char *const gpi_pm_state_str[MAX_PM_STATE] = {
+	[DISABLE_STATE] = "DISABLE",
+	[CONFIG_STATE] = "CONFIG",
+	[PREPARE_HARDWARE] = "PREPARE HARDWARE",
+	[ACTIVE_STATE] = "ACTIVE",
+	[PREPARE_TERMINATE] = "PREPARE TERMINATE",
+	[PAUSE_STATE] = "PAUSE",
+};
+
+#define TO_GPI_PM_STR(_state) (((_state) >= MAX_PM_STATE) ? \
+			      "INVALID" : gpi_pm_state_str[(_state)])
+
+static const struct {
+	enum gpi_cmd gpi_cmd;
+	u32 opcode;
+	u32 state;
+} gpi_cmd_info[GPI_MAX_CMD] = {
+	{
+		GPI_CH_CMD_ALLOCATE,
+		GPII_n_CH_CMD_ALLOCATE,
+		CH_STATE_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_START,
+		GPII_n_CH_CMD_START,
+		CH_STATE_STARTED,
+	},
+	{
+		GPI_CH_CMD_STOP,
+		GPII_n_CH_CMD_STOP,
+		CH_STATE_STOPPED,
+	},
+	{
+		GPI_CH_CMD_RESET,
+		GPII_n_CH_CMD_RESET,
+		CH_STATE_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_DE_ALLOC,
+		GPII_n_CH_CMD_DE_ALLOC,
+		CH_STATE_NOT_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_UART_SW_STALE,
+		GPII_n_CH_CMD_UART_SW_STALE,
+		STATE_IGNORE,
+	},
+	{
+		GPI_CH_CMD_UART_RFR_READY,
+		GPII_n_CH_CMD_UART_RFR_READY,
+		STATE_IGNORE,
+	},
+	{
+		GPI_CH_CMD_UART_RFR_NOT_READY,
+		GPII_n_CH_CMD_UART_RFR_NOT_READY,
+		STATE_IGNORE,
+	},
+	{
+		GPI_EV_CMD_ALLOCATE,
+		GPII_n_EV_CH_CMD_ALLOCATE,
+		EV_STATE_ALLOCATED,
+	},
+	{
+		GPI_EV_CMD_RESET,
+		GPII_n_EV_CH_CMD_RESET,
+		EV_STATE_ALLOCATED,
+	},
+	{
+		GPI_EV_CMD_DEALLOC,
+		GPII_n_EV_CH_CMD_DE_ALLOC,
+		EV_STATE_NOT_ALLOCATED,
+	},
+};
+
+struct gpi_ring {
+	void *pre_aligned;
+	size_t alloc_size;
+	phys_addr_t phys_addr;
+	dma_addr_t dma_handle;
+	void *base;
+	void *wp;
+	void *rp;
+	u32 len;
+	u32 el_size;
+	u32 elements;
+	bool configured;
+};
+
+struct gpi_dev {
+	struct dma_device dma_device;
+	struct device *dev;
+	struct resource *res;
+	void __iomem *regs;
+	void __iomem *ee_base; /*ee register base address*/
+	u32 max_gpii; /* maximum # of gpii instances available per gpi block */
+	u32 gpii_mask; /* gpii instances available for apps */
+	u32 ev_factor; /* ev ring length factor */
+	struct gpii *gpiis;
+};
+
+struct reg_info {
+	char *name;
+	u32 offset;
+	u32 val;
+};
+
+struct gchan {
+	struct virt_dma_chan vc;
+	u32 chid;
+	u32 seid;
+	u32 protocol;
+	struct gpii *gpii;
+	enum gpi_ch_state ch_state;
+	enum gpi_pm_state pm_state;
+	void __iomem *ch_cntxt_base_reg;
+	void __iomem *ch_cntxt_db_reg;
+	void __iomem *ch_cmd_reg;
+	u32 dir;
+	struct gpi_ring ch_ring;
+	void *config;
+};
+
+struct gpii {
+	u32 gpii_id;
+	struct gchan gchan[MAX_CHANNELS_PER_GPII];
+	struct gpi_dev *gpi_dev;
+	int irq;
+	void __iomem *regs; /* points to gpi top */
+	void __iomem *ev_cntxt_base_reg;
+	void __iomem *ev_cntxt_db_reg;
+	void __iomem *ev_ring_rp_lsb_reg;
+	void __iomem *ev_cmd_reg;
+	void __iomem *ieob_clr_reg;
+	struct mutex ctrl_lock;
+	enum gpi_ev_state ev_state;
+	bool configured_irq;
+	enum gpi_pm_state pm_state;
+	rwlock_t pm_lock;
+	struct gpi_ring ev_ring;
+	struct tasklet_struct ev_task; /* event processing tasklet */
+	struct completion cmd_completion;
+	enum gpi_cmd gpi_cmd;
+	u32 cntxt_type_irq_msk;
+	bool ieob_set;
+};
+
+#define MAX_TRE 3
+
+struct gpi_desc {
+	struct virt_dma_desc vd;
+	size_t len;
+	void *db; /* DB register to program */
+	struct gchan *gchan;
+	struct gpi_tre tre[MAX_TRE];
+	u32 num_tre;
+};
+
+static const u32 GPII_CHAN_DIR[MAX_CHANNELS_PER_GPII] = {
+	GPI_CHTYPE_DIR_OUT, GPI_CHTYPE_DIR_IN
+};
+
+static irqreturn_t gpi_handle_irq(int irq, void *data);
+static void gpi_ring_recycle_ev_element(struct gpi_ring *ring);
+static int gpi_ring_add_element(struct gpi_ring *ring, void **wp);
+static void gpi_process_events(struct gpii *gpii);
+
+static inline struct gchan *to_gchan(struct dma_chan *dma_chan)
+{
+	return container_of(dma_chan, struct gchan, vc.chan);
+}
+
+static inline struct gpi_desc *to_gpi_desc(struct virt_dma_desc *vd)
+{
+	return container_of(vd, struct gpi_desc, vd);
+}
+
+static inline phys_addr_t to_physical(const struct gpi_ring *const ring,
+				      void *addr)
+{
+	return ring->phys_addr + (addr - ring->base);
+}
+
+static inline void *to_virtual(const struct gpi_ring *const ring, phys_addr_t addr)
+{
+	return ring->base + (addr - ring->phys_addr);
+}
+
+static inline u32 gpi_read_reg(struct gpii *gpii, void __iomem *addr)
+{
+	return readl_relaxed(addr);
+}
+
+static inline void gpi_write_reg(struct gpii *gpii, void __iomem *addr, u32 val)
+{
+	writel_relaxed(val, addr);
+}
+
+/* gpi_write_reg_field - write to specific bit field */
+static inline void gpi_write_reg_field(struct gpii *gpii, void __iomem *addr,
+				       u32 mask, u32 shift, u32 val)
+{
+	u32 tmp = gpi_read_reg(gpii, addr);
+
+	tmp &= ~mask;
+	val = tmp | ((val << shift) & mask);
+	gpi_write_reg(gpii, addr, val);
+}
+
+static inline void
+gpi_update_reg(struct gpii *gpii, u32 offset, u32 mask, u32 val)
+{
+	void __iomem *addr = gpii->regs + offset;
+	u32 tmp = gpi_read_reg(gpii, addr);
+
+	tmp &= ~mask;
+	tmp |= u32_encode_bits(val, mask);
+
+	gpi_write_reg(gpii, addr, tmp);
+}
+
+static void gpi_disable_interrupts(struct gpii *gpii)
+{
+	gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_INTSET_BMSK, 0);
+
+	gpii->cntxt_type_irq_msk = 0;
+	devm_free_irq(gpii->gpi_dev->dev, gpii->irq, gpii);
+	gpii->configured_irq = false;
+}
+
+/* configure and enable interrupts */
+static int gpi_config_interrupts(struct gpii *gpii, enum gpii_irq_settings settings, bool mask)
+{
+	const u32 enable = (GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL);
+	int ret;
+
+	if (!gpii->configured_irq) {
+		ret = devm_request_irq(gpii->gpi_dev->dev, gpii->irq,
+				       gpi_handle_irq, IRQF_TRIGGER_HIGH,
+				       "gpi-dma", gpii);
+		if (ret < 0) {
+			dev_err(gpii->gpi_dev->dev, "error request irq:%d ret:%d\n",
+				gpii->irq, ret);
+			return ret;
+		}
+	}
+
+	if (settings == MASK_IEOB_SETTINGS) {
+		/*
+		 * GPII only uses one EV ring per gpii so we can globally
+		 * enable/disable IEOB interrupt
+		 */
+		if (mask)
+			gpii->cntxt_type_irq_msk |= GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB;
+		else
+			gpii->cntxt_type_irq_msk &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB);
+		gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, gpii->cntxt_type_irq_msk);
+	} else {
+		gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, enable);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK,
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, GPII_n_CNTXT_GPII_IRQ_EN_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_LSB_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_MSB_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_0_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_1_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_INTSET_BMSK, 1);
+		gpi_update_reg(gpii, GPII_n_ERROR_LOG_OFFS(gpii->gpii_id), U32_MAX, 0);
+
+		gpii->cntxt_type_irq_msk = enable;
+	}
+
+	gpii->configured_irq = true;
+	return 0;
+}
+
+/* Sends gpii event or channel command */
+static int gpi_send_cmd(struct gpii *gpii, struct gchan *gchan,
+			enum gpi_cmd gpi_cmd)
+{
+	u32 chid = MAX_CHANNELS_PER_GPII;
+	unsigned long timeout;
+	void __iomem *cmd_reg;
+	u32 cmd;
+
+	if (gpi_cmd >= GPI_MAX_CMD)
+		return -EINVAL;
+	if (IS_CHAN_CMD(gpi_cmd))
+		chid = gchan->chid;
+
+	dev_dbg(gpii->gpi_dev->dev,
+		"sending cmd: %s:%u\n", TO_GPI_CMD_STR(gpi_cmd), chid);
+
+	/* send opcode and wait for completion */
+	reinit_completion(&gpii->cmd_completion);
+	gpii->gpi_cmd = gpi_cmd;
+
+	cmd_reg = IS_CHAN_CMD(gpi_cmd) ? gchan->ch_cmd_reg : gpii->ev_cmd_reg;
+	cmd = IS_CHAN_CMD(gpi_cmd) ? GPII_n_CH_CMD(gpi_cmd_info[gpi_cmd].opcode, chid) :
+				     GPII_n_EV_CMD(gpi_cmd_info[gpi_cmd].opcode, 0);
+	gpi_write_reg(gpii, cmd_reg, cmd);
+	timeout = wait_for_completion_timeout(&gpii->cmd_completion,
+					      msecs_to_jiffies(CMD_TIMEOUT_MS));
+	if (!timeout) {
+		dev_err(gpii->gpi_dev->dev, "cmd: %s completion timeout:%u\n",
+			TO_GPI_CMD_STR(gpi_cmd), chid);
+		return -EIO;
+	}
+
+	/* confirm new ch state is correct , if the cmd is a state change cmd */
+	if (gpi_cmd_info[gpi_cmd].state == STATE_IGNORE)
+		return 0;
+
+	if (IS_CHAN_CMD(gpi_cmd) && gchan->ch_state == gpi_cmd_info[gpi_cmd].state)
+		return 0;
+
+	if (!IS_CHAN_CMD(gpi_cmd) && gpii->ev_state == gpi_cmd_info[gpi_cmd].state)
+		return 0;
+
+	return -EIO;
+}
+
+/* program transfer ring DB register */
+static inline void gpi_write_ch_db(struct gchan *gchan,
+				   struct gpi_ring *ring, void *wp)
+{
+	struct gpii *gpii = gchan->gpii;
+	phys_addr_t p_wp;
+
+	p_wp = to_physical(ring, wp);
+	gpi_write_reg(gpii, gchan->ch_cntxt_db_reg, p_wp);
+}
+
+/* program event ring DB register */
+static inline void gpi_write_ev_db(struct gpii *gpii,
+				   struct gpi_ring *ring, void *wp)
+{
+	phys_addr_t p_wp;
+
+	p_wp = ring->phys_addr + (wp - ring->base);
+	gpi_write_reg(gpii, gpii->ev_cntxt_db_reg, p_wp);
+}
+
+/* process transfer completion interrupt */
+static void gpi_process_ieob(struct gpii *gpii)
+{
+	gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0));
+
+	gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 0);
+	tasklet_hi_schedule(&gpii->ev_task);
+}
+
+/* process channel control interrupt */
+static void gpi_process_ch_ctrl_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(gpii_id);
+	u32 ch_irq = gpi_read_reg(gpii, gpii->regs + offset);
+	struct gchan *gchan;
+	u32 chid, state;
+
+	/* clear the status */
+	offset = GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, (u32)ch_irq);
+
+	for (chid = 0; chid < MAX_CHANNELS_PER_GPII; chid++) {
+		if (!(BIT(chid) & ch_irq))
+			continue;
+
+		gchan = &gpii->gchan[chid];
+		state = gpi_read_reg(gpii, gchan->ch_cntxt_base_reg +
+				     CNTXT_0_CONFIG);
+		state = FIELD_GET(GPII_n_CH_k_CNTXT_0_CHSTATE, state);
+
+		/*
+		 * CH_CMD_DEALLOC cmd always successful. However cmd does
+		 * not change hardware status. So overwriting software state
+		 * to default state.
+		 */
+		if (gpii->gpi_cmd == GPI_CH_CMD_DE_ALLOC)
+			state = DEFAULT_CH_STATE;
+		gchan->ch_state = state;
+
+		/*
+		 * Triggering complete all if ch_state is not a stop in process.
+		 * Stop in process is a transition state and we will wait for
+		 * stop interrupt before notifying.
+		 */
+		if (gchan->ch_state != CH_STATE_STOP_IN_PROC)
+			complete_all(&gpii->cmd_completion);
+	}
+}
+
+/* processing gpi general error interrupts */
+static void gpi_process_gen_err_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(gpii_id);
+	u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset);
+
+	/* clear the status */
+	dev_dbg(gpii->gpi_dev->dev, "irq_stts:0x%x\n", irq_stts);
+
+	/* Clear the register */
+	offset = GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, irq_stts);
+}
+
+/* processing gpi level error interrupts */
+static void gpi_process_glob_err_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(gpii_id);
+	u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset);
+
+	offset = GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, irq_stts);
+
+	/* only error interrupt should be set */
+	if (irq_stts & ~GPI_GLOB_IRQ_ERROR_INT_MSK) {
+		dev_err(gpii->gpi_dev->dev, "invalid error status:0x%x\n", irq_stts);
+		return;
+	}
+
+	offset = GPII_n_ERROR_LOG_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, 0);
+}
+
+/* gpii interrupt handler */
+static irqreturn_t gpi_handle_irq(int irq, void *data)
+{
+	struct gpii *gpii = data;
+	u32 gpii_id = gpii->gpii_id;
+	u32 type, offset;
+	unsigned long flags;
+
+	read_lock_irqsave(&gpii->pm_lock, flags);
+
+	/*
+	 * States are out of sync to receive interrupt
+	 * while software state is in DISABLE state, bailing out.
+	 */
+	if (!REG_ACCESS_VALID(gpii->pm_state)) {
+		dev_err(gpii->gpi_dev->dev, "receive interrupt while in %s state\n",
+			TO_GPI_PM_STR(gpii->pm_state));
+		goto exit_irq;
+	}
+
+	offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id);
+	type = gpi_read_reg(gpii, gpii->regs + offset);
+
+	do {
+		/* global gpii error */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB) {
+			gpi_process_glob_err_irq(gpii);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB);
+		}
+
+		/* transfer complete interrupt */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB) {
+			gpi_process_ieob(gpii);
+			type &= ~GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB;
+		}
+
+		/* event control irq */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL) {
+			u32 ev_state;
+			u32 ev_ch_irq;
+
+			dev_dbg(gpii->gpi_dev->dev,
+				"processing EV CTRL interrupt\n");
+			offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(gpii_id);
+			ev_ch_irq = gpi_read_reg(gpii, gpii->regs + offset);
+
+			offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS
+				(gpii_id);
+			gpi_write_reg(gpii, gpii->regs + offset, ev_ch_irq);
+			ev_state = gpi_read_reg(gpii, gpii->ev_cntxt_base_reg +
+						CNTXT_0_CONFIG);
+			ev_state = FIELD_GET(GPII_n_EV_k_CNTXT_0_CHSTATE, ev_state);
+
+			/*
+			 * CMD EV_CMD_DEALLOC is always successful. However
+			 * cmd does not change hardware status. So overwriting
+			 * software state to default state.
+			 */
+			if (gpii->gpi_cmd == GPI_EV_CMD_DEALLOC)
+				ev_state = DEFAULT_EV_CH_STATE;
+
+			gpii->ev_state = ev_state;
+			dev_dbg(gpii->gpi_dev->dev, "setting EV state to %s\n",
+				TO_GPI_EV_STATE_STR(gpii->ev_state));
+			complete_all(&gpii->cmd_completion);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL);
+		}
+
+		/* channel control irq */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL) {
+			dev_dbg(gpii->gpi_dev->dev, "process CH CTRL interrupts\n");
+			gpi_process_ch_ctrl_irq(gpii);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL);
+		}
+
+		if (type) {
+			dev_err(gpii->gpi_dev->dev, "Unhandled interrupt status:0x%x\n", type);
+			gpi_process_gen_err_irq(gpii);
+			goto exit_irq;
+		}
+
+		offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id);
+		type = gpi_read_reg(gpii, gpii->regs + offset);
+	} while (type);
+
+exit_irq:
+	read_unlock_irqrestore(&gpii->pm_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+/* process DMA Immediate completion data events */
+static void gpi_process_imed_data_event(struct gchan *gchan,
+					struct immediate_data_event *imed_event)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *tre = ch_ring->base + (ch_ring->el_size * imed_event->tre_index);
+	struct dmaengine_result result;
+	struct gpi_desc *gpi_desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	u32 chid;
+
+	/*
+	 * If channel not active don't process event
+	 */
+	if (gchan->pm_state != ACTIVE_STATE) {
+		dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n",
+			TO_GPI_PM_STR(gchan->pm_state));
+		return;
+	}
+
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vd = vchan_next_desc(&gchan->vc);
+	if (!vd) {
+		struct gpi_ere *gpi_ere;
+		struct gpi_tre *gpi_tre;
+
+		spin_unlock_irqrestore(&gchan->vc.lock, flags);
+		dev_dbg(gpii->gpi_dev->dev, "event without a pending descriptor!\n");
+		gpi_ere = (struct gpi_ere *)imed_event;
+		dev_dbg(gpii->gpi_dev->dev,
+			"Event: %08x %08x %08x %08x\n",
+			gpi_ere->dword[0], gpi_ere->dword[1],
+			gpi_ere->dword[2], gpi_ere->dword[3]);
+		gpi_tre = tre;
+		dev_dbg(gpii->gpi_dev->dev,
+			"Pending TRE: %08x %08x %08x %08x\n",
+			gpi_tre->dword[0], gpi_tre->dword[1],
+			gpi_tre->dword[2], gpi_tre->dword[3]);
+		return;
+	}
+	gpi_desc = to_gpi_desc(vd);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/*
+	 * RP pointed by Event is to last TRE processed,
+	 * we need to update ring rp to tre + 1
+	 */
+	tre += ch_ring->el_size;
+	if (tre >= (ch_ring->base + ch_ring->len))
+		tre = ch_ring->base;
+	ch_ring->rp = tre;
+
+	/* make sure rp updates are immediately visible to all cores */
+	smp_wmb();
+
+	chid = imed_event->chid;
+	if (imed_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) {
+		if (chid == GPI_RX_CHAN)
+			goto gpi_free_desc;
+		else
+			return;
+	}
+
+	if (imed_event->code == MSM_GPI_TCE_UNEXP_ERR)
+		result.result = DMA_TRANS_ABORTED;
+	else
+		result.result = DMA_TRANS_NOERROR;
+	result.residue = gpi_desc->len - imed_event->length;
+
+	dma_cookie_complete(&vd->tx);
+	dmaengine_desc_get_callback_invoke(&vd->tx, &result);
+
+gpi_free_desc:
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	list_del(&vd->node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+/* processing transfer completion events */
+static void gpi_process_xfer_compl_event(struct gchan *gchan,
+					 struct xfer_compl_event *compl_event)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *ev_rp = to_virtual(ch_ring, compl_event->ptr);
+	struct virt_dma_desc *vd;
+	struct gpi_desc *gpi_desc;
+	struct dmaengine_result result;
+	unsigned long flags;
+	u32 chid;
+
+	/* only process events on active channel */
+	if (unlikely(gchan->pm_state != ACTIVE_STATE)) {
+		dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n",
+			TO_GPI_PM_STR(gchan->pm_state));
+		return;
+	}
+
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vd = vchan_next_desc(&gchan->vc);
+	if (!vd) {
+		struct gpi_ere *gpi_ere;
+
+		spin_unlock_irqrestore(&gchan->vc.lock, flags);
+		dev_err(gpii->gpi_dev->dev, "Event without a pending descriptor!\n");
+		gpi_ere = (struct gpi_ere *)compl_event;
+		dev_err(gpii->gpi_dev->dev,
+			"Event: %08x %08x %08x %08x\n",
+			gpi_ere->dword[0], gpi_ere->dword[1],
+			gpi_ere->dword[2], gpi_ere->dword[3]);
+		return;
+	}
+
+	gpi_desc = to_gpi_desc(vd);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/*
+	 * RP pointed by Event is to last TRE processed,
+	 * we need to update ring rp to ev_rp + 1
+	 */
+	ev_rp += ch_ring->el_size;
+	if (ev_rp >= (ch_ring->base + ch_ring->len))
+		ev_rp = ch_ring->base;
+	ch_ring->rp = ev_rp;
+
+	/* update must be visible to other cores */
+	smp_wmb();
+
+	chid = compl_event->chid;
+	if (compl_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) {
+		if (chid == GPI_RX_CHAN)
+			goto gpi_free_desc;
+		else
+			return;
+	}
+
+	if (compl_event->code == MSM_GPI_TCE_UNEXP_ERR) {
+		dev_err(gpii->gpi_dev->dev, "Error in Transaction\n");
+		result.result = DMA_TRANS_ABORTED;
+	} else {
+		dev_dbg(gpii->gpi_dev->dev, "Transaction Success\n");
+		result.result = DMA_TRANS_NOERROR;
+	}
+	result.residue = gpi_desc->len - compl_event->length;
+	dev_dbg(gpii->gpi_dev->dev, "Residue %d\n", result.residue);
+
+	dma_cookie_complete(&vd->tx);
+	dmaengine_desc_get_callback_invoke(&vd->tx, &result);
+
+gpi_free_desc:
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	list_del(&vd->node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+/* process all events */
+static void gpi_process_events(struct gpii *gpii)
+{
+	struct gpi_ring *ev_ring = &gpii->ev_ring;
+	phys_addr_t cntxt_rp;
+	void *rp;
+	union gpi_event *gpi_event;
+	struct gchan *gchan;
+	u32 chid, type;
+
+	cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+	rp = to_virtual(ev_ring, cntxt_rp);
+
+	do {
+		while (rp != ev_ring->rp) {
+			gpi_event = ev_ring->rp;
+			chid = gpi_event->xfer_compl_event.chid;
+			type = gpi_event->xfer_compl_event.type;
+
+			dev_dbg(gpii->gpi_dev->dev,
+				"Event: CHID:%u, type:%x %08x %08x %08x %08x\n",
+				chid, type, gpi_event->gpi_ere.dword[0],
+				gpi_event->gpi_ere.dword[1], gpi_event->gpi_ere.dword[2],
+				gpi_event->gpi_ere.dword[3]);
+
+			switch (type) {
+			case XFER_COMPLETE_EV_TYPE:
+				gchan = &gpii->gchan[chid];
+				gpi_process_xfer_compl_event(gchan,
+							     &gpi_event->xfer_compl_event);
+				break;
+			case STALE_EV_TYPE:
+				dev_dbg(gpii->gpi_dev->dev, "stale event, not processing\n");
+				break;
+			case IMMEDIATE_DATA_EV_TYPE:
+				gchan = &gpii->gchan[chid];
+				gpi_process_imed_data_event(gchan,
+							    &gpi_event->immediate_data_event);
+				break;
+			case QUP_NOTIF_EV_TYPE:
+				dev_dbg(gpii->gpi_dev->dev, "QUP_NOTIF_EV_TYPE\n");
+				break;
+			default:
+				dev_dbg(gpii->gpi_dev->dev,
+					"not supported event type:0x%x\n", type);
+			}
+			gpi_ring_recycle_ev_element(ev_ring);
+		}
+		gpi_write_ev_db(gpii, ev_ring, ev_ring->wp);
+
+		/* clear pending IEOB events */
+		gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0));
+
+		cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+		rp = to_virtual(ev_ring, cntxt_rp);
+
+	} while (rp != ev_ring->rp);
+}
+
+/* processing events using tasklet */
+static void gpi_ev_tasklet(unsigned long data)
+{
+	struct gpii *gpii = (struct gpii *)data;
+
+	read_lock_bh(&gpii->pm_lock);
+	if (!REG_ACCESS_VALID(gpii->pm_state)) {
+		read_unlock_bh(&gpii->pm_lock);
+		dev_err(gpii->gpi_dev->dev, "not processing any events, pm_state:%s\n",
+			TO_GPI_PM_STR(gpii->pm_state));
+		return;
+	}
+
+	/* process the events */
+	gpi_process_events(gpii);
+
+	/* enable IEOB, switching back to interrupts */
+	gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 1);
+	read_unlock_bh(&gpii->pm_lock);
+}
+
+/* marks all pending events for the channel as stale */
+static void gpi_mark_stale_events(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ev_ring = &gpii->ev_ring;
+	u32 cntxt_rp, local_rp;
+	void *ev_rp;
+
+	cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+
+	ev_rp = ev_ring->rp;
+	local_rp = (u32)to_physical(ev_ring, ev_rp);
+	while (local_rp != cntxt_rp) {
+		union gpi_event *gpi_event = ev_rp;
+		u32 chid = gpi_event->xfer_compl_event.chid;
+
+		if (chid == gchan->chid)
+			gpi_event->xfer_compl_event.type = STALE_EV_TYPE;
+		ev_rp += ev_ring->el_size;
+		if (ev_rp >= (ev_ring->base + ev_ring->len))
+			ev_rp = ev_ring->base;
+		cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+		local_rp = (u32)to_physical(ev_ring, ev_rp);
+	}
+}
+
+/* reset sw state and issue channel reset or de-alloc */
+static int gpi_reset_chan(struct gchan *gchan, enum gpi_cmd gpi_cmd)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	unsigned long flags;
+	LIST_HEAD(list);
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, gpi_cmd);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(gpi_cmd), ret);
+		return ret;
+	}
+
+	/* initialize the local ring ptrs */
+	ch_ring->rp = ch_ring->base;
+	ch_ring->wp = ch_ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+
+	/* check event ring for any stale events */
+	write_lock_irq(&gpii->pm_lock);
+	gpi_mark_stale_events(gchan);
+
+	/* remove all async descriptors */
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vchan_get_all_descriptors(&gchan->vc, &list);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	write_unlock_irq(&gpii->pm_lock);
+	vchan_dma_desc_free_list(&gchan->vc, &list);
+
+	return 0;
+}
+
+static int gpi_start_chan(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_START);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_CH_CMD_START), ret);
+		return ret;
+	}
+
+	/* gpii CH is active now */
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	return 0;
+}
+
+static int gpi_stop_chan(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_STOP);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_CH_CMD_STOP), ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* allocate and configure the transfer channel */
+static int gpi_alloc_chan(struct gchan *chan, bool send_alloc_cmd)
+{
+	struct gpii *gpii = chan->gpii;
+	struct gpi_ring *ring = &chan->ch_ring;
+	int ret;
+	u32 id = gpii->gpii_id;
+	u32 chid = chan->chid;
+	u32 pair_chid = !chid;
+
+	if (send_alloc_cmd) {
+		ret = gpi_send_cmd(gpii, chan, GPI_CH_CMD_ALLOCATE);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+				TO_GPI_CMD_STR(GPI_CH_CMD_ALLOCATE), ret);
+			return ret;
+		}
+	}
+
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_0_CONFIG,
+		      GPII_n_CH_k_CNTXT_0(ring->el_size, 0, chan->dir, GPI_CHTYPE_PROTO_GPI));
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_1_R_LENGTH, ring->len);
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_2_RING_BASE_LSB, ring->phys_addr);
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_3_RING_BASE_MSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, chan->ch_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_0_OFFS(id, chid),
+		      GPII_n_CH_k_SCRATCH_0(pair_chid, chan->protocol, chan->seid));
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_1_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_2_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_3_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_QOS_OFFS(id, chid), 1);
+
+	/* flush all the writes */
+	wmb();
+	return 0;
+}
+
+/* allocate and configure event ring */
+static int gpi_alloc_ev_chan(struct gpii *gpii)
+{
+	struct gpi_ring *ring = &gpii->ev_ring;
+	void __iomem *base = gpii->ev_cntxt_base_reg;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, NULL, GPI_EV_CMD_ALLOCATE);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_EV_CMD_ALLOCATE), ret);
+		return ret;
+	}
+
+	/* program event context */
+	gpi_write_reg(gpii, base + CNTXT_0_CONFIG,
+		      GPII_n_EV_k_CNTXT_0(ring->el_size, GPI_INTTYPE_IRQ, GPI_CHTYPE_GPI_EV));
+	gpi_write_reg(gpii, base + CNTXT_1_R_LENGTH, ring->len);
+	gpi_write_reg(gpii, base + CNTXT_2_RING_BASE_LSB, lower_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, base + CNTXT_3_RING_BASE_MSB, upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, gpii->ev_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0);
+	gpi_write_reg(gpii, base + CNTXT_10_RING_MSI_LSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_11_RING_MSI_MSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0);
+	gpi_write_reg(gpii, base + CNTXT_12_RING_RP_UPDATE_LSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_13_RING_RP_UPDATE_MSB, 0);
+
+	/* add events to ring */
+	ring->wp = (ring->base + ring->len - ring->el_size);
+
+	/* flush all the writes */
+	wmb();
+
+	/* gpii is active now */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	gpi_write_ev_db(gpii, ring, ring->wp);
+
+	return 0;
+}
+
+/* calculate # of ERE/TRE available to queue */
+static int gpi_ring_num_elements_avail(const struct gpi_ring * const ring)
+{
+	int elements = 0;
+
+	if (ring->wp < ring->rp) {
+		elements = ((ring->rp - ring->wp) / ring->el_size) - 1;
+	} else {
+		elements = (ring->rp - ring->base) / ring->el_size;
+		elements += ((ring->base + ring->len - ring->wp) / ring->el_size) - 1;
+	}
+
+	return elements;
+}
+
+static int gpi_ring_add_element(struct gpi_ring *ring, void **wp)
+{
+	if (gpi_ring_num_elements_avail(ring) <= 0)
+		return -ENOMEM;
+
+	*wp = ring->wp;
+	ring->wp += ring->el_size;
+	if (ring->wp  >= (ring->base + ring->len))
+		ring->wp = ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+
+	return 0;
+}
+
+static void gpi_ring_recycle_ev_element(struct gpi_ring *ring)
+{
+	/* Update the WP */
+	ring->wp += ring->el_size;
+	if (ring->wp  >= (ring->base + ring->len))
+		ring->wp = ring->base;
+
+	/* Update the RP */
+	ring->rp += ring->el_size;
+	if (ring->rp  >= (ring->base + ring->len))
+		ring->rp = ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+}
+
+static void gpi_free_ring(struct gpi_ring *ring,
+			  struct gpii *gpii)
+{
+	dma_free_coherent(gpii->gpi_dev->dev, ring->alloc_size,
+			  ring->pre_aligned, ring->dma_handle);
+	memset(ring, 0, sizeof(*ring));
+}
+
+/* allocate memory for transfer and event rings */
+static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements,
+			  u32 el_size, struct gpii *gpii)
+{
+	u64 len = elements * el_size;
+	int bit;
+
+	/* ring len must be power of 2 */
+	bit = find_last_bit((unsigned long *)&len, 32);
+	if (((1 << bit) - 1) & len)
+		bit++;
+	len = 1 << bit;
+	ring->alloc_size = (len + (len - 1));
+	dev_dbg(gpii->gpi_dev->dev,
+		"#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%lu\n",
+		  elements, el_size, (elements * el_size), len,
+		  ring->alloc_size);
+
+	ring->pre_aligned = dma_alloc_coherent(gpii->gpi_dev->dev,
+					       ring->alloc_size,
+					       &ring->dma_handle, GFP_KERNEL);
+	if (!ring->pre_aligned) {
+		dev_err(gpii->gpi_dev->dev, "could not alloc size:%lu mem for ring\n",
+			ring->alloc_size);
+		return -ENOMEM;
+	}
+
+	/* align the physical mem */
+	ring->phys_addr = (ring->dma_handle + (len - 1)) & ~(len - 1);
+	ring->base = ring->pre_aligned + (ring->phys_addr - ring->dma_handle);
+	ring->rp = ring->base;
+	ring->wp = ring->base;
+	ring->len = len;
+	ring->el_size = el_size;
+	ring->elements = ring->len / ring->el_size;
+	memset(ring->base, 0, ring->len);
+	ring->configured = true;
+
+	/* update to other cores */
+	smp_wmb();
+
+	dev_dbg(gpii->gpi_dev->dev,
+		"phy_pre:0x%0llx phy_alig:0x%0llx len:%u el_size:%u elements:%u\n",
+		ring->dma_handle, ring->phys_addr, ring->len,
+		ring->el_size, ring->elements);
+
+	return 0;
+}
+
+/* copy tre into transfer ring */
+static void gpi_queue_xfer(struct gpii *gpii, struct gchan *gchan,
+			   struct gpi_tre *gpi_tre, void **wp)
+{
+	struct gpi_tre *ch_tre;
+	int ret;
+
+	/* get next tre location we can copy */
+	ret = gpi_ring_add_element(&gchan->ch_ring, (void **)&ch_tre);
+	if (unlikely(ret)) {
+		dev_err(gpii->gpi_dev->dev, "Error adding ring element to xfer ring\n");
+		return;
+	}
+
+	/* copy the tre info */
+	memcpy(ch_tre, gpi_tre, sizeof(*ch_tre));
+	*wp = ch_tre;
+}
+
+/* reset and restart transfer channel */
+static int gpi_terminate_all(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int schid, echid, i;
+	int ret = 0;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/*
+	 * treat both channels as a group if its protocol is not UART
+	 * STOP, RESET, or START needs to be in lockstep
+	 */
+	schid = (gchan->protocol == QCOM_GPI_UART) ? gchan->chid : 0;
+	echid = (gchan->protocol == QCOM_GPI_UART) ? schid + 1 : MAX_CHANNELS_PER_GPII;
+
+	/* stop the channel */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		/* disable ch state so no more TRE processing */
+		write_lock_irq(&gpii->pm_lock);
+		gchan->pm_state = PREPARE_TERMINATE;
+		write_unlock_irq(&gpii->pm_lock);
+
+		/* send command to Stop the channel */
+		ret = gpi_stop_chan(gchan);
+	}
+
+	/* reset the channels (clears any pending tre) */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		ret = gpi_reset_chan(gchan, GPI_CH_CMD_RESET);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error resetting channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+
+		/* reprogram channel CNTXT */
+		ret = gpi_alloc_chan(gchan, false);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error alloc_channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+	}
+
+	/* restart the channels */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		ret = gpi_start_chan(gchan);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error Starting Channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+	}
+
+terminate_exit:
+	mutex_unlock(&gpii->ctrl_lock);
+	return ret;
+}
+
+/* pause dma transfer for all channels */
+static int gpi_pause(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int i, ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/*
+	 * pause/resume are per gpii not per channel, so
+	 * client needs to call pause only once
+	 */
+	if (gpii->pm_state == PAUSE_STATE) {
+		dev_dbg(gpii->gpi_dev->dev, "channel is already paused\n");
+		mutex_unlock(&gpii->ctrl_lock);
+		return 0;
+	}
+
+	/* send stop command to stop the channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_stop_chan(&gpii->gchan[i]);
+		if (ret) {
+			mutex_unlock(&gpii->ctrl_lock);
+			return ret;
+		}
+	}
+
+	disable_irq(gpii->irq);
+
+	/* Wait for threads to complete out */
+	tasklet_kill(&gpii->ev_task);
+
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PAUSE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return 0;
+}
+
+/* resume dma transfer */
+static int gpi_resume(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int i, ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+	if (gpii->pm_state == ACTIVE_STATE) {
+		dev_dbg(gpii->gpi_dev->dev, "channel is already active\n");
+		mutex_unlock(&gpii->ctrl_lock);
+		return 0;
+	}
+
+	enable_irq(gpii->irq);
+
+	/* send start command to start the channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_send_cmd(gpii, &gpii->gchan[i], GPI_CH_CMD_START);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error starting chan, ret:%d\n", ret);
+			mutex_unlock(&gpii->ctrl_lock);
+			return ret;
+		}
+	}
+
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return 0;
+}
+
+static void gpi_desc_free(struct virt_dma_desc *vd)
+{
+	struct gpi_desc *gpi_desc = to_gpi_desc(vd);
+
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+static int
+gpi_peripheral_config(struct dma_chan *chan, struct dma_slave_config *config)
+{
+	struct gchan *gchan = to_gchan(chan);
+
+	if (!config->peripheral_config)
+		return -EINVAL;
+
+	gchan->config = krealloc(gchan->config, config->peripheral_size, GFP_NOWAIT);
+	if (!gchan->config)
+		return -ENOMEM;
+
+	memcpy(gchan->config, config->peripheral_config, config->peripheral_size);
+
+	return 0;
+}
+
+static int gpi_create_i2c_tre(struct gchan *chan, struct gpi_desc *desc,
+			      struct scatterlist *sgl, enum dma_transfer_direction direction)
+{
+	struct gpi_i2c_config *i2c = chan->config;
+	struct device *dev = chan->gpii->gpi_dev->dev;
+	unsigned int tre_idx = 0;
+	dma_addr_t address;
+	struct gpi_tre *tre;
+	unsigned int i;
+
+	/* first create config tre if applicable */
+	if (i2c->set_config) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(i2c->low_count, TRE_I2C_C0_TLOW);
+		tre->dword[0] |= u32_encode_bits(i2c->high_count, TRE_I2C_C0_THIGH);
+		tre->dword[0] |= u32_encode_bits(i2c->cycle_count, TRE_I2C_C0_TCYL);
+		tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_TX_PACK);
+		tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_RX_PACK);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(i2c->clk_div, TRE_C0_CLK_DIV);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the GO tre for Tx */
+	if (i2c->op == I2C_WRITE) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		if (i2c->multi_msg)
+			tre->dword[0] = u32_encode_bits(I2C_READ, TRE_I2C_GO_CMD);
+		else
+			tre->dword[0] = u32_encode_bits(i2c->op, TRE_I2C_GO_CMD);
+
+		tre->dword[0] |= u32_encode_bits(i2c->addr, TRE_I2C_GO_ADDR);
+		tre->dword[0] |= u32_encode_bits(i2c->stretch, TRE_I2C_GO_STRETCH);
+
+		tre->dword[1] = 0;
+		tre->dword[2] = u32_encode_bits(i2c->rx_len, TRE_RX_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE);
+
+		if (i2c->multi_msg)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_LINK);
+		else
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	if (i2c->op == I2C_READ || i2c->multi_msg == false) {
+		/* create the DMA TRE */
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		address = sg_dma_address(sgl);
+		tre->dword[0] = lower_32_bits(address);
+		tre->dword[1] = upper_32_bits(address);
+
+		tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+	};
+
+	for (i = 0; i < tre_idx; i++)
+		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
+			desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]);
+
+	return tre_idx;
+}
+
+static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
+			      struct scatterlist *sgl, enum dma_transfer_direction direction)
+{
+	struct gpi_spi_config *spi = chan->config;
+	struct device *dev = chan->gpii->gpi_dev->dev;
+	unsigned int tre_idx = 0;
+	dma_addr_t address;
+	struct gpi_tre *tre;
+	unsigned int i;
+
+	/* first create config tre if applicable */
+	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(spi->word_len, TRE_SPI_C0_WORD_SZ);
+		tre->dword[0] |= u32_encode_bits(spi->loopback_en, TRE_SPI_C0_LOOPBACK);
+		tre->dword[0] |= u32_encode_bits(spi->clock_pol_high, TRE_SPI_C0_CPOL);
+		tre->dword[0] |= u32_encode_bits(spi->data_pol_high, TRE_SPI_C0_CPHA);
+		tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_TX_PACK);
+		tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_RX_PACK);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(spi->clk_div, TRE_C0_CLK_DIV);
+		tre->dword[2] |= u32_encode_bits(spi->clk_src, TRE_C0_CLK_SRC);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the GO tre for Tx */
+	if (direction == DMA_MEM_TO_DEV) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(spi->fragmentation, TRE_SPI_GO_FRAG);
+		tre->dword[0] |= u32_encode_bits(spi->cs, TRE_SPI_GO_CS);
+		tre->dword[0] |= u32_encode_bits(spi->cmd, TRE_SPI_GO_CMD);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(spi->rx_len, TRE_RX_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE);
+		if (spi->cmd == SPI_RX)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOB);
+		else
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the dma tre */
+	tre = &desc->tre[tre_idx];
+	tre_idx++;
+
+	address = sg_dma_address(sgl);
+	tre->dword[0] = lower_32_bits(address);
+	tre->dword[1] = upper_32_bits(address);
+
+	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+
+	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+	if (direction == DMA_MEM_TO_DEV)
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+
+	for (i = 0; i < tre_idx; i++)
+		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
+			desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]);
+
+	return tre_idx;
+}
+
+/* copy tre into transfer ring */
+static struct dma_async_tx_descriptor *
+gpi_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
+		  unsigned int sg_len, enum dma_transfer_direction direction,
+		  unsigned long flags, void *context)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	struct device *dev = gpii->gpi_dev->dev;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	struct gpi_desc *gpi_desc;
+	u32 nr, nr_tre = 0;
+	u8 set_config;
+	int i;
+
+	gpii->ieob_set = false;
+	if (!is_slave_direction(direction)) {
+		dev_err(gpii->gpi_dev->dev, "invalid dma direction: %d\n", direction);
+		return NULL;
+	}
+
+	if (sg_len > 1) {
+		dev_err(dev, "Multi sg sent, we support only one atm: %d\n", sg_len);
+		return NULL;
+	}
+
+	nr_tre = 3;
+	set_config = *(u32 *)gchan->config;
+	if (!set_config)
+		nr_tre = 2;
+	if (direction == DMA_DEV_TO_MEM) /* rx */
+		nr_tre = 1;
+
+	/* calculate # of elements required & available */
+	nr = gpi_ring_num_elements_avail(ch_ring);
+	if (nr < nr_tre) {
+		dev_err(dev, "not enough space in ring, avail:%u required:%u\n", nr, nr_tre);
+		return NULL;
+	}
+
+	gpi_desc = kzalloc(sizeof(*gpi_desc), GFP_NOWAIT);
+	if (!gpi_desc)
+		return NULL;
+
+	/* create TREs for xfer */
+	if (gchan->protocol == QCOM_GPI_SPI) {
+		i = gpi_create_spi_tre(gchan, gpi_desc, sgl, direction);
+	} else if (gchan->protocol == QCOM_GPI_I2C) {
+		i = gpi_create_i2c_tre(gchan, gpi_desc, sgl, direction);
+	} else {
+		dev_err(dev, "invalid peripheral: %d\n", gchan->protocol);
+		kfree(gpi_desc);
+		return NULL;
+	}
+
+	/* set up the descriptor */
+	gpi_desc->gchan = gchan;
+	gpi_desc->len = sg_dma_len(sgl);
+	gpi_desc->num_tre  = i;
+
+	return vchan_tx_prep(&gchan->vc, &gpi_desc->vd, flags);
+}
+
+/* rings transfer ring db to being transfer */
+static void gpi_issue_pending(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	unsigned long flags, pm_lock_flags;
+	struct virt_dma_desc *vd = NULL;
+	struct gpi_desc *gpi_desc;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *tre, *wp = NULL;
+	int i;
+
+	read_lock_irqsave(&gpii->pm_lock, pm_lock_flags);
+
+	/* move all submitted discriptors to issued list */
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	if (vchan_issue_pending(&gchan->vc))
+		vd = list_last_entry(&gchan->vc.desc_issued,
+				     struct virt_dma_desc, node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/* nothing to do list is empty */
+	if (!vd) {
+		read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags);
+		return;
+	}
+
+	gpi_desc = to_gpi_desc(vd);
+	for (i = 0; i < gpi_desc->num_tre; i++) {
+		tre = &gpi_desc->tre[i];
+		gpi_queue_xfer(gpii, gchan, tre, &wp);
+	}
+
+	gpi_desc->db = ch_ring->wp;
+	gpi_write_ch_db(gchan, &gchan->ch_ring, gpi_desc->db);
+	read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags);
+}
+
+static int gpi_ch_init(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	const int ev_factor = gpii->gpi_dev->ev_factor;
+	u32 elements;
+	int i = 0, ret = 0;
+
+	gchan->pm_state = CONFIG_STATE;
+
+	/* check if both channels are configured before continue */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++)
+		if (gpii->gchan[i].pm_state != CONFIG_STATE)
+			goto exit_gpi_init;
+
+	/* protocol must be same for both channels */
+	if (gpii->gchan[0].protocol != gpii->gchan[1].protocol) {
+		dev_err(gpii->gpi_dev->dev, "protocol did not match protocol %u != %u\n",
+			gpii->gchan[0].protocol, gpii->gchan[1].protocol);
+		ret = -EINVAL;
+		goto exit_gpi_init;
+	}
+
+	/* allocate memory for event ring */
+	elements = CHAN_TRES << ev_factor;
+	ret = gpi_alloc_ring(&gpii->ev_ring, elements,
+			     sizeof(union gpi_event), gpii);
+	if (ret)
+		goto exit_gpi_init;
+
+	/* configure interrupts */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PREPARE_HARDWARE;
+	write_unlock_irq(&gpii->pm_lock);
+	ret = gpi_config_interrupts(gpii, DEFAULT_IRQ_SETTINGS, 0);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error config. interrupts, ret:%d\n", ret);
+		goto error_config_int;
+	}
+
+	/* allocate event rings */
+	ret = gpi_alloc_ev_chan(gpii);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error alloc_ev_chan:%d\n", ret);
+		goto error_alloc_ev_ring;
+	}
+
+	/* Allocate all channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_alloc_chan(&gpii->gchan[i], true);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error allocating chan:%d\n", ret);
+			goto error_alloc_chan;
+		}
+	}
+
+	/* start channels  */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_start_chan(&gpii->gchan[i]);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error start chan:%d\n", ret);
+			goto error_start_chan;
+		}
+	}
+	return ret;
+
+error_start_chan:
+	for (i = i - 1; i >= 0; i++) {
+		gpi_stop_chan(&gpii->gchan[i]);
+		gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET);
+	}
+	i = 2;
+error_alloc_chan:
+	for (i = i - 1; i >= 0; i--)
+		gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC);
+error_alloc_ev_ring:
+	gpi_disable_interrupts(gpii);
+error_config_int:
+	gpi_free_ring(&gpii->ev_ring, gpii);
+exit_gpi_init:
+	mutex_unlock(&gpii->ctrl_lock);
+	return ret;
+}
+
+/* release all channel resources */
+static void gpi_free_chan_resources(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	enum gpi_pm_state cur_state;
+	int ret, i;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	cur_state = gchan->pm_state;
+
+	/* disable ch state so no more TRE processing for this channel */
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = PREPARE_TERMINATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* attempt to do graceful hardware shutdown */
+	if (cur_state == ACTIVE_STATE) {
+		gpi_stop_chan(gchan);
+
+		ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET);
+		if (ret)
+			dev_err(gpii->gpi_dev->dev, "error resetting channel:%d\n", ret);
+
+		gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC);
+	}
+
+	/* free all allocated memory */
+	gpi_free_ring(&gchan->ch_ring, gpii);
+	vchan_free_chan_resources(&gchan->vc);
+	kfree(gchan->config);
+
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = DISABLE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* if other rings are still active exit */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++)
+		if (gpii->gchan[i].ch_ring.configured)
+			goto exit_free;
+
+	/* deallocate EV Ring */
+	cur_state = gpii->pm_state;
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PREPARE_TERMINATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* wait for threads to complete out */
+	tasklet_kill(&gpii->ev_task);
+
+	/* send command to de allocate event ring */
+	if (cur_state == ACTIVE_STATE)
+		gpi_send_cmd(gpii, NULL, GPI_EV_CMD_DEALLOC);
+
+	gpi_free_ring(&gpii->ev_ring, gpii);
+
+	/* disable interrupts */
+	if (cur_state == ACTIVE_STATE)
+		gpi_disable_interrupts(gpii);
+
+	/* set final state to disable */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = DISABLE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+exit_free:
+	mutex_unlock(&gpii->ctrl_lock);
+}
+
+/* allocate channel resources */
+static int gpi_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/* allocate memory for transfer ring */
+	ret = gpi_alloc_ring(&gchan->ch_ring, CHAN_TRES,
+			     sizeof(struct gpi_tre), gpii);
+	if (ret)
+		goto xfer_alloc_err;
+
+	ret = gpi_ch_init(gchan);
+
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return ret;
+xfer_alloc_err:
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return ret;
+}
+
+static int gpi_find_avail_gpii(struct gpi_dev *gpi_dev, u32 seid)
+{
+	struct gchan *tx_chan, *rx_chan;
+	unsigned int gpii;
+
+	/* check if same seid is already configured for another chid */
+	for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) {
+		if (!((1 << gpii) & gpi_dev->gpii_mask))
+			continue;
+
+		tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN];
+		rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN];
+
+		if (rx_chan->vc.chan.client_count && rx_chan->seid == seid)
+			return gpii;
+		if (tx_chan->vc.chan.client_count && tx_chan->seid == seid)
+			return gpii;
+	}
+
+	/* no channels configured with same seid, return next avail gpii */
+	for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) {
+		if (!((1 << gpii) & gpi_dev->gpii_mask))
+			continue;
+
+		tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN];
+		rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN];
+
+		/* check if gpii is configured */
+		if (tx_chan->vc.chan.client_count ||
+		    rx_chan->vc.chan.client_count)
+			continue;
+
+		/* found a free gpii */
+		return gpii;
+	}
+
+	/* no gpii instance available to use */
+	return -EIO;
+}
+
+/* gpi_of_dma_xlate: open client requested channel */
+static struct dma_chan *gpi_of_dma_xlate(struct of_phandle_args *args,
+					 struct of_dma *of_dma)
+{
+	struct gpi_dev *gpi_dev = (struct gpi_dev *)of_dma->of_dma_data;
+	u32 seid, chid;
+	int gpii;
+	struct gchan *gchan;
+
+	if (args->args_count < 3) {
+		dev_err(gpi_dev->dev, "gpii require minimum 2 args, client passed:%d args\n",
+			args->args_count);
+		return NULL;
+	}
+
+	chid = args->args[0];
+	if (chid >= MAX_CHANNELS_PER_GPII) {
+		dev_err(gpi_dev->dev, "gpii channel:%d not valid\n", chid);
+		return NULL;
+	}
+
+	seid = args->args[1];
+
+	/* find next available gpii to use */
+	gpii = gpi_find_avail_gpii(gpi_dev, seid);
+	if (gpii < 0) {
+		dev_err(gpi_dev->dev, "no available gpii instances\n");
+		return NULL;
+	}
+
+	gchan = &gpi_dev->gpiis[gpii].gchan[chid];
+	if (gchan->vc.chan.client_count) {
+		dev_err(gpi_dev->dev, "gpii:%d chid:%d seid:%d already configured\n",
+			gpii, chid, gchan->seid);
+		return NULL;
+	}
+
+	gchan->seid = seid;
+	gchan->protocol = args->args[2];
+
+	return dma_get_slave_channel(&gchan->vc.chan);
+}
+
+static int gpi_probe(struct platform_device *pdev)
+{
+	struct gpi_dev *gpi_dev;
+	unsigned int i;
+	int ret;
+
+	gpi_dev = devm_kzalloc(&pdev->dev, sizeof(*gpi_dev), GFP_KERNEL);
+	if (!gpi_dev)
+		return -ENOMEM;
+
+	gpi_dev->dev = &pdev->dev;
+	gpi_dev->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	gpi_dev->regs = devm_ioremap_resource(gpi_dev->dev, gpi_dev->res);
+	if (IS_ERR(gpi_dev->regs))
+		return PTR_ERR(gpi_dev->regs);
+	gpi_dev->ee_base = gpi_dev->regs;
+
+	ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channels",
+				   &gpi_dev->max_gpii);
+	if (ret) {
+		dev_err(gpi_dev->dev, "missing 'max-no-gpii' DT node\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channel-mask",
+				   &gpi_dev->gpii_mask);
+	if (ret) {
+		dev_err(gpi_dev->dev, "missing 'gpii-mask' DT node\n");
+		return ret;
+	}
+
+	gpi_dev->ev_factor = EV_FACTOR;
+
+	ret = dma_set_mask(gpi_dev->dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(gpi_dev->dev, "Error setting dma_mask to 64, ret:%d\n", ret);
+		return ret;
+	}
+
+	gpi_dev->gpiis = devm_kzalloc(gpi_dev->dev, sizeof(*gpi_dev->gpiis) *
+				      gpi_dev->max_gpii, GFP_KERNEL);
+	if (!gpi_dev->gpiis)
+		return -ENOMEM;
+
+	/* setup all the supported gpii */
+	INIT_LIST_HEAD(&gpi_dev->dma_device.channels);
+	for (i = 0; i < gpi_dev->max_gpii; i++) {
+		struct gpii *gpii = &gpi_dev->gpiis[i];
+		int chan;
+
+		if (!((1 << i) & gpi_dev->gpii_mask))
+			continue;
+
+		/* set up ev cntxt register map */
+		gpii->ev_cntxt_base_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_CNTXT_0_OFFS(i, 0);
+		gpii->ev_cntxt_db_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_DOORBELL_0_OFFS(i, 0);
+		gpii->ev_ring_rp_lsb_reg = gpii->ev_cntxt_base_reg + CNTXT_4_RING_RP_LSB;
+		gpii->ev_cmd_reg = gpi_dev->ee_base + GPII_n_EV_CH_CMD_OFFS(i);
+		gpii->ieob_clr_reg = gpi_dev->ee_base + GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(i);
+
+		/* set up irq */
+		ret = platform_get_irq(pdev, i);
+		if (ret < 0) {
+			dev_err(gpi_dev->dev, "platform_get_irq failed for %d:%d\n", i, ret);
+			return ret;
+		}
+		gpii->irq = ret;
+
+		/* set up channel specific register info */
+		for (chan = 0; chan < MAX_CHANNELS_PER_GPII; chan++) {
+			struct gchan *gchan = &gpii->gchan[chan];
+
+			/* set up ch cntxt register map */
+			gchan->ch_cntxt_base_reg = gpi_dev->ee_base +
+				GPII_n_CH_k_CNTXT_0_OFFS(i, chan);
+			gchan->ch_cntxt_db_reg = gpi_dev->ee_base +
+				GPII_n_CH_k_DOORBELL_0_OFFS(i, chan);
+			gchan->ch_cmd_reg = gpi_dev->ee_base + GPII_n_CH_CMD_OFFS(i);
+
+			/* vchan setup */
+			vchan_init(&gchan->vc, &gpi_dev->dma_device);
+			gchan->vc.desc_free = gpi_desc_free;
+			gchan->chid = chan;
+			gchan->gpii = gpii;
+			gchan->dir = GPII_CHAN_DIR[chan];
+		}
+		mutex_init(&gpii->ctrl_lock);
+		rwlock_init(&gpii->pm_lock);
+		tasklet_init(&gpii->ev_task, gpi_ev_tasklet,
+			     (unsigned long)gpii);
+		init_completion(&gpii->cmd_completion);
+		gpii->gpii_id = i;
+		gpii->regs = gpi_dev->ee_base;
+		gpii->gpi_dev = gpi_dev;
+	}
+
+	platform_set_drvdata(pdev, gpi_dev);
+
+	/* clear and Set capabilities */
+	dma_cap_zero(gpi_dev->dma_device.cap_mask);
+	dma_cap_set(DMA_SLAVE, gpi_dev->dma_device.cap_mask);
+
+	/* configure dmaengine apis */
+	gpi_dev->dma_device.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+	gpi_dev->dma_device.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	gpi_dev->dma_device.src_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES;
+	gpi_dev->dma_device.dst_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES;
+	gpi_dev->dma_device.device_alloc_chan_resources = gpi_alloc_chan_resources;
+	gpi_dev->dma_device.device_free_chan_resources = gpi_free_chan_resources;
+	gpi_dev->dma_device.device_tx_status = dma_cookie_status;
+	gpi_dev->dma_device.device_issue_pending = gpi_issue_pending;
+	gpi_dev->dma_device.device_prep_slave_sg = gpi_prep_slave_sg;
+	gpi_dev->dma_device.device_config = gpi_peripheral_config;
+	gpi_dev->dma_device.device_terminate_all = gpi_terminate_all;
+	gpi_dev->dma_device.dev = gpi_dev->dev;
+	gpi_dev->dma_device.device_pause = gpi_pause;
+	gpi_dev->dma_device.device_resume = gpi_resume;
+
+	/* register with dmaengine framework */
+	ret = dma_async_device_register(&gpi_dev->dma_device);
+	if (ret) {
+		dev_err(gpi_dev->dev, "async_device_register failed ret:%d", ret);
+		return ret;
+	}
+
+	ret = of_dma_controller_register(gpi_dev->dev->of_node,
+					 gpi_of_dma_xlate, gpi_dev);
+	if (ret) {
+		dev_err(gpi_dev->dev, "of_dma_controller_reg failed ret:%d", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static const struct of_device_id gpi_of_match[] = {
+	{ .compatible = "qcom,sdm845-gpi-dma" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, gpi_of_match);
+
+static struct platform_driver gpi_driver = {
+	.probe = gpi_probe,
+	.driver = {
+		.name = KBUILD_MODNAME,
+		.of_match_table = gpi_of_match,
+	},
+};
+
+static int __init gpi_init(void)
+{
+	return platform_driver_register(&gpi_driver);
+}
+subsys_initcall(gpi_init)
+
+MODULE_DESCRIPTION("QCOM GPI DMA engine driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
new file mode 100644
index 000000000000..f46dc3372f11
--- /dev/null
+++ b/include/linux/dma/qcom-gpi-dma.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Linaro Limited
+ */
+
+#ifndef QCOM_GPI_DMA_H
+#define QCOM_GPI_DMA_H
+
+/**
+ * enum spi_transfer_cmd - spi transfer commands
+ */
+enum spi_transfer_cmd {
+	SPI_TX = 1,
+	SPI_RX,
+	SPI_DUPLEX,
+};
+
+/**
+ * struct gpi_spi_config - spi config for peripheral
+ *
+ * @loopback_en: spi loopback enable when set
+ * @clock_pol_high: clock polarity
+ * @data_pol_high: data polarity
+ * @pack_en: process tx/rx buffers as packed
+ * @word_len: spi word length
+ * @clk_div: source clock divider
+ * @clk_src: serial clock
+ * @cmd: spi cmd
+ * @fragmentation: keep CS assserted at end of sequence
+ * @cs: chip select toggle
+ * @set_config: set peripheral config
+ * @rx_len: receive length for buffer
+ */
+struct gpi_spi_config {
+	u8 set_config;
+	u8 loopback_en;
+	u8 clock_pol_high;
+	u8 data_pol_high;
+	u8 pack_en;
+	u8 word_len;
+	u8 fragmentation;
+	u8 cs;
+	u32 clk_div;
+	u32 clk_src;
+	enum spi_transfer_cmd cmd;
+	u32 rx_len;
+};
+
+enum i2c_op {
+	I2C_WRITE = 1,
+	I2C_READ,
+};
+
+/**
+ * struct gpi_i2c_config - i2c config for peripheral
+ *
+ * @pack_enable: process tx/rx buffers as packed
+ * @cycle_count: clock cycles to be sent
+ * @high_count: high period of clock
+ * @low_count: low period of clock
+ * @clk_div: source clock divider
+ * @addr: i2c bus address
+ * @stretch: stretch the clock at eot
+ * @set_config: set peripheral config
+ * @rx_len: receive length for buffer
+ * @op: i2c cmd
+ * @muli-msg: is part of multi i2c r-w msgs
+ */
+struct gpi_i2c_config {
+	u8 set_config;
+	u8 pack_enable;
+	u8 cycle_count;
+	u8 high_count;
+	u8 low_count;
+	u8 addr;
+	u8 stretch;
+	u16 clk_div;
+	u32 rx_len;
+	enum i2c_op op;
+	bool multi_msg;
+};
+
+#endif /* QCOM_GPI_DMA_H */

From 0ab785c894e618587e83bb67e8a8e96649868ad1 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Tue, 24 Nov 2020 11:34:05 -0300
Subject: [PATCH 050/230] dmaengine: imx-dma: Remove unused .id_table

Since 5.10-rc1 i.MX is a devicetree-only platform, so simplify the code
by removing the unused non-DT support.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20201124143405.2764-1-festevam@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-dma.c | 33 ++++-----------------------------
 1 file changed, 4 insertions(+), 29 deletions(-)

diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
index 670db04b0757..7f116bbcfad2 100644
--- a/drivers/dma/imx-dma.c
+++ b/drivers/dma/imx-dma.c
@@ -191,32 +191,13 @@ struct imxdma_filter_data {
 	int			 request;
 };
 
-static const struct platform_device_id imx_dma_devtype[] = {
-	{
-		.name = "imx1-dma",
-		.driver_data = IMX1_DMA,
-	}, {
-		.name = "imx21-dma",
-		.driver_data = IMX21_DMA,
-	}, {
-		.name = "imx27-dma",
-		.driver_data = IMX27_DMA,
-	}, {
-		/* sentinel */
-	}
-};
-MODULE_DEVICE_TABLE(platform, imx_dma_devtype);
-
 static const struct of_device_id imx_dma_of_dev_id[] = {
 	{
-		.compatible = "fsl,imx1-dma",
-		.data = &imx_dma_devtype[IMX1_DMA],
+		.compatible = "fsl,imx1-dma", .data = (const void *)IMX1_DMA,
 	}, {
-		.compatible = "fsl,imx21-dma",
-		.data = &imx_dma_devtype[IMX21_DMA],
+		.compatible = "fsl,imx21-dma", .data = (const void *)IMX21_DMA,
 	}, {
-		.compatible = "fsl,imx27-dma",
-		.data = &imx_dma_devtype[IMX27_DMA],
+		.compatible = "fsl,imx27-dma", .data = (const void *)IMX27_DMA,
 	}, {
 		/* sentinel */
 	}
@@ -1056,20 +1037,15 @@ static int __init imxdma_probe(struct platform_device *pdev)
 {
 	struct imxdma_engine *imxdma;
 	struct resource *res;
-	const struct of_device_id *of_id;
 	int ret, i;
 	int irq, irq_err;
 
-	of_id = of_match_device(imx_dma_of_dev_id, &pdev->dev);
-	if (of_id)
-		pdev->id_entry = of_id->data;
-
 	imxdma = devm_kzalloc(&pdev->dev, sizeof(*imxdma), GFP_KERNEL);
 	if (!imxdma)
 		return -ENOMEM;
 
 	imxdma->dev = &pdev->dev;
-	imxdma->devtype = pdev->id_entry->driver_data;
+	imxdma->devtype = (enum imx_dma_type)of_device_get_match_data(&pdev->dev);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	imxdma->base = devm_ioremap_resource(&pdev->dev, res);
@@ -1263,7 +1239,6 @@ static struct platform_driver imxdma_driver = {
 		.name	= "imx-dma",
 		.of_match_table = imx_dma_of_dev_id,
 	},
-	.id_table	= imx_dma_devtype,
 	.remove		= imxdma_remove,
 };
 

From c95e6515a8c065862361f7e0e452978ade7f94ec Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Tue, 24 Nov 2020 09:08:13 +0800
Subject: [PATCH 051/230] dmaengine: mv_xor_v2: Fix error return code in
 mv_xor_v2_probe()

Return the corresponding error code when first_msi_entry() returns
NULL in mv_xor_v2_probe().

Fixes: 19a340b1a820430 ("dmaengine: mv_xor_v2: new driver")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Link: https://lore.kernel.org/r/20201124010813.1939095-1-chengzhihao1@huawei.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/mv_xor_v2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c
index 2753a6b916f6..9b0d463f89bb 100644
--- a/drivers/dma/mv_xor_v2.c
+++ b/drivers/dma/mv_xor_v2.c
@@ -771,8 +771,10 @@ static int mv_xor_v2_probe(struct platform_device *pdev)
 		goto disable_clk;
 
 	msi_desc = first_msi_entry(&pdev->dev);
-	if (!msi_desc)
+	if (!msi_desc) {
+		ret = -ENODEV;
 		goto free_msi_irqs;
+	}
 	xor_dev->msi_desc = msi_desc;
 
 	ret = devm_request_irq(&pdev->dev, msi_desc->irq,

From cc2afb0d4c7cbba6743ed6d9564f0883cab6bae1 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <festevam@gmail.com>
Date: Mon, 23 Nov 2020 16:30:51 -0300
Subject: [PATCH 052/230] dmaengine: mxs-dma: Remove the unused .id_table

The mxs-dma driver is only used by DT platforms and the .id_table
is unused.

Get rid of it to simplify the code.

Signed-off-by: Fabio Estevam <festevam@gmail.com>
Link: https://lore.kernel.org/r/20201123193051.17285-1-festevam@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/mxs-dma.c | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 65f816b40c32..994fc4d2aca4 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -167,29 +167,11 @@ static struct mxs_dma_type mxs_dma_types[] = {
 	}
 };
 
-static const struct platform_device_id mxs_dma_ids[] = {
-	{
-		.name = "imx23-dma-apbh",
-		.driver_data = (kernel_ulong_t) &mxs_dma_types[0],
-	}, {
-		.name = "imx23-dma-apbx",
-		.driver_data = (kernel_ulong_t) &mxs_dma_types[1],
-	}, {
-		.name = "imx28-dma-apbh",
-		.driver_data = (kernel_ulong_t) &mxs_dma_types[2],
-	}, {
-		.name = "imx28-dma-apbx",
-		.driver_data = (kernel_ulong_t) &mxs_dma_types[3],
-	}, {
-		/* end of list */
-	}
-};
-
 static const struct of_device_id mxs_dma_dt_ids[] = {
-	{ .compatible = "fsl,imx23-dma-apbh", .data = &mxs_dma_ids[0], },
-	{ .compatible = "fsl,imx23-dma-apbx", .data = &mxs_dma_ids[1], },
-	{ .compatible = "fsl,imx28-dma-apbh", .data = &mxs_dma_ids[2], },
-	{ .compatible = "fsl,imx28-dma-apbx", .data = &mxs_dma_ids[3], },
+	{ .compatible = "fsl,imx23-dma-apbh", .data = &mxs_dma_types[0], },
+	{ .compatible = "fsl,imx23-dma-apbx", .data = &mxs_dma_types[1], },
+	{ .compatible = "fsl,imx28-dma-apbh", .data = &mxs_dma_types[2], },
+	{ .compatible = "fsl,imx28-dma-apbx", .data = &mxs_dma_types[3], },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, mxs_dma_dt_ids);
@@ -762,8 +744,6 @@ static struct dma_chan *mxs_dma_xlate(struct of_phandle_args *dma_spec,
 static int __init mxs_dma_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
-	const struct platform_device_id *id_entry;
-	const struct of_device_id *of_id;
 	const struct mxs_dma_type *dma_type;
 	struct mxs_dma_engine *mxs_dma;
 	struct resource *iores;
@@ -779,13 +759,7 @@ static int __init mxs_dma_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	of_id = of_match_device(mxs_dma_dt_ids, &pdev->dev);
-	if (of_id)
-		id_entry = of_id->data;
-	else
-		id_entry = platform_get_device_id(pdev);
-
-	dma_type = (struct mxs_dma_type *)id_entry->driver_data;
+	dma_type = (struct mxs_dma_type *)of_device_get_match_data(&pdev->dev);
 	mxs_dma->type = dma_type->type;
 	mxs_dma->dev_id = dma_type->id;
 
@@ -865,7 +839,6 @@ static struct platform_driver mxs_dma_driver = {
 		.name	= "mxs-dma",
 		.of_match_table = mxs_dma_dt_ids,
 	},
-	.id_table	= mxs_dma_ids,
 };
 
 static int __init mxs_dma_module_init(void)

From 255c2cc896b1009e7df4c4e32d9090fc22d28293 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 20 Nov 2020 17:22:58 +0100
Subject: [PATCH 053/230] dmaengine: jz4780: drop of_match_ptr from
 of_device_id table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver can match only via the DT table so the table should be always
used and the of_match_ptr does not have any sense (this also allows ACPI
matching via PRP0001, even though it is not relevant here).  This fixes
compile warning (!CONFIG_OF on x86_64):

    drivers/dma/dma-jz4780.c:1031:34: warning:
        ‘jz4780_dma_dt_match’ defined but not used [-Wunused-const-variable=]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201120162303.482126-1-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dma-jz4780.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/dma-jz4780.c b/drivers/dma/dma-jz4780.c
index a608efaa435f..612d353648cf 100644
--- a/drivers/dma/dma-jz4780.c
+++ b/drivers/dma/dma-jz4780.c
@@ -1044,7 +1044,7 @@ static struct platform_driver jz4780_dma_driver = {
 	.remove		= jz4780_dma_remove,
 	.driver	= {
 		.name	= "jz4780-dma",
-		.of_match_table = of_match_ptr(jz4780_dma_dt_match),
+		.of_match_table = jz4780_dma_dt_match,
 	},
 };
 

From 60b6122e86403fae58099dc08ecd319a86d03906 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 20 Nov 2020 17:22:59 +0100
Subject: [PATCH 054/230] dmaengine: dw-axi-dmac: drop of_match_ptr from
 of_device_id table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver can match only via the DT table so the table should be always
used and the of_match_ptr does not have any sense (this also allows ACPI
matching via PRP0001, even though it is not relevant here).  This fixes
compile warning (!CONFIG_OF on x86_64):

    drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c:984:34: warning:
        ‘dw_dma_of_id_table’ defined but not used [-Wunused-const-variable=]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201120162303.482126-2-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c b/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c
index 14c1ac26f866..e164f3295f5d 100644
--- a/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c
+++ b/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c
@@ -992,7 +992,7 @@ static struct platform_driver dw_driver = {
 	.remove		= dw_remove,
 	.driver = {
 		.name	= KBUILD_MODNAME,
-		.of_match_table = of_match_ptr(dw_dma_of_id_table),
+		.of_match_table = dw_dma_of_id_table,
 		.pm = &dw_axi_dma_pm_ops,
 	},
 };

From 890bcd49d8a6d8cb943c8b844bdc1547c1806ad7 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 20 Nov 2020 17:23:00 +0100
Subject: [PATCH 055/230] dmaengine: mv_xor: drop of_match_ptr from
 of_device_id table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver can match only via the DT table so the table should be always
used and the of_match_ptr does not have any sense (this also allows ACPI
matching via PRP0001, even though it is not relevant here).  This fixes
compile warning (!CONFIG_OF on x86_64):

    drivers/dma/mv_xor.c:1281:34: warning:
        ‘mv_xor_dt_ids’ defined but not used [-Wunused-const-variable=]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201120162303.482126-3-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/mv_xor.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 00cd1335eeba..23b232b57518 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1455,7 +1455,7 @@ static struct platform_driver mv_xor_driver = {
 	.resume         = mv_xor_resume,
 	.driver		= {
 		.name	        = MV_XOR_NAME,
-		.of_match_table = of_match_ptr(mv_xor_dt_ids),
+		.of_match_table = mv_xor_dt_ids,
 	},
 };
 

From 75fa2d4218c7df7added689dc941ba7b52519b2e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 20 Nov 2020 17:23:01 +0100
Subject: [PATCH 056/230] dmaengine: sf: drop of_match_ptr from of_device_id
 table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver can match only via the DT table so the table should be always
used and the of_match_ptr does not have any sense (this also allows ACPI
matching via PRP0001, even though it is not relevant here).  This fixes
compile warning (!CONFIG_OF on x86_64):

    drivers/dma/sf-pdma/sf-pdma.c:576:34: warning:
        ‘sf_pdma_dt_ids’ defined but not used [-Wunused-const-variable=]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201120162303.482126-4-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sf-pdma/sf-pdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/sf-pdma/sf-pdma.c b/drivers/dma/sf-pdma/sf-pdma.c
index 981ddcc42d70..c4c4e8575764 100644
--- a/drivers/dma/sf-pdma/sf-pdma.c
+++ b/drivers/dma/sf-pdma/sf-pdma.c
@@ -582,7 +582,7 @@ static struct platform_driver sf_pdma_driver = {
 	.remove		= sf_pdma_remove,
 	.driver		= {
 		.name	= "sf-pdma",
-		.of_match_table = of_match_ptr(sf_pdma_dt_ids),
+		.of_match_table = sf_pdma_dt_ids,
 	},
 };
 

From 25d39b590dac0b8230cc3663c9f8f1a3a21831fd Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 20 Nov 2020 17:23:02 +0100
Subject: [PATCH 057/230] dmaengine: stm32: mark of_device_id table as maybe
 unused
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver uses a second of_device_id table in the probe()
function by passing it to of_match_node().  This code will be a no-op
for compile testing (!CONFIG_OF on x86_64):

    drivers/dma/stm32-dmamux.c:171:34: warning:
        ‘stm32_stm32dma_master_match’ defined but not used [-Wunused-const-variable=]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201120162303.482126-5-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/stm32-dmamux.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/stm32-dmamux.c b/drivers/dma/stm32-dmamux.c
index a10ccd964376..ef0d0555103d 100644
--- a/drivers/dma/stm32-dmamux.c
+++ b/drivers/dma/stm32-dmamux.c
@@ -168,7 +168,7 @@ error_chan_id:
 	return ERR_PTR(ret);
 }
 
-static const struct of_device_id stm32_stm32dma_master_match[] = {
+static const struct of_device_id stm32_stm32dma_master_match[] __maybe_unused = {
 	{ .compatible = "st,stm32-dma", },
 	{},
 };

From 5d051f37f49d5bf04dca15fadea3a90a6a6f0f15 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 20 Nov 2020 17:23:03 +0100
Subject: [PATCH 058/230] dmaengine: ti: drop of_match_ptr and mark
 of_device_id table as maybe unused
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The driver can match only via the DT table so the main table should be
always used and the of_match_ptr does not have any sense (this also
allows ACPI matching via PRP0001, even though it is not relevant here).

The secondary match of_device_id tables (passed to of_match_node) should
be marked as maybe unused to fix compile testing (!CONFIG_OF on x86_64)
warnings:

    drivers/dma/ti/dma-crossbar.c:125:34: warning:
        ‘ti_am335x_master_match’ defined but not used [-Wunused-const-variable=]
    drivers/dma/ti/dma-crossbar.c:22:34: warning:
        ‘ti_dma_xbar_match’ defined but not used [-Wunused-const-variable=]

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20201120162303.482126-6-krzk@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/dma-crossbar.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/ti/dma-crossbar.c b/drivers/dma/ti/dma-crossbar.c
index 4ba8fa5d9c36..71d24fc07c00 100644
--- a/drivers/dma/ti/dma-crossbar.c
+++ b/drivers/dma/ti/dma-crossbar.c
@@ -122,7 +122,7 @@ static void *ti_am335x_xbar_route_allocate(struct of_phandle_args *dma_spec,
 	return map;
 }
 
-static const struct of_device_id ti_am335x_master_match[] = {
+static const struct of_device_id ti_am335x_master_match[] __maybe_unused = {
 	{ .compatible = "ti,edma3-tpcc", },
 	{},
 };
@@ -292,7 +292,7 @@ static const u32 ti_dma_offset[] = {
 	[TI_XBAR_SDMA_OFFSET] = 1,
 };
 
-static const struct of_device_id ti_dra7_master_match[] = {
+static const struct of_device_id ti_dra7_master_match[] __maybe_unused = {
 	{
 		.compatible = "ti,omap4430-sdma",
 		.data = &ti_dma_offset[TI_XBAR_SDMA_OFFSET],
@@ -460,7 +460,7 @@ static int ti_dma_xbar_probe(struct platform_device *pdev)
 static struct platform_driver ti_dma_xbar_driver = {
 	.driver = {
 		.name = "ti-dma-crossbar",
-		.of_match_table = of_match_ptr(ti_dma_xbar_match),
+		.of_match_table = ti_dma_xbar_match,
 	},
 	.probe	= ti_dma_xbar_probe,
 };

From 9905f728b0bda737fe2c2afd7c24f3365a45cc7b Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 30 Nov 2020 10:28:04 +0000
Subject: [PATCH 059/230] mailbox: arm_mhu_db: Fix mhu_db_shutdown by replacing
 kfree with devm_kfree

The mhu_db_channel info is allocated per channel using devm_kzalloc from
mhu_db_mbox_xlate which gets called from mbox_request_channel. However
we are releasing the allocated mhu_db_channel info using plain kfree from
mhu_db_shutdown which is called from mbox_free_channel.

This leads to random crashes when the channel is freed like below one:

  Unable to handle kernel paging request at virtual address 0080000400000008
  [0080000400000008] address between user and kernel address ranges
  Internal error: Oops: 96000044 [#1] PREEMPT SMP
  Modules linked in: scmi_module(-)
  CPU: 1 PID: 2212 Comm: rmmod Not tainted 5.10.0-rc5 #31
  Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno
  	Development Platform, BIOS EDK II Nov 19 2020
  pstate: 20000085 (nzCv daIf -PAN -UAO -TCO BTYPE=--)
  pc : release_nodes+0x74/0x230
  lr : devres_release_all+0x40/0x68
  Call trace:
   release_nodes+0x74/0x230
   devres_release_all+0x40/0x68
   device_release_driver_internal+0x12c/0x1f8
   driver_detach+0x58/0xe8
   bus_remove_driver+0x64/0xe0
   driver_unregister+0x38/0x68
   platform_driver_unregister+0x1c/0x28
   scmi_driver_exit+0x38/0x44 [scmi_module]
   __arm64_sys_delete_module+0x188/0x260
   el0_svc_common.constprop.0+0x80/0x1a8
   do_el0_svc+0x2c/0x98
   el0_sync_handler+0x160/0x168
   el0_sync+0x174/0x180
  Code: 1400000d eb07009f 54000460 f9400486 (f90004a6)
  ---[ end trace c55ffd306c140233 ]---

Fix it by replacing kfree with devm_kfree as required.

Fixes: 7002ca237b21 ("mailbox: arm_mhu: Add ARM MHU doorbell driver")
Reported-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/arm_mhu_db.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mailbox/arm_mhu_db.c b/drivers/mailbox/arm_mhu_db.c
index 275efe4cca0c..8eb66c4ecf5b 100644
--- a/drivers/mailbox/arm_mhu_db.c
+++ b/drivers/mailbox/arm_mhu_db.c
@@ -180,7 +180,7 @@ static void mhu_db_shutdown(struct mbox_chan *chan)
 
 	/* Reset channel */
 	mhu_db_mbox_clear_irq(chan);
-	kfree(chan->con_priv);
+	devm_kfree(mbox->dev, chan->con_priv);
 	chan->con_priv = NULL;
 }
 

From d68f17291ccf86442c3d367ba83581b81541baae Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Sun, 1 Nov 2020 16:42:29 +0100
Subject: [PATCH 060/230] mailbox: stm32-ipcc: add COMPILE_TEST dependency

This allows compiling the driver on architectures where the hardware is not
available. Most other mailbox drivers support this as well.

Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Reviewed-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index 05b1009e2820..abbf5d67ffa2 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -201,7 +201,7 @@ config BCM_FLEXRM_MBOX
 
 config STM32_IPCC
 	tristate "STM32 IPCC Mailbox"
-	depends on MACH_STM32MP157
+	depends on MACH_STM32MP157 || COMPILE_TEST
 	help
 	  Mailbox implementation for STMicroelectonics STM32 family chips
 	  with hardware for Inter-Processor Communication Controller (IPCC)

From 133af21f7c00a2bfc2b6c6aa7bfad4dc3eeb0401 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Sun, 29 Nov 2020 19:52:27 +0100
Subject: [PATCH 061/230] mailbox: stm32-ipcc: remove duplicate error message

platform_get_irq_byname already prints an error message if the requested
irq was not found. Don't print another message in the driver.

Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Reviewed-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/stm32-ipcc.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/mailbox/stm32-ipcc.c b/drivers/mailbox/stm32-ipcc.c
index ef966887aa15..ab8fe56af948 100644
--- a/drivers/mailbox/stm32-ipcc.c
+++ b/drivers/mailbox/stm32-ipcc.c
@@ -257,9 +257,6 @@ static int stm32_ipcc_probe(struct platform_device *pdev)
 	for (i = 0; i < IPCC_IRQ_NUM; i++) {
 		ipcc->irqs[i] = platform_get_irq_byname(pdev, irq_name[i]);
 		if (ipcc->irqs[i] < 0) {
-			if (ipcc->irqs[i] != -EPROBE_DEFER)
-				dev_err(dev, "no IRQ specified %s\n",
-					irq_name[i]);
 			ret = ipcc->irqs[i];
 			goto err_clk;
 		}

From 017909281eb0f8a00445840a36c799d3aed00841 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Sun, 29 Nov 2020 19:52:28 +0100
Subject: [PATCH 062/230] mailbox: stm32-ipcc: cast void pointers to unsigned
 long

Now that the driver can be enabled by COMPILE_TEST, we see warnings on
64bit platforms when void pointers are cast to unsigned int (and
vice versa).

warning: cast to smaller integer type 'unsigned int' from 'void *'
           unsigned int chan = (unsigned int)link->con_priv;
...
warning: cast to 'void *' from smaller integer type 'unsigned int'
           ipcc->controller.chans[i].con_priv = (void *)i;

Update these casts to use unsigned long variables, which are the same
size as pointers on all platforms.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Reviewed-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/stm32-ipcc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/mailbox/stm32-ipcc.c b/drivers/mailbox/stm32-ipcc.c
index ab8fe56af948..b84e0587937c 100644
--- a/drivers/mailbox/stm32-ipcc.c
+++ b/drivers/mailbox/stm32-ipcc.c
@@ -144,11 +144,11 @@ static irqreturn_t stm32_ipcc_tx_irq(int irq, void *data)
 
 static int stm32_ipcc_send_data(struct mbox_chan *link, void *data)
 {
-	unsigned int chan = (unsigned int)link->con_priv;
+	unsigned long chan = (unsigned long)link->con_priv;
 	struct stm32_ipcc *ipcc = container_of(link->mbox, struct stm32_ipcc,
 					       controller);
 
-	dev_dbg(ipcc->controller.dev, "%s: chan:%d\n", __func__, chan);
+	dev_dbg(ipcc->controller.dev, "%s: chan:%lu\n", __func__, chan);
 
 	/* set channel n occupied */
 	stm32_ipcc_set_bits(&ipcc->lock, ipcc->reg_proc + IPCC_XSCR,
@@ -163,7 +163,7 @@ static int stm32_ipcc_send_data(struct mbox_chan *link, void *data)
 
 static int stm32_ipcc_startup(struct mbox_chan *link)
 {
-	unsigned int chan = (unsigned int)link->con_priv;
+	unsigned long chan = (unsigned long)link->con_priv;
 	struct stm32_ipcc *ipcc = container_of(link->mbox, struct stm32_ipcc,
 					       controller);
 	int ret;
@@ -183,7 +183,7 @@ static int stm32_ipcc_startup(struct mbox_chan *link)
 
 static void stm32_ipcc_shutdown(struct mbox_chan *link)
 {
-	unsigned int chan = (unsigned int)link->con_priv;
+	unsigned long chan = (unsigned long)link->con_priv;
 	struct stm32_ipcc *ipcc = container_of(link->mbox, struct stm32_ipcc,
 					       controller);
 
@@ -206,7 +206,7 @@ static int stm32_ipcc_probe(struct platform_device *pdev)
 	struct device_node *np = dev->of_node;
 	struct stm32_ipcc *ipcc;
 	struct resource *res;
-	unsigned int i;
+	unsigned long i;
 	int ret;
 	u32 ip_ver;
 	static const char * const irq_name[] = {"rx", "tx"};
@@ -265,7 +265,7 @@ static int stm32_ipcc_probe(struct platform_device *pdev)
 						irq_thread[i], IRQF_ONESHOT,
 						dev_name(dev), ipcc);
 		if (ret) {
-			dev_err(dev, "failed to request irq %d (%d)\n", i, ret);
+			dev_err(dev, "failed to request irq %lu (%d)\n", i, ret);
 			goto err_clk;
 		}
 	}

From 0359af7ac318495432e5a06f671c80dc29274f18 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 30 Nov 2020 14:58:19 -0500
Subject: [PATCH 063/230] SUNRPC: Remove XDRBUF_SPARSE_PAGES flag in gss_proxy
 upcall

There's no need to defer allocation of pages for the receive buffer.

- This upcall is quite infrequent
- gssp_alloc_receive_pages() can allocate the pages with GFP_KERNEL,
  unlike the transport
- gssp_alloc_receive_pages() knows exactly how many pages are needed

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Olga Kornievskaia <aglo@umich.edu>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 net/sunrpc/auth_gss/gss_rpc_upcall.c | 15 ++++++++++-----
 net/sunrpc/auth_gss/gss_rpc_xdr.c    |  1 -
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index af9c7f43859c..d1c003a25b0f 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -200,7 +200,7 @@ static int gssp_call(struct net *net, struct rpc_message *msg)
 
 static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
 {
-	int i;
+	unsigned int i;
 
 	for (i = 0; i < arg->npages && arg->pages[i]; i++)
 		__free_page(arg->pages[i]);
@@ -210,14 +210,19 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg)
 
 static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg)
 {
+	unsigned int i;
+
 	arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE);
 	arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL);
-	/*
-	 * XXX: actual pages are allocated by xdr layer in
-	 * xdr_partial_copy_from_skb.
-	 */
 	if (!arg->pages)
 		return -ENOMEM;
+	for (i = 0; i < arg->npages; i++) {
+		arg->pages[i] = alloc_page(GFP_KERNEL);
+		if (!arg->pages[i]) {
+			gssp_free_receive_pages(arg);
+			return -ENOMEM;
+		}
+	}
 	return 0;
 }
 
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 2ff7b7083eba..44838f6ea25e 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -771,7 +771,6 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req,
 	xdr_inline_pages(&req->rq_rcv_buf,
 		PAGE_SIZE/2 /* pretty arbitrary */,
 		arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE);
-	req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
 done:
 	if (err)
 		dprintk("RPC:       gssx_enc_accept_sec_context: %d\n", err);

From 5482e09a8840ee3bb1848d73d05301eafc0e6199 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 24 Nov 2020 19:15:18 -0500
Subject: [PATCH 064/230] NFS: Fix rpcrdma_inline_fixup() crash with new
 LISTXATTRS operation

By switching to an XFS-backed export, I am able to reproduce the
ibcomp worker crash on my client with xfstests generic/013.

For the failing LISTXATTRS operation, xdr_inline_pages() is called
with page_len=12 and buflen=128.

- When ->send_request() is called, rpcrdma_marshal_req() does not
  set up a Reply chunk because buflen is smaller than the inline
  threshold. Thus rpcrdma_convert_iovs() does not get invoked at
  all and the transport's XDRBUF_SPARSE_PAGES logic is not invoked
  on the receive buffer.

- During reply processing, rpcrdma_inline_fixup() tries to copy
  received data into rq_rcv_buf->pages because page_len is positive.
  But there are no receive pages because rpcrdma_marshal_req() never
  allocated them.

The result is that the ibcomp worker faults and dies. Sometimes that
causes a visible crash, and sometimes it results in a transport hang
without other symptoms.

RPC/RDMA's XDRBUF_SPARSE_PAGES support is not entirely correct, and
should eventually be fixed or replaced. However, my preference is
that upper-layer operations should explicitly allocate their receive
buffers (using GFP_KERNEL) when possible, rather than relying on
XDRBUF_SPARSE_PAGES.

Reported-by: Olga kornievskaia <kolga@netapp.com>
Suggested-by: Olga kornievskaia <kolga@netapp.com>
Fixes: c10a75145feb ("NFSv4.2: add the extended attribute proc functions.")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Olga kornievskaia <kolga@netapp.com>
Reviewed-by: Frank van der Linden <fllinden@amazon.com>
Tested-by: Olga kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs42proc.c | 21 +++++++++++++--------
 fs/nfs/nfs42xdr.c  |  1 -
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 2b2211d1234e..4fc61e3d098d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1241,12 +1241,13 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
 		.rpc_resp	= &res,
 	};
 	u32 xdrlen;
-	int ret, np;
+	int ret, np, i;
 
 
+	ret = -ENOMEM;
 	res.scratch = alloc_page(GFP_KERNEL);
 	if (!res.scratch)
-		return -ENOMEM;
+		goto out;
 
 	xdrlen = nfs42_listxattr_xdrsize(buflen);
 	if (xdrlen > server->lxasize)
@@ -1254,9 +1255,12 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
 	np = xdrlen / PAGE_SIZE + 1;
 
 	pages = kcalloc(np, sizeof(struct page *), GFP_KERNEL);
-	if (pages == NULL) {
-		__free_page(res.scratch);
-		return -ENOMEM;
+	if (!pages)
+		goto out_free_scratch;
+	for (i = 0; i < np; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto out_free_pages;
 	}
 
 	arg.xattr_pages = pages;
@@ -1271,14 +1275,15 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf,
 		*eofp = res.eof;
 	}
 
+out_free_pages:
 	while (--np >= 0) {
 		if (pages[np])
 			__free_page(pages[np]);
 	}
-
-	__free_page(res.scratch);
 	kfree(pages);
-
+out_free_scratch:
+	__free_page(res.scratch);
+out:
 	return ret;
 
 }
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6e060a88f98c..8432bd6b95f0 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1528,7 +1528,6 @@ static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req,
 
 	rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count,
 	    hdr.replen);
-	req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
 
 	encode_nops(&hdr);
 }

From bd75475c2fa161acec0017e030f6e8c01cb0d107 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Mon, 23 Nov 2020 22:15:17 -0500
Subject: [PATCH 065/230] NFSv4.2: Fix 5 seconds delay when doing inter server
 copy

Since commit b4868b44c5628 ("NFSv4: Wait for stateid updates after
CLOSE/OPEN_DOWNGRADE"), every inter server copy operation suffers 5
seconds delay regardless of the size of the copy. The delay is from
nfs_set_open_stateid_locked when the check by nfs_stateid_is_sequential
fails because the seqid in both nfs4_state and nfs4_stateid are 0.

Fix __nfs42_ssc_open to delay setting of NFS_OPEN_STATE in nfs4_state,
until after the call to update_open_stateid, to indicate this is the 1st
open. This fix is part of a 2 patches, the other patch is the fix in the
source server to return the stateid for COPY_NOTIFY request with seqid 1
instead of 0.

Fixes: ce0887ac96d3 ("NFSD add nfs4 inter ssc to nfsd4_copy")
Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 9d354de613da..57b3821d975a 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -377,10 +377,10 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
 		goto out_stateowner;
 
 	set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags);
-	set_bit(NFS_OPEN_STATE, &ctx->state->flags);
 	memcpy(&ctx->state->open_stateid.other, &stateid->other,
 	       NFS4_STATEID_OTHER_SIZE);
 	update_open_stateid(ctx->state, stateid, NULL, filep->f_mode);
+	set_bit(NFS_OPEN_STATE, &ctx->state->flags);
 
 	nfs_file_set_open_context(filep, ctx);
 	put_nfs_open_context(ctx);

From 5f447cb88123857d69f0c1a665356688037c6ad3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 19 Oct 2020 12:40:59 -0400
Subject: [PATCH 066/230] NFSv3: Refactor nfs3_proc_lookup() to split out the
 dentry

We want to reuse the lookup code in NFSv3 in order to emulate the
NFSv4 lookupp operation.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3proc.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2397ceedba8a..acbdf7496d31 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -154,14 +154,14 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 }
 
 static int
-nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
-		 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
-		 struct nfs4_label *label)
+__nfs3_proc_lookup(struct inode *dir, const char *name, size_t len,
+		   struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+		   unsigned short task_flags)
 {
 	struct nfs3_diropargs	arg = {
 		.fh		= NFS_FH(dir),
-		.name		= dentry->d_name.name,
-		.len		= dentry->d_name.len
+		.name		= name,
+		.len		= len
 	};
 	struct nfs3_diropres	res = {
 		.fh		= fhandle,
@@ -173,17 +173,11 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
 		.rpc_resp	= &res,
 	};
 	int			status;
-	unsigned short task_flags = 0;
-
-	/* Is this is an attribute revalidation, subject to softreval? */
-	if (nfs_lookup_is_soft_revalidate(dentry))
-		task_flags |= RPC_TASK_TIMEOUT;
 
 	res.dir_attr = nfs_alloc_fattr();
 	if (res.dir_attr == NULL)
 		return -ENOMEM;
 
-	dprintk("NFS call  lookup %pd2\n", dentry);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags);
 	nfs_refresh_inode(dir, res.dir_attr);
@@ -198,6 +192,23 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
 	return status;
 }
 
+static int
+nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
+		 struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+		 struct nfs4_label *label)
+{
+	unsigned short task_flags = 0;
+
+	/* Is this is an attribute revalidation, subject to softreval? */
+	if (nfs_lookup_is_soft_revalidate(dentry))
+		task_flags |= RPC_TASK_TIMEOUT;
+
+	dprintk("NFS call  lookup %pd2\n", dentry);
+	return __nfs3_proc_lookup(dir, dentry->d_name.name,
+				  dentry->d_name.len, fhandle, fattr,
+				  task_flags);
+}
+
 static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
 	struct nfs3_accessargs	arg = {

From 3c5e9a59faa6b1bb3110b961e695502c7ee8699b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 19 Oct 2020 12:45:53 -0400
Subject: [PATCH 067/230] NFSv3: Add emulation of the lookupp() operation

In order to use the open_by_filehandle() operations on NFSv3, we need
to be able to emulate lookupp() so that nfs_get_parent() can be used
to convert disconnected dentries into connected ones.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs3proc.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index acbdf7496d31..6b66b73a50eb 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -209,6 +209,20 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry,
 				  task_flags);
 }
 
+static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle,
+			     struct nfs_fattr *fattr, struct nfs4_label *label)
+{
+	const char dotdot[] = "..";
+	const size_t len = strlen(dotdot);
+	unsigned short task_flags = 0;
+
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+		task_flags |= RPC_TASK_TIMEOUT;
+
+	return __nfs3_proc_lookup(inode, dotdot, len, fhandle, fattr,
+				  task_flags);
+}
+
 static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry)
 {
 	struct nfs3_accessargs	arg = {
@@ -1015,6 +1029,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
 	.getattr	= nfs3_proc_getattr,
 	.setattr	= nfs3_proc_setattr,
 	.lookup		= nfs3_proc_lookup,
+	.lookupp	= nfs3_proc_lookupp,
 	.access		= nfs3_proc_access,
 	.readlink	= nfs3_proc_readlink,
 	.create		= nfs3_proc_create,

From 76998ebb91582812540f591f9e148daa0f08c76d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 20 Oct 2020 14:30:35 -0400
Subject: [PATCH 068/230] NFSv4: Observe the NFS_MOUNT_SOFTREVAL flag in
 _nfs4_proc_lookupp

We need to respect the NFS_MOUNT_SOFTREVAL flag in _nfs4_proc_lookupp,
by timing out if the server is unavailable.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9e0ca9b2b210..a5b9356bee6a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4397,6 +4397,10 @@ static int _nfs4_proc_lookupp(struct inode *inode,
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
+	unsigned short task_flags = 0;
+
+	if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
+		task_flags |= RPC_TASK_TIMEOUT;
 
 	args.bitmask = nfs4_bitmask(server, label);
 
@@ -4404,7 +4408,7 @@ static int _nfs4_proc_lookupp(struct inode *inode,
 
 	dprintk("NFS call  lookupp ino=0x%lx\n", inode->i_ino);
 	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, task_flags);
 	dprintk("NFS reply lookupp: %d\n", status);
 	return status;
 }

From e4c72201b6ec3173dfe13fa2e2335a3ad78d4921 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 22 Oct 2020 17:40:33 -0400
Subject: [PATCH 069/230] SUNRPC: rpc_wake_up() should wake up tasks in the
 correct order

Currently, we wake up the tasks by priority queue ordering, which means
that we ignore the batching that is supposed to help with QoS issues.

Fixes: c049f8ea9a0d ("SUNRPC: Remove the bh-safe lock requirement on the rpc_wait_queue->lock")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/sched.c | 67 +++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index f06d7c315017..cf702a5f7fe5 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -675,6 +675,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue)
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_next);
 
+/**
+ * rpc_wake_up_locked - wake up all rpc_tasks
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ *
+ */
+static void rpc_wake_up_locked(struct rpc_wait_queue *queue)
+{
+	struct rpc_task *task;
+
+	for (;;) {
+		task = __rpc_find_next_queued(queue);
+		if (task == NULL)
+			break;
+		rpc_wake_up_task_queue_locked(queue, task);
+	}
+}
+
 /**
  * rpc_wake_up - wake up all rpc_tasks
  * @queue: rpc_wait_queue on which the tasks are sleeping
@@ -683,26 +700,29 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next);
  */
 void rpc_wake_up(struct rpc_wait_queue *queue)
 {
-	struct list_head *head;
-
 	spin_lock(&queue->lock);
-	head = &queue->tasks[queue->maxpriority];
-	for (;;) {
-		while (!list_empty(head)) {
-			struct rpc_task *task;
-			task = list_first_entry(head,
-					struct rpc_task,
-					u.tk_wait.list);
-			rpc_wake_up_task_queue_locked(queue, task);
-		}
-		if (head == &queue->tasks[0])
-			break;
-		head--;
-	}
+	rpc_wake_up_locked(queue);
 	spin_unlock(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up);
 
+/**
+ * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value.
+ * @queue: rpc_wait_queue on which the tasks are sleeping
+ * @status: status value to set
+ */
+static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status)
+{
+	struct rpc_task *task;
+
+	for (;;) {
+		task = __rpc_find_next_queued(queue);
+		if (task == NULL)
+			break;
+		rpc_wake_up_task_queue_set_status_locked(queue, task, status);
+	}
+}
+
 /**
  * rpc_wake_up_status - wake up all rpc_tasks and set their status value.
  * @queue: rpc_wait_queue on which the tasks are sleeping
@@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up);
  */
 void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 {
-	struct list_head *head;
-
 	spin_lock(&queue->lock);
-	head = &queue->tasks[queue->maxpriority];
-	for (;;) {
-		while (!list_empty(head)) {
-			struct rpc_task *task;
-			task = list_first_entry(head,
-					struct rpc_task,
-					u.tk_wait.list);
-			task->tk_status = status;
-			rpc_wake_up_task_queue_locked(queue, task);
-		}
-		if (head == &queue->tasks[0])
-			break;
-		head--;
-	}
+	rpc_wake_up_status_locked(queue, status);
 	spin_unlock(&queue->lock);
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);

From 05ad917561fca39a03338cb21fe9622f998b0f9c Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 6 Nov 2020 16:03:38 -0500
Subject: [PATCH 070/230] NFSv4.2: condition READDIR's mask for security label
 based on LSM state

Currently, the client will always ask for security_labels if the server
returns that it supports that feature regardless of any LSM modules
(such as Selinux) enforcing security policy. This adds performance
penalty to the READDIR operation.

Client adjusts superblock's support of the security_label based on
the server's support but also current client's configuration of the
LSM modules. Thus, prior to using the default bitmask in READDIR,
this patch checks the server's capabilities and then instructs
READDIR to remove FATTR4_WORD2_SECURITY_LABEL from the bitmask.

v5: fixing silly mistakes of the rushed v4
v4: simplifying logic
v3: changing label's initialization per Ondrej's comment
v2: dropping selinux hook and using the sb cap.

Suggested-by: Ondrej Mosnacek <omosnace@redhat.com>
Suggested-by: Scott Mayhew <smayhew@redhat.com>
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Fixes: 2b0143b5c986 ("VFS: normal filesystems (and lustre): d_inode() annotations")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a5b9356bee6a..66f1f4a5c74c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4965,12 +4965,12 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 		u64 cookie, struct page **pages, unsigned int count, bool plus)
 {
 	struct inode		*dir = d_inode(dentry);
+	struct nfs_server	*server = NFS_SERVER(dir);
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
 		.pages = pages,
 		.pgbase = 0,
 		.count = count,
-		.bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask,
 		.plus = plus,
 	};
 	struct nfs4_readdir_res res;
@@ -4985,9 +4985,15 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
 			dentry,
 			(unsigned long long)cookie);
+	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
+		args.bitmask = server->attr_bitmask_nl;
+	else
+		args.bitmask = server->attr_bitmask;
+
 	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
 	res.pgbase = args.pgbase;
-	status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
+	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
+			&res.seq_res, 0);
 	if (status >= 0) {
 		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
 		status += args.pgbase;

From 2e7a46417952ae480cb5091ed5ade73078630b40 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 09:56:18 -0500
Subject: [PATCH 071/230] NFS: Ensure contents of struct nfs_open_dir_context
 are consistent

Ensure that the contents of struct nfs_open_dir_context are consistent
by setting them under the file->f_lock from a private copy (that is
known to be consistent).

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4e011adaf967..67d8595cd6e5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -144,20 +144,23 @@ struct nfs_cache_array {
 	struct nfs_cache_array_entry array[];
 };
 
-typedef struct {
+typedef struct nfs_readdir_descriptor {
 	struct file	*file;
 	struct page	*page;
 	struct dir_context *ctx;
 	unsigned long	page_index;
-	u64		*dir_cookie;
+	u64		dir_cookie;
 	u64		last_cookie;
+	u64		dup_cookie;
 	loff_t		current_index;
 	loff_t		prev_index;
 
 	unsigned long	dir_verifier;
 	unsigned long	timestamp;
 	unsigned long	gencount;
+	unsigned long	attr_gencount;
 	unsigned int	cache_entry_index;
+	signed char duped;
 	bool plus;
 	bool eof;
 } nfs_readdir_descriptor_t;
@@ -273,7 +276,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 	}
 
 	index = (unsigned int)diff;
-	*desc->dir_cookie = array->array[index].cookie;
+	desc->dir_cookie = array->array[index].cookie;
 	desc->cache_entry_index = index;
 	return 0;
 out_eof:
@@ -298,33 +301,32 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	int status = -EAGAIN;
 
 	for (i = 0; i < array->size; i++) {
-		if (array->array[i].cookie == *desc->dir_cookie) {
+		if (array->array[i].cookie == desc->dir_cookie) {
 			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
-			struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 			new_pos = desc->current_index + i;
-			if (ctx->attr_gencount != nfsi->attr_gencount ||
+			if (desc->attr_gencount != nfsi->attr_gencount ||
 			    !nfs_readdir_inode_mapping_valid(nfsi)) {
-				ctx->duped = 0;
-				ctx->attr_gencount = nfsi->attr_gencount;
+				desc->duped = 0;
+				desc->attr_gencount = nfsi->attr_gencount;
 			} else if (new_pos < desc->prev_index) {
-				if (ctx->duped > 0
-				    && ctx->dup_cookie == *desc->dir_cookie) {
+				if (desc->duped > 0
+				    && desc->dup_cookie == desc->dir_cookie) {
 					if (printk_ratelimit()) {
 						pr_notice("NFS: directory %pD2 contains a readdir loop."
 								"Please contact your server vendor.  "
 								"The file: %.*s has duplicate cookie %llu\n",
 								desc->file, array->array[i].string.len,
-								array->array[i].string.name, *desc->dir_cookie);
+								array->array[i].string.name, desc->dir_cookie);
 					}
 					status = -ELOOP;
 					goto out;
 				}
-				ctx->dup_cookie = *desc->dir_cookie;
-				ctx->duped = -1;
+				desc->dup_cookie = desc->dir_cookie;
+				desc->duped = -1;
 			}
 			if (nfs_readdir_use_cookie(desc->file))
-				desc->ctx->pos = *desc->dir_cookie;
+				desc->ctx->pos = desc->dir_cookie;
 			else
 				desc->ctx->pos = new_pos;
 			desc->prev_index = new_pos;
@@ -334,7 +336,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 	}
 	if (array->eof_index >= 0) {
 		status = -EBADCOOKIE;
-		if (*desc->dir_cookie == array->last_cookie)
+		if (desc->dir_cookie == array->last_cookie)
 			desc->eof = true;
 	}
 out:
@@ -349,7 +351,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 
 	array = kmap(desc->page);
 
-	if (*desc->dir_cookie == 0)
+	if (desc->dir_cookie == 0)
 		status = nfs_readdir_search_for_pos(array, desc);
 	else
 		status = nfs_readdir_search_for_cookie(array, desc);
@@ -801,7 +803,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 	int i = 0;
 	int res = 0;
 	struct nfs_cache_array *array = NULL;
-	struct nfs_open_dir_context *ctx = file->private_data;
 
 	array = kmap(desc->page);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -814,22 +815,22 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 			break;
 		}
 		if (i < (array->size-1))
-			*desc->dir_cookie = array->array[i+1].cookie;
+			desc->dir_cookie = array->array[i+1].cookie;
 		else
-			*desc->dir_cookie = array->last_cookie;
+			desc->dir_cookie = array->last_cookie;
 		if (nfs_readdir_use_cookie(file))
-			desc->ctx->pos = *desc->dir_cookie;
+			desc->ctx->pos = desc->dir_cookie;
 		else
 			desc->ctx->pos++;
-		if (ctx->duped != 0)
-			ctx->duped = 1;
+		if (desc->duped != 0)
+			desc->duped = 1;
 	}
 	if (array->eof_index >= 0)
 		desc->eof = true;
 
 	kunmap(desc->page);
 	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
-			(unsigned long long)*desc->dir_cookie, res);
+			(unsigned long long)desc->dir_cookie, res);
 	return res;
 }
 
@@ -851,10 +852,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	struct page	*page = NULL;
 	int		status;
 	struct inode *inode = file_inode(desc->file);
-	struct nfs_open_dir_context *ctx = desc->file->private_data;
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
-			(unsigned long long)*desc->dir_cookie);
+			(unsigned long long)desc->dir_cookie);
 
 	page = alloc_page(GFP_HIGHUSER);
 	if (!page) {
@@ -863,9 +863,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	}
 
 	desc->page_index = 0;
-	desc->last_cookie = *desc->dir_cookie;
+	desc->last_cookie = desc->dir_cookie;
 	desc->page = page;
-	ctx->duped = 0;
+	desc->duped = 0;
 
 	status = nfs_readdir_xdr_to_array(desc, page, inode);
 	if (status < 0)
@@ -894,7 +894,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	nfs_readdir_descriptor_t my_desc = {
 		.file = file,
 		.ctx = ctx,
-		.dir_cookie = &dir_ctx->dir_cookie,
 		.plus = nfs_use_readdirplus(inode, ctx),
 	},
 			*desc = &my_desc;
@@ -915,13 +914,20 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	if (res < 0)
 		goto out;
 
+	spin_lock(&file->f_lock);
+	desc->dir_cookie = dir_ctx->dir_cookie;
+	desc->dup_cookie = dir_ctx->dup_cookie;
+	desc->duped = dir_ctx->duped;
+	desc->attr_gencount = dir_ctx->attr_gencount;
+	spin_unlock(&file->f_lock);
+
 	do {
 		res = readdir_search_pagecache(desc);
 
 		if (res == -EBADCOOKIE) {
 			res = 0;
 			/* This means either end of directory */
-			if (*desc->dir_cookie && !desc->eof) {
+			if (desc->dir_cookie && !desc->eof) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc);
 				if (res == 0)
@@ -946,6 +952,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res < 0)
 			break;
 	} while (!desc->eof);
+
+	spin_lock(&file->f_lock);
+	dir_ctx->dir_cookie = desc->dir_cookie;
+	dir_ctx->dup_cookie = desc->dup_cookie;
+	dir_ctx->duped = desc->duped;
+	dir_ctx->attr_gencount = desc->attr_gencount;
+	spin_unlock(&file->f_lock);
+
 out:
 	if (res > 0)
 		res = 0;

From b1e21c97437f64d0a00f8fea1f9e64e77e0e4242 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:45:55 -0500
Subject: [PATCH 072/230] NFS: Clean up readdir struct nfs_cache_array

Since the 'eof_index' is only ever used as a flag, make it so.
Also add a flag to detect if the page has been completely filled.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 66 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 67d8595cd6e5..604ebe015387 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -138,9 +138,10 @@ struct nfs_cache_array_entry {
 };
 
 struct nfs_cache_array {
-	int size;
-	int eof_index;
 	u64 last_cookie;
+	unsigned int size;
+	unsigned char page_full : 1,
+		      page_is_eof : 1;
 	struct nfs_cache_array_entry array[];
 };
 
@@ -172,7 +173,6 @@ void nfs_readdir_init_array(struct page *page)
 
 	array = kmap_atomic(page);
 	memset(array, 0, sizeof(struct nfs_cache_array));
-	array->eof_index = -1;
 	kunmap_atomic(array);
 }
 
@@ -192,6 +192,17 @@ void nfs_readdir_clear_array(struct page *page)
 	kunmap_atomic(array);
 }
 
+static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
+{
+	array->page_is_eof = 1;
+	array->page_full = 1;
+}
+
+static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
+{
+	return array->page_full;
+}
+
 /*
  * the caller is responsible for freeing qstr.name
  * when called by nfs_readdir_add_to_array, the strings will be freed in
@@ -213,6 +224,23 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
 	return 0;
 }
 
+/*
+ * Check that the next array entry lies entirely within the page bounds
+ */
+static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
+{
+	struct nfs_cache_array_entry *cache_entry;
+
+	if (array->page_full)
+		return -ENOSPC;
+	cache_entry = &array->array[array->size + 1];
+	if ((char *)cache_entry - (char *)array > PAGE_SIZE) {
+		array->page_full = 1;
+		return -ENOSPC;
+	}
+	return 0;
+}
+
 static
 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 {
@@ -220,13 +248,11 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	struct nfs_cache_array_entry *cache_entry;
 	int ret;
 
-	cache_entry = &array->array[array->size];
-
-	/* Check that this entry lies within the page bounds */
-	ret = -ENOSPC;
-	if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE)
+	ret = nfs_readdir_array_can_expand(array);
+	if (ret)
 		goto out;
 
+	cache_entry = &array->array[array->size];
 	cache_entry->cookie = entry->prev_cookie;
 	cache_entry->ino = entry->ino;
 	cache_entry->d_type = entry->d_type;
@@ -236,12 +262,21 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	array->last_cookie = entry->cookie;
 	array->size++;
 	if (entry->eof != 0)
-		array->eof_index = array->size;
+		nfs_readdir_array_set_eof(array);
 out:
 	kunmap(page);
 	return ret;
 }
 
+static void nfs_readdir_page_set_eof(struct page *page)
+{
+	struct nfs_cache_array *array;
+
+	array = kmap_atomic(page);
+	nfs_readdir_array_set_eof(array);
+	kunmap_atomic(array);
+}
+
 static inline
 int is_32bit_api(void)
 {
@@ -270,7 +305,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
 	if (diff < 0)
 		goto out_eof;
 	if (diff >= array->size) {
-		if (array->eof_index >= 0)
+		if (array->page_is_eof)
 			goto out_eof;
 		return -EAGAIN;
 	}
@@ -334,7 +369,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 			return 0;
 		}
 	}
-	if (array->eof_index >= 0) {
+	if (array->page_is_eof) {
 		status = -EBADCOOKIE;
 		if (desc->dir_cookie == array->last_cookie)
 			desc->eof = true;
@@ -566,7 +601,6 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 	struct xdr_stream stream;
 	struct xdr_buf buf;
 	struct page *scratch;
-	struct nfs_cache_array *array;
 	unsigned int count = 0;
 	int status;
 
@@ -604,10 +638,8 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 
 out_nopages:
 	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
-		array = kmap(page);
-		array->eof_index = array->size;
+		nfs_readdir_page_set_eof(page);
 		status = 0;
-		kunmap(page);
 	}
 
 	put_page(scratch);
@@ -689,7 +721,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 				status = 0;
 			break;
 		}
-	} while (array->eof_index < 0);
+	} while (!nfs_readdir_array_is_full(array));
 
 	nfs_readdir_free_pages(pages, array_size);
 out_release_array:
@@ -825,7 +857,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		if (desc->duped != 0)
 			desc->duped = 1;
 	}
-	if (array->eof_index >= 0)
+	if (array->page_is_eof)
 		desc->eof = true;
 
 	kunmap(desc->page);

From 972bcdf233096d36b2f3e02f34a80d0f073b6b05 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 17:15:43 -0500
Subject: [PATCH 073/230] NFS: Clean up nfs_readdir_page_filler()

Clean up handling of the case where there are no entries in the readdir
reply.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 604ebe015387..68acbde3f914 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -601,16 +601,12 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 	struct xdr_stream stream;
 	struct xdr_buf buf;
 	struct page *scratch;
-	unsigned int count = 0;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
 	if (scratch == NULL)
 		return -ENOMEM;
 
-	if (buflen == 0)
-		goto out_nopages;
-
 	xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
 	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
 
@@ -619,27 +615,27 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 			entry->label->len = NFS4_MAXLABELLEN;
 
 		status = xdr_decode(desc, entry, &stream);
-		if (status != 0) {
-			if (status == -EAGAIN)
-				status = 0;
+		if (status != 0)
 			break;
-		}
-
-		count++;
 
 		if (desc->plus)
 			nfs_prime_dcache(file_dentry(desc->file), entry,
 					desc->dir_verifier);
 
 		status = nfs_readdir_add_to_array(entry, page);
-		if (status != 0)
-			break;
-	} while (!entry->eof);
+	} while (!status && !entry->eof);
 
-out_nopages:
-	if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
-		nfs_readdir_page_set_eof(page);
+	switch (status) {
+	case -EBADCOOKIE:
+		if (entry->eof) {
+			nfs_readdir_page_set_eof(page);
+			status = 0;
+		}
+		break;
+	case -ENOSPC:
+	case -EAGAIN:
 		status = 0;
+		break;
 	}
 
 	put_page(scratch);
@@ -714,14 +710,15 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 		if (status < 0)
 			break;
+
 		pglen = status;
-		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
-		if (status < 0) {
-			if (status == -ENOSPC)
-				status = 0;
+		if (pglen == 0) {
+			nfs_readdir_page_set_eof(page);
 			break;
 		}
-	} while (!nfs_readdir_array_is_full(array));
+
+		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+	} while (!status && !nfs_readdir_array_is_full(array));
 
 	nfs_readdir_free_pages(pages, array_size);
 out_release_array:

From 1f1d4aa4e4bcb4721d3c51f4c07dda790b6accd9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 12:34:43 -0500
Subject: [PATCH 074/230] NFS: Clean up directory array handling

Refactor to use pagecache_get_page() so that we can fill the page
in multiple stages.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 140 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 62 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 68acbde3f914..842f69120a01 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -149,7 +149,7 @@ typedef struct nfs_readdir_descriptor {
 	struct file	*file;
 	struct page	*page;
 	struct dir_context *ctx;
-	unsigned long	page_index;
+	pgoff_t		page_index;
 	u64		dir_cookie;
 	u64		last_cookie;
 	u64		dup_cookie;
@@ -166,13 +166,18 @@ typedef struct nfs_readdir_descriptor {
 	bool eof;
 } nfs_readdir_descriptor_t;
 
-static
-void nfs_readdir_init_array(struct page *page)
+static void nfs_readdir_array_init(struct nfs_cache_array *array)
+{
+	memset(array, 0, sizeof(struct nfs_cache_array));
+}
+
+static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
 {
 	struct nfs_cache_array *array;
 
 	array = kmap_atomic(page);
-	memset(array, 0, sizeof(struct nfs_cache_array));
+	nfs_readdir_array_init(array);
+	array->last_cookie = last_cookie;
 	kunmap_atomic(array);
 }
 
@@ -188,7 +193,7 @@ void nfs_readdir_clear_array(struct page *page)
 	array = kmap_atomic(page);
 	for (i = 0; i < array->size; i++)
 		kfree(array->array[i].string.name);
-	array->size = 0;
+	nfs_readdir_array_init(array);
 	kunmap_atomic(array);
 }
 
@@ -268,6 +273,44 @@ out:
 	return ret;
 }
 
+static struct page *nfs_readdir_page_get_locked(struct address_space *mapping,
+						pgoff_t index, u64 last_cookie)
+{
+	struct page *page;
+
+	page = grab_cache_page(mapping, index);
+	if (page && !PageUptodate(page)) {
+		nfs_readdir_page_init_array(page, last_cookie);
+		if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0)
+			nfs_zap_mapping(mapping->host, mapping);
+		SetPageUptodate(page);
+	}
+
+	return page;
+}
+
+static u64 nfs_readdir_page_last_cookie(struct page *page)
+{
+	struct nfs_cache_array *array;
+	u64 ret;
+
+	array = kmap_atomic(page);
+	ret = array->last_cookie;
+	kunmap_atomic(array);
+	return ret;
+}
+
+static bool nfs_readdir_page_needs_filling(struct page *page)
+{
+	struct nfs_cache_array *array;
+	bool ret;
+
+	array = kmap_atomic(page);
+	ret = !nfs_readdir_array_is_full(array);
+	kunmap_atomic(array);
+	return ret;
+}
+
 static void nfs_readdir_page_set_eof(struct page *page)
 {
 	struct nfs_cache_array *array;
@@ -682,10 +725,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	int status = -ENOMEM;
 	unsigned int array_size = ARRAY_SIZE(pages);
 
-	nfs_readdir_init_array(page);
-
 	entry.prev_cookie = 0;
-	entry.cookie = desc->last_cookie;
+	entry.cookie = nfs_readdir_page_last_cookie(page);
 	entry.eof = 0;
 	entry.fh = nfs_alloc_fhandle();
 	entry.fattr = nfs_alloc_fattr();
@@ -730,48 +771,25 @@ out:
 	return status;
 }
 
-/*
- * Now we cache directories properly, by converting xdr information
- * to an array that can be used for lookups later.  This results in
- * fewer cache pages, since we can store more information on each page.
- * We only need to convert from xdr once so future lookups are much simpler
- */
-static
-int nfs_readdir_filler(void *data, struct page* page)
-{
-	nfs_readdir_descriptor_t *desc = data;
-	struct inode	*inode = file_inode(desc->file);
-	int ret;
-
-	ret = nfs_readdir_xdr_to_array(desc, page, inode);
-	if (ret < 0)
-		goto error;
-	SetPageUptodate(page);
-
-	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
-		/* Should never happen */
-		nfs_zap_mapping(inode, inode->i_mapping);
-	}
-	unlock_page(page);
-	return 0;
- error:
-	nfs_readdir_clear_array(page);
-	unlock_page(page);
-	return ret;
-}
-
-static
-void cache_page_release(nfs_readdir_descriptor_t *desc)
+static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc)
 {
 	put_page(desc->page);
 	desc->page = NULL;
 }
 
-static
-struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
+static void
+nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc)
 {
-	return read_cache_page(desc->file->f_mapping, desc->page_index,
-			nfs_readdir_filler, desc);
+	unlock_page(desc->page);
+	nfs_readdir_page_put(desc);
+}
+
+static struct page *
+nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
+{
+	return nfs_readdir_page_get_locked(desc->file->f_mapping,
+					   desc->page_index,
+					   desc->last_cookie);
 }
 
 /*
@@ -785,23 +803,21 @@ int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int res;
 
-	desc->page = get_cache_page(desc);
-	if (IS_ERR(desc->page))
-		return PTR_ERR(desc->page);
-	res = lock_page_killable(desc->page);
-	if (res != 0)
-		goto error;
-	res = -EAGAIN;
-	if (desc->page->mapping != NULL) {
-		res = nfs_readdir_search_array(desc);
-		if (res == 0) {
-			nfsi->page_index = desc->page_index;
-			return 0;
-		}
+	desc->page = nfs_readdir_page_get_cached(desc);
+	if (!desc->page)
+		return -ENOMEM;
+	if (nfs_readdir_page_needs_filling(desc->page)) {
+		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
+		if (res < 0)
+			goto error;
+	}
+	res = nfs_readdir_search_array(desc);
+	if (res == 0) {
+		nfsi->page_index = desc->page_index;
+		return 0;
 	}
-	unlock_page(desc->page);
 error:
-	cache_page_release(desc);
+	nfs_readdir_page_unlock_and_put_cached(desc);
 	return res;
 }
 
@@ -896,6 +912,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	desc->page = page;
 	desc->duped = 0;
 
+	nfs_readdir_page_init_array(page, desc->dir_cookie);
 	status = nfs_readdir_xdr_to_array(desc, page, inode);
 	if (status < 0)
 		goto out_release;
@@ -904,7 +921,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 
  out_release:
 	nfs_readdir_clear_array(desc->page);
-	cache_page_release(desc);
+	nfs_readdir_page_put(desc);
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
 			__func__, status);
@@ -976,8 +993,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 			break;
 
 		res = nfs_do_filldir(desc);
-		unlock_page(desc->page);
-		cache_page_release(desc);
+		nfs_readdir_page_unlock_and_put_cached(desc);
 		if (res < 0)
 			break;
 	} while (!desc->eof);

From 3b2a09f127e025674945e82c1ec0c88d6740280e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:14:10 -0500
Subject: [PATCH 075/230] NFS: Don't discard readdir results

If a readdir call returns more data than we can fit into one page
cache page, then allocate a new one for that data rather than
discarding the data.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 842f69120a01..f7248145c333 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -320,6 +320,26 @@ static void nfs_readdir_page_set_eof(struct page *page)
 	kunmap_atomic(array);
 }
 
+static void nfs_readdir_page_unlock_and_put(struct page *page)
+{
+	unlock_page(page);
+	put_page(page);
+}
+
+static struct page *nfs_readdir_page_get_next(struct address_space *mapping,
+					      pgoff_t index, u64 cookie)
+{
+	struct page *page;
+
+	page = nfs_readdir_page_get_locked(mapping, index, cookie);
+	if (page) {
+		if (nfs_readdir_page_last_cookie(page) == cookie)
+			return page;
+		nfs_readdir_page_unlock_and_put(page);
+	}
+	return NULL;
+}
+
 static inline
 int is_32bit_api(void)
 {
@@ -637,13 +657,15 @@ out:
 }
 
 /* Perform conversion from xdr to cache array */
-static
-int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
-				struct page **xdr_pages, struct page *page, unsigned int buflen)
+static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
+				   struct nfs_entry *entry,
+				   struct page **xdr_pages,
+				   struct page *fillme, unsigned int buflen)
 {
+	struct address_space *mapping = desc->file->f_mapping;
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch;
+	struct page *scratch, *new, *page = fillme;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
@@ -666,6 +688,19 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 					desc->dir_verifier);
 
 		status = nfs_readdir_add_to_array(entry, page);
+		if (status != -ENOSPC)
+			continue;
+
+		if (page->mapping != mapping)
+			break;
+		new = nfs_readdir_page_get_next(mapping, page->index + 1,
+						entry->prev_cookie);
+		if (!new)
+			break;
+		if (page != fillme)
+			nfs_readdir_page_unlock_and_put(page);
+		page = new;
+		status = nfs_readdir_add_to_array(entry, page);
 	} while (!status && !entry->eof);
 
 	switch (status) {
@@ -681,6 +716,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
 		break;
 	}
 
+	if (page != fillme)
+		nfs_readdir_page_unlock_and_put(page);
+
 	put_page(scratch);
 	return status;
 }

From e762a639816015a70bb1af8ea4baf54f4facb591 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:31:20 -0500
Subject: [PATCH 076/230] NFS: Remove unnecessary kmap in
 nfs_readdir_xdr_to_array()

The kmapped pointer is only used once per loop to check if we need to
exit.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f7248145c333..e8b0fcc1bc9e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -759,7 +759,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 	struct page *pages[NFS_MAX_READDIR_PAGES];
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
-	struct nfs_cache_array *array;
 	int status = -ENOMEM;
 	unsigned int array_size = ARRAY_SIZE(pages);
 
@@ -778,11 +777,9 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		goto out;
 	}
 
-	array = kmap(page);
-
 	status = nfs_readdir_alloc_pages(pages, array_size);
 	if (status < 0)
-		goto out_release_array;
+		goto out_release_label;
 	do {
 		unsigned int pglen;
 		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
@@ -797,11 +794,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		}
 
 		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
-	} while (!status && !nfs_readdir_array_is_full(array));
+	} while (!status && nfs_readdir_page_needs_filling(page));
 
 	nfs_readdir_free_pages(pages, array_size);
-out_release_array:
-	kunmap(page);
+out_release_label:
 	nfs4_label_free(entry.label);
 out:
 	nfs_free_fattr(entry.fattr);

From ed09222d651dbd30e707f96180628229146b885c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 13:34:32 -0500
Subject: [PATCH 077/230] NFS: Replace kmap() with kmap_atomic() in
 nfs_readdir_search_array()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e8b0fcc1bc9e..b9001123ec84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -447,7 +447,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 	struct nfs_cache_array *array;
 	int status;
 
-	array = kmap(desc->page);
+	array = kmap_atomic(desc->page);
 
 	if (desc->dir_cookie == 0)
 		status = nfs_readdir_search_for_pos(array, desc);
@@ -459,7 +459,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 		desc->current_index += array->size;
 		desc->page_index++;
 	}
-	kunmap(desc->page);
+	kunmap_atomic(array);
 	return status;
 }
 

From a52a8a6adad99e1162c27f70cd6495626a48064d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 19:17:29 -0500
Subject: [PATCH 078/230] NFS: Simplify struct nfs_cache_array_entry

We don't need to store a hash, so replace struct qstr with a simple
const char pointer and length.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b9001123ec84..be0e2891fecc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,7 +133,8 @@ nfs_closedir(struct inode *inode, struct file *filp)
 struct nfs_cache_array_entry {
 	u64 cookie;
 	u64 ino;
-	struct qstr string;
+	const char *name;
+	unsigned int name_len;
 	unsigned char d_type;
 };
 
@@ -192,7 +193,7 @@ void nfs_readdir_clear_array(struct page *page)
 
 	array = kmap_atomic(page);
 	for (i = 0; i < array->size; i++)
-		kfree(array->array[i].string.name);
+		kfree(array->array[i].name);
 	nfs_readdir_array_init(array);
 	kunmap_atomic(array);
 }
@@ -213,20 +214,17 @@ static bool nfs_readdir_array_is_full(struct nfs_cache_array *array)
  * when called by nfs_readdir_add_to_array, the strings will be freed in
  * nfs_clear_readdir_array()
  */
-static
-int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len)
+static const char *nfs_readdir_copy_name(const char *name, unsigned int len)
 {
-	string->len = len;
-	string->name = kmemdup_nul(name, len, GFP_KERNEL);
-	if (string->name == NULL)
-		return -ENOMEM;
+	const char *ret = kmemdup_nul(name, len, GFP_KERNEL);
+
 	/*
 	 * Avoid a kmemleak false positive. The pointer to the name is stored
 	 * in a page cache page which kmemleak does not scan.
 	 */
-	kmemleak_not_leak(string->name);
-	string->hash = full_name_hash(NULL, name, len);
-	return 0;
+	if (ret != NULL)
+		kmemleak_not_leak(ret);
+	return ret;
 }
 
 /*
@@ -249,27 +247,34 @@ static int nfs_readdir_array_can_expand(struct nfs_cache_array *array)
 static
 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 {
-	struct nfs_cache_array *array = kmap(page);
+	struct nfs_cache_array *array;
 	struct nfs_cache_array_entry *cache_entry;
+	const char *name;
 	int ret;
 
+	name = nfs_readdir_copy_name(entry->name, entry->len);
+	if (!name)
+		return -ENOMEM;
+
+	array = kmap_atomic(page);
 	ret = nfs_readdir_array_can_expand(array);
-	if (ret)
+	if (ret) {
+		kfree(name);
 		goto out;
+	}
 
 	cache_entry = &array->array[array->size];
 	cache_entry->cookie = entry->prev_cookie;
 	cache_entry->ino = entry->ino;
 	cache_entry->d_type = entry->d_type;
-	ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len);
-	if (ret)
-		goto out;
+	cache_entry->name_len = entry->len;
+	cache_entry->name = name;
 	array->last_cookie = entry->cookie;
 	array->size++;
 	if (entry->eof != 0)
 		nfs_readdir_array_set_eof(array);
 out:
-	kunmap(page);
+	kunmap_atomic(array);
 	return ret;
 }
 
@@ -413,9 +418,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
 					if (printk_ratelimit()) {
 						pr_notice("NFS: directory %pD2 contains a readdir loop."
 								"Please contact your server vendor.  "
-								"The file: %.*s has duplicate cookie %llu\n",
-								desc->file, array->array[i].string.len,
-								array->array[i].string.name, desc->dir_cookie);
+								"The file: %s has duplicate cookie %llu\n",
+								desc->file, array->array[i].name, desc->dir_cookie);
 					}
 					status = -ELOOP;
 					goto out;
@@ -888,7 +892,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		struct nfs_cache_array_entry *ent;
 
 		ent = &array->array[i];
-		if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
+		if (!dir_emit(desc->ctx, ent->name, ent->name_len,
 		    nfs_compat_user_ino64(ent->ino), ent->d_type)) {
 			desc->eof = true;
 			break;

From 1a34c8c9a49ee10ccaf91091ddd98c25e4d567dd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 14:26:47 -0500
Subject: [PATCH 079/230] NFS: Support larger readdir buffers

Support readdir buffers of up to 1MB in size so that we can read
large directories using few RPC calls.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/client.c   |  4 ++--
 fs/nfs/dir.c      | 33 +++++++++++++++++++--------------
 fs/nfs/internal.h |  6 ------
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4b8cc93913f7..f6454ba53d05 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -781,8 +781,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-	if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES)
-		server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES;
+	if (server->dtsize > NFS_MAX_FILE_IO_SIZE)
+		server->dtsize = NFS_MAX_FILE_IO_SIZE;
 	if (server->dtsize > server->rsize)
 		server->dtsize = server->rsize;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index be0e2891fecc..438906dae083 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -727,44 +727,47 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 	return status;
 }
 
-static
-void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
+static void nfs_readdir_free_pages(struct page **pages, size_t npages)
 {
-	unsigned int i;
-	for (i = 0; i < npages; i++)
-		put_page(pages[i]);
+	while (npages--)
+		put_page(pages[npages]);
+	kfree(pages);
 }
 
 /*
  * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call
  * to nfs_readdir_free_pages()
  */
-static
-int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
+static struct page **nfs_readdir_alloc_pages(size_t npages)
 {
-	unsigned int i;
+	struct page **pages;
+	size_t i;
 
+	pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return NULL;
 	for (i = 0; i < npages; i++) {
 		struct page *page = alloc_page(GFP_KERNEL);
 		if (page == NULL)
 			goto out_freepages;
 		pages[i] = page;
 	}
-	return 0;
+	return pages;
 
 out_freepages:
 	nfs_readdir_free_pages(pages, i);
-	return -ENOMEM;
+	return NULL;
 }
 
 static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
-	struct page *pages[NFS_MAX_READDIR_PAGES];
+	struct page **pages;
 	struct nfs_entry entry;
 	struct file	*file = desc->file;
+	size_t array_size;
+	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
-	unsigned int array_size = ARRAY_SIZE(pages);
 
 	entry.prev_cookie = 0;
 	entry.cookie = nfs_readdir_page_last_cookie(page);
@@ -781,9 +784,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 		goto out;
 	}
 
-	status = nfs_readdir_alloc_pages(pages, array_size);
-	if (status < 0)
+	array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pages = nfs_readdir_alloc_pages(array_size);
+	if (!pages)
 		goto out_release_label;
+
 	do {
 		unsigned int pglen;
 		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 6673a77884d9..b840d0a91c9d 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -56,12 +56,6 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry)
 #define NFS_UNSPEC_RETRANS	(UINT_MAX)
 #define NFS_UNSPEC_TIMEO	(UINT_MAX)
 
-/*
- * Maximum number of pages that readdir can use for creating
- * a vmapped array of pages.
- */
-#define NFS_MAX_READDIR_PAGES 8
-
 struct nfs_client_initdata {
 	unsigned long init_flags;
 	const char *hostname;			/* Hostname of the server */

From 93b8959a0a8cf1b1a493efee9e8328681e111862 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 15:24:41 -0500
Subject: [PATCH 080/230] NFS: More readdir cleanups

Remove the redundant caching of the credential in struct
nfs_open_dir_context.
Pass the buffer size as an argument to nfs_readdir_xdr_filler().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c           | 25 +++++++++++--------------
 include/linux/nfs_fs.h |  1 -
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 438906dae083..bc366bd8e8f3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = {
 	.freepage = nfs_readdir_clear_array,
 };
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
@@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->attr_gencount = nfsi->attr_gencount;
 		ctx->dir_cookie = 0;
 		ctx->dup_cookie = 0;
-		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
 		if (list_empty(&nfsi->open_files) &&
 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont
 	spin_lock(&dir->i_lock);
 	list_del(&ctx->list);
 	spin_unlock(&dir->i_lock);
-	put_cred(ctx->cred);
 	kfree(ctx);
 }
 
@@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
-	ctx = alloc_nfs_open_dir_context(inode, current_cred());
+	ctx = alloc_nfs_open_dir_context(inode);
 	if (IS_ERR(ctx)) {
 		res = PTR_ERR(ctx);
 		goto out;
@@ -468,12 +466,12 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 }
 
 /* Fill a page with xdr information before transferring to the cache page */
-static
-int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
-			struct nfs_entry *entry, struct file *file, struct inode *inode)
+static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
+				  u64 cookie, struct page **pages,
+				  size_t bufsize)
 {
-	struct nfs_open_dir_context *ctx = file->private_data;
-	const struct cred *cred = ctx->cred;
+	struct file *file = desc->file;
+	struct inode *inode = file_inode(file);
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -481,8 +479,8 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
-					  NFS_SERVER(inode)->dtsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
+					  cookie, pages, bufsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
@@ -764,7 +762,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 {
 	struct page **pages;
 	struct nfs_entry entry;
-	struct file	*file = desc->file;
 	size_t array_size;
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
@@ -791,8 +788,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
-
+		status = nfs_readdir_xdr_filler(desc, entry.cookie,
+						pages, dtsize);
 		if (status < 0)
 			break;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a2c6455ea3fa..dd6b463dda80 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -88,7 +88,6 @@ struct nfs_open_context {
 
 struct nfs_open_dir_context {
 	struct list_head list;
-	const struct cred *cred;
 	unsigned long attr_gencount;
 	__u64 dir_cookie;
 	__u64 dup_cookie;

From dbeaf8c984ca689c2c0966c41bd78dee178b5dfe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 18:20:03 -0500
Subject: [PATCH 081/230] NFS: nfs_do_filldir() does not return a value

Clean up nfs_do_filldir().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bc366bd8e8f3..48856cee10de 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -881,13 +881,11 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
-static 
-int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
+static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 {
 	struct file	*file = desc->file;
-	int i = 0;
-	int res = 0;
-	struct nfs_cache_array *array = NULL;
+	struct nfs_cache_array *array;
+	unsigned int i = 0;
 
 	array = kmap(desc->page);
 	for (i = desc->cache_entry_index; i < array->size; i++) {
@@ -914,9 +912,8 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
 		desc->eof = true;
 
 	kunmap(desc->page);
-	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
-			(unsigned long long)desc->dir_cookie, res);
-	return res;
+	dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
+			(unsigned long long)desc->dir_cookie);
 }
 
 /*
@@ -957,7 +954,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	if (status < 0)
 		goto out_release;
 
-	status = nfs_do_filldir(desc);
+	nfs_do_filldir(desc);
 
  out_release:
 	nfs_readdir_clear_array(desc->page);
@@ -1032,10 +1029,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 		if (res < 0)
 			break;
 
-		res = nfs_do_filldir(desc);
+		nfs_do_filldir(desc);
 		nfs_readdir_page_unlock_and_put_cached(desc);
-		if (res < 0)
-			break;
 	} while (!desc->eof);
 
 	spin_lock(&file->f_lock);
@@ -1046,8 +1041,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	spin_unlock(&file->f_lock);
 
 out:
-	if (res > 0)
-		res = 0;
 	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
 	return res;
 }

From 6b75cf9e309d18664f964889ac026096ba0d1919 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 08:55:03 -0500
Subject: [PATCH 082/230] NFS: Reduce readdir stack usage

The descriptor and the struct nfs_entry are both large structures,
so don't allocate them from the stack.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 58 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 48856cee10de..c3af54640f6c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -761,23 +761,24 @@ static
 int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
 {
 	struct page **pages;
-	struct nfs_entry entry;
+	struct nfs_entry *entry;
 	size_t array_size;
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
 
-	entry.prev_cookie = 0;
-	entry.cookie = nfs_readdir_page_last_cookie(page);
-	entry.eof = 0;
-	entry.fh = nfs_alloc_fhandle();
-	entry.fattr = nfs_alloc_fattr();
-	entry.server = NFS_SERVER(inode);
-	if (entry.fh == NULL || entry.fattr == NULL)
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+	entry->cookie = nfs_readdir_page_last_cookie(page);
+	entry->fh = nfs_alloc_fhandle();
+	entry->fattr = nfs_alloc_fattr();
+	entry->server = NFS_SERVER(inode);
+	if (entry->fh == NULL || entry->fattr == NULL)
 		goto out;
 
-	entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
-	if (IS_ERR(entry.label)) {
-		status = PTR_ERR(entry.label);
+	entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT);
+	if (IS_ERR(entry->label)) {
+		status = PTR_ERR(entry->label);
 		goto out;
 	}
 
@@ -788,7 +789,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, entry.cookie,
+		status = nfs_readdir_xdr_filler(desc, entry->cookie,
 						pages, dtsize);
 		if (status < 0)
 			break;
@@ -799,15 +800,16 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 			break;
 		}
 
-		status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+		status = nfs_readdir_page_filler(desc, entry, pages, page, pglen);
 	} while (!status && nfs_readdir_page_needs_filling(page));
 
 	nfs_readdir_free_pages(pages, array_size);
 out_release_label:
-	nfs4_label_free(entry.label);
+	nfs4_label_free(entry->label);
 out:
-	nfs_free_fattr(entry.fattr);
-	nfs_free_fhandle(entry.fh);
+	nfs_free_fattr(entry->fattr);
+	nfs_free_fhandle(entry->fh);
+	kfree(entry);
 	return status;
 }
 
@@ -974,13 +976,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	struct dentry	*dentry = file_dentry(file);
 	struct inode	*inode = d_inode(dentry);
 	struct nfs_open_dir_context *dir_ctx = file->private_data;
-	nfs_readdir_descriptor_t my_desc = {
-		.file = file,
-		.ctx = ctx,
-		.plus = nfs_use_readdirplus(inode, ctx),
-	},
-			*desc = &my_desc;
-	int res = 0;
+	struct nfs_readdir_descriptor *desc;
+	int res;
 
 	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
 			file, (long long)ctx->pos);
@@ -992,10 +989,19 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	 * to either find the entry with the appropriate number or
 	 * revalidate the cookie.
 	 */
-	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) {
 		res = nfs_revalidate_mapping(inode, file->f_mapping);
-	if (res < 0)
+		if (res < 0)
+			goto out;
+	}
+
+	res = -ENOMEM;
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	if (!desc)
 		goto out;
+	desc->file = file;
+	desc->ctx = ctx;
+	desc->plus = nfs_use_readdirplus(inode, ctx);
 
 	spin_lock(&file->f_lock);
 	desc->dir_cookie = dir_ctx->dir_cookie;
@@ -1040,6 +1046,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->attr_gencount = desc->attr_gencount;
 	spin_unlock(&file->f_lock);
 
+	kfree(desc);
+
 out:
 	dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
 	return res;

From 6c981eff23b894ce429281dc45a5589359eef2c1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 3 Nov 2020 07:42:04 -0500
Subject: [PATCH 083/230] NFS: Cleanup to remove nfs_readdir_descriptor_t
 typedef

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c3af54640f6c..b226f6f3ae96 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -144,7 +144,7 @@ struct nfs_cache_array {
 	struct nfs_cache_array_entry array[];
 };
 
-typedef struct nfs_readdir_descriptor {
+struct nfs_readdir_descriptor {
 	struct file	*file;
 	struct page	*page;
 	struct dir_context *ctx;
@@ -163,7 +163,7 @@ typedef struct nfs_readdir_descriptor {
 	signed char duped;
 	bool plus;
 	bool eof;
-} nfs_readdir_descriptor_t;
+};
 
 static void nfs_readdir_array_init(struct nfs_cache_array *array)
 {
@@ -362,8 +362,8 @@ bool nfs_readdir_use_cookie(const struct file *filp)
 	return true;
 }
 
-static
-int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+static int nfs_readdir_search_for_pos(struct nfs_cache_array *array,
+				      struct nfs_readdir_descriptor *desc)
 {
 	loff_t diff = desc->ctx->pos - desc->current_index;
 	unsigned int index;
@@ -394,8 +394,8 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
 	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
 }
 
-static
-int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
+static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
+					 struct nfs_readdir_descriptor *desc)
 {
 	int i;
 	loff_t new_pos;
@@ -443,8 +443,7 @@ out:
 	return status;
 }
 
-static
-int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
+static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 {
 	struct nfs_cache_array *array;
 	int status;
@@ -497,7 +496,7 @@ error:
 	return error;
 }
 
-static int xdr_decode(nfs_readdir_descriptor_t *desc,
+static int xdr_decode(struct nfs_readdir_descriptor *desc,
 		      struct nfs_entry *entry, struct xdr_stream *xdr)
 {
 	struct inode *inode = file_inode(desc->file);
@@ -757,8 +756,8 @@ out_freepages:
 	return NULL;
 }
 
-static
-int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
+static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
+				    struct page *page, struct inode *inode)
 {
 	struct page **pages;
 	struct nfs_entry *entry;
@@ -838,8 +837,7 @@ nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc)
  * Returns 0 if desc->dir_cookie was found on page desc->page_index
  * and locks the page to prevent removal from the page cache.
  */
-static
-int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
+static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 {
 	struct inode *inode = file_inode(desc->file);
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -864,8 +862,7 @@ error:
 }
 
 /* Search for desc->dir_cookie from the beginning of the page cache */
-static inline
-int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
+static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
@@ -930,8 +927,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
  *	 we should already have a complete representation of the
  *	 directory in the page cache by the time we get here.
  */
-static inline
-int uncached_readdir(nfs_readdir_descriptor_t *desc)
+static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
 	struct page	*page = NULL;
 	int		status;

From 82e22a5e6245873779db1607d3b0fec6f9ca07d0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 17:34:23 -0500
Subject: [PATCH 084/230] NFS: Allow the NFS generic code to pass in a verifier
 to readdir

If we're ever going to allow support for servers that use the readdir
verifier, then that use needs to be managed by the middle layers as
those need to be able to reject cookies from other verifiers.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c            | 23 ++++++++++++++++++-----
 fs/nfs/nfs3proc.c       | 35 +++++++++++++++++------------------
 fs/nfs/nfs4proc.c       | 36 +++++++++++++++++-------------------
 fs/nfs/proc.c           | 18 +++++++++---------
 include/linux/nfs_xdr.h | 17 +++++++++++++++--
 5 files changed, 76 insertions(+), 53 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b226f6f3ae96..3ee0668a9719 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -469,8 +469,20 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 				  u64 cookie, struct page **pages,
 				  size_t bufsize)
 {
-	struct file *file = desc->file;
-	struct inode *inode = file_inode(file);
+	struct inode *inode = file_inode(desc->file);
+	__be32 verf_res[2];
+	struct nfs_readdir_arg arg = {
+		.dentry = file_dentry(desc->file),
+		.cred = desc->file->f_cred,
+		.verf = NFS_I(inode)->cookieverf,
+		.cookie = cookie,
+		.pages = pages,
+		.page_len = bufsize,
+		.plus = desc->plus,
+	};
+	struct nfs_readdir_res res = {
+		.verf = verf_res,
+	};
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -478,20 +490,21 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
-					  cookie, pages, bufsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(&arg, &res);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-			desc->plus = false;
+			desc->plus = arg.plus = false;
 			goto again;
 		}
 		goto error;
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
+	memcpy(NFS_I(inode)->cookieverf, res.verf,
+	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 6b66b73a50eb..5c4e23abc345 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -662,37 +662,36 @@ out:
  * Also note that this implementation handles both plain readdir and
  * readdirplus.
  */
-static int
-nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		  u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			     struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
-	__be32			*verf = NFS_I(dir)->cookieverf;
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.verf		= {verf[0], verf[1]},
-		.plus		= plus,
-		.count		= count,
-		.pages		= pages
+		.cookie		= nr_arg->cookie,
+		.plus		= nr_arg->plus,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages
 	};
 	struct nfs3_readdirres	res = {
-		.verf		= verf,
-		.plus		= plus
+		.verf		= nr_res->verf,
+		.plus		= nr_arg->plus,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int status = -ENOMEM;
 
-	if (plus)
+	if (nr_arg->plus)
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+	if (arg.cookie)
+		memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf));
 
-	dprintk("NFS call  readdir%s %d\n",
-			plus? "plus" : "", (unsigned int) cookie);
+	dprintk("NFS call  readdir%s %llu\n", nr_arg->plus ? "plus" : "",
+		(unsigned long long)nr_arg->cookie);
 
 	res.dir_attr = nfs_alloc_fattr();
 	if (res.dir_attr == NULL)
@@ -705,8 +704,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
 
 	nfs_free_fattr(res.dir_attr);
 out:
-	dprintk("NFS reply readdir%s: %d\n",
-			plus? "plus" : "", status);
+	dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "",
+		status);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 66f1f4a5c74c..adcaba68eaed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4961,41 +4961,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			      struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_server	*server = NFS_SERVER(dir);
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
-		.pages = pages,
+		.pages = nr_arg->pages,
 		.pgbase = 0,
-		.count = count,
-		.plus = plus,
+		.count = nr_arg->page_len,
+		.plus = nr_arg->plus,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
-		.rpc_cred = cred,
+		.rpc_cred = nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
-			dentry,
-			(unsigned long long)cookie);
+	dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__,
+		nr_arg->dentry, (unsigned long long)nr_arg->cookie);
 	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
 		args.bitmask = server->attr_bitmask_nl;
 	else
 		args.bitmask = server->attr_bitmask;
 
-	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
+	nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args);
 	res.pgbase = args.pgbase;
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
 			&res.seq_res, 0);
 	if (status >= 0) {
-		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
+		memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE);
 		status += args.pgbase;
 	}
 
@@ -5005,19 +5004,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 	return status;
 }
 
-static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs4_proc_readdir(struct nfs_readdir_arg *arg,
+			     struct nfs_readdir_res *res)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
 	int err;
 	do {
-		err = _nfs4_proc_readdir(dentry, cred, cookie,
-				pages, count, plus);
-		trace_nfs4_readdir(d_inode(dentry), err);
-		err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
-				&exception);
+		err = _nfs4_proc_readdir(arg, res);
+		trace_nfs4_readdir(d_inode(arg->dentry), err);
+		err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)),
+					    err, &exception);
 	} while (exception.retry);
 	return err;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 15c865cc837f..73ab7c59d3a7 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
  * sure it is syntactically correct; the entries itself are decoded
  * from nfs_readdir by calling the decode_entry function directly.
  */
-static int
-nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		 u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			    struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.count		= count,
-		.pages		= pages,
+		.cookie		= nr_arg->cookie,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
 		.rpc_argp	= &arg,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
+	dprintk("NFS call  readdir %llu\n", (unsigned long long)nr_arg->cookie);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nr_res->verf[0] = nr_res->verf[1] = 0;
 
 	nfs_invalidate_atime(dir);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d63cb862d58e..3327239fa2f9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -750,6 +750,20 @@ struct nfs_entry {
 	struct nfs_server *	server;
 };
 
+struct nfs_readdir_arg {
+	struct dentry		*dentry;
+	const struct cred	*cred;
+	__be32			*verf;
+	u64			cookie;
+	struct page		**pages;
+	unsigned int		page_len;
+	bool			plus;
+};
+
+struct nfs_readdir_res {
+	__be32			*verf;
+};
+
 /*
  * The following types are for NFSv2 only.
  */
@@ -1744,8 +1758,7 @@ struct nfs_rpc_ops {
 			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, const struct qstr *);
-	int	(*readdir) (struct dentry *, const struct cred *,
-			    u64, struct page **, unsigned int, bool);
+	int	(*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *);
 	int	(*mknod)   (struct inode *, struct dentry *, struct iattr *,
 			    dev_t);
 	int	(*statfs)  (struct nfs_server *, struct nfs_fh *,

From 9fff59ed4c4d239125f8529a9971c46defd2e2b0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 20:11:32 -0500
Subject: [PATCH 085/230] NFS: Handle NFS4ERR_NOT_SAME and NFSERR_BADCOOKIE
 from readdir calls

If the server returns NFS4ERR_NOT_SAME or tells us that the cookie is
bad in response to a READDIR call, then we should empty the page cache
so that we can fill it from scratch again.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c      | 24 ++++++++++++++++--------
 fs/nfs/nfs4proc.c |  2 ++
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3ee0668a9719..3b44bef3a1b4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -861,15 +861,21 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
 		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
-		if (res < 0)
-			goto error;
+		if (res < 0) {
+			nfs_readdir_page_unlock_and_put_cached(desc);
+			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
+				invalidate_inode_pages2(desc->file->f_mapping);
+				desc->page_index = 0;
+				return -EAGAIN;
+			}
+			return res;
+		}
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {
 		nfsi->page_index = desc->page_index;
 		return 0;
 	}
-error:
 	nfs_readdir_page_unlock_and_put_cached(desc);
 	return res;
 }
@@ -879,12 +885,12 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
-	if (desc->page_index == 0) {
-		desc->current_index = 0;
-		desc->prev_index = 0;
-		desc->last_cookie = 0;
-	}
 	do {
+		if (desc->page_index == 0) {
+			desc->current_index = 0;
+			desc->prev_index = 0;
+			desc->last_cookie = 0;
+		}
 		res = find_and_lock_cache_page(desc);
 	} while (res == -EAGAIN);
 	return res;
@@ -1030,6 +1036,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 				res = uncached_readdir(desc);
 				if (res == 0)
 					continue;
+				if (res == -EBADCOOKIE || res == -ENOTSYNC)
+					res = 0;
 			}
 			break;
 		}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index adcaba68eaed..7ab40d0e6a74 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -184,6 +184,8 @@ static int nfs4_map_errors(int err)
 		return -EPROTONOSUPPORT;
 	case -NFS4ERR_FILE_OPEN:
 		return -EBUSY;
+	case -NFS4ERR_NOT_SAME:
+		return -ENOTSYNC;
 	default:
 		dprintk("%s could not handle NFSv4 error %d\n",
 				__func__, -err);

From b593c09f83a2732a0f0298c8f3468236a83cdd9f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 20:06:12 -0500
Subject: [PATCH 086/230] NFS: Improve handling of directory verifiers

If the server insists on using the readdir verifiers in order to allow
cookies to expire, then we should ensure that we cache the verifier
with the cookie, so that we can return an error if the application
tries to use the expired cookie.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c           | 35 +++++++++++++++++++++++------------
 fs/nfs/inode.c         |  7 -------
 include/linux/nfs_fs.h |  8 +++++++-
 3 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3b44bef3a1b4..454377228167 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -155,6 +155,7 @@ struct nfs_readdir_descriptor {
 	loff_t		current_index;
 	loff_t		prev_index;
 
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	unsigned long	dir_verifier;
 	unsigned long	timestamp;
 	unsigned long	gencount;
@@ -466,15 +467,15 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 
 /* Fill a page with xdr information before transferring to the cache page */
 static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
-				  u64 cookie, struct page **pages,
-				  size_t bufsize)
+				  __be32 *verf, u64 cookie,
+				  struct page **pages, size_t bufsize,
+				  __be32 *verf_res)
 {
 	struct inode *inode = file_inode(desc->file);
-	__be32 verf_res[2];
 	struct nfs_readdir_arg arg = {
 		.dentry = file_dentry(desc->file),
 		.cred = desc->file->f_cred,
-		.verf = NFS_I(inode)->cookieverf,
+		.verf = verf,
 		.cookie = cookie,
 		.pages = pages,
 		.page_len = bufsize,
@@ -503,8 +504,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
-	memcpy(NFS_I(inode)->cookieverf, res.verf,
-	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
@@ -770,11 +769,13 @@ out_freepages:
 }
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
-				    struct page *page, struct inode *inode)
+				    struct page *page, __be32 *verf_arg,
+				    __be32 *verf_res)
 {
 	struct page **pages;
 	struct nfs_entry *entry;
 	size_t array_size;
+	struct inode *inode = file_inode(desc->file);
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
 
@@ -801,8 +802,9 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, entry->cookie,
-						pages, dtsize);
+		status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
+						pages, dtsize,
+						verf_res);
 		if (status < 0)
 			break;
 
@@ -854,13 +856,15 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 {
 	struct inode *inode = file_inode(desc->file);
 	struct nfs_inode *nfsi = NFS_I(inode);
+	__be32 verf[NFS_DIR_VERIFIER_SIZE];
 	int res;
 
 	desc->page = nfs_readdir_page_get_cached(desc);
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
-		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
+		res = nfs_readdir_xdr_to_array(desc, desc->page,
+					       nfsi->cookieverf, verf);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -870,6 +874,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			}
 			return res;
 		}
+		memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {
@@ -902,6 +907,7 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 {
 	struct file	*file = desc->file;
+	struct nfs_inode *nfsi = NFS_I(file_inode(file));
 	struct nfs_cache_array *array;
 	unsigned int i = 0;
 
@@ -915,6 +921,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 			desc->eof = true;
 			break;
 		}
+		memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
 		if (i < (array->size-1))
 			desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -949,8 +956,8 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
 	struct page	*page = NULL;
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	int		status;
-	struct inode *inode = file_inode(desc->file);
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)desc->dir_cookie);
@@ -967,7 +974,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->duped = 0;
 
 	nfs_readdir_page_init_array(page, desc->dir_cookie);
-	status = nfs_readdir_xdr_to_array(desc, page, inode);
+	status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
 	if (status < 0)
 		goto out_release;
 
@@ -1023,6 +1030,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->dup_cookie = dir_ctx->dup_cookie;
 	desc->duped = dir_ctx->duped;
 	desc->attr_gencount = dir_ctx->attr_gencount;
+	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
 	spin_unlock(&file->f_lock);
 
 	do {
@@ -1061,6 +1069,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->dup_cookie = desc->dup_cookie;
 	dir_ctx->duped = desc->duped;
 	dir_ctx->attr_gencount = desc->attr_gencount;
+	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
 	spin_unlock(&file->f_lock);
 
 	kfree(desc);
@@ -1101,6 +1110,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 			dir_ctx->dir_cookie = offset;
 		else
 			dir_ctx->dir_cookie = 0;
+		if (offset == 0)
+			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
 		dir_ctx->duped = 0;
 	}
 	spin_unlock(&filp->f_lock);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index aa6493905bbe..9b765a900b28 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -229,7 +229,6 @@ static void nfs_zap_caches_locked(struct inode *inode)
 	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	nfsi->attrtimeo_timestamp = jiffies;
 
-	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
 					| NFS_INO_INVALID_DATA
@@ -1237,7 +1236,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret;
 
 	if (mapping->nrpages != 0) {
@@ -1250,11 +1248,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	if (S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		spin_unlock(&inode->i_lock);
-	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index dd6b463dda80..681ed98e4ba8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -45,6 +45,11 @@
  */
 #define NFS_RPC_SWAPFLAGS		(RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS)
 
+/*
+ * Size of the NFS directory verifier
+ */
+#define NFS_DIR_VERIFIER_SIZE		2
+
 /*
  * NFSv3/v4 Access mode cache entry
  */
@@ -89,6 +94,7 @@ struct nfs_open_context {
 struct nfs_open_dir_context {
 	struct list_head list;
 	unsigned long attr_gencount;
+	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
 	__u64 dup_cookie;
 	signed char duped;
@@ -156,7 +162,7 @@ struct nfs_inode {
 	 * This is the cookie verifier used for NFSv3 readdir
 	 * operations
 	 */
-	__be32			cookieverf[2];
+	__be32			cookieverf[NFS_DIR_VERIFIER_SIZE];
 
 	atomic_long_t		nrequests;
 	struct nfs_mds_commit_info commit_info;

From 762567b7c798afd08c22811ecfc66885a2b50f91 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 4 Nov 2020 08:32:19 -0500
Subject: [PATCH 087/230] NFS: Optimisations for monotonically increasing
 readdir cookies

If the server is handing out monotonically increasing readdir cookie values,
then we can optimise away searches through pages that contain cookies that
lie outside our search range.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 454377228167..b6c3501e8f61 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,7 +140,8 @@ struct nfs_cache_array {
 	u64 last_cookie;
 	unsigned int size;
 	unsigned char page_full : 1,
-		      page_is_eof : 1;
+		      page_is_eof : 1,
+		      cookies_are_ordered : 1;
 	struct nfs_cache_array_entry array[];
 };
 
@@ -178,6 +179,7 @@ static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie)
 	array = kmap_atomic(page);
 	nfs_readdir_array_init(array);
 	array->last_cookie = last_cookie;
+	array->cookies_are_ordered = 1;
 	kunmap_atomic(array);
 }
 
@@ -269,6 +271,8 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 	cache_entry->name_len = entry->len;
 	cache_entry->name = name;
 	array->last_cookie = entry->cookie;
+	if (array->last_cookie <= cache_entry->cookie)
+		array->cookies_are_ordered = 0;
 	array->size++;
 	if (entry->eof != 0)
 		nfs_readdir_array_set_eof(array);
@@ -395,6 +399,19 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
 	return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
 }
 
+static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array,
+					      u64 cookie)
+{
+	if (!array->cookies_are_ordered)
+		return true;
+	/* Optimisation for monotonically increasing cookies */
+	if (cookie >= array->last_cookie)
+		return false;
+	if (array->size && cookie < array->array[0].cookie)
+		return false;
+	return true;
+}
+
 static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 					 struct nfs_readdir_descriptor *desc)
 {
@@ -402,6 +419,9 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 	loff_t new_pos;
 	int status = -EAGAIN;
 
+	if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie))
+		goto check_eof;
+
 	for (i = 0; i < array->size; i++) {
 		if (array->array[i].cookie == desc->dir_cookie) {
 			struct nfs_inode *nfsi = NFS_I(file_inode(desc->file));
@@ -435,6 +455,7 @@ static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array,
 			return 0;
 		}
 	}
+check_eof:
 	if (array->page_is_eof) {
 		status = -EBADCOOKIE;
 		if (desc->dir_cookie == array->last_cookie)

From 35df59d3ef693292840a61cdb04b39d8c9412f4e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 20:38:47 -0500
Subject: [PATCH 088/230] NFS: Reduce number of RPC calls when doing uncached
 readdir

If we're doing uncached readdir, allocate multiple pages in order to
try to avoid duplicate RPC calls for the same getdents() call.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 105 +++++++++++++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b6c3501e8f61..93c6f22cdc62 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -199,6 +199,23 @@ void nfs_readdir_clear_array(struct page *page)
 	kunmap_atomic(array);
 }
 
+static struct page *
+nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags)
+{
+	struct page *page = alloc_page(gfp_flags);
+	if (page)
+		nfs_readdir_page_init_array(page, last_cookie);
+	return page;
+}
+
+static void nfs_readdir_page_array_free(struct page *page)
+{
+	if (page) {
+		nfs_readdir_clear_array(page);
+		put_page(page);
+	}
+}
+
 static void nfs_readdir_array_set_eof(struct nfs_cache_array *array)
 {
 	array->page_is_eof = 1;
@@ -694,12 +711,14 @@ out:
 static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 				   struct nfs_entry *entry,
 				   struct page **xdr_pages,
-				   struct page *fillme, unsigned int buflen)
+				   unsigned int buflen,
+				   struct page **arrays,
+				   size_t narrays)
 {
 	struct address_space *mapping = desc->file->f_mapping;
 	struct xdr_stream stream;
 	struct xdr_buf buf;
-	struct page *scratch, *new, *page = fillme;
+	struct page *scratch, *new, *page = *arrays;
 	int status;
 
 	scratch = alloc_page(GFP_KERNEL);
@@ -725,15 +744,25 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 		if (status != -ENOSPC)
 			continue;
 
-		if (page->mapping != mapping)
-			break;
-		new = nfs_readdir_page_get_next(mapping, page->index + 1,
-						entry->prev_cookie);
-		if (!new)
-			break;
-		if (page != fillme)
-			nfs_readdir_page_unlock_and_put(page);
-		page = new;
+		if (page->mapping != mapping) {
+			if (!--narrays)
+				break;
+			new = nfs_readdir_page_array_alloc(entry->prev_cookie,
+							   GFP_KERNEL);
+			if (!new)
+				break;
+			arrays++;
+			*arrays = page = new;
+		} else {
+			new = nfs_readdir_page_get_next(mapping,
+							page->index + 1,
+							entry->prev_cookie);
+			if (!new)
+				break;
+			if (page != *arrays)
+				nfs_readdir_page_unlock_and_put(page);
+			page = new;
+		}
 		status = nfs_readdir_add_to_array(entry, page);
 	} while (!status && !entry->eof);
 
@@ -750,7 +779,7 @@ static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc,
 		break;
 	}
 
-	if (page != fillme)
+	if (page != *arrays)
 		nfs_readdir_page_unlock_and_put(page);
 
 	put_page(scratch);
@@ -790,10 +819,11 @@ out_freepages:
 }
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
-				    struct page *page, __be32 *verf_arg,
-				    __be32 *verf_res)
+				    __be32 *verf_arg, __be32 *verf_res,
+				    struct page **arrays, size_t narrays)
 {
 	struct page **pages;
+	struct page *page = *arrays;
 	struct nfs_entry *entry;
 	size_t array_size;
 	struct inode *inode = file_inode(desc->file);
@@ -835,7 +865,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 			break;
 		}
 
-		status = nfs_readdir_page_filler(desc, entry, pages, page, pglen);
+		status = nfs_readdir_page_filler(desc, entry, pages, pglen,
+						 arrays, narrays);
 	} while (!status && nfs_readdir_page_needs_filling(page));
 
 	nfs_readdir_free_pages(pages, array_size);
@@ -884,8 +915,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
-		res = nfs_readdir_xdr_to_array(desc, desc->page,
-					       nfsi->cookieverf, verf);
+		res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf,
+					       &desc->page, 1);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -976,37 +1007,39 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
  */
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
-	struct page	*page = NULL;
+	struct page	**arrays;
+	size_t		i, sz = 512;
 	__be32		verf[NFS_DIR_VERIFIER_SIZE];
-	int		status;
+	int		status = -ENOMEM;
 
-	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
+	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n",
 			(unsigned long long)desc->dir_cookie);
 
-	page = alloc_page(GFP_HIGHUSER);
-	if (!page) {
-		status = -ENOMEM;
+	arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL);
+	if (!arrays)
+		goto out;
+	arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL);
+	if (!arrays[0])
 		goto out;
-	}
 
 	desc->page_index = 0;
 	desc->last_cookie = desc->dir_cookie;
-	desc->page = page;
 	desc->duped = 0;
 
-	nfs_readdir_page_init_array(page, desc->dir_cookie);
-	status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
-	if (status < 0)
-		goto out_release;
+	status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
 
-	nfs_do_filldir(desc);
+	for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
+		desc->page = arrays[i];
+		nfs_do_filldir(desc);
+	}
+	desc->page = NULL;
 
- out_release:
-	nfs_readdir_clear_array(desc->page);
-	nfs_readdir_page_put(desc);
- out:
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
-			__func__, status);
+
+	for (i = 0; i < sz && arrays[i]; i++)
+		nfs_readdir_page_array_free(arrays[i]);
+out:
+	kfree(arrays);
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
 	return status;
 }
 

From 794092c57f89c2c833da00f82f38a0afcb4033bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 20:47:05 -0500
Subject: [PATCH 089/230] NFS: Do uncached readdir when we're seeking a cookie
 in an empty page cache

If the directory is changing, causing the page cache to get invalidated
while we are listing the contents, then the NFS client is currently forced
to read in the entire directory contents from scratch, because it needs
to perform a linear search for the readdir cookie. While this is not
an issue for small directories, it does not scale to directories with
millions of entries.
In order to be able to deal with large directories that are changing,
add a heuristic to ensure that if the page cache is empty, and we are
searching for a cookie that is not the zero cookie, we just default to
performing uncached readdir.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 93c6f22cdc62..3f70697729d8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -937,11 +937,28 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 	return res;
 }
 
+static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc)
+{
+	struct address_space *mapping = desc->file->f_mapping;
+	struct inode *dir = file_inode(desc->file);
+	unsigned int dtsize = NFS_SERVER(dir)->dtsize;
+	loff_t size = i_size_read(dir);
+
+	/*
+	 * Default to uncached readdir if the page cache is empty, and
+	 * we're looking for a non-zero cookie in a large directory.
+	 */
+	return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize;
+}
+
 /* Search for desc->dir_cookie from the beginning of the page cache */
 static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 {
 	int res;
 
+	if (nfs_readdir_dont_search_cache(desc))
+		return -EBADCOOKIE;
+
 	do {
 		if (desc->page_index == 0) {
 			desc->current_index = 0;

From d5aa6b22e2258f05317313ecc02efbb988ed6d38 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 16:33:38 -0500
Subject: [PATCH 090/230] SUNRPC: xprt_load_transport() needs to support the
 netid "rdma6"

According to RFC5666, the correct netid for an IPv6 addressed RDMA
transport is "rdma6", which we've supported as a mount option since
Linux-4.7. The problem is when we try to load the module "xprtrdma6",
that will fail, since there is no modulealias of that name.

Fixes: 181342c5ebe8 ("xprtrdma: Add rdma6 option to support NFS/RDMA IPv6")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h     |  1 +
 net/sunrpc/xprt.c               | 65 +++++++++++++++++++++++++--------
 net/sunrpc/xprtrdma/module.c    |  1 +
 net/sunrpc/xprtrdma/transport.c |  1 +
 net/sunrpc/xprtsock.c           |  4 ++
 5 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a603d48d2b2c..3ac5037d1c3d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -330,6 +330,7 @@ struct xprt_class {
 	struct rpc_xprt *	(*setup)(struct xprt_create *);
 	struct module		*owner;
 	char			name[32];
+	const char *		netid[];
 };
 
 /*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f6c17e75f20e..57f09ea3ef2a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,31 +151,64 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
+static void
+xprt_class_release(const struct xprt_class *t)
+{
+	module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+	const struct xprt_class *t;
+	unsigned int i;
+
+	list_for_each_entry(t, &xprt_list, list) {
+		for (i = 0; t->netid[i][0] != '\0'; i++) {
+			if (strcmp(t->netid[i], netid) != 0)
+				continue;
+			if (!try_module_get(t->owner))
+				continue;
+			return t;
+		}
+	}
+	return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+	const struct xprt_class *t;
+
+	spin_lock(&xprt_list_lock);
+	t = xprt_class_find_by_netid_locked(netid);
+	if (!t) {
+		spin_unlock(&xprt_list_lock);
+		request_module("rpc%s", netid);
+		spin_lock(&xprt_list_lock);
+		t = xprt_class_find_by_netid_locked(netid);
+	}
+	spin_unlock(&xprt_list_lock);
+	return t;
+}
+
 /**
  * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
+ * @netid: transport to load
  *
  * Returns:
  * 0:		transport successfully loaded
  * -ENOENT:	transport module not available
  */
-int xprt_load_transport(const char *transport_name)
+int xprt_load_transport(const char *netid)
 {
-	struct xprt_class *t;
-	int result;
+	const struct xprt_class *t;
 
-	result = 0;
-	spin_lock(&xprt_list_lock);
-	list_for_each_entry(t, &xprt_list, list) {
-		if (strcmp(t->name, transport_name) == 0) {
-			spin_unlock(&xprt_list_lock);
-			goto out;
-		}
-	}
-	spin_unlock(&xprt_list_lock);
-	result = request_module("xprt%s", transport_name);
-out:
-	return result;
+	t = xprt_class_find_by_netid(netid);
+	if (!t)
+		return -ENOENT;
+	xprt_class_release(t);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_ALIAS("svcrdma");
 MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
 
 static void __exit rpc_rdma_cleanup(void)
 {
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8915e42240d3..035060c05fd5 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -768,6 +768,7 @@ static struct xprt_class xprt_rdma = {
 	.owner			= THIS_MODULE,
 	.ident			= XPRT_TRANSPORT_RDMA,
 	.setup			= xprt_setup_rdma,
+	.netid			= { "rdma", "rdma6", "" },
 };
 
 void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7090bbee0ec5..c93ff70da3f9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3059,6 +3059,7 @@ static struct xprt_class	xs_local_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_LOCAL,
 	.setup		= xs_setup_local,
+	.netid		= { "" },
 };
 
 static struct xprt_class	xs_udp_transport = {
@@ -3067,6 +3068,7 @@ static struct xprt_class	xs_udp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_UDP,
 	.setup		= xs_setup_udp,
+	.netid		= { "udp", "udp6", "" },
 };
 
 static struct xprt_class	xs_tcp_transport = {
@@ -3075,6 +3077,7 @@ static struct xprt_class	xs_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_TCP,
 	.setup		= xs_setup_tcp,
+	.netid		= { "tcp", "tcp6", "" },
 };
 
 static struct xprt_class	xs_bc_tcp_transport = {
@@ -3083,6 +3086,7 @@ static struct xprt_class	xs_bc_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_BC_TCP,
 	.setup		= xs_setup_bc_tcp,
+	.netid		= { "" },
 };
 
 /**

From 9bccd264611b5345d85138dc7fd55bdeb9e6942e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 12:58:22 -0500
Subject: [PATCH 091/230] SUNRPC: Close a race with transport setup and module
 put

After we've looked up the transport module, we need to ensure it can't
go away until we've finished running the transport setup code.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprt.c | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 57f09ea3ef2a..bf490d0c98c6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -157,6 +157,32 @@ xprt_class_release(const struct xprt_class *t)
 	module_put(t->owner);
 }
 
+static const struct xprt_class *
+xprt_class_find_by_ident_locked(int ident)
+{
+	const struct xprt_class *t;
+
+	list_for_each_entry(t, &xprt_list, list) {
+		if (t->ident != ident)
+			continue;
+		if (!try_module_get(t->owner))
+			continue;
+		return t;
+	}
+	return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_ident(int ident)
+{
+	const struct xprt_class *t;
+
+	spin_lock(&xprt_list_lock);
+	t = xprt_class_find_by_ident_locked(ident);
+	spin_unlock(&xprt_list_lock);
+	return t;
+}
+
 static const struct xprt_class *
 xprt_class_find_by_netid_locked(const char *netid)
 {
@@ -1929,21 +1955,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
 struct rpc_xprt *xprt_create_transport(struct xprt_create *args)
 {
 	struct rpc_xprt	*xprt;
-	struct xprt_class *t;
+	const struct xprt_class *t;
 
-	spin_lock(&xprt_list_lock);
-	list_for_each_entry(t, &xprt_list, list) {
-		if (t->ident == args->ident) {
-			spin_unlock(&xprt_list_lock);
-			goto found;
-		}
+	t = xprt_class_find_by_ident(args->ident);
+	if (!t) {
+		dprintk("RPC: transport (%d) not supported\n", args->ident);
+		return ERR_PTR(-EIO);
 	}
-	spin_unlock(&xprt_list_lock);
-	dprintk("RPC: transport (%d) not supported\n", args->ident);
-	return ERR_PTR(-EIO);
 
-found:
 	xprt = t->setup(args);
+	xprt_class_release(t);
+
 	if (IS_ERR(xprt))
 		goto out;
 	if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)

From 1fc5f13186440973e1aa1d85aa263326756af431 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 09:41:21 -0500
Subject: [PATCH 092/230] SUNRPC: Add a helper to return the transport
 identifier given a netid

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 31 ++++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 3ac5037d1c3d..f7b75c72f80e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -386,6 +386,7 @@ xprt_disable_swap(struct rpc_xprt *xprt)
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
 int			xprt_load_transport(const char *);
+int			xprt_find_transport_ident(const char *);
 void			xprt_wait_for_reply_request_def(struct rpc_task *task);
 void			xprt_wait_for_reply_request_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index bf490d0c98c6..23452f57d369 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -218,6 +218,28 @@ xprt_class_find_by_netid(const char *netid)
 	return t;
 }
 
+/**
+ * xprt_find_transport_ident - convert a netid into a transport identifier
+ * @netid: transport to load
+ *
+ * Returns:
+ * > 0:		transport identifier
+ * -ENOENT:	transport module not available
+ */
+int xprt_find_transport_ident(const char *netid)
+{
+	const struct xprt_class *t;
+	int ret;
+
+	t = xprt_class_find_by_netid(netid);
+	if (!t)
+		return -ENOENT;
+	ret = t->ident;
+	xprt_class_release(t);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
+
 /**
  * xprt_load_transport - load a transport implementation
  * @netid: transport to load
@@ -228,13 +250,8 @@ xprt_class_find_by_netid(const char *netid)
  */
 int xprt_load_transport(const char *netid)
 {
-	const struct xprt_class *t;
-
-	t = xprt_class_find_by_netid(netid);
-	if (!t)
-		return -ENOENT;
-	xprt_class_release(t);
-	return 0;
+	int ret = xprt_find_transport_ident(netid);
+	return ret < 0 ? ret : 0;
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 

From 1c3695d0bb3869fedcde4538d831003519576ece Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 10:30:35 -0500
Subject: [PATCH 093/230] NFS: Switch mount code to use
 xprt_find_transport_ident()

Switch the mount code to use xprt_find_transport_ident() and to check
the results before allowing the mount to proceed.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/fs_context.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 29ec8b09a52d..06894bcdea2d 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -510,13 +510,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP;
 		break;
 	case Opt_tcp:
-		ctx->flags |= NFS_MOUNT_TCP;
-		ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP;
-		break;
 	case Opt_rdma:
 		ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */
-		ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-		xprt_load_transport(param->key);
+		ret = xprt_find_transport_ident(param->key);
+		if (ret < 0)
+			goto out_bad_transport;
+		ctx->nfs_server.protocol = ret;
 		break;
 	case Opt_acl:
 		if (result.negated)
@@ -670,11 +669,13 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		case Opt_xprt_rdma:
 			/* vector side protocols to TCP */
 			ctx->flags |= NFS_MOUNT_TCP;
-			ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
-			xprt_load_transport(param->string);
+			ret = xprt_find_transport_ident(param->string);
+			if (ret < 0)
+				goto out_bad_transport;
+			ctx->nfs_server.protocol = ret;
 			break;
 		default:
-			return nfs_invalf(fc, "NFS: Unrecognized transport protocol");
+			goto out_bad_transport;
 		}
 
 		ctx->protofamily = protofamily;
@@ -697,7 +698,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 			break;
 		case Opt_xprt_rdma: /* not used for side protocols */
 		default:
-			return nfs_invalf(fc, "NFS: Unrecognized transport protocol");
+			goto out_bad_transport;
 		}
 		ctx->mountfamily = mountfamily;
 		break;
@@ -787,6 +788,8 @@ out_invalid_address:
 	return nfs_invalf(fc, "NFS: Bad IP address specified");
 out_of_bounds:
 	return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key);
+out_bad_transport:
+	return nfs_invalf(fc, "NFS: Unrecognized transport protocol");
 }
 
 /*

From c87b056e58e71ba7a3f603700618f8da9742aa29 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 10:32:14 -0500
Subject: [PATCH 094/230] SUNRPC: Remove unused function xprt_load_transport()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 -
 net/sunrpc/xprt.c           | 15 ---------------
 2 files changed, 16 deletions(-)

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index f7b75c72f80e..d2e97ee802af 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -385,7 +385,6 @@ xprt_disable_swap(struct rpc_xprt *xprt)
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
-int			xprt_load_transport(const char *);
 int			xprt_find_transport_ident(const char *);
 void			xprt_wait_for_reply_request_def(struct rpc_task *task);
 void			xprt_wait_for_reply_request_rtt(struct rpc_task *task);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 23452f57d369..691ccf8049a4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -240,21 +240,6 @@ int xprt_find_transport_ident(const char *netid)
 }
 EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
 
-/**
- * xprt_load_transport - load a transport implementation
- * @netid: transport to load
- *
- * Returns:
- * 0:		transport successfully loaded
- * -ENOENT:	transport module not available
- */
-int xprt_load_transport(const char *netid)
-{
-	int ret = xprt_find_transport_ident(netid);
-	return ret < 0 ? ret : 0;
-}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
-
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
 	xprt->snd_task = NULL;

From a12f996d3413ae41b6c0952013cd7a11396e14eb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 16:58:53 -0500
Subject: [PATCH 095/230] NFSv4/pNFS: Use connections to a DS that are all of
 the same protocol family

If the pNFS metadata server advertises multiple addresses for the same
data server, we should try to connect to just one protocol family and
transport type on the assumption that homogeneity will improve performance.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pnfs_nfs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 679767ac258d..ee6d003db844 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -860,6 +860,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
 				.addrlen = da->da_addrlen,
 				.servername = clp->cl_hostname,
 			};
+
+			if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+				continue;
 			/* Add this address as an alias */
 			rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
 					rpc_clnt_test_and_add_xprt, NULL);
@@ -913,17 +916,19 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 			};
 			struct nfs4_add_xprt_data xprtdata = {
 				.clp = clp,
-				.cred = nfs4_get_clid_cred(clp),
 			};
 			struct rpc_add_xprt_test rpcdata = {
 				.add_xprt_test = clp->cl_mvops->session_trunk,
 				.data = &xprtdata,
 			};
 
+			if (da->da_addr.ss_family != clp->cl_addr.ss_family)
+				continue;
 			/**
 			* Test this address for session trunking and
 			* add as an alias
 			*/
+			xprtdata.cred = nfs4_get_clid_cred(clp),
 			rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
 					  rpc_clnt_setup_test_and_add_xprt,
 					  &rpcdata);

From 190c75a31fe65e311f3628bd97bd58cff50d221f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 09:21:48 -0500
Subject: [PATCH 096/230] pNFS: Add helpers for allocation/free of struct
 nfs4_pnfs_ds_addr

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pnfs_nfs.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index ee6d003db844..c3c04a763985 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -661,6 +661,20 @@ _data_server_lookup_locked(const struct list_head *dsaddrs)
 	return NULL;
 }
 
+static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags)
+{
+	struct nfs4_pnfs_ds_addr *da = kzalloc(sizeof(*da), gfp_flags);
+	if (da)
+		INIT_LIST_HEAD(&da->da_node);
+	return da;
+}
+
+static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da)
+{
+	kfree(da->da_remotestr);
+	kfree(da);
+}
+
 static void destroy_ds(struct nfs4_pnfs_ds *ds)
 {
 	struct nfs4_pnfs_ds_addr *da;
@@ -676,8 +690,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
 				      struct nfs4_pnfs_ds_addr,
 				      da_node);
 		list_del_init(&da->da_node);
-		kfree(da->da_remotestr);
-		kfree(da);
+		nfs4_pnfs_ds_addr_free(da);
 	}
 
 	kfree(ds->ds_remotestr);
@@ -1094,12 +1107,10 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 	}
 	*portstr = '\0';
 
-	da = kzalloc(sizeof(*da), gfp_flags);
+	da = nfs4_pnfs_ds_addr_alloc(gfp_flags);
 	if (unlikely(!da))
 		goto out_free_buf;
 
-	INIT_LIST_HEAD(&da->da_node);
-
 	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
 		      sizeof(da->da_addr))) {
 		dprintk("%s: error parsing address %s\n", __func__, buf);

From 4be78d26810bc506769773ca2f81b4de65f43fd5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 16:10:52 -0500
Subject: [PATCH 097/230] NFSv4/pNFS: Store the transport type in struct
 nfs4_pnfs_ds_addr

We want to enable RDMA and UDP as valid transport methods if a
GETDEVICEINFO call specifies it. Do so by adding a parser for the
netid that translates it to an appropriate argument for the RPC
transport layer.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pnfs.h     |  2 ++
 fs/nfs/pnfs_nfs.c | 34 +++++++++++++++++++---------------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 2661c44c62db..f618c49697bb 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -51,6 +51,8 @@ struct nfs4_pnfs_ds_addr {
 	size_t			da_addrlen;
 	struct list_head	da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
 	char			*da_remotestr;	/* human readable addr+port */
+	const char		*da_netid;
+	int			da_transport;
 };
 
 struct nfs4_pnfs_ds {
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index c3c04a763985..7a97643acf3f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -672,6 +672,7 @@ static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags)
 static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da)
 {
 	kfree(da->da_remotestr);
+	kfree(da->da_netid);
 	kfree(da);
 }
 
@@ -867,13 +868,15 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
 
 		if (!IS_ERR(clp)) {
 			struct xprt_create xprt_args = {
-				.ident = XPRT_TRANSPORT_TCP,
+				.ident = da->da_transport,
 				.net = clp->cl_net,
 				.dstaddr = (struct sockaddr *)&da->da_addr,
 				.addrlen = da->da_addrlen,
 				.servername = clp->cl_hostname,
 			};
 
+			if (da->da_transport != clp->cl_proto)
+				continue;
 			if (da->da_addr.ss_family != clp->cl_addr.ss_family)
 				continue;
 			/* Add this address as an alias */
@@ -883,7 +886,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
 		}
 		clp = get_v3_ds_connect(mds_srv,
 				(struct sockaddr *)&da->da_addr,
-				da->da_addrlen, IPPROTO_TCP,
+				da->da_addrlen, da->da_transport,
 				timeo, retrans);
 		if (IS_ERR(clp))
 			continue;
@@ -921,7 +924,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 
 		if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
 			struct xprt_create xprt_args = {
-				.ident = XPRT_TRANSPORT_TCP,
+				.ident = da->da_transport,
 				.net = clp->cl_net,
 				.dstaddr = (struct sockaddr *)&da->da_addr,
 				.addrlen = da->da_addrlen,
@@ -935,6 +938,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 				.data = &xprtdata,
 			};
 
+			if (da->da_transport != clp->cl_proto)
+				continue;
 			if (da->da_addr.ss_family != clp->cl_addr.ss_family)
 				continue;
 			/**
@@ -950,8 +955,9 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 		} else {
 			clp = nfs4_set_ds_client(mds_srv,
 						(struct sockaddr *)&da->da_addr,
-						da->da_addrlen, IPPROTO_TCP,
-						timeo, retrans, minor_version);
+						da->da_addrlen,
+						da->da_transport, timeo,
+						retrans, minor_version);
 			if (IS_ERR(clp))
 				continue;
 
@@ -1042,8 +1048,8 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 	int nlen, rlen;
 	int tmp[2];
 	__be32 *p;
-	char *netid, *match_netid;
-	size_t len, match_netid_len;
+	char *netid;
+	size_t len;
 	char *startsep = "";
 	char *endsep = "";
 
@@ -1125,15 +1131,11 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 	case AF_INET:
 		((struct sockaddr_in *)&da->da_addr)->sin_port = port;
 		da->da_addrlen = sizeof(struct sockaddr_in);
-		match_netid = "tcp";
-		match_netid_len = 3;
 		break;
 
 	case AF_INET6:
 		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
 		da->da_addrlen = sizeof(struct sockaddr_in6);
-		match_netid = "tcp6";
-		match_netid_len = 4;
 		startsep = "[";
 		endsep = "]";
 		break;
@@ -1144,12 +1146,15 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 		goto out_free_da;
 	}
 
-	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
-		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
-			__func__, netid, match_netid);
+	da->da_transport = xprt_find_transport_ident(netid);
+	if (da->da_transport < 0) {
+		dprintk("%s: ERROR: unknown r_netid \"%s\"\n",
+			__func__, netid);
 		goto out_free_da;
 	}
 
+	da->da_netid = netid;
+
 	/* save human readable address */
 	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
 	da->da_remotestr = kzalloc(len, gfp_flags);
@@ -1161,7 +1166,6 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 
 	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
 	kfree(buf);
-	kfree(netid);
 	return da;
 
 out_free_da:

From 9a7016319e1e7c6348a960931182f5f71b95f24e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 9 Nov 2020 15:42:03 -0500
Subject: [PATCH 098/230] pNFS/flexfiles: Fix up layoutstats reporting for
 non-TCP transports

Ensure that we report the correct netid when using UDP or RDMA
transports to the DSes.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 24bf5797f88a..b7c26836b2cb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2284,7 +2284,6 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
 	struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
 	char portbuf[RPCBIND_MAXUADDRPLEN];
 	char addrbuf[RPCBIND_MAXUADDRLEN];
-	char *netid;
 	unsigned short port;
 	int len, netid_len;
 	__be32 *p;
@@ -2294,18 +2293,13 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
 		if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
 			return;
 		port = ntohs(((struct sockaddr_in *)sap)->sin_port);
-		netid = "tcp";
-		netid_len = 3;
 		break;
 	case AF_INET6:
 		if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
 			return;
 		port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
-		netid = "tcp6";
-		netid_len = 4;
 		break;
 	default:
-		/* we only support tcp and tcp6 */
 		WARN_ON_ONCE(1);
 		return;
 	}
@@ -2313,8 +2307,9 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
 	snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
 	len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
 
+	netid_len = strlen(da->da_netid);
 	p = xdr_reserve_space(xdr, 4 + netid_len);
-	xdr_encode_opaque(p, netid, netid_len);
+	xdr_encode_opaque(p, da->da_netid, netid_len);
 
 	p = xdr_reserve_space(xdr, 4 + len);
 	xdr_encode_opaque(p, addrbuf, len);

From 4aceaaea5eccd32bc40c6c76b262489b2f53ca8d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 10:56:53 -0500
Subject: [PATCH 099/230] SUNRPC: Fix up open coded kmemdup_nul()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 71e03b930b70..b1cda6d85ded 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1942,10 +1942,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
 
 	ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
 	if (ret > 0) {
-		char *s = kmalloc(ret + 1, gfp_flags);
+		char *s = kmemdup_nul(p, ret, gfp_flags);
 		if (s != NULL) {
-			memcpy(s, p, ret);
-			s[ret] = '\0';
 			*str = s;
 			return strlen(s);
 		}

From 988998134996a397a47cf758627def5f20dc1e88 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 9 Nov 2020 16:06:15 -0500
Subject: [PATCH 100/230] pNFS: Clean up open coded xdr string decoding

Use the existing xdr_stream_decode_string_dup() to safely decode into
kmalloced strings.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/pnfs_nfs.c | 43 +++++++------------------------------------
 1 file changed, 7 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7a97643acf3f..2efcfdd348a1 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -1045,9 +1045,8 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 	struct nfs4_pnfs_ds_addr *da = NULL;
 	char *buf, *portstr;
 	__be16 port;
-	int nlen, rlen;
+	ssize_t nlen, rlen;
 	int tmp[2];
-	__be32 *p;
 	char *netid;
 	size_t len;
 	char *startsep = "";
@@ -1055,45 +1054,17 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
 
 
 	/* r_netid */
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
+	nlen = xdr_stream_decode_string_dup(xdr, &netid, XDR_MAX_NETOBJ,
+					    gfp_flags);
+	if (unlikely(nlen < 0))
 		goto out_err;
-	nlen = be32_to_cpup(p++);
-
-	p = xdr_inline_decode(xdr, nlen);
-	if (unlikely(!p))
-		goto out_err;
-
-	netid = kmalloc(nlen+1, gfp_flags);
-	if (unlikely(!netid))
-		goto out_err;
-
-	netid[nlen] = '\0';
-	memcpy(netid, p, nlen);
 
 	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
-		goto out_free_netid;
-	rlen = be32_to_cpup(p);
-
-	p = xdr_inline_decode(xdr, rlen);
-	if (unlikely(!p))
-		goto out_free_netid;
-
 	/* port is ".ABC.DEF", 8 chars max */
-	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
-		dprintk("%s: Invalid address, length %d\n", __func__,
-			rlen);
+	rlen = xdr_stream_decode_string_dup(xdr, &buf, INET6_ADDRSTRLEN +
+					    IPV6_SCOPE_ID_LEN + 8, gfp_flags);
+	if (unlikely(rlen < 0))
 		goto out_free_netid;
-	}
-	buf = kmalloc(rlen + 1, gfp_flags);
-	if (!buf) {
-		dprintk("%s: Not enough memory\n", __func__);
-		goto out_free_netid;
-	}
-	buf[rlen] = '\0';
-	memcpy(buf, p, rlen);
 
 	/* replace port '.' with '-' */
 	portstr = strrchr(buf, '.');

From 046e5ccb4198b990190e11fb52fd9cfd264402eb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 13 Nov 2020 21:42:16 -0500
Subject: [PATCH 101/230] NFSv4: Fix the alignment of page data in the
 getdeviceinfo reply

We can fit the device_addr4 opaque data padding in the pages.

Fixes: cf500bac8fd4 ("SUNRPC: Introduce rpc_prepare_reply_pages()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c6dbfcae7517..c16b93df1bc1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3009,15 +3009,19 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
+	uint32_t replen;
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
+
+	replen = hdr.replen + op_decode_hdr_maxsz;
+
 	encode_getdeviceinfo(xdr, args, &hdr);
 
-	/* set up reply kvec. Subtract notification bitmap max size (2)
-	 * so that notification bitmap is put in xdr_buf tail */
+	/* set up reply kvec. device_addr4 opaque data is read into the
+	 * pages */
 	rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
-				args->pdev->pglen, hdr.replen - 2);
+				args->pdev->pglen, replen + 2 + 1);
 	encode_nops(&hdr);
 }
 

From 2b1f83d108bd35d12d8a833298d2a033f9121aac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 21 Nov 2020 14:01:50 -0500
Subject: [PATCH 102/230] SUNRPC: Fix up typo in xdr_init_decode()

We already know that the head buffer and page are empty, so if there is
any data, it is in the tail.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index b1cda6d85ded..bc7a622016ee 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1060,7 +1060,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
 	else if (buf->page_len != 0)
 		xdr_set_page_base(xdr, 0, buf->len);
 	else
-		xdr_set_iov(xdr, buf->head, buf->len);
+		xdr_set_iov(xdr, buf->tail, buf->len);
 	if (p != NULL && p > xdr->p && xdr->end >= p) {
 		xdr->nwords -= p - xdr->p;
 		xdr->p = p;

From 8d86e373b0ef52d091ced9583ffbb33ad2771576 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 21 Nov 2020 14:50:43 -0500
Subject: [PATCH 103/230] SUNRPC: Clean up helpers xdr_set_iov() and
 xdr_set_page_base()

Allow xdr_set_iov() to set a base so that we can use it to set the
cursor to a specific position in the kvec buffer.

If the new base overflows the kvec/pages buffer in either xdr_set_iov()
or xdr_set_page_base(), then truncate it so that we point to the end of
the buffer.

Finally, change both function to return the number of bytes remaining to
read in their buffers.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index bc7a622016ee..394297ec1cb9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -970,19 +970,22 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
 }
 EXPORT_SYMBOL_GPL(xdr_write_pages);
 
-static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
-		unsigned int len)
+static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
+				unsigned int base, unsigned int len)
 {
 	if (len > iov->iov_len)
 		len = iov->iov_len;
-	xdr->p = (__be32*)iov->iov_base;
+	if (unlikely(base > len))
+		base = len;
+	xdr->p = (__be32*)(iov->iov_base + base);
 	xdr->end = (__be32*)(iov->iov_base + len);
 	xdr->iov = iov;
 	xdr->page_ptr = NULL;
+	return len - base;
 }
 
-static int xdr_set_page_base(struct xdr_stream *xdr,
-		unsigned int base, unsigned int len)
+static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
+				      unsigned int base, unsigned int len)
 {
 	unsigned int pgnr;
 	unsigned int maxlen;
@@ -991,9 +994,11 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
 	void *kaddr;
 
 	maxlen = xdr->buf->page_len;
-	if (base >= maxlen)
-		return -EINVAL;
-	maxlen -= base;
+	if (base >= maxlen) {
+		base = maxlen;
+		maxlen = 0;
+	} else
+		maxlen -= base;
 	if (len > maxlen)
 		len = maxlen;
 
@@ -1011,14 +1016,14 @@ static int xdr_set_page_base(struct xdr_stream *xdr,
 		pgend = PAGE_SIZE;
 	xdr->end = (__be32*)(kaddr + pgend);
 	xdr->iov = NULL;
-	return 0;
+	return len;
 }
 
 static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
 			 unsigned int len)
 {
-	if (xdr_set_page_base(xdr, base, len) < 0)
-		xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
+	if (xdr_set_page_base(xdr, base, len) == 0)
+		xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr));
 }
 
 static void xdr_set_next_page(struct xdr_stream *xdr)
@@ -1055,12 +1060,9 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
 	xdr->scratch.iov_base = NULL;
 	xdr->scratch.iov_len = 0;
 	xdr->nwords = XDR_QUADLEN(buf->len);
-	if (buf->head[0].iov_len != 0)
-		xdr_set_iov(xdr, buf->head, buf->len);
-	else if (buf->page_len != 0)
-		xdr_set_page_base(xdr, 0, buf->len);
-	else
-		xdr_set_iov(xdr, buf->tail, buf->len);
+	if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 &&
+	    xdr_set_page_base(xdr, 0, buf->len) == 0)
+		xdr_set_iov(xdr, buf->tail, 0, buf->len);
 	if (p != NULL && p > xdr->p && xdr->end >= p) {
 		xdr->nwords -= p - xdr->p;
 		xdr->p = p;

From 1d97316692f708de755655ac1cfd704d7a55843f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 20 Nov 2020 16:31:03 -0500
Subject: [PATCH 104/230] SUNRPC: Fix up xdr_read_pages() to take arbitrary
 object lengths

Fix up xdr_read_pages() so that it can handle object lengths that are
larger than the page length, by simply aligning to the next object in
the buffer tail.
The function will continue to return the length of the truncate object
data that actually fit into the pages.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 394297ec1cb9..3ce0a5daa9eb 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1219,44 +1219,33 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 }
 
 /**
- * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
+ * xdr_read_pages - align page-based XDR data to current pointer position
  * @xdr: pointer to xdr_stream struct
  * @len: number of bytes of page data
  *
  * Moves data beyond the current pointer position from the XDR head[] buffer
- * into the page list. Any data that lies beyond current position + "len"
- * bytes is moved into the XDR tail[].
+ * into the page list. Any data that lies beyond current position + @len
+ * bytes is moved into the XDR tail[]. The xdr_stream current position is
+ * then advanced past that data to align to the next XDR object in the tail.
  *
  * Returns the number of XDR encoded bytes now contained in the pages
  */
 unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 {
-	struct xdr_buf *buf = xdr->buf;
-	struct kvec *iov;
-	unsigned int nwords;
-	unsigned int end;
-	unsigned int padding;
+	unsigned int nwords = XDR_QUADLEN(len);
+	unsigned int base, end, pglen;
 
-	len = xdr_align_pages(xdr, len);
-	if (len == 0)
+	pglen = xdr_align_pages(xdr, nwords << 2);
+	if (pglen == 0)
 		return 0;
-	nwords = XDR_QUADLEN(len);
-	padding = (nwords << 2) - len;
-	xdr->iov = iov = buf->tail;
-	/* Compute remaining message length.  */
-	end = ((xdr->nwords - nwords) << 2) + padding;
-	if (end > iov->iov_len)
-		end = iov->iov_len;
 
-	/*
-	 * Position current pointer at beginning of tail, and
-	 * set remaining message length.
-	 */
-	xdr->p = (__be32 *)((char *)iov->iov_base + padding);
-	xdr->end = (__be32 *)((char *)iov->iov_base + end);
-	xdr->page_ptr = NULL;
-	xdr->nwords = XDR_QUADLEN(end - padding);
-	return len;
+	xdr->nwords -= nwords;
+	base = (nwords << 2) - pglen;
+	end = xdr_stream_remaining(xdr) - pglen;
+
+	if (xdr_set_iov(xdr, xdr->buf->tail, base, end) == 0)
+		xdr->nwords = 0;
+	return len <= pglen ? len : pglen;
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 

From 9ed5af268e88f6e5b65376be98d652b37cb20d7b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 21 Nov 2020 20:46:18 -0500
Subject: [PATCH 105/230] SUNRPC: Clean up the handling of page padding in
 rpc_prepare_reply_pages()

rpc_prepare_reply_pages() currently expects the 'hdrsize' argument to
contain the length of the data that we expect to want placed in the head
kvec plus a count of 1 word of padding that is placed after the page data.
This is very confusing when trying to read the code, and sometimes leads
to callers adding an arbitrary value of '1' just in order to satisfy the
requirement (whether or not the page data actually needs such padding).

This patch aims to clarify the code by changing the 'hdrsize' argument
to remove that 1 word of padding. This means we need to subtract the
padding from all the existing callers.

Fixes: 02ef04e432ba ("NFS: Account for XDR pad of buf->pages")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs2xdr.c  | 19 ++++++++++---------
 fs/nfs/nfs3xdr.c  | 29 ++++++++++++++++-------------
 fs/nfs/nfs4xdr.c  | 36 +++++++++++++++++++-----------------
 net/sunrpc/clnt.c |  5 +----
 net/sunrpc/xdr.c  |  3 ---
 5 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f6676af37d5d..7fba7711e6b3 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -34,6 +34,7 @@
  * Declare the space requirements for NFS arguments and replies as
  * number of 32bit-words
  */
+#define NFS_pagepad_sz		(1) /* Page padding */
 #define NFS_fhandle_sz		(8)
 #define NFS_sattr_sz		(8)
 #define NFS_filename_sz		(1+(NFS2_MAXNAMLEN>>2))
@@ -56,11 +57,11 @@
 
 #define NFS_attrstat_sz		(1+NFS_fattr_sz)
 #define NFS_diropres_sz		(1+NFS_fhandle_sz+NFS_fattr_sz)
-#define NFS_readlinkres_sz	(2+1)
-#define NFS_readres_sz		(1+NFS_fattr_sz+1+1)
+#define NFS_readlinkres_sz	(2+NFS_pagepad_sz)
+#define NFS_readres_sz		(1+NFS_fattr_sz+1+NFS_pagepad_sz)
 #define NFS_writeres_sz         (NFS_attrstat_sz)
 #define NFS_stat_sz		(1)
-#define NFS_readdirres_sz	(1+1)
+#define NFS_readdirres_sz	(1+NFS_pagepad_sz)
 #define NFS_statfsres_sz	(1+NFS_info_sz)
 
 static int nfs_stat_to_errno(enum nfs_stat);
@@ -592,8 +593,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
 	const struct nfs_readlinkargs *args = data;
 
 	encode_fhandle(xdr, args->fh);
-	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
-				args->pglen, NFS_readlinkres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen,
+				NFS_readlinkres_sz - NFS_pagepad_sz);
 }
 
 /*
@@ -628,8 +629,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
 	const struct nfs_pgio_args *args = data;
 
 	encode_readargs(xdr, args);
-	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
-				args->count, NFS_readres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count,
+				NFS_readres_sz - NFS_pagepad_sz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 }
 
@@ -786,8 +787,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req,
 	const struct nfs_readdirargs *args = data;
 
 	encode_readdirargs(xdr, args);
-	rpc_prepare_reply_pages(req, args->pages, 0,
-				args->count, NFS_readdirres_sz);
+	rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+				NFS_readdirres_sz - NFS_pagepad_sz);
 }
 
 /*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 69971f6c840d..ca10072644ff 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -33,6 +33,7 @@
  * Declare the space requirements for NFS arguments and replies as
  * number of 32bit-words
  */
+#define NFS3_pagepad_sz		(1) /* Page padding */
 #define NFS3_fhandle_sz		(1+16)
 #define NFS3_fh_sz		(NFS3_fhandle_sz)	/* shorthand */
 #define NFS3_sattr_sz		(15)
@@ -69,13 +70,13 @@
 #define NFS3_removeres_sz	(NFS3_setattrres_sz)
 #define NFS3_lookupres_sz	(1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz))
 #define NFS3_accessres_sz	(1+NFS3_post_op_attr_sz+1)
-#define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1+1)
-#define NFS3_readres_sz		(1+NFS3_post_op_attr_sz+3+1)
+#define NFS3_readlinkres_sz	(1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz)
+#define NFS3_readres_sz		(1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz)
 #define NFS3_writeres_sz	(1+NFS3_wcc_data_sz+4)
 #define NFS3_createres_sz	(1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
 #define NFS3_renameres_sz	(1+(2 * NFS3_wcc_data_sz))
 #define NFS3_linkres_sz		(1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz)
-#define NFS3_readdirres_sz	(1+NFS3_post_op_attr_sz+2+1)
+#define NFS3_readdirres_sz	(1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz)
 #define NFS3_fsstatres_sz	(1+NFS3_post_op_attr_sz+13)
 #define NFS3_fsinfores_sz	(1+NFS3_post_op_attr_sz+12)
 #define NFS3_pathconfres_sz	(1+NFS3_post_op_attr_sz+6)
@@ -85,7 +86,8 @@
 #define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
 				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
-				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1)
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+\
+				NFS3_pagepad_sz)
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
 static int nfs3_stat_to_errno(enum nfs_stat);
@@ -909,8 +911,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
 	const struct nfs3_readlinkargs *args = data;
 
 	encode_nfs_fh3(xdr, args->fh);
-	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
-				args->pglen, NFS3_readlinkres_sz);
+	rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen,
+				NFS3_readlinkres_sz - NFS3_pagepad_sz);
 }
 
 /*
@@ -939,7 +941,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
 				   const void *data)
 {
 	const struct nfs_pgio_args *args = data;
-	unsigned int replen = args->replen ? args->replen : NFS3_readres_sz;
+	unsigned int replen = args->replen ? args->replen :
+					     NFS3_readres_sz - NFS3_pagepad_sz;
 
 	encode_read3args(xdr, args);
 	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
@@ -1239,8 +1242,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req,
 	const struct nfs3_readdirargs *args = data;
 
 	encode_readdir3args(xdr, args);
-	rpc_prepare_reply_pages(req, args->pages, 0,
-				args->count, NFS3_readdirres_sz);
+	rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+				NFS3_readdirres_sz - NFS3_pagepad_sz);
 }
 
 /*
@@ -1281,8 +1284,8 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req,
 	const struct nfs3_readdirargs *args = data;
 
 	encode_readdirplus3args(xdr, args);
-	rpc_prepare_reply_pages(req, args->pages, 0,
-				args->count, NFS3_readdirres_sz);
+	rpc_prepare_reply_pages(req, args->pages, 0, args->count,
+				NFS3_readdirres_sz - NFS3_pagepad_sz);
 }
 
 /*
@@ -1328,7 +1331,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req,
 	if (args->mask & (NFS_ACL | NFS_DFACL)) {
 		rpc_prepare_reply_pages(req, args->pages, 0,
 					NFSACL_MAXPAGES << PAGE_SHIFT,
-					ACL3_getaclres_sz);
+					ACL3_getaclres_sz - NFS3_pagepad_sz);
 		req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
 	}
 }
@@ -1648,7 +1651,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
 	result->op_status = status;
 	if (status != NFS3_OK)
 		goto out_status;
-	result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2);
+	result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2);
 	error = decode_read3resok(xdr, result);
 out:
 	return error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c16b93df1bc1..3899ef3047f4 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -84,6 +84,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 /* lock,open owner id:
  * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
  */
+#define pagepad_maxsz		(1)
 #define open_owner_id_maxsz	(1 + 2 + 1 + 1 + 2)
 #define lock_owner_id_maxsz	(1 + 1 + 4)
 #define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
@@ -215,14 +216,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 				 nfs4_fattr_bitmap_maxsz)
 #define encode_read_maxsz	(op_encode_hdr_maxsz + \
 				 encode_stateid_maxsz + 3)
-#define decode_read_maxsz	(op_decode_hdr_maxsz + 2 + 1)
+#define decode_read_maxsz	(op_decode_hdr_maxsz + 2 + pagepad_maxsz)
 #define encode_readdir_maxsz	(op_encode_hdr_maxsz + \
 				 2 + encode_verifier_maxsz + 5 + \
 				nfs4_label_maxsz)
 #define decode_readdir_maxsz	(op_decode_hdr_maxsz + \
-				 decode_verifier_maxsz + 1)
+				 decode_verifier_maxsz + pagepad_maxsz)
 #define encode_readlink_maxsz	(op_encode_hdr_maxsz)
-#define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1 + 1)
+#define decode_readlink_maxsz	(op_decode_hdr_maxsz + 1 + pagepad_maxsz)
 #define encode_write_maxsz	(op_encode_hdr_maxsz + \
 				 encode_stateid_maxsz + 4)
 #define decode_write_maxsz	(op_decode_hdr_maxsz + \
@@ -284,14 +285,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 #define decode_delegreturn_maxsz (op_decode_hdr_maxsz)
 #define encode_getacl_maxsz	(encode_getattr_maxsz)
 #define decode_getacl_maxsz	(op_decode_hdr_maxsz + \
-				 nfs4_fattr_bitmap_maxsz + 1 + 1)
+				 nfs4_fattr_bitmap_maxsz + 1 + pagepad_maxsz)
 #define encode_setacl_maxsz	(op_encode_hdr_maxsz + \
 				 encode_stateid_maxsz + 3)
 #define decode_setacl_maxsz	(decode_setattr_maxsz)
 #define encode_fs_locations_maxsz \
 				(encode_getattr_maxsz)
 #define decode_fs_locations_maxsz \
-				(1)
+				(pagepad_maxsz)
 #define encode_secinfo_maxsz	(op_encode_hdr_maxsz + nfs4_name_maxsz)
 #define decode_secinfo_maxsz	(op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4))
 
@@ -393,12 +394,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
 				  /* devaddr4 payload is read into page */ \
 				1 /* notification bitmap length */ + \
 				1 /* notification bitmap, word 0 */ + \
-				1 /* possible XDR padding */)
+				pagepad_maxsz /* possible XDR padding */)
 #define encode_layoutget_maxsz	(op_encode_hdr_maxsz + 10 + \
 				encode_stateid_maxsz)
 #define decode_layoutget_maxsz	(op_decode_hdr_maxsz + 8 + \
 				decode_stateid_maxsz + \
-				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1)
+				XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \
+				pagepad_maxsz)
 #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz +          \
 				2 /* offset */ + \
 				2 /* length */ + \
@@ -2342,7 +2344,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 		encode_layoutget(xdr, args->lg_args, &hdr);
 		rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
 					args->lg_args->layout.pglen,
-					hdr.replen);
+					hdr.replen - pagepad_maxsz);
 	}
 	encode_nops(&hdr);
 }
@@ -2388,7 +2390,7 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req,
 		encode_layoutget(xdr, args->lg_args, &hdr);
 		rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0,
 					args->lg_args->layout.pglen,
-					hdr.replen);
+					hdr.replen - pagepad_maxsz);
 	}
 	encode_nops(&hdr);
 }
@@ -2499,7 +2501,7 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_readlink(xdr, args, req, &hdr);
 
 	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
-				args->pglen, hdr.replen);
+				args->pglen, hdr.replen - pagepad_maxsz);
 	encode_nops(&hdr);
 }
 
@@ -2520,7 +2522,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_readdir(xdr, args, req, &hdr);
 
 	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
-				args->count, hdr.replen);
+				args->count, hdr.replen - pagepad_maxsz);
 	encode_nops(&hdr);
 }
 
@@ -2541,7 +2543,7 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_read(xdr, args, &hdr);
 
 	rpc_prepare_reply_pages(req, args->pages, args->pgbase,
-				args->count, hdr.replen);
+				args->count, hdr.replen - pagepad_maxsz);
 	req->rq_rcv_buf.flags |= XDRBUF_READ;
 	encode_nops(&hdr);
 }
@@ -2588,7 +2590,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
 			ARRAY_SIZE(nfs4_acl_bitmap), &hdr);
 
 	rpc_prepare_reply_pages(req, args->acl_pages, 0,
-				args->acl_len, replen + 1);
+				args->acl_len, replen);
 	encode_nops(&hdr);
 }
 
@@ -2810,7 +2812,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
 	}
 
 	rpc_prepare_reply_pages(req, (struct page **)&args->page, 0,
-				PAGE_SIZE, replen + 1);
+				PAGE_SIZE, replen);
 	encode_nops(&hdr);
 }
 
@@ -3014,14 +3016,14 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 
-	replen = hdr.replen + op_decode_hdr_maxsz;
+	replen = hdr.replen + op_decode_hdr_maxsz + 2;
 
 	encode_getdeviceinfo(xdr, args, &hdr);
 
 	/* set up reply kvec. device_addr4 opaque data is read into the
 	 * pages */
 	rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase,
-				args->pdev->pglen, replen + 2 + 1);
+				args->pdev->pglen, replen);
 	encode_nops(&hdr);
 }
 
@@ -3043,7 +3045,7 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
 	encode_layoutget(xdr, args, &hdr);
 
 	rpc_prepare_reply_pages(req, args->layout.pages, 0,
-				args->layout.pglen, hdr.replen);
+				args->layout.pglen, hdr.replen - pagepad_maxsz);
 	encode_nops(&hdr);
 }
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3259120462ed..612f0a641f4c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages,
 			     unsigned int base, unsigned int len,
 			     unsigned int hdrsize)
 {
-	/* Subtract one to force an extra word of buffer space for the
-	 * payload's XDR pad to fall into the rcv_buf's tail iovec.
-	 */
-	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1;
+	hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign;
 
 	xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len);
 	trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 3ce0a5daa9eb..5a450055469f 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -193,9 +193,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 
 	tail->iov_base = buf + offset;
 	tail->iov_len = buflen - offset;
-	if ((xdr->page_len & 3) == 0)
-		tail->iov_len -= sizeof(__be32);
-
 	xdr->buflen += len;
 }
 EXPORT_SYMBOL_GPL(xdr_inline_pages);

From 0279024f22705128c7139bd55af6981afe90e876 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 21 Nov 2020 21:21:11 -0500
Subject: [PATCH 106/230] SUNRPC: Fix up xdr_set_page()

While we always want to align to the next page and/or the beginning of
the tail buffer when we call xdr_set_next_page(), the functions
xdr_align_data() and xdr_expand_hole() really want to align to the next
object in that next page or tail.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5a450055469f..ddd5cc2281ab 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1019,8 +1019,10 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
 static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
 			 unsigned int len)
 {
-	if (xdr_set_page_base(xdr, base, len) == 0)
-		xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr));
+	if (xdr_set_page_base(xdr, base, len) == 0) {
+		base -= xdr->buf->page_len;
+		xdr_set_iov(xdr, xdr->buf->tail, base, len);
+	}
 }
 
 static void xdr_set_next_page(struct xdr_stream *xdr)
@@ -1029,17 +1031,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
 
 	newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
 	newbase -= xdr->buf->page_base;
-
-	xdr_set_page(xdr, newbase, PAGE_SIZE);
+	if (newbase < xdr->buf->page_len)
+		xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr));
+	else
+		xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr));
 }
 
 static bool xdr_set_next_buffer(struct xdr_stream *xdr)
 {
 	if (xdr->page_ptr != NULL)
 		xdr_set_next_page(xdr);
-	else if (xdr->iov == xdr->buf->head) {
-		xdr_set_page(xdr, 0, PAGE_SIZE);
-	}
+	else if (xdr->iov == xdr->buf->head)
+		xdr_set_page(xdr, 0, xdr_stream_remaining(xdr));
 	return xdr->p != xdr->end;
 }
 
@@ -1277,7 +1280,7 @@ uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length
 	}
 
 	xdr->nwords -= XDR_QUADLEN(length);
-	xdr_set_page(xdr, from + length, PAGE_SIZE);
+	xdr_set_page(xdr, from + length, xdr_stream_remaining(xdr));
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
@@ -1314,7 +1317,7 @@ uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t lengt
 	_zero_pages(buf->pages, buf->page_base + offset, length);
 
 	buf->len += length - (from - offset) - truncated;
-	xdr_set_page(xdr, offset + length, PAGE_SIZE);
+	xdr_set_page(xdr, offset + length, xdr_stream_remaining(xdr));
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_expand_hole);

From eee1f54964fe868da425fe52a03666377335de01 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 21 Nov 2020 21:39:02 -0500
Subject: [PATCH 107/230] SUNRPC: Fix open coded xdr_stream_remaining()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index ddd5cc2281ab..c852d199c789 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1261,7 +1261,7 @@ uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr->nwords << 2;
+	bytes = xdr_stream_remaining(xdr);
 	if (length < bytes)
 		bytes = length;
 
@@ -1298,7 +1298,7 @@ uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t lengt
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr->nwords << 2;
+	bytes = xdr_stream_remaining(xdr);
 
 	if (offset + length + bytes > buf->page_len) {
 		unsigned int shift = (offset + length + bytes) - buf->page_len;

From 17068466ad02d3ec07ab1b8f3f97928598affc9a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sat, 21 Nov 2020 21:41:08 -0500
Subject: [PATCH 108/230] NFSv4: Fix open coded xdr_stream_remaining()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4xdr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3899ef3047f4..de69276cfd22 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5337,11 +5337,11 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 		res->acl_len = attrlen;
 
 		/* Check for receive buffer overflow */
-		if (res->acl_len > (xdr->nwords << 2) ||
+		if (res->acl_len > xdr_stream_remaining(xdr) ||
 		    res->acl_len + res->acl_data_offset > xdr->buf->page_len) {
 			res->acl_flags |= NFS4_ACL_TRUNC;
-			dprintk("NFS: acl reply: attrlen %u > page_len %u\n",
-					attrlen, xdr->nwords << 2);
+			dprintk("NFS: acl reply: attrlen %u > page_len %zu\n",
+				attrlen, xdr_stream_remaining(xdr));
 		}
 	} else
 		status = -EOPNOTSUPP;

From b6d49ecd1081740b6e632366428b960461f8158b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 25 Nov 2020 12:06:14 -0500
Subject: [PATCH 109/230] NFSv4: Fix a pNFS layout related use-after-free race
 when freeing the inode

When returning the layout in nfs4_evict_inode(), we need to ensure that
the layout is actually done being freed before we can proceed to free the
inode itself.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4super.c |  2 +-
 fs/nfs/pnfs.c      | 33 +++++++++++++++++++++++++++++++--
 fs/nfs/pnfs.h      |  5 +++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 93f5c1678ec2..984cc42ee54d 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -67,7 +67,7 @@ static void nfs4_evict_inode(struct inode *inode)
 	nfs_inode_evict_delegation(inode);
 	/* Note that above delegreturn would trigger pnfs return-on-close */
 	pnfs_return_layout(inode);
-	pnfs_destroy_layout(NFS_I(inode));
+	pnfs_destroy_layout_final(NFS_I(inode));
 	/* First call standard NFS clear_inode() code */
 	nfs_clear_inode(inode);
 	nfs4_xattr_cache_zap(inode);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0e50b9d45c32..07f59dc8cb2e 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -294,6 +294,7 @@ void
 pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
 	struct inode *inode;
+	unsigned long i_state;
 
 	if (!lo)
 		return;
@@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 		if (!list_empty(&lo->plh_segs))
 			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
 		pnfs_detach_layout_hdr(lo);
+		i_state = inode->i_state;
 		spin_unlock(&inode->i_lock);
 		pnfs_free_layout_hdr(lo);
+		/* Notify pnfs_destroy_layout_final() that we're done */
+		if (i_state & (I_FREEING | I_CLEAR))
+			wake_up_var(lo);
 	}
 }
 
@@ -734,8 +739,7 @@ pnfs_free_lseg_list(struct list_head *free_me)
 	}
 }
 
-void
-pnfs_destroy_layout(struct nfs_inode *nfsi)
+static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi)
 {
 	struct pnfs_layout_hdr *lo;
 	LIST_HEAD(tmp_list);
@@ -753,9 +757,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 		pnfs_put_layout_hdr(lo);
 	} else
 		spin_unlock(&nfsi->vfs_inode.i_lock);
+	return lo;
+}
+
+void pnfs_destroy_layout(struct nfs_inode *nfsi)
+{
+	__pnfs_destroy_layout(nfsi);
 }
 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 
+static bool pnfs_layout_removed(struct nfs_inode *nfsi,
+				struct pnfs_layout_hdr *lo)
+{
+	bool ret;
+
+	spin_lock(&nfsi->vfs_inode.i_lock);
+	ret = nfsi->layout != lo;
+	spin_unlock(&nfsi->vfs_inode.i_lock);
+	return ret;
+}
+
+void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
+{
+	struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
+
+	if (lo)
+		wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
+}
+
 static bool
 pnfs_layout_add_bulk_destroy_list(struct inode *inode,
 		struct list_head *layout_list)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index f618c49697bb..bbd3de1025f2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -268,6 +268,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_layoutget_free(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
+void pnfs_destroy_layout_final(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
 int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
 		struct nfs_fsid *fsid,
@@ -712,6 +713,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
 {
 }
 
+static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
+{
+}
+
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {

From d18a9d3fa0f27a47706fb67f1ee0f4d971587c4e Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Thu, 12 Nov 2020 02:09:51 -0800
Subject: [PATCH 110/230] NFS: NFSv2/NFSv3: Use cred from fs_context during
 mount

There was refactoring done to use the fs_context for mounting done in:
62a55d088cd87: NFS: Additional refactoring for fs_context conversion

This made it so that the net_ns is fetched from the fs_context (the netns
that fsopen is called in). This change also makes it so that the credential
fetched during fsopen is used as well as the net_ns.

NFS has already had a number of changes to prepare it for user namespaces:
1a58e8a0e5c1: NFS: Store the credential of the mount process in the nfs_server
264d948ce7d0: NFS: Convert NFSv3 to use the container user namespace
c207db2f5da5: NFS: Convert NFSv2 to use the container user namespace

Previously, different credentials could be used for creation of the
fs_context versus creation of the nfs_server, as FSCONFIG_CMD_CREATE did
the actual credential check, and that's where current_creds() were fetched.
This meant that the user namespace which fsopen was called in could be a
non-init user namespace. This still requires that the user that calls
FSCONFIG_CMD_CREATE has CAP_SYS_ADMIN in the init user ns.

This roughly allows a privileged user to mount on behalf of an unprivileged
usernamespace, by forking off and calling fsopen in the unprivileged user
namespace. It can then pass back that fsfd to the privileged process which
can configure the NFS mount, and then it can call FSCONFIG_CMD_CREATE
before switching back into the mount namespace of the container, and finish
up the mounting process and call fsmount and move_mount.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Tested-by: Alban Crequy <alban.crequy@gmail.com>
Fixes: 62a55d088cd8 ("NFS: Additional refactoring for fs_context conversion")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/client.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f6454ba53d05..ff5c4d0d6d13 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -571,7 +571,7 @@ static int nfs_start_lockd(struct nfs_server *server)
 					1 : 0,
 		.net		= clp->cl_net,
 		.nlmclnt_ops 	= clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
-		.cred		= current_cred(),
+		.cred		= server->cred,
 	};
 
 	if (nlm_init.nfs_version > 3)
@@ -985,7 +985,7 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
-	server->cred = get_cred(current_cred());
+	server->cred = get_cred(fc->cred);
 
 	error = -ENOMEM;
 	fattr = nfs_alloc_fattr();

From d3ff46fe693683cb9660e9b93e8c932cc8e0c1f8 Mon Sep 17 00:00:00 2001
From: Sargun Dhillon <sargun@sargun.me>
Date: Thu, 12 Nov 2020 02:09:52 -0800
Subject: [PATCH 111/230] NFSv4: Refactor to use user namespaces for nfs4idmap

In several patches work has been done to enable NFSv4 to use user
namespaces:
58002399da65: NFSv4: Convert the NFS client idmapper to use the container user namespace
3b7eb5e35d0f: NFS: When mounting, don't share filesystems between different user namespaces

Unfortunately, the userspace APIs were only such that the userspace facing
side of the filesystem (superblock s_user_ns) could be set to a non init
user namespace. This furthers the fs_context related refactoring, and
piggybacks on top of that logic, so the superblock user namespace, and the
NFS user namespace are the same.

Users can still use rpc.idmapd if they choose to, but there are complexities
with user namespaces and request-key that have yet to be addresssed.

Eventually, we will need to at least:
  * Come up with an upcall mechanism that can be triggered inside of the container,
    or safely triggered outside, with the requisite context to do the right
    mapping. * Handle whatever refactoring needs to be done in net/sunrpc.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Tested-by: Alban Crequy <alban.crequy@gmail.com>
Fixes: 62a55d088cd8 ("NFS: Additional refactoring for fs_context conversion")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index be7915c861ce..86acffe7335c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -1153,7 +1153,7 @@ struct nfs_server *nfs4_create_server(struct fs_context *fc)
 	if (!server)
 		return ERR_PTR(-ENOMEM);
 
-	server->cred = get_cred(current_cred());
+	server->cred = get_cred(fc->cred);
 
 	auth_probe = ctx->auth_info.flavor_len < 1;
 

From 35a6d396721e28ba161595b0fc9e8896c00399bb Mon Sep 17 00:00:00 2001
From: Fedor Tokarev <ftokarev@gmail.com>
Date: Thu, 15 Oct 2020 16:59:08 +0300
Subject: [PATCH 112/230] net: sunrpc: Fix 'snprintf' return value check in
 'do_xprt_debugfs'

'snprintf' returns the number of characters which would have been written
if enough space had been available, excluding the terminating null byte.
Thus, the return value of 'sizeof(buf)' means that the last character
has been dropped.

Signed-off-by: Fedor Tokarev <ftokarev@gmail.com>
Fixes: 2f34b8bfae19 ("SUNRPC: add links for all client xprts to debugfs")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/debugfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index fd9bca242724..56029e3af6ff 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n
 		return 0;
 	len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
 		       xprt->debugfs->d_name.name);
-	if (len > sizeof(name))
+	if (len >= sizeof(name))
 		return -1;
 	if (*nump == 0)
 		strcpy(link, "xprt");
 	else {
 		len = snprintf(link, sizeof(link), "xprt%d", *nump);
-		if (len > sizeof(link))
+		if (len >= sizeof(link))
 			return -1;
 	}
 	debugfs_create_symlink(link, clnt->cl_debugfs, name);

From 9b82d88d5976e5f2b8015d58913654856576ace5 Mon Sep 17 00:00:00 2001
From: Calum Mackay <calum.mackay@oracle.com>
Date: Wed, 28 Oct 2020 20:16:27 +0000
Subject: [PATCH 113/230] lockd: don't use interval-based rebinding over TCP

NLM uses an interval-based rebinding, i.e. it clears the transport's
binding under certain conditions if more than 60 seconds have elapsed
since the connection was last bound.

This rebinding is not necessary for an autobind RPC client over a
connection-oriented protocol like TCP.

It can also cause problems: it is possible for nlm_bind_host() to clear
XPRT_BOUND whilst a connection worker is in the middle of trying to
reconnect, after it had already been checked in xprt_connect().

When the connection worker notices that XPRT_BOUND has been cleared
under it, in xs_tcp_finish_connecting(), that results in:

	xs_tcp_setup_socket: connect returned unhandled error -107

Worse, it's possible that the two can get into lockstep, resulting in
the same behaviour repeated indefinitely, with the above error every
300 seconds, without ever recovering, and the connection never being
established. This has been seen in practice, with a large number of NLM
client tasks, following a server restart.

The existing callers of nlm_bind_host & nlm_rebind_host should not need
to force the rebind, for TCP, so restrict the interval-based rebinding
to UDP only.

For TCP, we will still rebind when needed, e.g. on timeout, and connection
error (including closure), since connection-related errors on an existing
connection, ECONNREFUSED when trying to connect, and rpc_check_timeout(),
already unconditionally clear XPRT_BOUND.

To avoid having to add the fix, and explanation, to both nlm_bind_host()
and nlm_rebind_host(), remove the duplicate code from the former, and
have it call the latter.

Drop the dprintk, which adds no value over a trace.

Signed-off-by: Calum Mackay <calum.mackay@oracle.com>
Fixes: 35f5a422ce1a ("SUNRPC: new interface to force an RPC rebind")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/lockd/host.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 0afb6d59bad0..771c289f6df7 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -439,12 +439,7 @@ nlm_bind_host(struct nlm_host *host)
 	 * RPC rebind is required
 	 */
 	if ((clnt = host->h_rpcclnt) != NULL) {
-		if (time_after_eq(jiffies, host->h_nextrebind)) {
-			rpc_force_rebind(clnt);
-			host->h_nextrebind = jiffies + NLM_HOST_REBIND;
-			dprintk("lockd: next rebind in %lu jiffies\n",
-					host->h_nextrebind - jiffies);
-		}
+		nlm_rebind_host(host);
 	} else {
 		unsigned long increment = nlmsvc_timeout;
 		struct rpc_timeout timeparms = {
@@ -494,13 +489,20 @@ nlm_bind_host(struct nlm_host *host)
 	return clnt;
 }
 
-/*
- * Force a portmap lookup of the remote lockd port
+/**
+ * nlm_rebind_host - If needed, force a portmap lookup of the peer's lockd port
+ * @host: NLM host handle for peer
+ *
+ * This is not needed when using a connection-oriented protocol, such as TCP.
+ * The existing autobind mechanism is sufficient to force a rebind when
+ * required, e.g. on connection state transitions.
  */
 void
 nlm_rebind_host(struct nlm_host *host)
 {
-	dprintk("lockd: rebind host %s\n", host->h_name);
+	if (host->h_proto != IPPROTO_UDP)
+		return;
+
 	if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) {
 		rpc_force_rebind(host->h_rpcclnt);
 		host->h_nextrebind = jiffies + NLM_HOST_REBIND;

From bf701b765eaa82dd164d65edc5747ec7288bb5c3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 27 Nov 2020 11:24:33 +1100
Subject: [PATCH 114/230] NFS: switch nfsiod to be an UNBOUND workqueue.

nfsiod is currently a concurrency-managed workqueue (CMWQ).
This means that workitems scheduled to nfsiod on a given CPU are queued
behind all other work items queued on any CMWQ on the same CPU.  This
can introduce unexpected latency.

Occaionally nfsiod can even cause excessive latency.  If the work item
to complete a CLOSE request calls the final iput() on an inode, the
address_space of that inode will be dismantled.  This takes time
proportional to the number of in-memory pages, which on a large host
working on large files (e.g..  5TB), can be a large number of pages
resulting in a noticable number of seconds.

We can avoid these latency problems by switching nfsiod to WQ_UNBOUND.
This causes each concurrent work item to gets a dedicated thread which
can be scheduled to an idle CPU.

There is precedent for this as several other filesystems use WQ_UNBOUND
workqueue for handling various async events.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: ada609ee2ac2 ("workqueue: use WQ_MEM_RECLAIM instead of WQ_RESCUER")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9b765a900b28..522aa10a1a3e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2173,7 +2173,7 @@ static int nfsiod_start(void)
 {
 	struct workqueue_struct *wq;
 	dprintk("RPC:       creating workqueue nfsiod\n");
-	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
+	wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
 	if (wq == NULL)
 		return -ENOMEM;
 	nfsiod_workqueue = wq;

From aa80be5043a6d87c84112c37afe5cf4aa5cb9e11 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 17 Nov 2020 15:32:05 +0530
Subject: [PATCH 115/230] dt-bindings: mailbox : arm,mhuv2: Add bindings

This patch adds device tree binding for ARM Message Handling Unit (MHU)
controller version 2.

Based on earlier work by Morten Borup Petersen.

Reviewed-by: Rob Herring <robh@kernel.org>
Co-developed-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Signed-off-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 .../bindings/mailbox/arm,mhuv2.yaml           | 209 ++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml

diff --git a/Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml b/Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
new file mode 100644
index 000000000000..6608545ea66f
--- /dev/null
+++ b/Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mailbox/arm,mhuv2.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM MHUv2 Mailbox Controller
+
+maintainers:
+  - Tushar Khandelwal <tushar.khandelwal@arm.com>
+  - Viresh Kumar <viresh.kumar@linaro.org>
+
+description: |
+  The Arm Message Handling Unit (MHU) Version 2 is a mailbox controller that has
+  between 1 and 124 channel windows (each 32-bit wide) to provide unidirectional
+  communication with remote processor(s), where the number of channel windows
+  are implementation dependent.
+
+  Given the unidirectional nature of the controller, an MHUv2 mailbox may only
+  be written to or read from. If a pair of MHU controllers is implemented
+  between two processing elements to provide bidirectional communication, these
+  must be specified as two separate mailboxes.
+
+  If the interrupts property is present in device tree node, then its treated as
+  a "receiver" mailbox, otherwise a "sender".
+
+  An MHU controller must be specified along with the supported transport
+  protocols. The transport protocols determine the method of data transmission
+  as well as the number of provided mailbox channels.
+
+  Following are the possible transport protocols.
+
+  - Data-transfer: Each transfer is made of one or more words, using one or more
+    channel windows.
+
+  - Doorbell: Each transfer is made up of single bit flag, using any one of the
+    bits in a channel window. A channel window can support up to 32 doorbells
+    and the entire window shall be used in doorbell protocol.  Optionally, data
+    may be transmitted through a shared memory region, wherein the MHU is used
+    strictly as an interrupt generation mechanism but that is out of the scope
+    of these bindings.
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+  properties:
+    compatible:
+      contains:
+        enum:
+          - arm,mhuv2-tx
+          - arm,mhuv2-rx
+  required:
+    - compatible
+
+properties:
+  compatible:
+    oneOf:
+      - description: Sender mode
+        items:
+          - const: arm,mhuv2-tx
+          - const: arm,primecell
+
+      - description: Receiver-mode
+        items:
+          - const: arm,mhuv2-rx
+          - const: arm,primecell
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    description: |
+      The MHUv2 controller always implements an interrupt in the "receiver"
+      mode, while the interrupt in the "sender" mode was not available in the
+      version MHUv2.0, but the later versions do have it.
+    maxItems: 1
+
+  clocks:
+    maxItems: 1
+
+  clock-names:
+    maxItems: 1
+
+  arm,mhuv2-protocols:
+    $ref: /schemas/types.yaml#/definitions/uint32-matrix
+    description: |
+      The MHUv2 controller may contain up to 124 channel windows (each 32-bit
+      wide). The hardware and the DT bindings allows any combination of those to
+      be used for various transport protocols.
+
+      This property allows a platform to describe how these channel windows are
+      used in various transport protocols. The entries in this property shall be
+      present as an array of tuples, where each tuple describes details about
+      one of the transport protocol being implemented over some channel
+      window(s).
+
+      The first field of a tuple signifies the transfer protocol, 0 is reserved
+      for doorbell protocol, and 1 is reserved for data-transfer protocol.
+      Using any other value in the first field of a tuple makes it invalid.
+
+      The second field of a tuple signifies the number of channel windows where
+      the protocol would be used and should be set to a non zero value. For
+      doorbell protocol this field signifies the number of 32-bit channel
+      windows that implement the doorbell protocol. For data-transfer protocol,
+      this field signifies the number of 32-bit channel windows that implement
+      the data-transfer protocol.
+
+      The total number of channel windows specified here shouldn't be more than
+      the ones implemented by the platform, though one can specify lesser number
+      of windows here than what the platform implements.
+
+      mhu: mailbox@2b1f0000 {
+          ...
+
+          arm,mhuv2-protocols = <0 2>, <1 1>, <1 5>, <1 7>;
+      }
+
+      The above example defines the protocols of an ARM MHUv2 mailbox
+      controller, where a total of 15 channel windows are used. The first two
+      windows are used in doorbell protocol (64 doorbells), followed by 1, 5 and
+      7 windows (separately) used in data-transfer protocol.
+
+    minItems: 1
+    maxItems: 124
+    items:
+      items:
+        - enum: [ 0, 1 ]
+        - minimum: 0
+          maximum: 124
+
+
+  '#mbox-cells':
+    description: |
+      It is always set to 2. The first argument in the consumers 'mboxes'
+      property represents the channel window group, which may be used in
+      doorbell, or data-transfer protocol, and the second argument (only
+      relevant in doorbell protocol, should be 0 otherwise) represents the
+      doorbell number within the 32 bit wide channel window.
+
+      From the example given above for arm,mhuv2-protocols, here is how a client
+      node can reference them.
+
+      mboxes = <&mhu 0 5>; // Channel Window Group 0, doorbell 5.
+      mboxes = <&mhu 1 7>; // Channel Window Group 1, doorbell 7.
+      mboxes = <&mhu 2 0>; // Channel Window Group 2, data transfer protocol with 1 window.
+      mboxes = <&mhu 3 0>; // Channel Window Group 3, data transfer protocol with 5 windows.
+      mboxes = <&mhu 4 0>; // Channel Window Group 4, data transfer protocol with 7 windows.
+
+    const: 2
+
+if:
+  # Interrupt is compulsory for receiver
+  properties:
+    compatible:
+      contains:
+        const: arm,mhuv2-rx
+then:
+  required:
+    - interrupts
+
+required:
+  - compatible
+  - reg
+  - '#mbox-cells'
+  - arm,mhuv2-protocols
+
+additionalProperties: false
+
+examples:
+  # Multiple transport protocols implemented by the mailbox controllers
+  - |
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        mhu_tx: mailbox@2b1f0000 {
+            #mbox-cells = <2>;
+            compatible = "arm,mhuv2-tx", "arm,primecell";
+            reg = <0 0x2b1f0000 0 0x1000>;
+            clocks = <&clock 0>;
+            clock-names = "apb_pclk";
+            interrupts = <0 45 4>;
+            arm,mhuv2-protocols = <1 5>, <1 2>, <1 5>, <1 7>, <0 2>;
+        };
+
+        mhu_rx: mailbox@2b1f1000 {
+            #mbox-cells = <2>;
+            compatible = "arm,mhuv2-rx", "arm,primecell";
+            reg = <0 0x2b1f1000 0 0x1000>;
+            clocks = <&clock 0>;
+            clock-names = "apb_pclk";
+            interrupts = <0 46 4>;
+            arm,mhuv2-protocols = <1 1>, <1 7>, <0 2>;
+        };
+
+        mhu_client: scb@2e000000 {
+            compatible = "fujitsu,mb86s70-scb-1.0";
+            reg = <0 0x2e000000 0 0x4000>;
+
+            mboxes =
+                     //data-transfer protocol with 5 windows, mhu-tx
+                     <&mhu_tx 2 0>,
+                     //data-transfer protocol with 7 windows, mhu-tx
+                     <&mhu_tx 3 0>,
+                     //doorbell protocol channel 4, doorbell 27, mhu-tx
+                     <&mhu_tx 4 27>,
+                     //data-transfer protocol with 1 window, mhu-rx
+                     <&mhu_rx 0 0>;
+        };
+    };

From 5a6338cce9f4133c478d3b10b300f96dd644379a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 17 Nov 2020 15:32:06 +0530
Subject: [PATCH 116/230] mailbox: arm_mhuv2: Add driver

This adds driver for the ARM MHUv2 (Message Handling Unit) mailbox
controller.

This is based on the accepted DT bindings of the controller and supports
combination of both transport protocols, i.e. doorbell and data-transfer.

Transmitting and receiving data through the mailbox framework is done
through struct arm_mhuv2_mbox_msg.

Based on the initial work done by Morten Borup Petersen from ARM.

Co-developed-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Signed-off-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Tested-by: Usama Arif <usama.arif@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 MAINTAINERS                               |    9 +
 drivers/mailbox/Kconfig                   |    7 +
 drivers/mailbox/Makefile                  |    2 +
 drivers/mailbox/arm_mhuv2.c               | 1136 +++++++++++++++++++++
 include/linux/mailbox/arm_mhuv2_message.h |   20 +
 5 files changed, 1174 insertions(+)
 create mode 100644 drivers/mailbox/arm_mhuv2.c
 create mode 100644 include/linux/mailbox/arm_mhuv2_message.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 2daa6ee673f7..3917b7ef1da6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10418,6 +10418,15 @@ F:	drivers/mailbox/
 F:	include/linux/mailbox_client.h
 F:	include/linux/mailbox_controller.h
 
+MAILBOX ARM MHUv2
+M:	Viresh Kumar <viresh.kumar@linaro.org>
+M:	Tushar Khandelwal <Tushar.Khandelwal@arm.com>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	drivers/mailbox/arm_mhuv2.c
+F:	include/linux/mailbox/arm_mhuv2_message.h
+F:	Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
+
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
 M:	Michael Kerrisk <mtk.manpages@gmail.com>
 L:	linux-man@vger.kernel.org
diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index abbf5d67ffa2..f4abe3529acd 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -16,6 +16,13 @@ config ARM_MHU
 	  The controller has 3 mailbox channels, the last of which can be
 	  used in Secure mode only.
 
+config ARM_MHU_V2
+	tristate "ARM MHUv2 Mailbox"
+	depends on ARM_AMBA
+	help
+	  Say Y here if you want to build the ARM MHUv2 controller driver,
+	  which provides unidirectional mailboxes between processing elements.
+
 config IMX_MBOX
 	tristate "i.MX Mailbox"
 	depends on ARCH_MXC || COMPILE_TEST
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 2e06e02b2e03..7194fa92c787 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -7,6 +7,8 @@ obj-$(CONFIG_MAILBOX_TEST)	+= mailbox-test.o
 
 obj-$(CONFIG_ARM_MHU)	+= arm_mhu.o arm_mhu_db.o
 
+obj-$(CONFIG_ARM_MHU_V2)	+= arm_mhuv2.o
+
 obj-$(CONFIG_IMX_MBOX)	+= imx-mailbox.o
 
 obj-$(CONFIG_ARMADA_37XX_RWTM_MBOX)	+= armada-37xx-rwtm-mailbox.o
diff --git a/drivers/mailbox/arm_mhuv2.c b/drivers/mailbox/arm_mhuv2.c
new file mode 100644
index 000000000000..67fb10885bb4
--- /dev/null
+++ b/drivers/mailbox/arm_mhuv2.c
@@ -0,0 +1,1136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM Message Handling Unit Version 2 (MHUv2) driver.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ * Copyright (C) 2020 Linaro Ltd.
+ *
+ * An MHUv2 mailbox controller can provide up to 124 channel windows (each 32
+ * bit long) and the driver allows any combination of both the transport
+ * protocol modes: data-transfer and doorbell, to be used on those channel
+ * windows.
+ *
+ * The transport protocols should be specified in the device tree entry for the
+ * device. The transport protocols determine how the underlying hardware
+ * resources of the device are utilized when transmitting data. Refer to the
+ * device tree bindings of the ARM MHUv2 controller for more details.
+ *
+ * The number of registered mailbox channels is dependent on both the underlying
+ * hardware - mainly the number of channel windows implemented by the platform,
+ * as well as the selected transport protocols.
+ *
+ * The MHUv2 controller can work both as a sender and receiver, but the driver
+ * and the DT bindings support unidirectional transfers for better allocation of
+ * the channels. That is, this driver will be probed for two separate devices
+ * for each mailbox controller, a sender device and a receiver device.
+ */
+
+#include <linux/amba/bus.h>
+#include <linux/interrupt.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox/arm_mhuv2_message.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/spinlock.h>
+
+/* ====== MHUv2 Registers ====== */
+
+/* Maximum number of channel windows */
+#define MHUV2_CH_WN_MAX			124
+/* Number of combined interrupt status registers */
+#define MHUV2_CMB_INT_ST_REG_CNT	4
+#define MHUV2_STAT_BYTES		(sizeof(u32))
+#define MHUV2_STAT_BITS			(MHUV2_STAT_BYTES * __CHAR_BIT__)
+
+#define LSB_MASK(n)			((1 << (n * __CHAR_BIT__)) - 1)
+#define MHUV2_PROTOCOL_PROP		"arm,mhuv2-protocols"
+
+/* Register Message Handling Unit Configuration fields */
+struct mhu_cfg_t {
+	u32 num_ch : 7;
+	u32 pad : 25;
+} __packed;
+
+/* register Interrupt Status fields */
+struct int_st_t {
+	u32 nr2r : 1;
+	u32 r2nr : 1;
+	u32 pad : 30;
+} __packed;
+
+/* Register Interrupt Clear fields */
+struct int_clr_t {
+	u32 nr2r : 1;
+	u32 r2nr : 1;
+	u32 pad : 30;
+} __packed;
+
+/* Register Interrupt Enable fields */
+struct int_en_t {
+	u32 r2nr : 1;
+	u32 nr2r : 1;
+	u32 chcomb : 1;
+	u32 pad : 29;
+} __packed;
+
+/* Register Implementer Identification fields */
+struct iidr_t {
+	u32 implementer : 12;
+	u32 revision : 4;
+	u32 variant : 4;
+	u32 product_id : 12;
+} __packed;
+
+/* Register Architecture Identification Register fields */
+struct aidr_t {
+	u32 arch_minor_rev : 4;
+	u32 arch_major_rev : 4;
+	u32 pad : 24;
+} __packed;
+
+/* Sender Channel Window fields */
+struct mhu2_send_ch_wn_reg {
+	u32 stat;
+	u8 pad1[0x0C - 0x04];
+	u32 stat_set;
+	u32 int_st;
+	u32 int_clr;
+	u32 int_en;
+	u8 pad2[0x20 - 0x1C];
+} __packed;
+
+/* Sender frame register fields */
+struct mhu2_send_frame_reg {
+	struct mhu2_send_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX];
+	struct mhu_cfg_t mhu_cfg;
+	u32 resp_cfg;
+	u32 access_request;
+	u32 access_ready;
+	struct int_st_t int_st;
+	struct int_clr_t int_clr;
+	struct int_en_t int_en;
+	u32 reserved0;
+	u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT];
+	u8 pad[0xFC8 - 0xFB0];
+	struct iidr_t iidr;
+	struct aidr_t aidr;
+} __packed;
+
+/* Receiver Channel Window fields */
+struct mhu2_recv_ch_wn_reg {
+	u32 stat;
+	u32 stat_masked;
+	u32 stat_clear;
+	u8 reserved0[0x10 - 0x0C];
+	u32 mask;
+	u32 mask_set;
+	u32 mask_clear;
+	u8 pad[0x20 - 0x1C];
+} __packed;
+
+/* Receiver frame register fields */
+struct mhu2_recv_frame_reg {
+	struct mhu2_recv_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX];
+	struct mhu_cfg_t mhu_cfg;
+	u8 reserved0[0xF90 - 0xF84];
+	struct int_st_t int_st;
+	struct int_clr_t int_clr;
+	struct int_en_t int_en;
+	u32 pad;
+	u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT];
+	u8 reserved2[0xFC8 - 0xFB0];
+	struct iidr_t iidr;
+	struct aidr_t aidr;
+} __packed;
+
+
+/* ====== MHUv2 data structures ====== */
+
+enum mhuv2_transport_protocol {
+	DOORBELL = 0,
+	DATA_TRANSFER = 1
+};
+
+enum mhuv2_frame {
+	RECEIVER_FRAME,
+	SENDER_FRAME
+};
+
+/**
+ * struct mhuv2 - MHUv2 mailbox controller data
+ *
+ * @mbox:	Mailbox controller belonging to the MHU frame.
+ * @send/recv:	Base address of the register mapping region.
+ * @frame:	Frame type: RECEIVER_FRAME or SENDER_FRAME.
+ * @irq:	Interrupt.
+ * @windows:	Channel windows implemented by the platform.
+ * @minor:	Minor version of the controller.
+ * @length:	Length of the protocols array in bytes.
+ * @protocols:	Raw protocol information, derived from device tree.
+ * @doorbell_pending_lock: spinlock required for correct operation of Tx
+ *		interrupt for doorbells.
+ */
+struct mhuv2 {
+	struct mbox_controller mbox;
+	union {
+		struct mhu2_send_frame_reg __iomem *send;
+		struct mhu2_recv_frame_reg __iomem *recv;
+	};
+	enum mhuv2_frame frame;
+	unsigned int irq;
+	unsigned int windows;
+	unsigned int minor;
+	unsigned int length;
+	u32 *protocols;
+
+	spinlock_t doorbell_pending_lock;
+};
+
+#define mhu_from_mbox(_mbox) container_of(_mbox, struct mhuv2, mbox)
+
+/**
+ * struct mhuv2_protocol_ops - MHUv2 operations
+ *
+ * Each transport protocol must provide an implementation of the operations
+ * provided here.
+ *
+ * @rx_startup: Startup callback for receiver.
+ * @rx_shutdown: Shutdown callback for receiver.
+ * @read_data: Reads and clears newly available data.
+ * @tx_startup: Startup callback for receiver.
+ * @tx_shutdown: Shutdown callback for receiver.
+ * @last_tx_done: Report back if the last tx is completed or not.
+ * @send_data: Send data to the receiver.
+ */
+struct mhuv2_protocol_ops {
+	int (*rx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void (*rx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void *(*read_data)(struct mhuv2 *mhu, struct mbox_chan *chan);
+
+	void (*tx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void (*tx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	int (*last_tx_done)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	int (*send_data)(struct mhuv2 *mhu, struct mbox_chan *chan, void *arg);
+};
+
+/*
+ * MHUv2 mailbox channel's private information
+ *
+ * @ops:	protocol specific ops for the channel.
+ * @ch_wn_idx:	Channel window index allocated to the channel.
+ * @windows:	Total number of windows consumed by the channel, only relevant
+ *		in DATA_TRANSFER protocol.
+ * @doorbell:	Doorbell bit number within the ch_wn_idx window, only relevant
+ *		in DOORBELL protocol.
+ * @pending:	Flag indicating pending doorbell interrupt, only relevant in
+ *		DOORBELL protocol.
+ */
+struct mhuv2_mbox_chan_priv {
+	const struct mhuv2_protocol_ops *ops;
+	u32 ch_wn_idx;
+	union {
+		u32 windows;
+		struct {
+			u32 doorbell;
+			u32 pending;
+		};
+	};
+};
+
+/* Macro for reading a bitfield within a physically mapped packed struct */
+#define readl_relaxed_bitfield(_regptr, _field)				\
+	({								\
+		u32 _regval;						\
+		_regval = readl_relaxed((_regptr));			\
+		(*(typeof((_regptr)))(&_regval))._field;		\
+	})
+
+/* Macro for writing a bitfield within a physically mapped packed struct */
+#define writel_relaxed_bitfield(_value, _regptr, _field)		\
+	({								\
+		u32 _regval;						\
+		_regval = readl_relaxed(_regptr);			\
+		(*(typeof(_regptr))(&_regval))._field = _value;		\
+		writel_relaxed(_regval, _regptr);			\
+	})
+
+
+/* =================== Doorbell transport protocol operations =============== */
+
+static int mhuv2_doorbell_rx_startup(struct mhuv2 *mhu, struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].mask_clear);
+	return 0;
+}
+
+static void mhuv2_doorbell_rx_shutdown(struct mhuv2 *mhu,
+				       struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].mask_set);
+}
+
+static void *mhuv2_doorbell_read_data(struct mhuv2 *mhu, struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].stat_clear);
+	return NULL;
+}
+
+static int mhuv2_doorbell_last_tx_done(struct mhuv2 *mhu,
+				       struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return !(readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat) &
+		 BIT(priv->doorbell));
+}
+
+static int mhuv2_doorbell_send_data(struct mhuv2 *mhu, struct mbox_chan *chan,
+				    void *arg)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mhu->doorbell_pending_lock, flags);
+
+	priv->pending = 1;
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->send->ch_wn[priv->ch_wn_idx].stat_set);
+
+	spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags);
+
+	return 0;
+}
+
+static const struct mhuv2_protocol_ops mhuv2_doorbell_ops = {
+	.rx_startup = mhuv2_doorbell_rx_startup,
+	.rx_shutdown = mhuv2_doorbell_rx_shutdown,
+	.read_data = mhuv2_doorbell_read_data,
+	.last_tx_done = mhuv2_doorbell_last_tx_done,
+	.send_data = mhuv2_doorbell_send_data,
+};
+#define IS_PROTOCOL_DOORBELL(_priv) (_priv->ops == &mhuv2_doorbell_ops)
+
+/* ============= Data transfer transport protocol operations ================ */
+
+static int mhuv2_data_transfer_rx_startup(struct mhuv2 *mhu,
+					  struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/*
+	 * The protocol mandates that all but the last status register must be
+	 * masked.
+	 */
+	writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_clear);
+	return 0;
+}
+
+static void mhuv2_data_transfer_rx_shutdown(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set);
+}
+
+static void *mhuv2_data_transfer_read_data(struct mhuv2 *mhu,
+					   struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	const int windows = priv->windows;
+	struct arm_mhuv2_mbox_msg *msg;
+	u32 *data;
+	int i, idx;
+
+	msg = kzalloc(sizeof(*msg) + windows * MHUV2_STAT_BYTES, GFP_KERNEL);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	data = msg->data = msg + 1;
+	msg->len = windows * MHUV2_STAT_BYTES;
+
+	/*
+	 * Messages are expected in order of most significant word to least
+	 * significant word. Refer mhuv2_data_transfer_send_data() for more
+	 * details.
+	 *
+	 * We also need to read the stat register instead of stat_masked, as we
+	 * masked all but the last window.
+	 *
+	 * Last channel window must be cleared as the final operation. Upon
+	 * clearing the last channel window register, which is unmasked in
+	 * data-transfer protocol, the interrupt is de-asserted.
+	 */
+	for (i = 0; i < windows; i++) {
+		idx = priv->ch_wn_idx + i;
+		data[windows - 1 - i] = readl_relaxed(&mhu->recv->ch_wn[idx].stat);
+		writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[idx].stat_clear);
+	}
+
+	return msg;
+}
+
+static void mhuv2_data_transfer_tx_startup(struct mhuv2 *mhu,
+					   struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/* Enable interrupts only for the last window */
+	if (mhu->minor) {
+		writel_relaxed(0x1, &mhu->send->ch_wn[i].int_clr);
+		writel_relaxed(0x1, &mhu->send->ch_wn[i].int_en);
+	}
+}
+
+static void mhuv2_data_transfer_tx_shutdown(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	if (mhu->minor)
+		writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en);
+}
+
+static int mhuv2_data_transfer_last_tx_done(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/* Just checking the last channel window should be enough */
+	return !readl_relaxed(&mhu->send->ch_wn[i].stat);
+}
+
+/*
+ * Message will be transmitted from most significant to least significant word.
+ * This is to allow for messages shorter than channel windows to still trigger
+ * the receiver interrupt which gets activated when the last stat register is
+ * written. As an example, a 6-word message is to be written on a 4-channel MHU
+ * connection: Registers marked with '*' are masked, and will not generate an
+ * interrupt on the receiver side once written.
+ *
+ * u32 *data =	[0x00000001], [0x00000002], [0x00000003], [0x00000004],
+ *		[0x00000005], [0x00000006]
+ *
+ * ROUND 1:
+ * stat reg		To write	Write sequence
+ * [ stat 3 ]	<-	[0x00000001]	4 <- triggers interrupt on receiver
+ * [ stat 2 ]	<-	[0x00000002]	3
+ * [ stat 1 ]	<-	[0x00000003]	2
+ * [ stat 0 ]	<-	[0x00000004]	1
+ *
+ * data += 4 // Increment data pointer by number of stat regs
+ *
+ * ROUND 2:
+ * stat reg		To write	Write sequence
+ * [ stat 3 ]	<-	[0x00000005]	2 <- triggers interrupt on receiver
+ * [ stat 2 ]	<-	[0x00000006]	1
+ * [ stat 1 ]	<-	[0x00000000]
+ * [ stat 0 ]	<-	[0x00000000]
+ */
+static int mhuv2_data_transfer_send_data(struct mhuv2 *mhu,
+					 struct mbox_chan *chan, void *arg)
+{
+	const struct arm_mhuv2_mbox_msg *msg = arg;
+	int bytes_left = msg->len, bytes_to_send, bytes_in_round, i;
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int windows = priv->windows;
+	u32 *data = msg->data, word;
+
+	while (bytes_left) {
+		if (!data[0]) {
+			dev_err(mhu->mbox.dev, "Data aligned at first window can't be zero to guarantee interrupt generation at receiver");
+			return -EINVAL;
+		}
+
+		while(!mhuv2_data_transfer_last_tx_done(mhu, chan))
+			continue;
+
+		bytes_in_round = min(bytes_left, (int)(windows * MHUV2_STAT_BYTES));
+
+		for (i = windows - 1; i >= 0; i--) {
+			/* Data less than windows can transfer ? */
+			if (unlikely(bytes_in_round <= i * MHUV2_STAT_BYTES))
+				continue;
+
+			word = data[i];
+			bytes_to_send = bytes_in_round & (MHUV2_STAT_BYTES - 1);
+			if (unlikely(bytes_to_send))
+				word &= LSB_MASK(bytes_to_send);
+			else
+				bytes_to_send = MHUV2_STAT_BYTES;
+
+			writel_relaxed(word, &mhu->send->ch_wn[priv->ch_wn_idx + windows - 1 - i].stat_set);
+			bytes_left -= bytes_to_send;
+			bytes_in_round -= bytes_to_send;
+		}
+
+		data += windows;
+	}
+
+	return 0;
+}
+
+static const struct mhuv2_protocol_ops mhuv2_data_transfer_ops = {
+	.rx_startup = mhuv2_data_transfer_rx_startup,
+	.rx_shutdown = mhuv2_data_transfer_rx_shutdown,
+	.read_data = mhuv2_data_transfer_read_data,
+	.tx_startup = mhuv2_data_transfer_tx_startup,
+	.tx_shutdown = mhuv2_data_transfer_tx_shutdown,
+	.last_tx_done = mhuv2_data_transfer_last_tx_done,
+	.send_data = mhuv2_data_transfer_send_data,
+};
+
+/* Interrupt handlers */
+
+static struct mbox_chan *get_irq_chan_comb(struct mhuv2 *mhu, u32 *reg)
+{
+	struct mbox_chan *chans = mhu->mbox.chans;
+	int channel = 0, i, offset = 0, windows, protocol, ch_wn;
+	u32 stat;
+
+	for (i = 0; i < MHUV2_CMB_INT_ST_REG_CNT; i++) {
+		stat = readl_relaxed(reg + i);
+		if (!stat)
+			continue;
+
+		ch_wn = i * MHUV2_STAT_BITS + __builtin_ctz(stat);
+
+		for (i = 0; i < mhu->length; i += 2) {
+			protocol = mhu->protocols[i];
+			windows = mhu->protocols[i + 1];
+
+			if (ch_wn >= offset + windows) {
+				if (protocol == DOORBELL)
+					channel += MHUV2_STAT_BITS * windows;
+				else
+					channel++;
+
+				offset += windows;
+				continue;
+			}
+
+			/* Return first chan of the window in doorbell mode */
+			if (protocol == DOORBELL)
+				channel += MHUV2_STAT_BITS * (ch_wn - offset);
+
+			return &chans[channel];
+		}
+	}
+
+	return ERR_PTR(-EIO);
+}
+
+static irqreturn_t mhuv2_sender_interrupt(int irq, void *data)
+{
+	struct mhuv2 *mhu = data;
+	struct device *dev = mhu->mbox.dev;
+	struct mhuv2_mbox_chan_priv *priv;
+	struct mbox_chan *chan;
+	unsigned long flags;
+	int i, found = 0;
+	u32 stat;
+
+	chan = get_irq_chan_comb(mhu, mhu->send->chcomb_int_st);
+	if (IS_ERR(chan)) {
+		dev_warn(dev, "Failed to find channel for the Tx interrupt\n");
+		return IRQ_NONE;
+	}
+	priv = chan->con_priv;
+
+	if (!IS_PROTOCOL_DOORBELL(priv)) {
+		writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx + priv->windows - 1].int_clr);
+
+		if (chan->cl) {
+			mbox_chan_txdone(chan, 0);
+			return IRQ_HANDLED;
+		}
+
+		dev_warn(dev, "Tx interrupt Received on channel (%u) not currently attached to a mailbox client\n",
+			 priv->ch_wn_idx);
+		return IRQ_NONE;
+	}
+
+	/* Clear the interrupt first, so we don't miss any doorbell later */
+	writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx].int_clr);
+
+	/*
+	 * In Doorbell mode, make sure no new transitions happen while the
+	 * interrupt handler is trying to find the finished doorbell tx
+	 * operations, else we may think few of the transfers were complete
+	 * before they actually were.
+	 */
+	spin_lock_irqsave(&mhu->doorbell_pending_lock, flags);
+
+	/*
+	 * In case of doorbell mode, the first channel of the window is returned
+	 * by get_irq_chan_comb(). Find all the pending channels here.
+	 */
+	stat = readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat);
+
+	for (i = 0; i < MHUV2_STAT_BITS; i++) {
+		priv = chan[i].con_priv;
+
+		/* Find cases where pending was 1, but stat's bit is cleared */
+		if (priv->pending ^ ((stat >> i) & 0x1)) {
+			BUG_ON(!priv->pending);
+
+			if (!chan->cl) {
+				dev_warn(dev, "Tx interrupt received on doorbell (%u : %u) channel not currently attached to a mailbox client\n",
+					 priv->ch_wn_idx, i);
+				continue;
+			}
+
+			mbox_chan_txdone(&chan[i], 0);
+			priv->pending = 0;
+			found++;
+		}
+	}
+
+	spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags);
+
+	if (!found) {
+		/*
+		 * We may have already processed the doorbell in the previous
+		 * iteration if the interrupt came right after we cleared it but
+		 * before we read the stat register.
+		 */
+		dev_dbg(dev, "Couldn't find the doorbell (%u) for the Tx interrupt interrupt\n",
+			priv->ch_wn_idx);
+		return IRQ_NONE;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct mbox_chan *get_irq_chan_comb_rx(struct mhuv2 *mhu)
+{
+	struct mhuv2_mbox_chan_priv *priv;
+	struct mbox_chan *chan;
+	u32 stat;
+
+	chan = get_irq_chan_comb(mhu, mhu->recv->chcomb_int_st);
+	if (IS_ERR(chan))
+		return chan;
+
+	priv = chan->con_priv;
+	if (!IS_PROTOCOL_DOORBELL(priv))
+		return chan;
+
+	/*
+	 * In case of doorbell mode, the first channel of the window is returned
+	 * by the routine. Find the exact channel here.
+	 */
+	stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked);
+	BUG_ON(!stat);
+
+	return chan + __builtin_ctz(stat);
+}
+
+static struct mbox_chan *get_irq_chan_stat_rx(struct mhuv2 *mhu)
+{
+	struct mbox_chan *chans = mhu->mbox.chans;
+	struct mhuv2_mbox_chan_priv *priv;
+	u32 stat;
+	int i = 0;
+
+	while (i < mhu->mbox.num_chans) {
+		priv = chans[i].con_priv;
+		stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked);
+
+		if (stat) {
+			if (IS_PROTOCOL_DOORBELL(priv))
+				i += __builtin_ctz(stat);
+			return &chans[i];
+		}
+
+		i += IS_PROTOCOL_DOORBELL(priv) ? MHUV2_STAT_BITS : 1;
+	}
+
+	return ERR_PTR(-EIO);
+}
+
+static struct mbox_chan *get_irq_chan_rx(struct mhuv2 *mhu)
+{
+	if (!mhu->minor)
+		return get_irq_chan_stat_rx(mhu);
+
+	return get_irq_chan_comb_rx(mhu);
+}
+
+static irqreturn_t mhuv2_receiver_interrupt(int irq, void *arg)
+{
+	struct mhuv2 *mhu = arg;
+	struct mbox_chan *chan = get_irq_chan_rx(mhu);
+	struct device *dev = mhu->mbox.dev;
+	struct mhuv2_mbox_chan_priv *priv;
+	int ret = IRQ_NONE;
+	void *data;
+
+	if (IS_ERR(chan)) {
+		dev_warn(dev, "Failed to find channel for the rx interrupt\n");
+		return IRQ_NONE;
+	}
+	priv = chan->con_priv;
+
+	/* Read and clear the data first */
+	data = priv->ops->read_data(mhu, chan);
+
+	if (!chan->cl) {
+		dev_warn(dev, "Received data on channel (%u) not currently attached to a mailbox client\n",
+			 priv->ch_wn_idx);
+	} else if (IS_ERR(data)) {
+		dev_err(dev, "Failed to read data: %lu\n", PTR_ERR(data));
+	} else {
+		mbox_chan_received_data(chan, data);
+		ret = IRQ_HANDLED;
+	}
+
+	kfree(data);
+	return ret;
+}
+
+/* Sender and receiver ops */
+static bool mhuv2_sender_last_tx_done(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return priv->ops->last_tx_done(mhu, chan);
+}
+
+static int mhuv2_sender_send_data(struct mbox_chan *chan, void *data)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (!priv->ops->last_tx_done(mhu, chan))
+		return -EBUSY;
+
+	return priv->ops->send_data(mhu, chan, data);
+}
+
+static int mhuv2_sender_startup(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (priv->ops->tx_startup)
+		priv->ops->tx_startup(mhu, chan);
+	return 0;
+}
+
+static void mhuv2_sender_shutdown(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (priv->ops->tx_shutdown)
+		priv->ops->tx_shutdown(mhu, chan);
+}
+
+static const struct mbox_chan_ops mhuv2_sender_ops = {
+	.send_data = mhuv2_sender_send_data,
+	.startup = mhuv2_sender_startup,
+	.shutdown = mhuv2_sender_shutdown,
+	.last_tx_done = mhuv2_sender_last_tx_done,
+};
+
+static int mhuv2_receiver_startup(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return priv->ops->rx_startup(mhu, chan);
+}
+
+static void mhuv2_receiver_shutdown(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	priv->ops->rx_shutdown(mhu, chan);
+}
+
+static int mhuv2_receiver_send_data(struct mbox_chan *chan, void *data)
+{
+	dev_err(chan->mbox->dev,
+		"Trying to transmit on a receiver MHU frame\n");
+	return -EIO;
+}
+
+static bool mhuv2_receiver_last_tx_done(struct mbox_chan *chan)
+{
+	dev_err(chan->mbox->dev, "Trying to Tx poll on a receiver MHU frame\n");
+	return true;
+}
+
+static const struct mbox_chan_ops mhuv2_receiver_ops = {
+	.send_data = mhuv2_receiver_send_data,
+	.startup = mhuv2_receiver_startup,
+	.shutdown = mhuv2_receiver_shutdown,
+	.last_tx_done = mhuv2_receiver_last_tx_done,
+};
+
+static struct mbox_chan *mhuv2_mbox_of_xlate(struct mbox_controller *mbox,
+					     const struct of_phandle_args *pa)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(mbox);
+	struct mbox_chan *chans = mbox->chans;
+	int channel = 0, i, offset, doorbell, protocol, windows;
+
+	if (pa->args_count != 2)
+		return ERR_PTR(-EINVAL);
+
+	offset = pa->args[0];
+	doorbell = pa->args[1];
+	if (doorbell >= MHUV2_STAT_BITS)
+		goto out;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (protocol == DOORBELL) {
+			if (offset < windows)
+				return &chans[channel + MHUV2_STAT_BITS * offset + doorbell];
+
+			channel += MHUV2_STAT_BITS * windows;
+			offset -= windows;
+		} else {
+			if (offset == 0) {
+				if (doorbell)
+					goto out;
+
+				return &chans[channel];
+			}
+
+			channel++;
+			offset--;
+		}
+	}
+
+out:
+	dev_err(mbox->dev, "Couldn't xlate to a valid channel (%d: %d)\n",
+		pa->args[0], doorbell);
+	return ERR_PTR(-ENODEV);
+}
+
+static int mhuv2_verify_protocol(struct mhuv2 *mhu)
+{
+	struct device *dev = mhu->mbox.dev;
+	int protocol, windows, channels = 0, total_windows = 0, i;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (!windows) {
+			dev_err(dev, "Window size can't be zero (%d)\n", i);
+			return -EINVAL;
+		}
+		total_windows += windows;
+
+		if (protocol == DOORBELL) {
+			channels += MHUV2_STAT_BITS * windows;
+		} else if (protocol == DATA_TRANSFER) {
+			channels++;
+		} else {
+			dev_err(dev, "Invalid protocol (%d) present in %s property at index %d\n",
+				protocol, MHUV2_PROTOCOL_PROP, i);
+			return -EINVAL;
+		}
+	}
+
+	if (total_windows > mhu->windows) {
+		dev_err(dev, "Channel windows can't be more than what's implemented by the hardware ( %d: %d)\n",
+			total_windows, mhu->windows);
+		return -EINVAL;
+	}
+
+	mhu->mbox.num_chans = channels;
+	return 0;
+}
+
+static int mhuv2_allocate_channels(struct mhuv2 *mhu)
+{
+	struct mbox_controller *mbox = &mhu->mbox;
+	struct mhuv2_mbox_chan_priv *priv;
+	struct device *dev = mbox->dev;
+	struct mbox_chan *chans;
+	int protocol, windows = 0, next_window = 0, i, j, k;
+
+	chans = devm_kcalloc(dev, mbox->num_chans, sizeof(*chans), GFP_KERNEL);
+	if (!chans)
+		return -ENOMEM;
+
+	mbox->chans = chans;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		next_window += windows;
+
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (protocol == DATA_TRANSFER) {
+			priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL);
+			if (!priv)
+				return -ENOMEM;
+
+			priv->ch_wn_idx = next_window;
+			priv->ops = &mhuv2_data_transfer_ops;
+			priv->windows = windows;
+			chans++->con_priv = priv;
+			continue;
+		}
+
+		for (j = 0; j < windows; j++) {
+			for (k = 0; k < MHUV2_STAT_BITS; k++) {
+				priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL);
+				if (!priv)
+					return -ENOMEM;
+
+				priv->ch_wn_idx = next_window + j;
+				priv->ops = &mhuv2_doorbell_ops;
+				priv->doorbell = k;
+				chans++->con_priv = priv;
+			}
+
+			/*
+			 * Permanently enable interrupt as we can't
+			 * control it per doorbell.
+			 */
+			if (mhu->frame == SENDER_FRAME && mhu->minor)
+				writel_relaxed(0x1, &mhu->send->ch_wn[priv->ch_wn_idx].int_en);
+		}
+	}
+
+	/* Make sure we have initialized all channels */
+	BUG_ON(chans - mbox->chans != mbox->num_chans);
+
+	return 0;
+}
+
+static int mhuv2_parse_channels(struct mhuv2 *mhu)
+{
+	struct device *dev = mhu->mbox.dev;
+	const struct device_node *np = dev->of_node;
+	int ret, count;
+	u32 *protocols;
+
+	count = of_property_count_u32_elems(np, MHUV2_PROTOCOL_PROP);
+	if (count <= 0 || count % 2) {
+		dev_err(dev, "Invalid %s property (%d)\n", MHUV2_PROTOCOL_PROP,
+			count);
+		return -EINVAL;
+	}
+
+	protocols = devm_kmalloc_array(dev, count, sizeof(*protocols), GFP_KERNEL);
+	if (!protocols)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(np, MHUV2_PROTOCOL_PROP, protocols, count);
+	if (ret) {
+		dev_err(dev, "Failed to read %s property: %d\n",
+			MHUV2_PROTOCOL_PROP, ret);
+		return ret;
+	}
+
+	mhu->protocols = protocols;
+	mhu->length = count;
+
+	ret = mhuv2_verify_protocol(mhu);
+	if (ret)
+		return ret;
+
+	return mhuv2_allocate_channels(mhu);
+}
+
+static int mhuv2_tx_init(struct amba_device *adev, struct mhuv2 *mhu,
+			 void __iomem *reg)
+{
+	struct device *dev = mhu->mbox.dev;
+	int ret, i;
+
+	mhu->frame = SENDER_FRAME;
+	mhu->mbox.ops = &mhuv2_sender_ops;
+	mhu->send = reg;
+
+	mhu->windows = readl_relaxed_bitfield(&mhu->send->mhu_cfg, num_ch);
+	mhu->minor = readl_relaxed_bitfield(&mhu->send->aidr, arch_minor_rev);
+
+	spin_lock_init(&mhu->doorbell_pending_lock);
+
+	/*
+	 * For minor version 1 and forward, tx interrupt is provided by
+	 * the controller.
+	 */
+	if (mhu->minor && adev->irq[0]) {
+		ret = devm_request_threaded_irq(dev, adev->irq[0], NULL,
+						mhuv2_sender_interrupt,
+						IRQF_ONESHOT, "mhuv2-tx", mhu);
+		if (ret) {
+			dev_err(dev, "Failed to request tx IRQ, fallback to polling mode: %d\n",
+				ret);
+		} else {
+			mhu->mbox.txdone_irq = true;
+			mhu->mbox.txdone_poll = false;
+			mhu->irq = adev->irq[0];
+
+			writel_relaxed_bitfield(1, &mhu->send->int_en, chcomb);
+
+			/* Disable all channel interrupts */
+			for (i = 0; i < mhu->windows; i++)
+				writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en);
+
+			goto out;
+		}
+	}
+
+	mhu->mbox.txdone_irq = false;
+	mhu->mbox.txdone_poll = true;
+	mhu->mbox.txpoll_period = 1;
+
+out:
+	/* Wait for receiver to be ready */
+	writel_relaxed(0x1, &mhu->send->access_request);
+	while (!readl_relaxed(&mhu->send->access_ready))
+		continue;
+
+	return 0;
+}
+
+static int mhuv2_rx_init(struct amba_device *adev, struct mhuv2 *mhu,
+			 void __iomem *reg)
+{
+	struct device *dev = mhu->mbox.dev;
+	int ret, i;
+
+	mhu->frame = RECEIVER_FRAME;
+	mhu->mbox.ops = &mhuv2_receiver_ops;
+	mhu->recv = reg;
+
+	mhu->windows = readl_relaxed_bitfield(&mhu->recv->mhu_cfg, num_ch);
+	mhu->minor = readl_relaxed_bitfield(&mhu->recv->aidr, arch_minor_rev);
+
+	mhu->irq = adev->irq[0];
+	if (!mhu->irq) {
+		dev_err(dev, "Missing receiver IRQ\n");
+		return -EINVAL;
+	}
+
+	ret = devm_request_threaded_irq(dev, mhu->irq, NULL,
+					mhuv2_receiver_interrupt, IRQF_ONESHOT,
+					"mhuv2-rx", mhu);
+	if (ret) {
+		dev_err(dev, "Failed to request rx IRQ\n");
+		return ret;
+	}
+
+	/* Mask all the channel windows */
+	for (i = 0; i < mhu->windows; i++)
+		writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set);
+
+	if (mhu->minor)
+		writel_relaxed_bitfield(1, &mhu->recv->int_en, chcomb);
+
+	return 0;
+}
+
+static int mhuv2_probe(struct amba_device *adev, const struct amba_id *id)
+{
+	struct device *dev = &adev->dev;
+	const struct device_node *np = dev->of_node;
+	struct mhuv2 *mhu;
+	void __iomem *reg;
+	int ret = -EINVAL;
+
+	reg = devm_of_iomap(dev, dev->of_node, 0, NULL);
+	if (!reg)
+		return -ENOMEM;
+
+	mhu = devm_kzalloc(dev, sizeof(*mhu), GFP_KERNEL);
+	if (!mhu)
+		return -ENOMEM;
+
+	mhu->mbox.dev = dev;
+	mhu->mbox.of_xlate = mhuv2_mbox_of_xlate;
+
+	if (of_device_is_compatible(np, "arm,mhuv2-tx"))
+		ret = mhuv2_tx_init(adev, mhu, reg);
+	else if (of_device_is_compatible(np, "arm,mhuv2-rx"))
+		ret = mhuv2_rx_init(adev, mhu, reg);
+	else
+		dev_err(dev, "Invalid compatible property\n");
+
+	if (ret)
+		return ret;
+
+	/* Channel windows can't be 0 */
+	BUG_ON(!mhu->windows);
+
+	ret = mhuv2_parse_channels(mhu);
+	if (ret)
+		return ret;
+
+	amba_set_drvdata(adev, mhu);
+
+	ret = devm_mbox_controller_register(dev, &mhu->mbox);
+	if (ret)
+		dev_err(dev, "failed to register ARM MHUv2 driver %d\n", ret);
+
+	return ret;
+}
+
+static int mhuv2_remove(struct amba_device *adev)
+{
+	struct mhuv2 *mhu = amba_get_drvdata(adev);
+
+	if (mhu->frame == SENDER_FRAME)
+		writel_relaxed(0x0, &mhu->send->access_request);
+
+	return 0;
+}
+
+static struct amba_id mhuv2_ids[] = {
+	{
+		/* 2.0 */
+		.id = 0xbb0d1,
+		.mask = 0xfffff,
+	},
+	{
+		/* 2.1 */
+		.id = 0xbb076,
+		.mask = 0xfffff,
+	},
+	{ 0, 0 },
+};
+MODULE_DEVICE_TABLE(amba, mhuv2_ids);
+
+static struct amba_driver mhuv2_driver = {
+	.drv = {
+		.name	= "arm-mhuv2",
+	},
+	.id_table	= mhuv2_ids,
+	.probe		= mhuv2_probe,
+	.remove		= mhuv2_remove,
+};
+module_amba_driver(mhuv2_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("ARM MHUv2 Driver");
+MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.org>");
+MODULE_AUTHOR("Tushar Khandelwal <tushar.khandelwal@arm.com>");
diff --git a/include/linux/mailbox/arm_mhuv2_message.h b/include/linux/mailbox/arm_mhuv2_message.h
new file mode 100644
index 000000000000..821b9d96daa4
--- /dev/null
+++ b/include/linux/mailbox/arm_mhuv2_message.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM MHUv2 Mailbox Message
+ *
+ * Copyright (C) 2020 Arm Ltd.
+ * Copyright (C) 2020 Linaro Ltd.
+ */
+
+#ifndef _LINUX_ARM_MHUV2_MESSAGE_H_
+#define _LINUX_ARM_MHUV2_MESSAGE_H_
+
+#include <linux/types.h>
+
+/* Data structure for data-transfer protocol */
+struct arm_mhuv2_mbox_msg {
+	void *data;
+	size_t len;
+};
+
+#endif /* _LINUX_ARM_MHUV2_MESSAGE_H_ */

From 92de5fa2dc39c3fba0704f7ac914e7f02eb732f2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 13 Nov 2020 15:55:05 -0700
Subject: [PATCH 117/230] dmaengine: idxd: add ATS disable knob for work queues

With the DSA spec 1.1 update, a knob to disable ATS for individually is
introduced. Add enabling code to allow a system admin to make the
configuration through sysfs.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160530810593.1288392.2561048329116529566.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../ABI/stable/sysfs-driver-dma-idxd          |  7 ++++
 drivers/dma/idxd/device.c                     |  4 +++
 drivers/dma/idxd/idxd.h                       |  1 +
 drivers/dma/idxd/registers.h                  |  5 +--
 drivers/dma/idxd/sysfs.c                      | 34 +++++++++++++++++++
 5 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd
index 5ea81ffd3c1a..55285c136cf0 100644
--- a/Documentation/ABI/stable/sysfs-driver-dma-idxd
+++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd
@@ -204,6 +204,13 @@ Contact:	dmaengine@vger.kernel.org
 Description:	The max batch size for this workqueue. Cannot exceed device
 		max batch size. Configurable parameter.
 
+What:		/sys/bus/dsa/devices/wq<m>.<n>/ats_disable
+Date:		Nov 13, 2020
+KernelVersion:	5.11.0
+Contact:	dmaengine@vger.kernel.org
+Description:	Indicate whether ATS disable is turned on for the workqueue.
+		0 indicates ATS is on, and 1 indicates ATS is off for the workqueue.
+
 What:           /sys/bus/dsa/devices/engine<m>.<n>/group_id
 Date:           Oct 25, 2019
 KernelVersion:  5.6.0
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index d6f551dcbcb6..b75f9a09666e 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -354,6 +354,7 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq)
 	wq->group = NULL;
 	wq->threshold = 0;
 	wq->priority = 0;
+	wq->ats_dis = 0;
 	clear_bit(WQ_FLAG_DEDICATED, &wq->flags);
 	memset(wq->name, 0, WQ_NAME_SIZE);
 
@@ -631,6 +632,9 @@ static int idxd_wq_config_write(struct idxd_wq *wq)
 	    test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags))
 		wq->wqcfg->bof = 1;
 
+	if (idxd->hw.wq_cap.wq_ats_support)
+		wq->wqcfg->wq_ats_disable = wq->ats_dis;
+
 	/* bytes 12-15 */
 	wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes);
 	wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size);
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 7e54209c433a..149934f8d097 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -123,6 +123,7 @@ struct idxd_wq {
 	char name[WQ_NAME_SIZE + 1];
 	u64 max_xfer_bytes;
 	u32 max_batch_size;
+	bool ats_dis;
 };
 
 struct idxd_engine {
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index d29a58ee2651..0cdc5405bc53 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -47,7 +47,7 @@ union wq_cap_reg {
 		u64 rsvd:20;
 		u64 shared_mode:1;
 		u64 dedicated_mode:1;
-		u64 rsvd2:1;
+		u64 wq_ats_support:1;
 		u64 priority:1;
 		u64 occupancy:1;
 		u64 occupancy_int:1;
@@ -303,7 +303,8 @@ union wqcfg {
 		/* bytes 8-11 */
 		u32 mode:1;	/* shared or dedicated */
 		u32 bof:1;	/* block on fault */
-		u32 rsvd2:2;
+		u32 wq_ats_disable:1;
+		u32 rsvd2:1;
 		u32 priority:4;
 		u32 pasid:20;
 		u32 pasid_en:1;
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 6d292eb79bf3..3af83f1fd36e 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -1261,6 +1261,39 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu
 static struct device_attribute dev_attr_wq_max_batch_size =
 		__ATTR(max_batch_size, 0644, wq_max_batch_size_show, wq_max_batch_size_store);
 
+static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+
+	return sprintf(buf, "%u\n", wq->ats_dis);
+}
+
+static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev);
+	struct idxd_device *idxd = wq->idxd;
+	bool ats_dis;
+	int rc;
+
+	if (wq->state != IDXD_WQ_DISABLED)
+		return -EPERM;
+
+	if (!idxd->hw.wq_cap.wq_ats_support)
+		return -EOPNOTSUPP;
+
+	rc = kstrtobool(buf, &ats_dis);
+	if (rc < 0)
+		return rc;
+
+	wq->ats_dis = ats_dis;
+
+	return count;
+}
+
+static struct device_attribute dev_attr_wq_ats_disable =
+		__ATTR(ats_disable, 0644, wq_ats_disable_show, wq_ats_disable_store);
+
 static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_clients.attr,
 	&dev_attr_wq_state.attr,
@@ -1275,6 +1308,7 @@ static struct attribute *idxd_wq_attributes[] = {
 	&dev_attr_wq_cdev_minor.attr,
 	&dev_attr_wq_max_transfer_size.attr,
 	&dev_attr_wq_max_batch_size.attr,
+	&dev_attr_wq_ats_disable.attr,
 	NULL,
 };
 

From fa94a951bf3553fee66e2c879b97da97bcf00589 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 3 Dec 2020 12:04:51 -0500
Subject: [PATCH 118/230] NFSv4.2: Fix up the get/listxattr calls to
 rpc_prepare_reply_pages()

Ensure that both getxattr and listxattr page array are correctly
aligned, and that getxattr correctly accounts for the page padding word.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 8432bd6b95f0..103978ff76c9 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -191,7 +191,7 @@
 
 #define encode_getxattr_maxsz   (op_encode_hdr_maxsz + 1 + \
 				 nfs4_xattr_name_maxsz)
-#define decode_getxattr_maxsz   (op_decode_hdr_maxsz + 1 + 1)
+#define decode_getxattr_maxsz   (op_decode_hdr_maxsz + 1 + pagepad_maxsz)
 #define encode_setxattr_maxsz   (op_encode_hdr_maxsz + \
 				 1 + nfs4_xattr_name_maxsz + 1)
 #define decode_setxattr_maxsz   (op_decode_hdr_maxsz + decode_change_info_maxsz)
@@ -1476,17 +1476,18 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
+	uint32_t replen;
 	size_t plen;
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
+	replen = hdr.replen + op_decode_hdr_maxsz + 1;
 	encode_getxattr(xdr, args->xattr_name, &hdr);
 
 	plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
 
-	rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen,
-	    hdr.replen);
+	rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, replen);
 	req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
 
 	encode_nops(&hdr);
@@ -1520,14 +1521,15 @@ static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req,
 	struct compound_hdr hdr = {
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
+	uint32_t replen;
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
+	replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1;
 	encode_listxattrs(xdr, args, &hdr);
 
-	rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count,
-	    hdr.replen);
+	rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen);
 
 	encode_nops(&hdr);
 }

From 4facce4c12638c5636cbe9d4b8a4a879bebe8570 Mon Sep 17 00:00:00 2001
From: Jonathan McDowell <noodles@earth.li>
Date: Thu, 26 Nov 2020 18:46:02 +0000
Subject: [PATCH 119/230] dmaengine: qcom: Fix ADM driver kerneldoc markup

Update the kerneldoc function headers to fix build warnings:

drivers/dma/qcom/qcom_adm.c:180: warning: Function parameter or member 'chan' not described in 'adm_free_chan'
drivers/dma/qcom/qcom_adm.c:190: warning: Function parameter or member 'burst' not described in 'adm_get_blksize'
drivers/dma/qcom/qcom_adm.c:466: warning: Function parameter or member 'chan' not described in 'adm_terminate_all'
drivers/dma/qcom/qcom_adm.c:466: warning: Excess function parameter 'achan' description in 'adm_terminate_all'
drivers/dma/qcom/qcom_adm.c:503: warning: Function parameter or member 'achan' not described in 'adm_start_dma'

Signed-off-by: Jonathan McDowell <noodles@earth.li>
Link: https://lore.kernel.org/r/20201126184602.GA1008@earth.li
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/qcom_adm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/qcom/qcom_adm.c b/drivers/dma/qcom/qcom_adm.c
index 9b6f8e050ecc..ee78bed8d60d 100644
--- a/drivers/dma/qcom/qcom_adm.c
+++ b/drivers/dma/qcom/qcom_adm.c
@@ -173,8 +173,9 @@ struct adm_device {
 /**
  * adm_free_chan - Frees dma resources associated with the specific channel
  *
- * Free all allocated descriptors associated with this channel
+ * @chan: dma channel
  *
+ * Free all allocated descriptors associated with this channel
  */
 static void adm_free_chan(struct dma_chan *chan)
 {
@@ -185,6 +186,7 @@ static void adm_free_chan(struct dma_chan *chan)
 /**
  * adm_get_blksize - Get block size from burst value
  *
+ * @burst: Burst size of transaction
  */
 static int adm_get_blksize(unsigned int burst)
 {
@@ -456,7 +458,7 @@ free:
 
 /**
  * adm_terminate_all - terminate all transactions on a channel
- * @achan: adm dma channel
+ * @chan: dma channel
  *
  * Dequeues and frees all transactions, aborts current transaction
  * No callbacks are done
@@ -497,7 +499,7 @@ static int adm_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg)
 
 /**
  * adm_start_dma - start next transaction
- * @achan - ADM dma channel
+ * @achan: ADM dma channel
  */
 static void adm_start_dma(struct adm_chan *achan)
 {

From 51b69c9679de9bcb45b846807d75bab7ce9c6fda Mon Sep 17 00:00:00 2001
From: Fabien Parent <fparent@baylibre.com>
Date: Wed, 9 Dec 2020 12:47:35 +0100
Subject: [PATCH 120/230] dt-bindings: dma: mtk-apdma: add bindings for MT8516
 SOC

Add bindings to APDMA for MT8516 SoC. MT8516 is compatible with MT6577.

Signed-off-by: Fabien Parent <fparent@baylibre.com>
Reviewed-by: Matthias Brugger <matthias.bgg@gmail.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201209114736.70625-1-fparent@baylibre.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
index 2117db0ce4f2..fef9c1eeb264 100644
--- a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
+++ b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
@@ -4,6 +4,7 @@ Required properties:
 - compatible should contain:
   * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA
   * "mediatek,mt6577-uart-dma" for MT6577 and all of the above
+  * "mediatek,mt8516-uart-dma", "mediatek,mt6577" for MT8516 SoC
 
 - reg: The base address of the APDMA register bank.
 

From f25b463883a8a2d1b7303a63339c0d589fc94f1e Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 17 Nov 2020 13:39:14 -0700
Subject: [PATCH 121/230] dmaengine: idxd: add IAX configuration support in the
 IDXD driver

Add support to allow configuration of Intel Analytics Accelerator (IAX) in
addition to the Intel Data Streaming Accelerator (DSA). The IAX hardware
has the same configuration interface as DSA. The main difference
is the type of operations it performs. We can support the DSA and
IAX devices on the same driver with some tweaks.

IAX has a 64B completion record that needs to be 64B aligned, as opposed to
a 32B completion record that is 32B aligned for DSA. IAX also does not
support token management.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/160564555488.1834439.4261958859935360473.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/idxd/cdev.c      |  1 +
 drivers/dma/idxd/device.c    | 37 +++++++++++++----
 drivers/dma/idxd/idxd.h      | 24 +++++++++--
 drivers/dma/idxd/init.c      | 14 +++++++
 drivers/dma/idxd/registers.h |  1 +
 drivers/dma/idxd/submit.c    |  2 +-
 drivers/dma/idxd/sysfs.c     | 46 +++++++++++++++++++--
 include/uapi/linux/idxd.h    | 79 ++++++++++++++++++++++++++++++++++++
 8 files changed, 187 insertions(+), 17 deletions(-)

diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c
index 010b820d8f74..0db9b82ed8cf 100644
--- a/drivers/dma/idxd/cdev.c
+++ b/drivers/dma/idxd/cdev.c
@@ -28,6 +28,7 @@ struct idxd_cdev_context {
  */
 static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = {
 	{ .name = "dsa" },
+	{ .name = "iax" }
 };
 
 struct idxd_user_context {
diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c
index b75f9a09666e..47ff8e387172 100644
--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -131,6 +131,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	struct idxd_device *idxd = wq->idxd;
 	struct device *dev = &idxd->pdev->dev;
 	int rc, num_descs, i;
+	int align;
+	u64 tmp;
 
 	if (wq->type != IDXD_WQT_KERNEL)
 		return 0;
@@ -142,14 +144,27 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 	if (rc < 0)
 		return rc;
 
-	wq->compls_size = num_descs * sizeof(struct dsa_completion_record);
-	wq->compls = dma_alloc_coherent(dev, wq->compls_size,
-					&wq->compls_addr, GFP_KERNEL);
-	if (!wq->compls) {
+	if (idxd->type == IDXD_TYPE_DSA)
+		align = 32;
+	else if (idxd->type == IDXD_TYPE_IAX)
+		align = 64;
+	else
+		return -ENODEV;
+
+	wq->compls_size = num_descs * idxd->compl_size + align;
+	wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size,
+					    &wq->compls_addr_raw, GFP_KERNEL);
+	if (!wq->compls_raw) {
 		rc = -ENOMEM;
 		goto fail_alloc_compls;
 	}
 
+	/* Adjust alignment */
+	wq->compls_addr = (wq->compls_addr_raw + (align - 1)) & ~(align - 1);
+	tmp = (u64)wq->compls_raw;
+	tmp = (tmp + (align - 1)) & ~(align - 1);
+	wq->compls = (struct dsa_completion_record *)tmp;
+
 	rc = alloc_descs(wq, num_descs);
 	if (rc < 0)
 		goto fail_alloc_descs;
@@ -163,9 +178,11 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
 		struct idxd_desc *desc = wq->descs[i];
 
 		desc->hw = wq->hw_descs[i];
-		desc->completion = &wq->compls[i];
-		desc->compl_dma  = wq->compls_addr +
-			sizeof(struct dsa_completion_record) * i;
+		if (idxd->type == IDXD_TYPE_DSA)
+			desc->completion = &wq->compls[i];
+		else if (idxd->type == IDXD_TYPE_IAX)
+			desc->iax_completion = &wq->iax_compls[i];
+		desc->compl_dma = wq->compls_addr + idxd->compl_size * i;
 		desc->id = i;
 		desc->wq = wq;
 		desc->cpu = -1;
@@ -178,7 +195,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq)
  fail_sbitmap_init:
 	free_descs(wq);
  fail_alloc_descs:
-	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+	dma_free_coherent(dev, wq->compls_size, wq->compls_raw,
+			  wq->compls_addr_raw);
  fail_alloc_compls:
 	free_hw_descs(wq);
 	return rc;
@@ -193,7 +211,8 @@ void idxd_wq_free_resources(struct idxd_wq *wq)
 
 	free_hw_descs(wq);
 	free_descs(wq);
-	dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr);
+	dma_free_coherent(dev, wq->compls_size, wq->compls_raw,
+			  wq->compls_addr_raw);
 	sbitmap_queue_free(&wq->sbq);
 }
 
diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h
index 149934f8d097..5a50e91c71bf 100644
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -20,7 +20,8 @@ extern struct kmem_cache *idxd_desc_pool;
 enum idxd_type {
 	IDXD_TYPE_UNKNOWN = -1,
 	IDXD_TYPE_DSA = 0,
-	IDXD_TYPE_MAX
+	IDXD_TYPE_IAX,
+	IDXD_TYPE_MAX,
 };
 
 #define IDXD_NAME_SIZE		128
@@ -114,8 +115,13 @@ struct idxd_wq {
 	u32 vec_ptr;		/* interrupt steering */
 	struct dsa_hw_desc **hw_descs;
 	int num_descs;
-	struct dsa_completion_record *compls;
+	union {
+		struct dsa_completion_record *compls;
+		struct iax_completion_record *iax_compls;
+	};
+	void *compls_raw;
 	dma_addr_t compls_addr;
+	dma_addr_t compls_addr_raw;
 	int compls_size;
 	struct idxd_desc **descs;
 	struct sbitmap_queue sbq;
@@ -196,6 +202,7 @@ struct idxd_device {
 	int token_limit;
 	int nr_tokens;		/* non-reserved tokens */
 	unsigned int wqcfg_size;
+	int compl_size;
 
 	union sw_err_reg sw_err;
 	wait_queue_head_t cmd_waitq;
@@ -210,9 +217,15 @@ struct idxd_device {
 
 /* IDXD software descriptor */
 struct idxd_desc {
-	struct dsa_hw_desc *hw;
+	union {
+		struct dsa_hw_desc *hw;
+		struct iax_hw_desc *iax_hw;
+	};
 	dma_addr_t desc_dma;
-	struct dsa_completion_record *completion;
+	union {
+		struct dsa_completion_record *completion;
+		struct iax_completion_record *iax_completion;
+	};
 	dma_addr_t compl_dma;
 	struct dma_async_tx_descriptor txd;
 	struct llist_node llnode;
@@ -226,6 +239,7 @@ struct idxd_desc {
 #define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
 
 extern struct bus_type dsa_bus_type;
+extern struct bus_type iax_bus_type;
 
 extern bool support_enqcmd;
 
@@ -271,6 +285,8 @@ static inline void idxd_set_type(struct idxd_device *idxd)
 
 	if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0)
 		idxd->type = IDXD_TYPE_DSA;
+	else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0)
+		idxd->type = IDXD_TYPE_IAX;
 	else
 		idxd->type = IDXD_TYPE_UNKNOWN;
 }
diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index 45b0eac640c3..2c051e07c34c 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -36,12 +36,16 @@ static struct mutex idxd_idr_lock;
 static struct pci_device_id idxd_pci_tbl[] = {
 	/* DSA ver 1.0 platforms */
 	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) },
+
+	/* IAX ver 1.0 platforms */
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IAX_SPR0) },
 	{ 0, }
 };
 MODULE_DEVICE_TABLE(pci, idxd_pci_tbl);
 
 static char *idxd_name[] = {
 	"dsa",
+	"iax"
 };
 
 const char *idxd_get_dev_name(struct idxd_device *idxd)
@@ -377,6 +381,14 @@ static int idxd_probe(struct idxd_device *idxd)
 	return rc;
 }
 
+static void idxd_type_init(struct idxd_device *idxd)
+{
+	if (idxd->type == IDXD_TYPE_DSA)
+		idxd->compl_size = sizeof(struct dsa_completion_record);
+	else if (idxd->type == IDXD_TYPE_IAX)
+		idxd->compl_size = sizeof(struct iax_completion_record);
+}
+
 static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct device *dev = &pdev->dev;
@@ -412,6 +424,8 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	idxd_set_type(idxd);
 
+	idxd_type_init(idxd);
+
 	dev_dbg(dev, "Set PCI master\n");
 	pci_set_master(pdev);
 	pci_set_drvdata(pdev, idxd);
diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 0cdc5405bc53..23c41fe52215 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -5,6 +5,7 @@
 
 /* PCI Config */
 #define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
+#define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
 
 #define IDXD_MMIO_BAR		0
 #define IDXD_WQ_BAR		2
diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c
index efca5d8468a6..0ff64eeb84be 100644
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -15,7 +15,7 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu)
 
 	desc = wq->descs[idx];
 	memset(desc->hw, 0, sizeof(struct dsa_hw_desc));
-	memset(desc->completion, 0, sizeof(struct dsa_completion_record));
+	memset(desc->completion, 0, idxd->compl_size);
 	desc->cpu = cpu;
 
 	if (device_pasid_enabled(idxd))
diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c
index 3af83f1fd36e..266423a2cabc 100644
--- a/drivers/dma/idxd/sysfs.c
+++ b/drivers/dma/idxd/sysfs.c
@@ -41,14 +41,24 @@ static struct device_type dsa_device_type = {
 	.release = idxd_conf_device_release,
 };
 
+static struct device_type iax_device_type = {
+	.name = "iax",
+	.release = idxd_conf_device_release,
+};
+
 static inline bool is_dsa_dev(struct device *dev)
 {
 	return dev ? dev->type == &dsa_device_type : false;
 }
 
+static inline bool is_iax_dev(struct device *dev)
+{
+	return dev ? dev->type == &iax_device_type : false;
+}
+
 static inline bool is_idxd_dev(struct device *dev)
 {
-	return is_dsa_dev(dev);
+	return is_dsa_dev(dev) || is_iax_dev(dev);
 }
 
 static inline bool is_idxd_wq_dev(struct device *dev)
@@ -359,8 +369,17 @@ struct bus_type dsa_bus_type = {
 	.shutdown = idxd_config_bus_shutdown,
 };
 
+struct bus_type iax_bus_type = {
+	.name = "iax",
+	.match = idxd_config_bus_match,
+	.probe = idxd_config_bus_probe,
+	.remove = idxd_config_bus_remove,
+	.shutdown = idxd_config_bus_shutdown,
+};
+
 static struct bus_type *idxd_bus_types[] = {
-	&dsa_bus_type
+	&dsa_bus_type,
+	&iax_bus_type
 };
 
 static struct idxd_device_driver dsa_drv = {
@@ -372,8 +391,18 @@ static struct idxd_device_driver dsa_drv = {
 	},
 };
 
+static struct idxd_device_driver iax_drv = {
+	.drv = {
+		.name = "iax",
+		.bus = &iax_bus_type,
+		.owner = THIS_MODULE,
+		.mod_name = KBUILD_MODNAME,
+	},
+};
+
 static struct idxd_device_driver *idxd_drvs[] = {
-	&dsa_drv
+	&dsa_drv,
+	&iax_drv
 };
 
 struct bus_type *idxd_get_bus_type(struct idxd_device *idxd)
@@ -385,6 +414,8 @@ static struct device_type *idxd_get_device_type(struct idxd_device *idxd)
 {
 	if (idxd->type == IDXD_TYPE_DSA)
 		return &dsa_device_type;
+	else if (idxd->type == IDXD_TYPE_IAX)
+		return &iax_device_type;
 	else
 		return NULL;
 }
@@ -525,6 +556,9 @@ static ssize_t group_tokens_reserved_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
+	if (idxd->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
@@ -570,6 +604,9 @@ static ssize_t group_tokens_allowed_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
+	if (idxd->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
@@ -612,6 +649,9 @@ static ssize_t group_use_token_limit_store(struct device *dev,
 	if (rc < 0)
 		return -EINVAL;
 
+	if (idxd->type == IDXD_TYPE_IAX)
+		return -EOPNOTSUPP;
+
 	if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags))
 		return -EPERM;
 
diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index fdcdfe414223..236d437947bc 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -26,6 +26,9 @@
 #define IDXD_OP_FLAG_DRDBK	0x4000
 #define IDXD_OP_FLAG_DSTS	0x8000
 
+/* IAX */
+#define IDXD_OP_FLAG_RD_SRC2_AECS	0x010000
+
 /* Opcode */
 enum dsa_opcode {
 	DSA_OPCODE_NOOP = 0,
@@ -47,6 +50,14 @@ enum dsa_opcode {
 	DSA_OPCODE_CFLUSH = 0x20,
 };
 
+enum iax_opcode {
+	IAX_OPCODE_NOOP = 0,
+	IAX_OPCODE_DRAIN = 2,
+	IAX_OPCODE_MEMMOVE,
+	IAX_OPCODE_DECOMPRESS = 0x42,
+	IAX_OPCODE_COMPRESS,
+};
+
 /* Completion record status */
 enum dsa_completion_status {
 	DSA_COMP_NONE = 0,
@@ -80,6 +91,33 @@ enum dsa_completion_status {
 	DSA_COMP_TRANSLATION_FAIL,
 };
 
+enum iax_completion_status {
+	IAX_COMP_NONE = 0,
+	IAX_COMP_SUCCESS,
+	IAX_COMP_PAGE_FAULT_IR = 0x04,
+	IAX_COMP_OUTBUF_OVERFLOW,
+	IAX_COMP_BAD_OPCODE = 0x10,
+	IAX_COMP_INVALID_FLAGS,
+	IAX_COMP_NOZERO_RESERVE,
+	IAX_COMP_INVALID_SIZE,
+	IAX_COMP_OVERLAP_BUFFERS = 0x16,
+	IAX_COMP_INT_HANDLE_INVAL = 0x19,
+	IAX_COMP_CRA_XLAT,
+	IAX_COMP_CRA_ALIGN,
+	IAX_COMP_ADDR_ALIGN,
+	IAX_COMP_PRIV_BAD,
+	IAX_COMP_TRAFFIC_CLASS_CONF,
+	IAX_COMP_PFAULT_RDBA,
+	IAX_COMP_HW_ERR1,
+	IAX_COMP_HW_ERR_DRB,
+	IAX_COMP_TRANSLATION_FAIL,
+	IAX_COMP_PRS_TIMEOUT,
+	IAX_COMP_WATCHDOG,
+	IAX_COMP_INVALID_COMP_FLAG = 0x30,
+	IAX_COMP_INVALID_FILTER_FLAG,
+	IAX_COMP_INVALID_NUM_ELEMS = 0x33,
+};
+
 #define DSA_COMP_STATUS_MASK		0x7f
 #define DSA_COMP_STATUS_WRITE		0x80
 
@@ -163,6 +201,28 @@ struct dsa_hw_desc {
 	};
 } __attribute__((packed));
 
+struct iax_hw_desc {
+	uint32_t        pasid:20;
+	uint32_t        rsvd:11;
+	uint32_t        priv:1;
+	uint32_t        flags:24;
+	uint32_t        opcode:8;
+	uint64_t        completion_addr;
+	uint64_t        src1_addr;
+	uint64_t        dst_addr;
+	uint32_t        src1_size;
+	uint16_t        int_handle;
+	union {
+		uint16_t        compr_flags;
+		uint16_t        decompr_flags;
+	};
+	uint64_t        src2_addr;
+	uint32_t        max_dst_size;
+	uint32_t        src2_size;
+	uint32_t	filter_flags;
+	uint32_t	num_inputs;
+} __attribute__((packed));
+
 struct dsa_raw_desc {
 	uint64_t	field[8];
 } __attribute__((packed));
@@ -223,4 +283,23 @@ struct dsa_raw_completion_record {
 	uint64_t	field[4];
 } __attribute__((packed));
 
+struct iax_completion_record {
+	volatile uint8_t        status;
+	uint8_t                 error_code;
+	uint16_t                rsvd;
+	uint32_t                bytes_completed;
+	uint64_t                fault_addr;
+	uint32_t                invalid_flags;
+	uint32_t                rsvd2;
+	uint32_t                output_size;
+	uint8_t                 output_bits;
+	uint8_t                 rsvd3;
+	uint16_t                rsvd4;
+	uint64_t                rsvd5[4];
+} __attribute__((packed));
+
+struct iax_raw_completion_record {
+	uint64_t	field[8];
+} __attribute__((packed));
+
 #endif

From 4421fe533296e070359573ab6d320d74f73c80b9 Mon Sep 17 00:00:00 2001
From: Parth Y Shah <sparth1292@gmail.com>
Date: Mon, 7 Dec 2020 12:03:40 +0530
Subject: [PATCH 122/230] dmaengine: bam_dma: fix return of bam_dma_irq()

While performing suspend/resume, we were getting below kernel crash.

[   54.541672] [FTS][Info]gesture suspend...
[   54.605256] [FTS][Error][GESTURE]Enter into gesture(suspend) failed!
[   54.605256]
[   58.345850] irq event 10: bogus return value fffffff3
......

[   58.345966] [<ffff0000080830f0>] el1_irq+0xb0/0x124
[   58.345971] [<ffff000008085360>] arch_cpu_idle+0x10/0x18
[   58.345975] [<ffff0000081077f4>] do_idle+0x1ac/0x1e0
[   58.345979] [<ffff0000081079c8>] cpu_startup_entry+0x20/0x28
[   58.345983] [<ffff000008a80ed0>] rest_init+0xd0/0xdc
[   58.345988] [<ffff0000091c0b48>] start_kernel+0x390/0x3a4
[   58.345990] handlers:
[   58.345994] [<ffff0000085120d0>] bam_dma_irq

The reason for the crash we found is, bam_dma_irq() was returning
negative value when the device resumes in some conditions.

In addition, the irq handler should have one of the below return values.

IRQ_NONE            interrupt was not from this device or was not handled
IRQ_HANDLED         interrupt was handled by this device
IRQ_WAKE_THREAD     handler requests to wake the handler thread

Therefore, to resolve this crash, we have changed the return value to
IRQ_NONE.

Signed-off-by: Parth Y Shah <sparth1292@gmail.com>
Link: https://lore.kernel.org/r/1607322820-7450-1-git-send-email-sparth1292@gmail.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/bam_dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 4eeb8bb27279..d5773d474d8f 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -875,7 +875,7 @@ static irqreturn_t bam_dma_irq(int irq, void *data)
 
 	ret = bam_pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
-		return ret;
+		return IRQ_NONE;
 
 	if (srcs & BAM_IRQ) {
 		clr_mask = readl_relaxed(bam_addr(bdev, 0, BAM_IRQ_STTS));

From a44d9d72453ea6b064380d4835e712e574e58d9b Mon Sep 17 00:00:00 2001
From: Amelie Delaunay <amelie.delaunay@st.com>
Date: Fri, 20 Nov 2020 15:33:17 +0100
Subject: [PATCH 123/230] dmaengine: stm32-dma: rework irq handler to manage
 error before xfer events

To better understand error that can be detected by the DMA controller,
manage the error flags before the transfer flags.
This way, it is possible to know if the FIFO error flag is set for an
over/underrun condition or a FIFO level error.
When a FIFO over/underrun condition occurs, the data is not lost because
peripheral request is not acknowledged by the stream until the over/
underrun condition is cleared. If this acknowledge takes too much time,
the peripheral itself may detect an over/underrun condition of its internal
buffer and data might be lost.
That's why in case the FIFO error flag is set, we check if the channel is
disabled or not, and if a Transfer Complete flag is set, which means that
the channel is disabled because of the end of transfer.
Because channel is disabled by hardware either by a FIFO level error, or by
an end of transfer.

Signed-off-by: Amelie Delaunay <amelie.delaunay@st.com>
Link: https://lore.kernel.org/r/20201120143320.30367-2-amelie.delaunay@st.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/stm32-dma.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index d0055d2f0b9a..55a6bd381219 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -648,21 +648,12 @@ static irqreturn_t stm32_dma_chan_irq(int irq, void *devid)
 	scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id));
 	sfcr = stm32_dma_read(dmadev, STM32_DMA_SFCR(chan->id));
 
-	if (status & STM32_DMA_TCI) {
-		stm32_dma_irq_clear(chan, STM32_DMA_TCI);
-		if (scr & STM32_DMA_SCR_TCIE)
-			stm32_dma_handle_chan_done(chan);
-		status &= ~STM32_DMA_TCI;
-	}
-	if (status & STM32_DMA_HTI) {
-		stm32_dma_irq_clear(chan, STM32_DMA_HTI);
-		status &= ~STM32_DMA_HTI;
-	}
 	if (status & STM32_DMA_FEI) {
 		stm32_dma_irq_clear(chan, STM32_DMA_FEI);
 		status &= ~STM32_DMA_FEI;
 		if (sfcr & STM32_DMA_SFCR_FEIE) {
-			if (!(scr & STM32_DMA_SCR_EN))
+			if (!(scr & STM32_DMA_SCR_EN) &&
+			    !(status & STM32_DMA_TCI))
 				dev_err(chan2dev(chan), "FIFO Error\n");
 			else
 				dev_dbg(chan2dev(chan), "FIFO over/underrun\n");
@@ -674,6 +665,19 @@ static irqreturn_t stm32_dma_chan_irq(int irq, void *devid)
 		if (sfcr & STM32_DMA_SCR_DMEIE)
 			dev_dbg(chan2dev(chan), "Direct mode overrun\n");
 	}
+
+	if (status & STM32_DMA_TCI) {
+		stm32_dma_irq_clear(chan, STM32_DMA_TCI);
+		if (scr & STM32_DMA_SCR_TCIE)
+			stm32_dma_handle_chan_done(chan);
+		status &= ~STM32_DMA_TCI;
+	}
+
+	if (status & STM32_DMA_HTI) {
+		stm32_dma_irq_clear(chan, STM32_DMA_HTI);
+		status &= ~STM32_DMA_HTI;
+	}
+
 	if (status) {
 		stm32_dma_irq_clear(chan, status);
 		dev_err(chan2dev(chan), "DMA error: status=0x%08x\n", status);

From 5d4d4dfbda18063231a95dea28fdeab148f23301 Mon Sep 17 00:00:00 2001
From: Amelie Delaunay <amelie.delaunay@st.com>
Date: Fri, 20 Nov 2020 15:33:18 +0100
Subject: [PATCH 124/230] dmaengine: stm32-dma: clean channel configuration
 when channel is freed

When dma_channel_release is called, it means that the channel won't be used
anymore with the configuration it had. To ensure a future client can safely
use the channel after it has been released, clean the configuration done
when channel was requested.

Signed-off-by: Amelie Delaunay <amelie.delaunay@st.com>
Link: https://lore.kernel.org/r/20201120143320.30367-3-amelie.delaunay@st.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/stm32-dma.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 55a6bd381219..62501e5d9e9d 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -1220,6 +1220,8 @@ static void stm32_dma_free_chan_resources(struct dma_chan *c)
 	pm_runtime_put(dmadev->ddev.dev);
 
 	vchan_free_chan_resources(to_virt_chan(c));
+	stm32_dma_clear_reg(&chan->chan_reg);
+	chan->threshold = 0;
 }
 
 static void stm32_dma_desc_free(struct virt_dma_desc *vdesc)

From e0ebdbdcb42a66f49b7587dc50cc6f528ec55cad Mon Sep 17 00:00:00 2001
From: Amelie Delaunay <amelie.delaunay@st.com>
Date: Fri, 20 Nov 2020 15:33:19 +0100
Subject: [PATCH 125/230] dmaengine: stm32-dma: take address into account when
 computing max width

DMA_SxPAR or DMA_SxM0AR/M1AR registers have to be aligned on PSIZE or MSIZE
respectively. This means that bus width needs to be forced to 1 byte when
computed width is not aligned with address.

Signed-off-by: Amelie Delaunay <amelie.delaunay@st.com>
Link: https://lore.kernel.org/r/20201120143320.30367-4-amelie.delaunay@st.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/stm32-dma.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 62501e5d9e9d..f54ecb123a52 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -264,9 +264,11 @@ static int stm32_dma_get_width(struct stm32_dma_chan *chan,
 }
 
 static enum dma_slave_buswidth stm32_dma_get_max_width(u32 buf_len,
+						       dma_addr_t buf_addr,
 						       u32 threshold)
 {
 	enum dma_slave_buswidth max_width;
+	u64 addr = buf_addr;
 
 	if (threshold == STM32_DMA_FIFO_THRESHOLD_FULL)
 		max_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
@@ -277,6 +279,9 @@ static enum dma_slave_buswidth stm32_dma_get_max_width(u32 buf_len,
 	       max_width > DMA_SLAVE_BUSWIDTH_1_BYTE)
 		max_width = max_width >> 1;
 
+	if (do_div(addr, max_width))
+		max_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
+
 	return max_width;
 }
 
@@ -707,7 +712,7 @@ static void stm32_dma_issue_pending(struct dma_chan *c)
 static int stm32_dma_set_xfer_param(struct stm32_dma_chan *chan,
 				    enum dma_transfer_direction direction,
 				    enum dma_slave_buswidth *buswidth,
-				    u32 buf_len)
+				    u32 buf_len, dma_addr_t buf_addr)
 {
 	enum dma_slave_buswidth src_addr_width, dst_addr_width;
 	int src_bus_width, dst_bus_width;
@@ -739,7 +744,8 @@ static int stm32_dma_set_xfer_param(struct stm32_dma_chan *chan,
 			return dst_burst_size;
 
 		/* Set memory data size */
-		src_addr_width = stm32_dma_get_max_width(buf_len, fifoth);
+		src_addr_width = stm32_dma_get_max_width(buf_len, buf_addr,
+							 fifoth);
 		chan->mem_width = src_addr_width;
 		src_bus_width = stm32_dma_get_width(chan, src_addr_width);
 		if (src_bus_width < 0)
@@ -788,7 +794,8 @@ static int stm32_dma_set_xfer_param(struct stm32_dma_chan *chan,
 			return src_burst_size;
 
 		/* Set memory data size */
-		dst_addr_width = stm32_dma_get_max_width(buf_len, fifoth);
+		dst_addr_width = stm32_dma_get_max_width(buf_len, buf_addr,
+							 fifoth);
 		chan->mem_width = dst_addr_width;
 		dst_bus_width = stm32_dma_get_width(chan, dst_addr_width);
 		if (dst_bus_width < 0)
@@ -876,7 +883,8 @@ static struct dma_async_tx_descriptor *stm32_dma_prep_slave_sg(
 
 	for_each_sg(sgl, sg, sg_len, i) {
 		ret = stm32_dma_set_xfer_param(chan, direction, &buswidth,
-					       sg_dma_len(sg));
+					       sg_dma_len(sg),
+					       sg_dma_address(sg));
 		if (ret < 0)
 			goto err;
 
@@ -944,7 +952,8 @@ static struct dma_async_tx_descriptor *stm32_dma_prep_dma_cyclic(
 		return NULL;
 	}
 
-	ret = stm32_dma_set_xfer_param(chan, direction, &buswidth, period_len);
+	ret = stm32_dma_set_xfer_param(chan, direction, &buswidth, period_len,
+				       buf_addr);
 	if (ret < 0)
 		return NULL;
 

From 1d3dd68749b9f4a4da272f39608d03b4bae0b69f Mon Sep 17 00:00:00 2001
From: Amelie Delaunay <amelie.delaunay@st.com>
Date: Fri, 20 Nov 2020 15:33:20 +0100
Subject: [PATCH 126/230] dmaengine: stm32-mdma: rework interrupt handler

To avoid multiple entries in MDMA interrupt handler for each flag&interrupt
enable, manage all flags set at once.

Signed-off-by: Amelie Delaunay <amelie.delaunay@st.com>
Link: https://lore.kernel.org/r/20201120143320.30367-5-amelie.delaunay@st.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/stm32-mdma.c | 64 +++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

diff --git a/drivers/dma/stm32-mdma.c b/drivers/dma/stm32-mdma.c
index 29e5e30524bb..e4637ec786d3 100644
--- a/drivers/dma/stm32-mdma.c
+++ b/drivers/dma/stm32-mdma.c
@@ -1346,7 +1346,7 @@ static irqreturn_t stm32_mdma_irq_handler(int irq, void *devid)
 {
 	struct stm32_mdma_device *dmadev = devid;
 	struct stm32_mdma_chan *chan = devid;
-	u32 reg, id, ien, status, flag;
+	u32 reg, id, ccr, ien, status;
 
 	/* Find out which channel generates the interrupt */
 	status = readl_relaxed(dmadev->base + STM32_MDMA_GISR0);
@@ -1368,67 +1368,71 @@ static irqreturn_t stm32_mdma_irq_handler(int irq, void *devid)
 
 	chan = &dmadev->chan[id];
 	if (!chan) {
-		dev_dbg(mdma2dev(dmadev), "MDMA channel not initialized\n");
-		goto exit;
+		dev_warn(mdma2dev(dmadev), "MDMA channel not initialized\n");
+		return IRQ_NONE;
 	}
 
 	/* Handle interrupt for the channel */
 	spin_lock(&chan->vchan.lock);
-	status = stm32_mdma_read(dmadev, STM32_MDMA_CISR(chan->id));
-	ien = stm32_mdma_read(dmadev, STM32_MDMA_CCR(chan->id));
-	ien &= STM32_MDMA_CCR_IRQ_MASK;
-	ien >>= 1;
+	status = stm32_mdma_read(dmadev, STM32_MDMA_CISR(id));
+	/* Mask Channel ReQuest Active bit which can be set in case of MEM2MEM */
+	status &= ~STM32_MDMA_CISR_CRQA;
+	ccr = stm32_mdma_read(dmadev, STM32_MDMA_CCR(id));
+	ien = (ccr & STM32_MDMA_CCR_IRQ_MASK) >> 1;
 
 	if (!(status & ien)) {
 		spin_unlock(&chan->vchan.lock);
-		dev_dbg(chan2dev(chan),
-			"spurious it (status=0x%04x, ien=0x%04x)\n",
-			status, ien);
+		dev_warn(chan2dev(chan),
+			 "spurious it (status=0x%04x, ien=0x%04x)\n",
+			 status, ien);
 		return IRQ_NONE;
 	}
 
-	flag = __ffs(status & ien);
-	reg = STM32_MDMA_CIFCR(chan->id);
+	reg = STM32_MDMA_CIFCR(id);
 
-	switch (1 << flag) {
-	case STM32_MDMA_CISR_TEIF:
-		id = chan->id;
-		status = readl_relaxed(dmadev->base + STM32_MDMA_CESR(id));
-		dev_err(chan2dev(chan), "Transfer Err: stat=0x%08x\n", status);
+	if (status & STM32_MDMA_CISR_TEIF) {
+		dev_err(chan2dev(chan), "Transfer Err: stat=0x%08x\n",
+			readl_relaxed(dmadev->base + STM32_MDMA_CESR(id)));
 		stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CTEIF);
-		break;
+		status &= ~STM32_MDMA_CISR_TEIF;
+	}
 
-	case STM32_MDMA_CISR_CTCIF:
+	if (status & STM32_MDMA_CISR_CTCIF) {
 		stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CCTCIF);
+		status &= ~STM32_MDMA_CISR_CTCIF;
 		stm32_mdma_xfer_end(chan);
-		break;
+	}
 
-	case STM32_MDMA_CISR_BRTIF:
+	if (status & STM32_MDMA_CISR_BRTIF) {
 		stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CBRTIF);
-		break;
+		status &= ~STM32_MDMA_CISR_BRTIF;
+	}
 
-	case STM32_MDMA_CISR_BTIF:
+	if (status & STM32_MDMA_CISR_BTIF) {
 		stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CBTIF);
+		status &= ~STM32_MDMA_CISR_BTIF;
 		chan->curr_hwdesc++;
 		if (chan->desc && chan->desc->cyclic) {
 			if (chan->curr_hwdesc == chan->desc->count)
 				chan->curr_hwdesc = 0;
 			vchan_cyclic_callback(&chan->desc->vdesc);
 		}
-		break;
+	}
 
-	case STM32_MDMA_CISR_TCIF:
+	if (status & STM32_MDMA_CISR_TCIF) {
 		stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CLTCIF);
-		break;
+		status &= ~STM32_MDMA_CISR_TCIF;
+	}
 
-	default:
-		dev_err(chan2dev(chan), "it %d unhandled (status=0x%04x)\n",
-			1 << flag, status);
+	if (status) {
+		stm32_mdma_set_bits(dmadev, reg, status);
+		dev_err(chan2dev(chan), "DMA error: status=0x%08x\n", status);
+		if (!(ccr & STM32_MDMA_CCR_EN))
+			dev_err(chan2dev(chan), "chan disabled by HW\n");
 	}
 
 	spin_unlock(&chan->vchan.lock);
 
-exit:
 	return IRQ_HANDLED;
 }
 

From e2de925bbfe321ba0588c99f577c59386ab1f428 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:21 +0200
Subject: [PATCH 127/230] dmaengine: ti: k3-udma: Correct normal channel offset
 when uchan_cnt is not 0

According to different sections of the TRM, the hchan_cnt of CAP3 includes
the number of uchan in UDMA, thus the start offset of the normal channels
are hchan_cnt.

Fixes: daf4ad0499aa4 ("dmaengine: ti: k3-udma: Query throughput level information from hardware")
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-2-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index e508280b3d70..0e8426dd18a7 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -3199,8 +3199,7 @@ static int udma_setup_resources(struct udma_dev *ud)
 	} else if (UDMA_CAP3_UCHAN_CNT(cap3)) {
 		ud->tpl_levels = 3;
 		ud->tpl_start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3);
-		ud->tpl_start_idx[0] = ud->tpl_start_idx[1] +
-				       UDMA_CAP3_HCHAN_CNT(cap3);
+		ud->tpl_start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
 	} else if (UDMA_CAP3_HCHAN_CNT(cap3)) {
 		ud->tpl_levels = 2;
 		ud->tpl_start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);

From 5e1cb1cb0f9fe670900d736822a7dbcd7c11dbba Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:22 +0200
Subject: [PATCH 128/230] dmaengine: ti: k3-udma: Wait for peer teardown
 completion if supported

Set the TDTYPE if it is supported on the platform (j721e) which will cause
UDMAP to wait for the remote peer to finish the teardown before returning
the teardown completed message.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-3-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index 0e8426dd18a7..eee43757e774 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -86,6 +86,7 @@ struct udma_rchan {
 
 #define UDMA_FLAG_PDMA_ACC32		BIT(0)
 #define UDMA_FLAG_PDMA_BURST		BIT(1)
+#define UDMA_FLAG_TDTYPE		BIT(2)
 
 struct udma_match_data {
 	u32 psil_base;
@@ -1589,6 +1590,13 @@ static int udma_tisci_tx_channel_config(struct udma_chan *uc)
 	req_tx.tx_fetch_size = fetch_size >> 2;
 	req_tx.txcq_qnum = tc_ring;
 	req_tx.tx_atype = uc->config.atype;
+	if (uc->config.ep_type == PSIL_EP_PDMA_XY &&
+	    ud->match_data->flags & UDMA_FLAG_TDTYPE) {
+		/* wait for peer to complete the teardown for PDMAs */
+		req_tx.valid_params |=
+				TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID;
+		req_tx.tx_tdtype = 1;
+	}
 
 	ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx);
 	if (ret)
@@ -3105,14 +3113,14 @@ static struct udma_match_data am654_mcu_data = {
 static struct udma_match_data j721e_main_data = {
 	.psil_base = 0x1000,
 	.enable_memcpy_support = true,
-	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST,
+	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE,
 	.statictr_z_mask = GENMASK(23, 0),
 };
 
 static struct udma_match_data j721e_mcu_data = {
 	.psil_base = 0x6000,
 	.enable_memcpy_support = false, /* MEM_TO_MEM is slow via MCU UDMA */
-	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST,
+	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE,
 	.statictr_z_mask = GENMASK(23, 0),
 };
 

From 1609c15a20b8e0c1adc2a26ad81fd7e2b968f81a Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:23 +0200
Subject: [PATCH 129/230] dmaengine: ti: k3-udma: Add support for second
 resource range from sysfw

Resource allocation via sysfw can use up to two ranges per resource subtype
to support more complex resource assignment, mainly for DMA channels.

Take the second range also into consideration when setting up the maps for
available resources.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-4-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 55 ++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index eee43757e774..b89afa602532 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -3174,12 +3174,22 @@ static int udma_get_mmrs(struct platform_device *pdev, struct udma_dev *ud)
 	return 0;
 }
 
+static void udma_mark_resource_ranges(struct udma_dev *ud, unsigned long *map,
+				      struct ti_sci_resource_desc *rm_desc,
+				      char *name)
+{
+	bitmap_clear(map, rm_desc->start, rm_desc->num);
+	bitmap_clear(map, rm_desc->start_sec, rm_desc->num_sec);
+	dev_dbg(ud->dev, "ti_sci resource range for %s: %d:%d | %d:%d\n", name,
+		rm_desc->start, rm_desc->num, rm_desc->start_sec,
+		rm_desc->num_sec);
+}
+
 static int udma_setup_resources(struct udma_dev *ud)
 {
 	struct device *dev = ud->dev;
 	int ch_count, ret, i, j;
 	u32 cap2, cap3;
-	struct ti_sci_resource_desc *rm_desc;
 	struct ti_sci_resource *rm_res, irq_res;
 	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
 	static const char * const range_names[] = { "ti,sci-rm-range-tchan",
@@ -3264,13 +3274,9 @@ static int udma_setup_resources(struct udma_dev *ud)
 		bitmap_zero(ud->tchan_map, ud->tchan_cnt);
 	} else {
 		bitmap_fill(ud->tchan_map, ud->tchan_cnt);
-		for (i = 0; i < rm_res->sets; i++) {
-			rm_desc = &rm_res->desc[i];
-			bitmap_clear(ud->tchan_map, rm_desc->start,
-				     rm_desc->num);
-			dev_dbg(dev, "ti-sci-res: tchan: %d:%d\n",
-				rm_desc->start, rm_desc->num);
-		}
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->tchan_map,
+						  &rm_res->desc[i], "tchan");
 	}
 	irq_res.sets = rm_res->sets;
 
@@ -3280,13 +3286,9 @@ static int udma_setup_resources(struct udma_dev *ud)
 		bitmap_zero(ud->rchan_map, ud->rchan_cnt);
 	} else {
 		bitmap_fill(ud->rchan_map, ud->rchan_cnt);
-		for (i = 0; i < rm_res->sets; i++) {
-			rm_desc = &rm_res->desc[i];
-			bitmap_clear(ud->rchan_map, rm_desc->start,
-				     rm_desc->num);
-			dev_dbg(dev, "ti-sci-res: rchan: %d:%d\n",
-				rm_desc->start, rm_desc->num);
-		}
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->rchan_map,
+						  &rm_res->desc[i], "rchan");
 	}
 
 	irq_res.sets += rm_res->sets;
@@ -3295,12 +3297,21 @@ static int udma_setup_resources(struct udma_dev *ud)
 	for (i = 0; i < rm_res->sets; i++) {
 		irq_res.desc[i].start = rm_res->desc[i].start;
 		irq_res.desc[i].num = rm_res->desc[i].num;
+		irq_res.desc[i].start_sec = rm_res->desc[i].start_sec;
+		irq_res.desc[i].num_sec = rm_res->desc[i].num_sec;
 	}
 	rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN];
 	for (j = 0; j < rm_res->sets; j++, i++) {
-		irq_res.desc[i].start = rm_res->desc[j].start +
+		if (rm_res->desc[j].num) {
+			irq_res.desc[i].start = rm_res->desc[j].start +
 					ud->soc_data->rchan_oes_offset;
-		irq_res.desc[i].num = rm_res->desc[j].num;
+			irq_res.desc[i].num = rm_res->desc[j].num;
+		}
+		if (rm_res->desc[j].num_sec) {
+			irq_res.desc[i].start_sec = rm_res->desc[j].start_sec +
+					ud->soc_data->rchan_oes_offset;
+			irq_res.desc[i].num_sec = rm_res->desc[j].num_sec;
+		}
 	}
 	ret = ti_sci_inta_msi_domain_alloc_irqs(ud->dev, &irq_res);
 	kfree(irq_res.desc);
@@ -3316,13 +3327,9 @@ static int udma_setup_resources(struct udma_dev *ud)
 		bitmap_clear(ud->rflow_gp_map, ud->rchan_cnt,
 			     ud->rflow_cnt - ud->rchan_cnt);
 	} else {
-		for (i = 0; i < rm_res->sets; i++) {
-			rm_desc = &rm_res->desc[i];
-			bitmap_clear(ud->rflow_gp_map, rm_desc->start,
-				     rm_desc->num);
-			dev_dbg(dev, "ti-sci-res: rflow: %d:%d\n",
-				rm_desc->start, rm_desc->num);
-		}
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->rflow_gp_map,
+						  &rm_res->desc[i], "gp-rflow");
 	}
 
 	ch_count -= bitmap_weight(ud->tchan_map, ud->tchan_cnt);

From 426506a7e0f1902268c3edbdc7e5475624a9d18b Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:24 +0200
Subject: [PATCH 130/230] dmaengine: ti: k3-udma-glue: Add function to get
 device pointer for DMA API

Glue layer users should use the device of the DMA for DMA mapping and
allocations as it is the DMA which accesses to descriptors and buffers,
not the clients

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-5-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 14 ++++++++++++++
 drivers/dma/ti/k3-udma-private.c |  6 ++++++
 drivers/dma/ti/k3-udma.h         |  1 +
 include/linux/dma/k3-udma-glue.h |  4 ++++
 4 files changed, 25 insertions(+)

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index dfb65e382ab9..29d1524d1916 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -493,6 +493,13 @@ int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn)
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq);
 
+struct device *
+	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn)
+{
+	return xudma_get_device(tx_chn->common.udmax);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device);
+
 static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
 	const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm;
@@ -1201,3 +1208,10 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 	return flow->virq;
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq);
+
+struct device *
+	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn)
+{
+	return xudma_get_device(rx_chn->common.udmax);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device);
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index aa24e554f7b4..8ff7a264be03 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -50,6 +50,12 @@ struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property)
 }
 EXPORT_SYMBOL(of_xudma_dev_get);
 
+struct device *xudma_get_device(struct udma_dev *ud)
+{
+	return ud->dev;
+}
+EXPORT_SYMBOL(xudma_get_device);
+
 u32 xudma_dev_get_psil_base(struct udma_dev *ud)
 {
 	return ud->psil_base;
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index 09c4529e013d..d1cace0cb43b 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -112,6 +112,7 @@ int xudma_navss_psil_unpair(struct udma_dev *ud, u32 src_thread,
 			    u32 dst_thread);
 
 struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property);
+struct device *xudma_get_device(struct udma_dev *ud);
 void xudma_dev_put(struct udma_dev *ud);
 u32 xudma_dev_get_psil_base(struct udma_dev *ud);
 struct udma_tisci_rm *xudma_dev_get_tisci_rm(struct udma_dev *ud);
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index 5eb34ad973a7..d7c12f31377c 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -41,6 +41,8 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 u32 k3_udma_glue_tx_get_hdesc_size(struct k3_udma_glue_tx_channel *tx_chn);
 u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn);
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn);
+struct device *
+	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn);
 
 enum {
 	K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0,
@@ -130,5 +132,7 @@ int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn,
 				u32 flow_idx);
 int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
 				 u32 flow_idx);
+struct device *
+	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn);
 
 #endif /* K3_UDMA_GLUE_H_ */

From aa8a4c4edad0bed7aaf3a7cfcae9fa555d847955 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:25 +0200
Subject: [PATCH 131/230] dmaengine: ti: k3-udma-glue: Get the ringacc from
 udma_dev

If of_xudma_dev_get() returns with the valid udma_dev then the driver
already got the ringacc, there is no need to execute
of_k3_ringacc_get_by_phandle() for each channel via the glue layer.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-6-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 6 +-----
 drivers/dma/ti/k3-udma-private.c | 6 ++++++
 drivers/dma/ti/k3-udma.h         | 1 +
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index 29d1524d1916..8a8988be4175 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -86,15 +86,11 @@ struct k3_udma_glue_rx_channel {
 static int of_k3_udma_glue_parse(struct device_node *udmax_np,
 				 struct k3_udma_glue_common *common)
 {
-	common->ringacc = of_k3_ringacc_get_by_phandle(udmax_np,
-						       "ti,ringacc");
-	if (IS_ERR(common->ringacc))
-		return PTR_ERR(common->ringacc);
-
 	common->udmax = of_xudma_dev_get(udmax_np, NULL);
 	if (IS_ERR(common->udmax))
 		return PTR_ERR(common->udmax);
 
+	common->ringacc = xudma_get_ringacc(common->udmax);
 	common->tisci_rm = xudma_dev_get_tisci_rm(common->udmax);
 
 	return 0;
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index 8ff7a264be03..346a4dd9640a 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -56,6 +56,12 @@ struct device *xudma_get_device(struct udma_dev *ud)
 }
 EXPORT_SYMBOL(xudma_get_device);
 
+struct k3_ringacc *xudma_get_ringacc(struct udma_dev *ud)
+{
+	return ud->ringacc;
+}
+EXPORT_SYMBOL(xudma_get_ringacc);
+
 u32 xudma_dev_get_psil_base(struct udma_dev *ud)
 {
 	return ud->psil_base;
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index d1cace0cb43b..b4334b1b7b14 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -113,6 +113,7 @@ int xudma_navss_psil_unpair(struct udma_dev *ud, u32 src_thread,
 
 struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property);
 struct device *xudma_get_device(struct udma_dev *ud);
+struct k3_ringacc *xudma_get_ringacc(struct udma_dev *ud);
 void xudma_dev_put(struct udma_dev *ud);
 u32 xudma_dev_get_psil_base(struct udma_dev *ud);
 struct udma_tisci_rm *xudma_dev_get_tisci_rm(struct udma_dev *ud);

From d553e2ab0137ae489b41824b1e8283053c363ed1 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:26 +0200
Subject: [PATCH 132/230] dmaengine: ti: k3-udma-glue: Configure the dma_dev
 for rings

Rings in RING mode should be using the DMA device for DMA API as in this
mode the ringacc will not access the ring memory in any ways, but the DMA
is.

Fix up the ring configuration and set the dma_dev unconditionally and let
the ringacc driver to select the correct device to use for DMA API.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-7-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index 8a8988be4175..e6ebcd98c02a 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -276,6 +276,10 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 		goto err;
 	}
 
+	/* Set the dma_dev for the rings to be configured */
+	cfg->tx_cfg.dma_dev = k3_udma_glue_tx_get_dma_device(tx_chn);
+	cfg->txcq_cfg.dma_dev = cfg->tx_cfg.dma_dev;
+
 	ret = k3_ringacc_ring_cfg(tx_chn->ringtx, &cfg->tx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringtx %d\n", ret);
@@ -591,6 +595,10 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 		goto err_rflow_put;
 	}
 
+	/* Set the dma_dev for the rings to be configured */
+	flow_cfg->rx_cfg.dma_dev = k3_udma_glue_rx_get_dma_device(rx_chn);
+	flow_cfg->rxfdq_cfg.dma_dev = flow_cfg->rx_cfg.dma_dev;
+
 	ret = k3_ringacc_ring_cfg(flow->ringrx, &flow_cfg->rx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringrx %d\n", ret);

From 4f910c035f38053ac8eb63a672c78862c535cd0f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:27 +0200
Subject: [PATCH 133/230] dmaengine: of-dma: Add support for optional router
 configuration callback

Additional configuration for the DMA event router might be needed for a
channel which can not be done during device_alloc_chan_resources callback
since the router information is not yet present for the drivers.

If there is a need for additional configuration for the channel if DMA
router is in use, then the driver can implement the device_router_config
callback.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-8-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/of-dma.c      | 10 ++++++++++
 include/linux/dmaengine.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 8a4f608904b9..ec00b20ae8e4 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -75,8 +75,18 @@ static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec,
 		ofdma->dma_router->route_free(ofdma->dma_router->dev,
 					      route_data);
 	} else {
+		int ret = 0;
+
 		chan->router = ofdma->dma_router;
 		chan->route_data = route_data;
+
+		if (chan->device->device_router_config)
+			ret = chan->device->device_router_config(chan);
+
+		if (ret) {
+			dma_release_channel(chan);
+			chan = ERR_PTR(ret);
+		}
 	}
 
 	/*
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 493a047ed0a2..aed44888cad3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -805,6 +805,7 @@ struct dma_filter {
  *	by tx_status
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
+ * @device_router_config: optional callback for DMA router configuration
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
@@ -879,6 +880,7 @@ struct dma_device {
 	enum dma_residue_granularity residue_granularity;
 
 	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	int (*device_router_config)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(

From ab650ef6d548153862119e1bf3bf267510707f48 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:28 +0200
Subject: [PATCH 134/230] dmaengine: Add support for per channel coherency
 handling

If the DMA device supports per channel coherency configuration (a channel
can be configured to have coherent or not coherent view) then a single
device (the DMA controller's device) can not be used for dma_api for all
channels as channels can have different coherency.

Introduce custom_dma_mapping flag for the dma_chan and a new helper to get
the device pointer to be used for dma_api for the given channel.

Client drivers should be updated to be able to support per channel
coherency by:

- dma_map_single(chan->device->dev, ptr, size, DMA_TO_DEVICE);
+ struct device *dma_dev = dmaengine_get_dma_device(chan);
+
+ dma_map_single(dma_dev, ptr, size, DMA_TO_DEVICE);

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-9-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index aed44888cad3..68130f5f599e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -357,11 +357,14 @@ struct dma_chan {
  * @chan: driver channel device
  * @device: sysfs device
  * @dev_id: parent dma_device dev_id
+ * @chan_dma_dev: The channel is using custom/different dma-mapping
+ * compared to the parent dma_device
  */
 struct dma_chan_dev {
 	struct dma_chan *chan;
 	struct device device;
 	int dev_id;
+	bool chan_dma_dev;
 };
 
 /**
@@ -1618,4 +1621,13 @@ dmaengine_get_direction_text(enum dma_transfer_direction dir)
 		return "invalid";
 	}
 }
+
+static inline struct device *dmaengine_get_dma_device(struct dma_chan *chan)
+{
+	if (chan->dev->chan_dma_dev)
+		return &chan->dev->device;
+
+	return chan->device->dev;
+}
+
 #endif /* DMAENGINE_H */

From f082c6df970e6e9aa97af35e826fec824007fbae Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:29 +0200
Subject: [PATCH 135/230] dmaengine: doc: client: Update for
 dmaengine_get_dma_device() usage

Client drivers should use the dmaengine_get_dma_device(chan) to get the
device pointer which should be used for DMA API for allocations and
mapping.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-10-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 Documentation/driver-api/dmaengine/client.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst
index 09a3f66dcd26..bfd057b21a00 100644
--- a/Documentation/driver-api/dmaengine/client.rst
+++ b/Documentation/driver-api/dmaengine/client.rst
@@ -120,7 +120,9 @@ The details of these operations are:
 
   .. code-block:: c
 
-     nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len);
+     struct device *dma_dev = dmaengine_get_dma_device(chan);
+
+     nr_sg = dma_map_sg(dma_dev, sgl, sg_len);
 	if (nr_sg == 0)
 		/* error */
 

From adc0f941f9a8fae7e79b249b0abcdbf86017ea53 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:30 +0200
Subject: [PATCH 136/230] dmaengine: dmatest: Use dmaengine_get_dma_device

By using the dmaengine_get_dma_device() to get the device for
dma_api use, the dmatest can support per channel coherency if it is
supported by the DMA controller.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-11-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dmatest.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index a3a172173e34..f696246f57fd 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -573,6 +573,7 @@ static int dmatest_func(void *data)
 	struct dmatest_params	*params;
 	struct dma_chan		*chan;
 	struct dma_device	*dev;
+	struct device		*dma_dev;
 	unsigned int		error_count;
 	unsigned int		failed_tests = 0;
 	unsigned int		total_tests = 0;
@@ -606,6 +607,8 @@ static int dmatest_func(void *data)
 	params = &info->params;
 	chan = thread->chan;
 	dev = chan->device;
+	dma_dev = dmaengine_get_dma_device(chan);
+
 	src = &thread->src;
 	dst = &thread->dst;
 	if (thread->type == DMA_MEMCPY) {
@@ -730,7 +733,7 @@ static int dmatest_func(void *data)
 			filltime = ktime_add(filltime, diff);
 		}
 
-		um = dmaengine_get_unmap_data(dev->dev, src->cnt + dst->cnt,
+		um = dmaengine_get_unmap_data(dma_dev, src->cnt + dst->cnt,
 					      GFP_KERNEL);
 		if (!um) {
 			failed_tests++;
@@ -745,10 +748,10 @@ static int dmatest_func(void *data)
 			struct page *pg = virt_to_page(buf);
 			unsigned long pg_off = offset_in_page(buf);
 
-			um->addr[i] = dma_map_page(dev->dev, pg, pg_off,
+			um->addr[i] = dma_map_page(dma_dev, pg, pg_off,
 						   um->len, DMA_TO_DEVICE);
 			srcs[i] = um->addr[i] + src->off;
-			ret = dma_mapping_error(dev->dev, um->addr[i]);
+			ret = dma_mapping_error(dma_dev, um->addr[i]);
 			if (ret) {
 				result("src mapping error", total_tests,
 				       src->off, dst->off, len, ret);
@@ -763,9 +766,9 @@ static int dmatest_func(void *data)
 			struct page *pg = virt_to_page(buf);
 			unsigned long pg_off = offset_in_page(buf);
 
-			dsts[i] = dma_map_page(dev->dev, pg, pg_off, um->len,
+			dsts[i] = dma_map_page(dma_dev, pg, pg_off, um->len,
 					       DMA_BIDIRECTIONAL);
-			ret = dma_mapping_error(dev->dev, dsts[i]);
+			ret = dma_mapping_error(dma_dev, dsts[i]);
 			if (ret) {
 				result("dst mapping error", total_tests,
 				       src->off, dst->off, len, ret);

From 991b96e0f12234e2a7e66d67275981b53572acd8 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:31 +0200
Subject: [PATCH 137/230] dt-bindings: dma: ti: Add document for K3 BCDMA

New binding document for
Texas Instruments K3 Block Copy DMA (BCDMA).

BCDMA is introduced as part of AM64.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201208090440.31792-12-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/ti/k3-bcdma.yaml  | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml

diff --git a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml
new file mode 100644
index 000000000000..b15f68c499cb
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/ti/k3-bcdma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments K3 DMSS BCDMA Device Tree Bindings
+
+maintainers:
+  - Peter Ujfalusi <peter.ujfalusi@ti.com>
+
+description: |
+  The Block Copy DMA (BCDMA) is intended to perform similar functions as the TR
+  mode channels of K3 UDMA-P.
+  BCDMA includes block copy channels and Split channels.
+
+  Block copy channels mainly used for memory to memory transfers, but with
+  optional triggers a block copy channel can service peripherals by accessing
+  directly to memory mapped registers or area.
+
+  Split channels can be used to service PSI-L based peripherals.
+  The peripherals can be PSI-L native or legacy, non PSI-L native peripherals
+  with PDMAs. PDMA is tasked to act as a bridge between the PSI-L fabric and the
+  legacy peripheral.
+
+  PDMAs can be configured via BCDMA split channel's peer registers to match with
+  the configuration of the legacy peripheral.
+
+allOf:
+  - $ref: /schemas/dma/dma-controller.yaml#
+
+properties:
+  compatible:
+    const: ti,am64-dmss-bcdma
+
+  "#dma-cells":
+    const: 3
+    description: |
+      cell 1: type of the BCDMA channel to be used to service the peripheral:
+        0 - split channel
+        1 - block copy channel using global trigger 1
+        2 - block copy channel using global trigger 2
+        3 - block copy channel using local trigger
+
+      cell 2: parameter for the channel:
+        if cell 1 is 0 (split channel):
+          PSI-L thread ID of the remote (to BCDMA) end.
+          Valid ranges for thread ID depends on the data movement direction:
+          for source thread IDs (rx): 0 - 0x7fff
+          for destination thread IDs (tx): 0x8000 - 0xffff
+
+          Please refer to the device documentation for the PSI-L thread map and
+          also the PSI-L peripheral chapter for the correct thread ID.
+        if cell 1 is 1 or 2 (block copy channel using global trigger):
+          Unused, ignored
+
+          The trigger must be configured for the channel externally to BCDMA,
+          channels using global triggers should not be requested directly, but
+          via DMA event router.
+        if cell 1 is 3 (block copy channel using local trigger):
+          bchan number of the locally triggered channel
+
+      cell 3: ASEL value for the channel
+
+  reg:
+    maxItems: 5
+
+  reg-names:
+    items:
+      - const: gcfg
+      - const: bchanrt
+      - const: rchanrt
+      - const: tchanrt
+      - const: ringrt
+
+  msi-parent: true
+
+  ti,asel:
+    $ref: /schemas/types.yaml#/definitions/uint32
+    description: ASEL value for non slave channels
+
+  ti,sci-rm-range-bchan:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of BCDMA block-copy channel resource subtypes for resource
+      allocation for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+  ti,sci-rm-range-tchan:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of BCDMA split tx channel resource subtypes for resource allocation
+      for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+  ti,sci-rm-range-rchan:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of BCDMA split rx channel resource subtypes for resource allocation
+      for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+required:
+  - compatible
+  - "#dma-cells"
+  - reg
+  - reg-names
+  - msi-parent
+  - ti,sci
+  - ti,sci-dev-id
+  - ti,sci-rm-range-bchan
+  - ti,sci-rm-range-tchan
+  - ti,sci-rm-range-rchan
+
+unevaluatedProperties: false
+
+examples:
+  - |+
+    cbass_main {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        main_dmss {
+            compatible = "simple-mfd";
+            #address-cells = <2>;
+            #size-cells = <2>;
+            dma-ranges;
+            ranges;
+
+            ti,sci-dev-id = <25>;
+
+            main_bcdma: dma-controller@485c0100 {
+                compatible = "ti,am64-dmss-bcdma";
+
+                reg = <0x0 0x485c0100 0x0 0x100>,
+                      <0x0 0x4c000000 0x0 0x20000>,
+                      <0x0 0x4a820000 0x0 0x20000>,
+                      <0x0 0x4aa40000 0x0 0x20000>,
+                      <0x0 0x4bc00000 0x0 0x100000>;
+                reg-names = "gcfg", "bchanrt", "rchanrt", "tchanrt", "ringrt";
+                msi-parent = <&inta_main_dmss>;
+                #dma-cells = <3>;
+
+                ti,sci = <&dmsc>;
+                ti,sci-dev-id = <26>;
+
+                ti,sci-rm-range-bchan = <0x20>; /* BLOCK_COPY_CHAN */
+                ti,sci-rm-range-rchan = <0x21>; /* SPLIT_TR_RX_CHAN */
+                ti,sci-rm-range-tchan = <0x22>; /* SPLIT_TR_TX_CHAN */
+            };
+        };
+    };

From 1d92cec649a38f9f4f488d5e359a27a4bb5f182d Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:32 +0200
Subject: [PATCH 138/230] dt-bindings: dma: ti: Add document for K3 PKTDMA

New binding document for
Texas Instruments K3 Packet DMA (PKTDMA).

PKTDMA is introduced as part of AM64.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20201208090440.31792-13-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../devicetree/bindings/dma/ti/k3-pktdma.yaml | 172 ++++++++++++++++++
 1 file changed, 172 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml

diff --git a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml
new file mode 100644
index 000000000000..b13ab60cd740
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma/ti/k3-pktdma.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Texas Instruments K3 DMSS PKTDMA Device Tree Bindings
+
+maintainers:
+  - Peter Ujfalusi <peter.ujfalusi@ti.com>
+
+description: |
+  The Packet DMA (PKTDMA) is intended to perform similar functions as the packet
+  mode channels of K3 UDMA-P.
+  PKTDMA only includes Split channels to service PSI-L based peripherals.
+
+  The peripherals can be PSI-L native or legacy, non PSI-L native peripherals
+  with PDMAs. PDMA is tasked to act as a bridge between the PSI-L fabric and the
+  legacy peripheral.
+
+  PDMAs can be configured via PKTDMA split channel's peer registers to match
+  with the configuration of the legacy peripheral.
+
+allOf:
+  - $ref: /schemas/dma/dma-controller.yaml#
+
+properties:
+  compatible:
+    const: ti,am64-dmss-pktdma
+
+  "#dma-cells":
+    const: 2
+    description: |
+      The first cell is the PSI-L  thread ID of the remote (to PKTDMA) end.
+      Valid ranges for thread ID depends on the data movement direction:
+      for source thread IDs (rx): 0 - 0x7fff
+      for destination thread IDs (tx): 0x8000 - 0xffff
+
+      Please refer to the device documentation for the PSI-L thread map and also
+      the PSI-L peripheral chapter for the correct thread ID.
+
+      The second cell is the ASEL value for the channel
+
+  reg:
+    maxItems: 4
+
+  reg-names:
+    items:
+      - const: gcfg
+      - const: rchanrt
+      - const: tchanrt
+      - const: ringrt
+
+  msi-parent: true
+
+  ti,sci-rm-range-tchan:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of PKTDMA split tx channel resource subtypes for resource allocation
+      for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+  ti,sci-rm-range-tflow:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of PKTDMA split tx flow resource subtypes for resource allocation
+      for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+  ti,sci-rm-range-rchan:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of PKTDMA split rx channel resource subtypes for resource allocation
+      for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+  ti,sci-rm-range-rflow:
+    $ref: /schemas/types.yaml#/definitions/uint32-array
+    description: |
+      Array of PKTDMA split rx flow resource subtypes for resource allocation
+      for this host
+    minItems: 1
+    # Should be enough
+    maxItems: 255
+    items:
+      maximum: 0x3f
+
+required:
+  - compatible
+  - "#dma-cells"
+  - reg
+  - reg-names
+  - msi-parent
+  - ti,sci
+  - ti,sci-dev-id
+  - ti,sci-rm-range-tchan
+  - ti,sci-rm-range-tflow
+  - ti,sci-rm-range-rchan
+  - ti,sci-rm-range-rflow
+
+unevaluatedProperties: false
+
+examples:
+  - |+
+    cbass_main {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        main_dmss {
+            compatible = "simple-mfd";
+            #address-cells = <2>;
+            #size-cells = <2>;
+            dma-ranges;
+            ranges;
+
+            ti,sci-dev-id = <25>;
+
+            main_pktdma: dma-controller@485c0000 {
+                compatible = "ti,am64-dmss-pktdma";
+
+                reg = <0x0 0x485c0000 0x0 0x100>,
+                      <0x0 0x4a800000 0x0 0x20000>,
+                      <0x0 0x4aa00000 0x0 0x40000>,
+                      <0x0 0x4b800000 0x0 0x400000>;
+                reg-names = "gcfg", "rchanrt", "tchanrt", "ringrt";
+                msi-parent = <&inta_main_dmss>;
+                #dma-cells = <2>;
+
+                ti,sci = <&dmsc>;
+                ti,sci-dev-id = <30>;
+
+                ti,sci-rm-range-tchan = <0x23>, /* UNMAPPED_TX_CHAN */
+                                        <0x24>, /* CPSW_TX_CHAN */
+                                        <0x25>, /* SAUL_TX_0_CHAN */
+                                        <0x26>, /* SAUL_TX_1_CHAN */
+                                        <0x27>, /* ICSSG_0_TX_CHAN */
+                                        <0x28>; /* ICSSG_1_TX_CHAN */
+                ti,sci-rm-range-tflow = <0x10>, /* RING_UNMAPPED_TX_CHAN */
+                                        <0x11>, /* RING_CPSW_TX_CHAN */
+                                        <0x12>, /* RING_SAUL_TX_0_CHAN */
+                                        <0x13>, /* RING_SAUL_TX_1_CHAN */
+                                        <0x14>, /* RING_ICSSG_0_TX_CHAN */
+                                        <0x15>; /* RING_ICSSG_1_TX_CHAN */
+                ti,sci-rm-range-rchan = <0x29>, /* UNMAPPED_RX_CHAN */
+                                        <0x2b>, /* CPSW_RX_CHAN */
+                                        <0x2d>, /* SAUL_RX_0_CHAN */
+                                        <0x2f>, /* SAUL_RX_1_CHAN */
+                                        <0x31>, /* SAUL_RX_2_CHAN */
+                                        <0x33>, /* SAUL_RX_3_CHAN */
+                                        <0x35>, /* ICSSG_0_RX_CHAN */
+                                        <0x37>; /* ICSSG_1_RX_CHAN */
+                ti,sci-rm-range-rflow = <0x2a>, /* FLOW_UNMAPPED_RX_CHAN */
+                                        <0x2c>, /* FLOW_CPSW_RX_CHAN */
+                                        <0x2e>, /* FLOW_SAUL_RX_0/1_CHAN */
+                                        <0x32>, /* FLOW_SAUL_RX_2/3_CHAN */
+                                        <0x36>, /* FLOW_ICSSG_0_RX_CHAN */
+                                        <0x38>; /* FLOW_ICSSG_1_RX_CHAN */
+            };
+        };
+    };

From b9366e2577a38ca5322f326cff9752c2008597c6 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:33 +0200
Subject: [PATCH 139/230] dmaengine: ti: k3-psil: Extend psil_endpoint_config
 for K3 PKTDMA

Additional fields needed for K3 PKTDMA to be able to handle the mapped
channels (channels are locked to handle specific threads) and flow ranges
for these mapped threads.
PKTDMA also introduces tflow for tx channels which can not be found in
K3 UDMA architecture.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-14-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-psil.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h
index 1962f75fa2d3..36e22c5a0f29 100644
--- a/include/linux/dma/k3-psil.h
+++ b/include/linux/dma/k3-psil.h
@@ -50,6 +50,15 @@ enum psil_endpoint_type {
  * @channel_tpl:	Desired throughput level for the channel
  * @pdma_acc32:		ACC32 must be enabled on the PDMA side
  * @pdma_burst:		BURST must be enabled on the PDMA side
+ * @mapped_channel_id:	PKTDMA thread to channel mapping for mapped channels.
+ *			The thread must be serviced by the specified channel if
+ *			mapped_channel_id is >= 0 in case of PKTDMA
+ * @flow_start:		PKDMA flow range start of mapped channel. Unmapped
+ *			channels use flow_id == chan_id
+ * @flow_num:		PKDMA flow count of mapped channel. Unmapped channels
+ *			use flow_id == chan_id
+ * @default_flow_id:	PKDMA default (r)flow index of mapped channel.
+ *			Must be within the flow range of the mapped channel.
  */
 struct psil_endpoint_config {
 	enum psil_endpoint_type ep_type;
@@ -63,6 +72,13 @@ struct psil_endpoint_config {
 	/* PDMA properties, valid for PSIL_EP_PDMA_* */
 	unsigned pdma_acc32:1;
 	unsigned pdma_burst:1;
+
+	/* PKDMA mapped channel */
+	int mapped_channel_id;
+	/* PKTDMA tflow and rflow ranges for mapped channel */
+	u16 flow_start;
+	u16 flow_num;
+	u16 default_flow_id;
 };
 
 int psil_set_new_ep_config(struct device *dev, const char *name,

From 2329725d1a228fb0d6424ee8d499f266020cc113 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:34 +0200
Subject: [PATCH 140/230] dmaengine: ti: k3-psil: Add initial map for AM64

Add initial PSI-L map file for AM64.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-15-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/Makefile       |   3 +-
 drivers/dma/ti/k3-psil-am64.c | 158 ++++++++++++++++++++++++++++++++++
 drivers/dma/ti/k3-psil-priv.h |   1 +
 drivers/dma/ti/k3-psil.c      |   1 +
 4 files changed, 162 insertions(+), 1 deletion(-)
 create mode 100644 drivers/dma/ti/k3-psil-am64.c

diff --git a/drivers/dma/ti/Makefile b/drivers/dma/ti/Makefile
index 0c67254caee6..bd496efadff7 100644
--- a/drivers/dma/ti/Makefile
+++ b/drivers/dma/ti/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_TI_K3_UDMA_GLUE_LAYER) += k3-udma-glue.o
 obj-$(CONFIG_TI_K3_PSIL) += k3-psil.o \
 			    k3-psil-am654.o \
 			    k3-psil-j721e.o \
-			    k3-psil-j7200.o
+			    k3-psil-j7200.o \
+			    k3-psil-am64.o
 obj-$(CONFIG_TI_DMA_CROSSBAR) += dma-crossbar.o
diff --git a/drivers/dma/ti/k3-psil-am64.c b/drivers/dma/ti/k3-psil-am64.c
new file mode 100644
index 000000000000..9fdeaa11a4fc
--- /dev/null
+++ b/drivers/dma/ti/k3-psil-am64.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Copyright (C) 2020 Texas Instruments Incorporated - https://www.ti.com
+ *  Author: Peter Ujfalusi <peter.ujfalusi@ti.com>
+ */
+
+#include <linux/kernel.h>
+
+#include "k3-psil-priv.h"
+
+#define PSIL_PDMA_XY_TR(x)					\
+	{							\
+		.thread_id = x,					\
+		.ep_config = {					\
+			.ep_type = PSIL_EP_PDMA_XY,		\
+			.mapped_channel_id = -1,		\
+			.default_flow_id = -1,			\
+		},						\
+	}
+
+#define PSIL_PDMA_XY_PKT(x)					\
+	{							\
+		.thread_id = x,					\
+		.ep_config = {					\
+			.ep_type = PSIL_EP_PDMA_XY,		\
+			.mapped_channel_id = -1,		\
+			.default_flow_id = -1,			\
+			.pkt_mode = 1,				\
+		},						\
+	}
+
+#define PSIL_ETHERNET(x, ch, flow_base, flow_cnt)		\
+	{							\
+		.thread_id = x,					\
+		.ep_config = {					\
+			.ep_type = PSIL_EP_NATIVE,		\
+			.pkt_mode = 1,				\
+			.needs_epib = 1,			\
+			.psd_size = 16,				\
+			.mapped_channel_id = ch,		\
+			.flow_start = flow_base,		\
+			.flow_num = flow_cnt,			\
+			.default_flow_id = flow_base,		\
+		},						\
+	}
+
+#define PSIL_SAUL(x, ch, flow_base, flow_cnt, default_flow, tx)	\
+	{							\
+		.thread_id = x,					\
+		.ep_config = {					\
+			.ep_type = PSIL_EP_NATIVE,		\
+			.pkt_mode = 1,				\
+			.needs_epib = 1,			\
+			.psd_size = 64,				\
+			.mapped_channel_id = ch,		\
+			.flow_start = flow_base,		\
+			.flow_num = flow_cnt,			\
+			.default_flow_id = default_flow,	\
+			.notdpkt = tx,				\
+		},						\
+	}
+
+/* PSI-L source thread IDs, used for RX (DMA_DEV_TO_MEM) */
+static struct psil_ep am64_src_ep_map[] = {
+	/* SAUL */
+	PSIL_SAUL(0x4000, 17, 32, 8, 32, 0),
+	PSIL_SAUL(0x4001, 18, 32, 8, 33, 0),
+	PSIL_SAUL(0x4002, 19, 40, 8, 40, 0),
+	PSIL_SAUL(0x4003, 20, 40, 8, 41, 0),
+	/* ICSS_G0 */
+	PSIL_ETHERNET(0x4100, 21, 48, 16),
+	PSIL_ETHERNET(0x4101, 22, 64, 16),
+	PSIL_ETHERNET(0x4102, 23, 80, 16),
+	PSIL_ETHERNET(0x4103, 24, 96, 16),
+	/* ICSS_G1 */
+	PSIL_ETHERNET(0x4200, 25, 112, 16),
+	PSIL_ETHERNET(0x4201, 26, 128, 16),
+	PSIL_ETHERNET(0x4202, 27, 144, 16),
+	PSIL_ETHERNET(0x4203, 28, 160, 16),
+	/* PDMA_MAIN0 - SPI0-3 */
+	PSIL_PDMA_XY_PKT(0x4300),
+	PSIL_PDMA_XY_PKT(0x4301),
+	PSIL_PDMA_XY_PKT(0x4302),
+	PSIL_PDMA_XY_PKT(0x4303),
+	PSIL_PDMA_XY_PKT(0x4304),
+	PSIL_PDMA_XY_PKT(0x4305),
+	PSIL_PDMA_XY_PKT(0x4306),
+	PSIL_PDMA_XY_PKT(0x4307),
+	PSIL_PDMA_XY_PKT(0x4308),
+	PSIL_PDMA_XY_PKT(0x4309),
+	PSIL_PDMA_XY_PKT(0x430a),
+	PSIL_PDMA_XY_PKT(0x430b),
+	PSIL_PDMA_XY_PKT(0x430c),
+	PSIL_PDMA_XY_PKT(0x430d),
+	PSIL_PDMA_XY_PKT(0x430e),
+	PSIL_PDMA_XY_PKT(0x430f),
+	/* PDMA_MAIN0 - USART0-1 */
+	PSIL_PDMA_XY_PKT(0x4310),
+	PSIL_PDMA_XY_PKT(0x4311),
+	/* PDMA_MAIN1 - SPI4 */
+	PSIL_PDMA_XY_PKT(0x4400),
+	PSIL_PDMA_XY_PKT(0x4401),
+	PSIL_PDMA_XY_PKT(0x4402),
+	PSIL_PDMA_XY_PKT(0x4403),
+	/* PDMA_MAIN1 - USART2-6 */
+	PSIL_PDMA_XY_PKT(0x4404),
+	PSIL_PDMA_XY_PKT(0x4405),
+	PSIL_PDMA_XY_PKT(0x4406),
+	PSIL_PDMA_XY_PKT(0x4407),
+	PSIL_PDMA_XY_PKT(0x4408),
+	/* PDMA_MAIN1 - ADCs */
+	PSIL_PDMA_XY_TR(0x440f),
+	PSIL_PDMA_XY_TR(0x4410),
+	/* CPSW2 */
+	PSIL_ETHERNET(0x4500, 16, 16, 16),
+};
+
+/* PSI-L destination thread IDs, used for TX (DMA_MEM_TO_DEV) */
+static struct psil_ep am64_dst_ep_map[] = {
+	/* SAUL */
+	PSIL_SAUL(0xc000, 24, 80, 8, 80, 1),
+	PSIL_SAUL(0xc001, 25, 88, 8, 88, 1),
+	/* ICSS_G0 */
+	PSIL_ETHERNET(0xc100, 26, 96, 1),
+	PSIL_ETHERNET(0xc101, 27, 97, 1),
+	PSIL_ETHERNET(0xc102, 28, 98, 1),
+	PSIL_ETHERNET(0xc103, 29, 99, 1),
+	PSIL_ETHERNET(0xc104, 30, 100, 1),
+	PSIL_ETHERNET(0xc105, 31, 101, 1),
+	PSIL_ETHERNET(0xc106, 32, 102, 1),
+	PSIL_ETHERNET(0xc107, 33, 103, 1),
+	/* ICSS_G1 */
+	PSIL_ETHERNET(0xc200, 34, 104, 1),
+	PSIL_ETHERNET(0xc201, 35, 105, 1),
+	PSIL_ETHERNET(0xc202, 36, 106, 1),
+	PSIL_ETHERNET(0xc203, 37, 107, 1),
+	PSIL_ETHERNET(0xc204, 38, 108, 1),
+	PSIL_ETHERNET(0xc205, 39, 109, 1),
+	PSIL_ETHERNET(0xc206, 40, 110, 1),
+	PSIL_ETHERNET(0xc207, 41, 111, 1),
+	/* CPSW2 */
+	PSIL_ETHERNET(0xc500, 16, 16, 8),
+	PSIL_ETHERNET(0xc501, 17, 24, 8),
+	PSIL_ETHERNET(0xc502, 18, 32, 8),
+	PSIL_ETHERNET(0xc503, 19, 40, 8),
+	PSIL_ETHERNET(0xc504, 20, 48, 8),
+	PSIL_ETHERNET(0xc505, 21, 56, 8),
+	PSIL_ETHERNET(0xc506, 22, 64, 8),
+	PSIL_ETHERNET(0xc507, 23, 72, 8),
+};
+
+struct psil_ep_map am64_ep_map = {
+	.name = "am64",
+	.src = am64_src_ep_map,
+	.src_count = ARRAY_SIZE(am64_src_ep_map),
+	.dst = am64_dst_ep_map,
+	.dst_count = ARRAY_SIZE(am64_dst_ep_map),
+};
diff --git a/drivers/dma/ti/k3-psil-priv.h b/drivers/dma/ti/k3-psil-priv.h
index b4b0fb359eff..b74e192e3c2d 100644
--- a/drivers/dma/ti/k3-psil-priv.h
+++ b/drivers/dma/ti/k3-psil-priv.h
@@ -40,5 +40,6 @@ struct psil_endpoint_config *psil_get_ep_config(u32 thread_id);
 extern struct psil_ep_map am654_ep_map;
 extern struct psil_ep_map j721e_ep_map;
 extern struct psil_ep_map j7200_ep_map;
+extern struct psil_ep_map am64_ep_map;
 
 #endif /* K3_PSIL_PRIV_H_ */
diff --git a/drivers/dma/ti/k3-psil.c b/drivers/dma/ti/k3-psil.c
index 837853aab95a..13ce7367d870 100644
--- a/drivers/dma/ti/k3-psil.c
+++ b/drivers/dma/ti/k3-psil.c
@@ -20,6 +20,7 @@ static const struct soc_device_attribute k3_soc_devices[] = {
 	{ .family = "AM65X", .data = &am654_ep_map },
 	{ .family = "J721E", .data = &j721e_ep_map },
 	{ .family = "J7200", .data = &j7200_ep_map },
+	{ .family = "AM64X", .data = &am64_ep_map },
 	{ /* sentinel */ }
 };
 

From fc373e47d72605cc3f5012ddda49d2dca430d51f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:35 +0200
Subject: [PATCH 141/230] dmaengine: ti: Add support for k3 event routers

In k3 architecture a DMA channel (in TR momde) can be triggered by global
events, origination from different modules.

The events for triggers can be sent from any module which is connected to
PSI-L fabric, but the event number to be sent is DMA channel specific, it
is only known after the channel itself is requested.

The router operation needs to be split up:
- route_allocate: configure the dma_spec for the DMA and store the
  configuration which is needed for the router's input
- set_event: callback used by the DMA driver to set the event number for
  the channel and enable the routing

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-16-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-event-router.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/linux/dma/k3-event-router.h

diff --git a/include/linux/dma/k3-event-router.h b/include/linux/dma/k3-event-router.h
new file mode 100644
index 000000000000..e3f88b2f87be
--- /dev/null
+++ b/include/linux/dma/k3-event-router.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (C) 2020 Texas Instruments Incorporated - https://www.ti.com
+ */
+
+#ifndef K3_EVENT_ROUTER_
+#define K3_EVENT_ROUTER_
+
+#include <linux/types.h>
+
+struct k3_event_route_data {
+	void *priv;
+	int (*set_event)(void *priv, u32 event);
+};
+
+#endif /* K3_EVENT_ROUTER_ */

From d782298c6f6b854452965b56d91616dfb60490c5 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Tue, 8 Dec 2020 11:04:36 +0200
Subject: [PATCH 142/230] soc: ti: k3-ringacc: add AM64 DMA rings support.

The DMAs in AM64 have built in rings compared to AM654/J721e/J7200 where a
separate and generic ringacc is used.

The ring SW interface is similar to ringacc with some major architectural
differences, like

They are part of the DMA (BCDMA or PKTDMA).

They are dual mode rings are modeled as pair of Rings objects which has
common configuration and memory buffer, but separate real-time control
register sets for each direction mem2dev (forward) and dev2mem (reverse).

The ringacc driver must be initialized for DMA rings use with
k3_ringacc_dmarings_init() as it is not an independent device as ringacc
is.

AM64 rings must be requested only using k3_ringacc_request_rings_pair(),
and forward ring must always be initialized/configured. After this any
other Ringacc APIs can be used without any callers changes.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-17-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soc/ti/k3-ringacc.c       | 325 +++++++++++++++++++++++++++++-
 include/linux/soc/ti/k3-ringacc.h |  17 ++
 2 files changed, 335 insertions(+), 7 deletions(-)

diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c
index 119164abcb41..c88c305ba367 100644
--- a/drivers/soc/ti/k3-ringacc.c
+++ b/drivers/soc/ti/k3-ringacc.c
@@ -11,6 +11,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/sys_soc.h>
+#include <linux/dma/ti-cppi5.h>
 #include <linux/soc/ti/k3-ringacc.h>
 #include <linux/soc/ti/ti_sci_protocol.h>
 #include <linux/soc/ti/ti_sci_inta_msi.h>
@@ -21,6 +22,7 @@ static LIST_HEAD(k3_ringacc_list);
 static DEFINE_MUTEX(k3_ringacc_list_lock);
 
 #define K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK		GENMASK(19, 0)
+#define K3_DMARING_CFG_RING_SIZE_ELCNT_MASK		GENMASK(15, 0)
 
 /**
  * struct k3_ring_rt_regs - The RA realtime Control/Status Registers region
@@ -43,7 +45,13 @@ struct k3_ring_rt_regs {
 	u32	hwindx;
 };
 
-#define K3_RINGACC_RT_REGS_STEP	0x1000
+#define K3_RINGACC_RT_REGS_STEP			0x1000
+#define K3_DMARING_RT_REGS_STEP			0x2000
+#define K3_DMARING_RT_REGS_REVERSE_OFS		0x1000
+#define K3_RINGACC_RT_OCC_MASK			GENMASK(20, 0)
+#define K3_DMARING_RT_OCC_TDOWN_COMPLETE	BIT(31)
+#define K3_DMARING_RT_DB_ENTRY_MASK		GENMASK(7, 0)
+#define K3_DMARING_RT_DB_TDOWN_ACK		BIT(31)
 
 /**
  * struct k3_ring_fifo_regs - The Ring Accelerator Queues Registers region
@@ -122,6 +130,7 @@ struct k3_ring_state {
 	u32 occ;
 	u32 windex;
 	u32 rindex;
+	u32 tdown_complete:1;
 };
 
 /**
@@ -143,6 +152,7 @@ struct k3_ring_state {
  * @use_count: Use count for shared rings
  * @proxy_id: RA Ring Proxy Id (only if @K3_RINGACC_RING_USE_PROXY)
  * @dma_dev: device to be used for DMA API (allocation, mapping)
+ * @asel: Address Space Select value for physical addresses
  */
 struct k3_ring {
 	struct k3_ring_rt_regs __iomem *rt;
@@ -157,12 +167,15 @@ struct k3_ring {
 	u32		flags;
 #define K3_RING_FLAG_BUSY	BIT(1)
 #define K3_RING_FLAG_SHARED	BIT(2)
+#define K3_RING_FLAG_REVERSE	BIT(3)
 	struct k3_ring_state state;
 	u32		ring_id;
 	struct k3_ringacc	*parent;
 	u32		use_count;
 	int		proxy_id;
 	struct device	*dma_dev;
+	u32		asel;
+#define K3_ADDRESS_ASEL_SHIFT	48
 };
 
 struct k3_ringacc_ops {
@@ -188,6 +201,7 @@ struct k3_ringacc_ops {
  * @tisci_ring_ops: ti-sci rings ops
  * @tisci_dev_id: ti-sci device id
  * @ops: SoC specific ringacc operation
+ * @dma_rings: indicate DMA ring (dual ring within BCDMA/PKTDMA)
  */
 struct k3_ringacc {
 	struct device *dev;
@@ -210,6 +224,7 @@ struct k3_ringacc {
 	u32 tisci_dev_id;
 
 	const struct k3_ringacc_ops *ops;
+	bool dma_rings;
 };
 
 /**
@@ -221,6 +236,21 @@ struct k3_ringacc_soc_data {
 	unsigned dma_ring_reset_quirk:1;
 };
 
+static int k3_ringacc_ring_read_occ(struct k3_ring *ring)
+{
+	return readl(&ring->rt->occ) & K3_RINGACC_RT_OCC_MASK;
+}
+
+static void k3_ringacc_ring_update_occ(struct k3_ring *ring)
+{
+	u32 val;
+
+	val = readl(&ring->rt->occ);
+
+	ring->state.occ = val & K3_RINGACC_RT_OCC_MASK;
+	ring->state.tdown_complete = !!(val & K3_DMARING_RT_OCC_TDOWN_COMPLETE);
+}
+
 static long k3_ringacc_ring_get_fifo_pos(struct k3_ring *ring)
 {
 	return K3_RINGACC_FIFO_WINDOW_SIZE_BYTES -
@@ -234,12 +264,24 @@ static void *k3_ringacc_get_elm_addr(struct k3_ring *ring, u32 idx)
 
 static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_pop_mem(struct k3_ring *ring, void *elem);
+static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem);
+static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem);
 
 static struct k3_ring_ops k3_ring_mode_ring_ops = {
 		.push_tail = k3_ringacc_ring_push_mem,
 		.pop_head = k3_ringacc_ring_pop_mem,
 };
 
+static struct k3_ring_ops k3_dmaring_fwd_ops = {
+		.push_tail = k3_ringacc_ring_push_mem,
+		.pop_head = k3_dmaring_fwd_pop,
+};
+
+static struct k3_ring_ops k3_dmaring_reverse_ops = {
+		/* Reverse side of the DMA ring can only be popped by SW */
+		.pop_head = k3_dmaring_reverse_pop,
+};
+
 static int k3_ringacc_ring_push_io(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_pop_io(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_push_head_io(struct k3_ring *ring, void *elem);
@@ -342,6 +384,40 @@ error:
 }
 EXPORT_SYMBOL_GPL(k3_ringacc_request_ring);
 
+static int k3_dmaring_request_dual_ring(struct k3_ringacc *ringacc, int fwd_id,
+					struct k3_ring **fwd_ring,
+					struct k3_ring **compl_ring)
+{
+	int ret = 0;
+
+	/*
+	 * DMA rings must be requested by ID, completion ring is the reverse
+	 * side of the forward ring
+	 */
+	if (fwd_id < 0)
+		return -EINVAL;
+
+	mutex_lock(&ringacc->req_lock);
+
+	if (test_bit(fwd_id, ringacc->rings_inuse)) {
+		ret = -EBUSY;
+		goto error;
+	}
+
+	*fwd_ring = &ringacc->rings[fwd_id];
+	*compl_ring = &ringacc->rings[fwd_id + ringacc->num_rings];
+	set_bit(fwd_id, ringacc->rings_inuse);
+	ringacc->rings[fwd_id].use_count++;
+	dev_dbg(ringacc->dev, "Giving ring#%d\n", fwd_id);
+
+	mutex_unlock(&ringacc->req_lock);
+	return 0;
+
+error:
+	mutex_unlock(&ringacc->req_lock);
+	return ret;
+}
+
 int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc,
 				  int fwd_id, int compl_id,
 				  struct k3_ring **fwd_ring,
@@ -352,6 +428,10 @@ int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc,
 	if (!fwd_ring || !compl_ring)
 		return -EINVAL;
 
+	if (ringacc->dma_rings)
+		return k3_dmaring_request_dual_ring(ringacc, fwd_id,
+						    fwd_ring, compl_ring);
+
 	*fwd_ring = k3_ringacc_request_ring(ringacc, fwd_id, 0);
 	if (!(*fwd_ring))
 		return -ENODEV;
@@ -421,7 +501,7 @@ void k3_ringacc_ring_reset_dma(struct k3_ring *ring, u32 occ)
 		goto reset;
 
 	if (!occ)
-		occ = readl(&ring->rt->occ);
+		occ = k3_ringacc_ring_read_occ(ring);
 
 	if (occ) {
 		u32 db_ring_cnt, db_ring_cnt_cur;
@@ -496,6 +576,13 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 
 	ringacc = ring->parent;
 
+	/*
+	 * DMA rings: rings shared memory and configuration, only forward ring
+	 * is configured and reverse ring considered as slave.
+	 */
+	if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE))
+		return 0;
+
 	dev_dbg(ring->parent->dev, "flags: 0x%08x\n", ring->flags);
 
 	if (!test_bit(ring->ring_id, ringacc->rings_inuse))
@@ -517,6 +604,8 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 	ring->flags = 0;
 	ring->ops = NULL;
 	ring->dma_dev = NULL;
+	ring->asel = 0;
+
 	if (ring->proxy_id != K3_RINGACC_PROXY_NOT_USED) {
 		clear_bit(ring->proxy_id, ringacc->proxy_inuse);
 		ring->proxy = NULL;
@@ -581,6 +670,7 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring)
 	ring_cfg.count = ring->size;
 	ring_cfg.mode = ring->mode;
 	ring_cfg.size = ring->elm_size;
+	ring_cfg.asel = ring->asel;
 
 	ret = ringacc->tisci_ring_ops->set_cfg(ringacc->tisci, &ring_cfg);
 	if (ret)
@@ -590,6 +680,90 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring)
 	return ret;
 }
 
+static int k3_dmaring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
+{
+	struct k3_ringacc *ringacc;
+	struct k3_ring *reverse_ring;
+	int ret = 0;
+
+	if (cfg->elm_size != K3_RINGACC_RING_ELSIZE_8 ||
+	    cfg->mode != K3_RINGACC_RING_MODE_RING ||
+	    cfg->size & ~K3_DMARING_CFG_RING_SIZE_ELCNT_MASK)
+		return -EINVAL;
+
+	ringacc = ring->parent;
+
+	/*
+	 * DMA rings: rings shared memory and configuration, only forward ring
+	 * is configured and reverse ring considered as slave.
+	 */
+	if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE))
+		return 0;
+
+	if (!test_bit(ring->ring_id, ringacc->rings_inuse))
+		return -EINVAL;
+
+	ring->size = cfg->size;
+	ring->elm_size = cfg->elm_size;
+	ring->mode = cfg->mode;
+	ring->asel = cfg->asel;
+	ring->dma_dev = cfg->dma_dev;
+	if (!ring->dma_dev) {
+		dev_warn(ringacc->dev, "dma_dev is not provided for ring%d\n",
+			 ring->ring_id);
+		ring->dma_dev = ringacc->dev;
+	}
+
+	memset(&ring->state, 0, sizeof(ring->state));
+
+	ring->ops = &k3_dmaring_fwd_ops;
+
+	ring->ring_mem_virt = dma_alloc_coherent(ring->dma_dev,
+						 ring->size * (4 << ring->elm_size),
+						 &ring->ring_mem_dma, GFP_KERNEL);
+	if (!ring->ring_mem_virt) {
+		dev_err(ringacc->dev, "Failed to alloc ring mem\n");
+		ret = -ENOMEM;
+		goto err_free_ops;
+	}
+
+	ret = k3_ringacc_ring_cfg_sci(ring);
+	if (ret)
+		goto err_free_mem;
+
+	ring->flags |= K3_RING_FLAG_BUSY;
+
+	k3_ringacc_ring_dump(ring);
+
+	/* DMA rings: configure reverse ring */
+	reverse_ring = &ringacc->rings[ring->ring_id + ringacc->num_rings];
+	reverse_ring->size = cfg->size;
+	reverse_ring->elm_size = cfg->elm_size;
+	reverse_ring->mode = cfg->mode;
+	reverse_ring->asel = cfg->asel;
+	memset(&reverse_ring->state, 0, sizeof(reverse_ring->state));
+	reverse_ring->ops = &k3_dmaring_reverse_ops;
+
+	reverse_ring->ring_mem_virt = ring->ring_mem_virt;
+	reverse_ring->ring_mem_dma = ring->ring_mem_dma;
+	reverse_ring->flags |= K3_RING_FLAG_BUSY;
+	k3_ringacc_ring_dump(reverse_ring);
+
+	return 0;
+
+err_free_mem:
+	dma_free_coherent(ring->dma_dev,
+			  ring->size * (4 << ring->elm_size),
+			  ring->ring_mem_virt,
+			  ring->ring_mem_dma);
+err_free_ops:
+	ring->ops = NULL;
+	ring->proxy = NULL;
+	ring->dma_dev = NULL;
+	ring->asel = 0;
+	return ret;
+}
+
 int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 {
 	struct k3_ringacc *ringacc;
@@ -597,8 +771,12 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 
 	if (!ring || !cfg)
 		return -EINVAL;
+
 	ringacc = ring->parent;
 
+	if (ringacc->dma_rings)
+		return k3_dmaring_cfg(ring, cfg);
+
 	if (cfg->elm_size > K3_RINGACC_RING_ELSIZE_256 ||
 	    cfg->mode >= K3_RINGACC_RING_MODE_INVALID ||
 	    cfg->size & ~K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK ||
@@ -705,7 +883,7 @@ u32 k3_ringacc_ring_get_free(struct k3_ring *ring)
 		return -EINVAL;
 
 	if (!ring->state.free)
-		ring->state.free = ring->size - readl(&ring->rt->occ);
+		ring->state.free = ring->size - k3_ringacc_ring_read_occ(ring);
 
 	return ring->state.free;
 }
@@ -716,7 +894,7 @@ u32 k3_ringacc_ring_get_occ(struct k3_ring *ring)
 	if (!ring || !(ring->flags & K3_RING_FLAG_BUSY))
 		return -EINVAL;
 
-	return readl(&ring->rt->occ);
+	return k3_ringacc_ring_read_occ(ring);
 }
 EXPORT_SYMBOL_GPL(k3_ringacc_ring_get_occ);
 
@@ -892,6 +1070,72 @@ static int k3_ringacc_ring_pop_tail_io(struct k3_ring *ring, void *elem)
 					 K3_RINGACC_ACCESS_MODE_POP_HEAD);
 }
 
+/*
+ * The element is 48 bits of address + ASEL bits in the ring.
+ * ASEL is used by the DMAs and should be removed for the kernel as it is not
+ * part of the physical memory address.
+ */
+static void k3_dmaring_remove_asel_from_elem(u64 *elem)
+{
+	*elem &= GENMASK_ULL(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+
+static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem)
+{
+	void *elem_ptr;
+	u32 elem_idx;
+
+	/*
+	 * DMA rings: forward ring is always tied DMA channel and HW does not
+	 * maintain any state data required for POP operation and its unknown
+	 * how much elements were consumed by HW. So, to actually
+	 * do POP, the read pointer has to be recalculated every time.
+	 */
+	ring->state.occ = k3_ringacc_ring_read_occ(ring);
+	if (ring->state.windex >= ring->state.occ)
+		elem_idx = ring->state.windex - ring->state.occ;
+	else
+		elem_idx = ring->size - (ring->state.occ - ring->state.windex);
+
+	elem_ptr = k3_ringacc_get_elm_addr(ring, elem_idx);
+	memcpy(elem, elem_ptr, (4 << ring->elm_size));
+	k3_dmaring_remove_asel_from_elem(elem);
+
+	ring->state.occ--;
+	writel(-1, &ring->rt->db);
+
+	dev_dbg(ring->parent->dev, "%s: occ%d Windex%d Rindex%d pos_ptr%px\n",
+		__func__, ring->state.occ, ring->state.windex, elem_idx,
+		elem_ptr);
+	return 0;
+}
+
+static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem)
+{
+	void *elem_ptr;
+
+	elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.rindex);
+
+	if (ring->state.occ) {
+		memcpy(elem, elem_ptr, (4 << ring->elm_size));
+		k3_dmaring_remove_asel_from_elem(elem);
+
+		ring->state.rindex = (ring->state.rindex + 1) % ring->size;
+		ring->state.occ--;
+		writel(-1 & K3_DMARING_RT_DB_ENTRY_MASK, &ring->rt->db);
+	} else if (ring->state.tdown_complete) {
+		dma_addr_t *value = elem;
+
+		*value = CPPI5_TDCM_MARKER;
+		writel(K3_DMARING_RT_DB_TDOWN_ACK, &ring->rt->db);
+		ring->state.tdown_complete = false;
+	}
+
+	dev_dbg(ring->parent->dev, "%s: occ%d index%d pos_ptr%px\n",
+		__func__, ring->state.occ, ring->state.rindex, elem_ptr);
+	return 0;
+}
+
 static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem)
 {
 	void *elem_ptr;
@@ -899,6 +1143,11 @@ static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem)
 	elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.windex);
 
 	memcpy(elem_ptr, elem, (4 << ring->elm_size));
+	if (ring->parent->dma_rings) {
+		u64 *addr = elem_ptr;
+
+		*addr |= ((u64)ring->asel << K3_ADDRESS_ASEL_SHIFT);
+	}
 
 	ring->state.windex = (ring->state.windex + 1) % ring->size;
 	ring->state.free--;
@@ -975,12 +1224,12 @@ int k3_ringacc_ring_pop(struct k3_ring *ring, void *elem)
 		return -EINVAL;
 
 	if (!ring->state.occ)
-		ring->state.occ = k3_ringacc_ring_get_occ(ring);
+		k3_ringacc_ring_update_occ(ring);
 
 	dev_dbg(ring->parent->dev, "ring_pop: occ%d index%d\n", ring->state.occ,
 		ring->state.rindex);
 
-	if (!ring->state.occ)
+	if (!ring->state.occ && !ring->state.tdown_complete)
 		return -ENODATA;
 
 	if (ring->ops && ring->ops->pop_head)
@@ -998,7 +1247,7 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem)
 		return -EINVAL;
 
 	if (!ring->state.occ)
-		ring->state.occ = k3_ringacc_ring_get_occ(ring);
+		k3_ringacc_ring_update_occ(ring);
 
 	dev_dbg(ring->parent->dev, "ring_pop_tail: occ%d index%d\n",
 		ring->state.occ, ring->state.rindex);
@@ -1203,6 +1452,68 @@ static const struct of_device_id k3_ringacc_of_match[] = {
 	{},
 };
 
+struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
+					    struct k3_ringacc_init_data *data)
+{
+	struct device *dev = &pdev->dev;
+	struct k3_ringacc *ringacc;
+	void __iomem *base_rt;
+	struct resource *res;
+	int i;
+
+	ringacc = devm_kzalloc(dev, sizeof(*ringacc), GFP_KERNEL);
+	if (!ringacc)
+		return ERR_PTR(-ENOMEM);
+
+	ringacc->dev = dev;
+	ringacc->dma_rings = true;
+	ringacc->num_rings = data->num_rings;
+	ringacc->tisci = data->tisci;
+	ringacc->tisci_dev_id = data->tisci_dev_id;
+
+	mutex_init(&ringacc->req_lock);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ringrt");
+	base_rt = devm_ioremap_resource(dev, res);
+	if (IS_ERR(base_rt))
+		return base_rt;
+
+	ringacc->rings = devm_kzalloc(dev,
+				      sizeof(*ringacc->rings) *
+				      ringacc->num_rings * 2,
+				      GFP_KERNEL);
+	ringacc->rings_inuse = devm_kcalloc(dev,
+					    BITS_TO_LONGS(ringacc->num_rings),
+					    sizeof(unsigned long), GFP_KERNEL);
+
+	if (!ringacc->rings || !ringacc->rings_inuse)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < ringacc->num_rings; i++) {
+		struct k3_ring *ring = &ringacc->rings[i];
+
+		ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i;
+		ring->parent = ringacc;
+		ring->ring_id = i;
+		ring->proxy_id = K3_RINGACC_PROXY_NOT_USED;
+
+		ring = &ringacc->rings[ringacc->num_rings + i];
+		ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i +
+			   K3_DMARING_RT_REGS_REVERSE_OFS;
+		ring->parent = ringacc;
+		ring->ring_id = i;
+		ring->proxy_id = K3_RINGACC_PROXY_NOT_USED;
+		ring->flags = K3_RING_FLAG_REVERSE;
+	}
+
+	ringacc->tisci_ring_ops = &ringacc->tisci->ops.rm_ring_ops;
+
+	dev_info(dev, "Number of rings: %u\n", ringacc->num_rings);
+
+	return ringacc;
+}
+EXPORT_SYMBOL_GPL(k3_ringacc_dmarings_init);
+
 static int k3_ringacc_probe(struct platform_device *pdev)
 {
 	const struct ringacc_match_data *match_data;
diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h
index 658dc71d2901..39b022b92598 100644
--- a/include/linux/soc/ti/k3-ringacc.h
+++ b/include/linux/soc/ti/k3-ringacc.h
@@ -70,6 +70,7 @@ struct k3_ring;
  * @dma_dev: Master device which is using and accessing to the ring
  *	memory when the mode is K3_RINGACC_RING_MODE_RING. Memory allocations
  *	should be done using this device.
+ * @asel: Address Space Select value for physical addresses
  */
 struct k3_ring_cfg {
 	u32 size;
@@ -79,6 +80,7 @@ struct k3_ring_cfg {
 	u32 flags;
 
 	struct device *dma_dev;
+	u32 asel;
 };
 
 #define K3_RINGACC_RING_ID_ANY (-1)
@@ -250,4 +252,19 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem);
 
 u32 k3_ringacc_get_tisci_dev_id(struct k3_ring *ring);
 
+/* DMA ring support */
+struct ti_sci_handle;
+
+/**
+ * struct struct k3_ringacc_init_data - Initialization data for DMA rings
+ */
+struct k3_ringacc_init_data {
+	const struct ti_sci_handle *tisci;
+	u32 tisci_dev_id;
+	u32 num_rings;
+};
+
+struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
+					    struct k3_ringacc_init_data *data);
+
 #endif /* __SOC_TI_K3_RINGACC_API_H_ */

From 017794739702d444ca48115ff0fcdce19edb5559 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:37 +0200
Subject: [PATCH 143/230] dmaengine: ti: k3-udma: Initial support for K3 BCDMA

One of the DMAs introduced with AM64 is the Block Copy DMA (BCDMA).
It serves similar purpose as K3 UDMAP channels in TR mode.

The rings for the BCDMA is integrated within the DMA itself instead of
using rings from the general purpose ringacc.

A BCDMA have two different type of channels:
- Block Copy Channels (bchan)
- Split Channels (tchan and rchan)

tchan and rchan can be used to service PSI-L peripherals, similarly to
K3 UDMA channels.

bchan can be only used for block copy operation (TR type15) like the
paired K3 UDMA tchan/rchan configured in block copy mode.
bchans can be also used to service peripherals directly if an external
trigger is selected for the channel.

Most of the driver code can be reused for BCDMA bchan/tchan/rchan support
but new setup and allocation functions are needed to handle the
differences between the DMAs.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-18-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 1266 +++++++++++++++++++++++++++++++++++---
 drivers/dma/ti/k3-udma.h |   12 +-
 2 files changed, 1186 insertions(+), 92 deletions(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index b89afa602532..f327d436f8ad 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -26,6 +26,7 @@
 #include <linux/soc/ti/k3-ringacc.h>
 #include <linux/soc/ti/ti_sci_protocol.h>
 #include <linux/soc/ti/ti_sci_inta_msi.h>
+#include <linux/dma/k3-event-router.h>
 #include <linux/dma/ti-cppi5.h>
 
 #include "../virt-dma.h"
@@ -55,14 +56,25 @@ struct udma_static_tr {
 
 struct udma_chan;
 
+enum k3_dma_type {
+	DMA_TYPE_UDMA = 0,
+	DMA_TYPE_BCDMA,
+};
+
 enum udma_mmr {
 	MMR_GCFG = 0,
+	MMR_BCHANRT,
 	MMR_RCHANRT,
 	MMR_TCHANRT,
 	MMR_LAST,
 };
 
-static const char * const mmr_names[] = { "gcfg", "rchanrt", "tchanrt" };
+static const char * const mmr_names[] = {
+	[MMR_GCFG] = "gcfg",
+	[MMR_BCHANRT] = "bchanrt",
+	[MMR_RCHANRT] = "rchanrt",
+	[MMR_TCHANRT] = "tchanrt",
+};
 
 struct udma_tchan {
 	void __iomem *reg_rt;
@@ -72,6 +84,8 @@ struct udma_tchan {
 	struct k3_ring *tc_ring; /* Transmit Completion ring */
 };
 
+#define udma_bchan udma_tchan
+
 struct udma_rflow {
 	int id;
 	struct k3_ring *fd_ring; /* Free Descriptor ring */
@@ -84,11 +98,25 @@ struct udma_rchan {
 	int id;
 };
 
+struct udma_oes_offsets {
+	/* K3 UDMA Output Event Offset */
+	u32 udma_rchan;
+
+	/* BCDMA Output Event Offsets */
+	u32 bcdma_bchan_data;
+	u32 bcdma_bchan_ring;
+	u32 bcdma_tchan_data;
+	u32 bcdma_tchan_ring;
+	u32 bcdma_rchan_data;
+	u32 bcdma_rchan_ring;
+};
+
 #define UDMA_FLAG_PDMA_ACC32		BIT(0)
 #define UDMA_FLAG_PDMA_BURST		BIT(1)
 #define UDMA_FLAG_TDTYPE		BIT(2)
 
 struct udma_match_data {
+	enum k3_dma_type type;
 	u32 psil_base;
 	bool enable_memcpy_support;
 	u32 flags;
@@ -96,7 +124,8 @@ struct udma_match_data {
 };
 
 struct udma_soc_data {
-	u32 rchan_oes_offset;
+	struct udma_oes_offsets oes;
+	u32 bcdma_trigger_event_offset;
 };
 
 struct udma_hwdesc {
@@ -139,16 +168,19 @@ struct udma_dev {
 
 	struct udma_rx_flush rx_flush;
 
+	int bchan_cnt;
 	int tchan_cnt;
 	int echan_cnt;
 	int rchan_cnt;
 	int rflow_cnt;
+	unsigned long *bchan_map;
 	unsigned long *tchan_map;
 	unsigned long *rchan_map;
 	unsigned long *rflow_gp_map;
 	unsigned long *rflow_gp_map_allocated;
 	unsigned long *rflow_in_use;
 
+	struct udma_bchan *bchans;
 	struct udma_tchan *tchans;
 	struct udma_rchan *rchans;
 	struct udma_rflow *rflows;
@@ -156,6 +188,7 @@ struct udma_dev {
 	struct udma_chan *channels;
 	u32 psil_base;
 	u32 atype;
+	u32 asel;
 };
 
 struct udma_desc {
@@ -200,6 +233,7 @@ struct udma_chan_config {
 	bool notdpkt; /* Suppress sending TDC packet */
 	int remote_thread_id;
 	u32 atype;
+	u32 asel;
 	u32 src_thread;
 	u32 dst_thread;
 	enum psil_endpoint_type ep_type;
@@ -207,6 +241,8 @@ struct udma_chan_config {
 	bool enable_burst;
 	enum udma_tp_level channel_tpl; /* Channel Throughput Level */
 
+	u32 tr_trigger_type;
+
 	enum dma_transfer_direction dir;
 };
 
@@ -214,11 +250,13 @@ struct udma_chan {
 	struct virt_dma_chan vc;
 	struct dma_slave_config	cfg;
 	struct udma_dev *ud;
+	struct device *dma_dev;
 	struct udma_desc *desc;
 	struct udma_desc *terminated_desc;
 	struct udma_static_tr static_tr;
 	char *name;
 
+	struct udma_bchan *bchan;
 	struct udma_tchan *tchan;
 	struct udma_rchan *rchan;
 	struct udma_rflow *rflow;
@@ -354,6 +392,30 @@ static int navss_psil_unpair(struct udma_dev *ud, u32 src_thread,
 						src_thread, dst_thread);
 }
 
+static void k3_configure_chan_coherency(struct dma_chan *chan, u32 asel)
+{
+	struct device *chan_dev = &chan->dev->device;
+
+	if (asel == 0) {
+		/* No special handling for the channel */
+		chan->dev->chan_dma_dev = false;
+
+		chan_dev->dma_coherent = false;
+		chan_dev->dma_parms = NULL;
+	} else if (asel == 14 || asel == 15) {
+		chan->dev->chan_dma_dev = true;
+
+		chan_dev->dma_coherent = true;
+		dma_coerce_mask_and_coherent(chan_dev, DMA_BIT_MASK(48));
+		chan_dev->dma_parms = chan_dev->parent->dma_parms;
+	} else {
+		dev_warn(chan->device->dev, "Invalid ASEL value: %u\n", asel);
+
+		chan_dev->dma_coherent = false;
+		chan_dev->dma_parms = NULL;
+	}
+}
+
 static void udma_reset_uchan(struct udma_chan *uc)
 {
 	memset(&uc->config, 0, sizeof(uc->config));
@@ -440,9 +502,7 @@ static void udma_free_hwdesc(struct udma_chan *uc, struct udma_desc *d)
 			d->hwdesc[i].cppi5_desc_vaddr = NULL;
 		}
 	} else if (d->hwdesc[0].cppi5_desc_vaddr) {
-		struct udma_dev *ud = uc->ud;
-
-		dma_free_coherent(ud->dev, d->hwdesc[0].cppi5_desc_size,
+		dma_free_coherent(uc->dma_dev, d->hwdesc[0].cppi5_desc_size,
 				  d->hwdesc[0].cppi5_desc_vaddr,
 				  d->hwdesc[0].cppi5_desc_paddr);
 
@@ -671,8 +731,10 @@ static void udma_reset_counters(struct udma_chan *uc)
 		val = udma_tchanrt_read(uc, UDMA_CHAN_RT_PCNT_REG);
 		udma_tchanrt_write(uc, UDMA_CHAN_RT_PCNT_REG, val);
 
-		val = udma_tchanrt_read(uc, UDMA_CHAN_RT_PEER_BCNT_REG);
-		udma_tchanrt_write(uc, UDMA_CHAN_RT_PEER_BCNT_REG, val);
+		if (!uc->bchan) {
+			val = udma_tchanrt_read(uc, UDMA_CHAN_RT_PEER_BCNT_REG);
+			udma_tchanrt_write(uc, UDMA_CHAN_RT_PEER_BCNT_REG, val);
+		}
 	}
 
 	if (uc->rchan) {
@@ -1233,6 +1295,42 @@ static struct udma_##res *__udma_reserve_##res(struct udma_dev *ud,	\
 UDMA_RESERVE_RESOURCE(tchan);
 UDMA_RESERVE_RESOURCE(rchan);
 
+static struct udma_bchan *__bcdma_reserve_bchan(struct udma_dev *ud, int id)
+{
+	if (id >= 0) {
+		if (test_bit(id, ud->bchan_map)) {
+			dev_err(ud->dev, "bchan%d is in use\n", id);
+			return ERR_PTR(-ENOENT);
+		}
+	} else {
+		id = find_next_zero_bit(ud->bchan_map, ud->bchan_cnt, 0);
+		if (id == ud->bchan_cnt)
+			return ERR_PTR(-ENOENT);
+	}
+
+	set_bit(id, ud->bchan_map);
+	return &ud->bchans[id];
+}
+
+static int bcdma_get_bchan(struct udma_chan *uc)
+{
+	struct udma_dev *ud = uc->ud;
+
+	if (uc->bchan) {
+		dev_dbg(ud->dev, "chan%d: already have bchan%d allocated\n",
+			uc->id, uc->bchan->id);
+		return 0;
+	}
+
+	uc->bchan = __bcdma_reserve_bchan(ud, -1);
+	if (IS_ERR(uc->bchan))
+		return PTR_ERR(uc->bchan);
+
+	uc->tchan = uc->bchan;
+
+	return 0;
+}
+
 static int udma_get_tchan(struct udma_chan *uc)
 {
 	struct udma_dev *ud = uc->ud;
@@ -1325,6 +1423,19 @@ static int udma_get_rflow(struct udma_chan *uc, int flow_id)
 	return PTR_ERR_OR_ZERO(uc->rflow);
 }
 
+static void bcdma_put_bchan(struct udma_chan *uc)
+{
+	struct udma_dev *ud = uc->ud;
+
+	if (uc->bchan) {
+		dev_dbg(ud->dev, "chan%d: put bchan%d\n", uc->id,
+			uc->bchan->id);
+		clear_bit(uc->bchan->id, ud->bchan_map);
+		uc->bchan = NULL;
+		uc->tchan = NULL;
+	}
+}
+
 static void udma_put_rchan(struct udma_chan *uc)
 {
 	struct udma_dev *ud = uc->ud;
@@ -1361,6 +1472,65 @@ static void udma_put_rflow(struct udma_chan *uc)
 	}
 }
 
+static void bcdma_free_bchan_resources(struct udma_chan *uc)
+{
+	if (!uc->bchan)
+		return;
+
+	k3_ringacc_ring_free(uc->bchan->tc_ring);
+	k3_ringacc_ring_free(uc->bchan->t_ring);
+	uc->bchan->tc_ring = NULL;
+	uc->bchan->t_ring = NULL;
+	k3_configure_chan_coherency(&uc->vc.chan, 0);
+
+	bcdma_put_bchan(uc);
+}
+
+static int bcdma_alloc_bchan_resources(struct udma_chan *uc)
+{
+	struct k3_ring_cfg ring_cfg;
+	struct udma_dev *ud = uc->ud;
+	int ret;
+
+	ret = bcdma_get_bchan(uc);
+	if (ret)
+		return ret;
+
+	ret = k3_ringacc_request_rings_pair(ud->ringacc, uc->bchan->id, -1,
+					    &uc->bchan->t_ring,
+					    &uc->bchan->tc_ring);
+	if (ret) {
+		ret = -EBUSY;
+		goto err_ring;
+	}
+
+	memset(&ring_cfg, 0, sizeof(ring_cfg));
+	ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE;
+	ring_cfg.elm_size = K3_RINGACC_RING_ELSIZE_8;
+	ring_cfg.mode = K3_RINGACC_RING_MODE_RING;
+
+	k3_configure_chan_coherency(&uc->vc.chan, ud->asel);
+	ring_cfg.asel = ud->asel;
+	ring_cfg.dma_dev = dmaengine_get_dma_device(&uc->vc.chan);
+
+	ret = k3_ringacc_ring_cfg(uc->bchan->t_ring, &ring_cfg);
+	if (ret)
+		goto err_ringcfg;
+
+	return 0;
+
+err_ringcfg:
+	k3_ringacc_ring_free(uc->bchan->tc_ring);
+	uc->bchan->tc_ring = NULL;
+	k3_ringacc_ring_free(uc->bchan->t_ring);
+	uc->bchan->t_ring = NULL;
+	k3_configure_chan_coherency(&uc->vc.chan, 0);
+err_ring:
+	bcdma_put_bchan(uc);
+
+	return ret;
+}
+
 static void udma_free_tx_resources(struct udma_chan *uc)
 {
 	if (!uc->tchan)
@@ -1378,15 +1548,19 @@ static int udma_alloc_tx_resources(struct udma_chan *uc)
 {
 	struct k3_ring_cfg ring_cfg;
 	struct udma_dev *ud = uc->ud;
-	int ret;
+	struct udma_tchan *tchan;
+	int ring_idx, ret;
 
 	ret = udma_get_tchan(uc);
 	if (ret)
 		return ret;
 
-	ret = k3_ringacc_request_rings_pair(ud->ringacc, uc->tchan->id, -1,
-					    &uc->tchan->t_ring,
-					    &uc->tchan->tc_ring);
+	tchan = uc->tchan;
+	ring_idx = ud->bchan_cnt + tchan->id;
+
+	ret = k3_ringacc_request_rings_pair(ud->ringacc, ring_idx, -1,
+					    &tchan->t_ring,
+					    &tchan->tc_ring);
 	if (ret) {
 		ret = -EBUSY;
 		goto err_ring;
@@ -1395,10 +1569,18 @@ static int udma_alloc_tx_resources(struct udma_chan *uc)
 	memset(&ring_cfg, 0, sizeof(ring_cfg));
 	ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE;
 	ring_cfg.elm_size = K3_RINGACC_RING_ELSIZE_8;
-	ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE;
+	if (ud->match_data->type == DMA_TYPE_UDMA) {
+		ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE;
+	} else {
+		ring_cfg.mode = K3_RINGACC_RING_MODE_RING;
 
-	ret = k3_ringacc_ring_cfg(uc->tchan->t_ring, &ring_cfg);
-	ret |= k3_ringacc_ring_cfg(uc->tchan->tc_ring, &ring_cfg);
+		k3_configure_chan_coherency(&uc->vc.chan, uc->config.asel);
+		ring_cfg.asel = uc->config.asel;
+		ring_cfg.dma_dev = dmaengine_get_dma_device(&uc->vc.chan);
+	}
+
+	ret = k3_ringacc_ring_cfg(tchan->t_ring, &ring_cfg);
+	ret |= k3_ringacc_ring_cfg(tchan->tc_ring, &ring_cfg);
 
 	if (ret)
 		goto err_ringcfg;
@@ -1458,7 +1640,8 @@ static int udma_alloc_rx_resources(struct udma_chan *uc)
 	}
 
 	rflow = uc->rflow;
-	fd_ring_id = ud->tchan_cnt + ud->echan_cnt + uc->rchan->id;
+	fd_ring_id = ud->bchan_cnt + ud->tchan_cnt + ud->echan_cnt +
+		     uc->rchan->id;
 	ret = k3_ringacc_request_rings_pair(ud->ringacc, fd_ring_id, -1,
 					    &rflow->fd_ring, &rflow->r_ring);
 	if (ret) {
@@ -1468,15 +1651,25 @@ static int udma_alloc_rx_resources(struct udma_chan *uc)
 
 	memset(&ring_cfg, 0, sizeof(ring_cfg));
 
-	if (uc->config.pkt_mode)
-		ring_cfg.size = SG_MAX_SEGMENTS;
-	else
-		ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE;
-
 	ring_cfg.elm_size = K3_RINGACC_RING_ELSIZE_8;
-	ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE;
+	if (ud->match_data->type == DMA_TYPE_UDMA) {
+		if (uc->config.pkt_mode)
+			ring_cfg.size = SG_MAX_SEGMENTS;
+		else
+			ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE;
+
+		ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE;
+	} else {
+		ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE;
+		ring_cfg.mode = K3_RINGACC_RING_MODE_RING;
+
+		k3_configure_chan_coherency(&uc->vc.chan, uc->config.asel);
+		ring_cfg.asel = uc->config.asel;
+		ring_cfg.dma_dev = dmaengine_get_dma_device(&uc->vc.chan);
+	}
 
 	ret = k3_ringacc_ring_cfg(rflow->fd_ring, &ring_cfg);
+
 	ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE;
 	ret |= k3_ringacc_ring_cfg(rflow->r_ring, &ring_cfg);
 
@@ -1498,7 +1691,18 @@ err_rflow:
 	return ret;
 }
 
-#define TISCI_TCHAN_VALID_PARAMS (				\
+#define TISCI_BCDMA_BCHAN_VALID_PARAMS (			\
+	TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID |	\
+	TI_SCI_MSG_VALUE_RM_UDMAP_CH_EXTENDED_CH_TYPE_VALID)
+
+#define TISCI_BCDMA_TCHAN_VALID_PARAMS (			\
+	TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID |	\
+	TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID)
+
+#define TISCI_BCDMA_RCHAN_VALID_PARAMS (			\
+	TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID)
+
+#define TISCI_UDMA_TCHAN_VALID_PARAMS (				\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID |	\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_EINFO_VALID |	\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_PSWORDS_VALID |	\
@@ -1508,7 +1712,7 @@ err_rflow:
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID |		\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID)
 
-#define TISCI_RCHAN_VALID_PARAMS (				\
+#define TISCI_UDMA_RCHAN_VALID_PARAMS (				\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID |	\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID |		\
 	TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID |		\
@@ -1533,7 +1737,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc)
 	struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 };
 	struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 };
 
-	req_tx.valid_params = TISCI_TCHAN_VALID_PARAMS;
+	req_tx.valid_params = TISCI_UDMA_TCHAN_VALID_PARAMS;
 	req_tx.nav_id = tisci_rm->tisci_dev_id;
 	req_tx.index = tchan->id;
 	req_tx.tx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR;
@@ -1547,7 +1751,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc)
 		return ret;
 	}
 
-	req_rx.valid_params = TISCI_RCHAN_VALID_PARAMS;
+	req_rx.valid_params = TISCI_UDMA_RCHAN_VALID_PARAMS;
 	req_rx.nav_id = tisci_rm->tisci_dev_id;
 	req_rx.index = rchan->id;
 	req_rx.rx_fetch_size = sizeof(struct cppi5_desc_hdr_t) >> 2;
@@ -1562,6 +1766,27 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc)
 	return ret;
 }
 
+static int bcdma_tisci_m2m_channel_config(struct udma_chan *uc)
+{
+	struct udma_dev *ud = uc->ud;
+	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
+	const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops;
+	struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 };
+	struct udma_bchan *bchan = uc->bchan;
+	int ret = 0;
+
+	req_tx.valid_params = TISCI_BCDMA_BCHAN_VALID_PARAMS;
+	req_tx.nav_id = tisci_rm->tisci_dev_id;
+	req_tx.extended_ch_type = TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_BCHAN;
+	req_tx.index = bchan->id;
+
+	ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx);
+	if (ret)
+		dev_err(ud->dev, "bchan%d cfg failed %d\n", bchan->id, ret);
+
+	return ret;
+}
+
 static int udma_tisci_tx_channel_config(struct udma_chan *uc)
 {
 	struct udma_dev *ud = uc->ud;
@@ -1582,7 +1807,7 @@ static int udma_tisci_tx_channel_config(struct udma_chan *uc)
 		fetch_size = sizeof(struct cppi5_desc_hdr_t);
 	}
 
-	req_tx.valid_params = TISCI_TCHAN_VALID_PARAMS;
+	req_tx.valid_params = TISCI_UDMA_TCHAN_VALID_PARAMS;
 	req_tx.nav_id = tisci_rm->tisci_dev_id;
 	req_tx.index = tchan->id;
 	req_tx.tx_chan_type = mode;
@@ -1605,6 +1830,33 @@ static int udma_tisci_tx_channel_config(struct udma_chan *uc)
 	return ret;
 }
 
+static int bcdma_tisci_tx_channel_config(struct udma_chan *uc)
+{
+	struct udma_dev *ud = uc->ud;
+	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
+	const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops;
+	struct udma_tchan *tchan = uc->tchan;
+	struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 };
+	int ret = 0;
+
+	req_tx.valid_params = TISCI_BCDMA_TCHAN_VALID_PARAMS;
+	req_tx.nav_id = tisci_rm->tisci_dev_id;
+	req_tx.index = tchan->id;
+	req_tx.tx_supr_tdpkt = uc->config.notdpkt;
+	if (ud->match_data->flags & UDMA_FLAG_TDTYPE) {
+		/* wait for peer to complete the teardown for PDMAs */
+		req_tx.valid_params |=
+				TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID;
+		req_tx.tx_tdtype = 1;
+	}
+
+	ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx);
+	if (ret)
+		dev_err(ud->dev, "tchan%d cfg failed %d\n", tchan->id, ret);
+
+	return ret;
+}
+
 static int udma_tisci_rx_channel_config(struct udma_chan *uc)
 {
 	struct udma_dev *ud = uc->ud;
@@ -1627,7 +1879,7 @@ static int udma_tisci_rx_channel_config(struct udma_chan *uc)
 		fetch_size = sizeof(struct cppi5_desc_hdr_t);
 	}
 
-	req_rx.valid_params = TISCI_RCHAN_VALID_PARAMS;
+	req_rx.valid_params = TISCI_UDMA_RCHAN_VALID_PARAMS;
 	req_rx.nav_id = tisci_rm->tisci_dev_id;
 	req_rx.index = rchan->id;
 	req_rx.rx_fetch_size =  fetch_size >> 2;
@@ -1686,6 +1938,26 @@ static int udma_tisci_rx_channel_config(struct udma_chan *uc)
 	return 0;
 }
 
+static int bcdma_tisci_rx_channel_config(struct udma_chan *uc)
+{
+	struct udma_dev *ud = uc->ud;
+	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
+	const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops;
+	struct udma_rchan *rchan = uc->rchan;
+	struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 };
+	int ret = 0;
+
+	req_rx.valid_params = TISCI_BCDMA_RCHAN_VALID_PARAMS;
+	req_rx.nav_id = tisci_rm->tisci_dev_id;
+	req_rx.index = rchan->id;
+
+	ret = tisci_ops->rx_ch_cfg(tisci_rm->tisci, &req_rx);
+	if (ret)
+		dev_err(ud->dev, "rchan%d cfg failed %d\n", rchan->id, ret);
+
+	return ret;
+}
+
 static int udma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct udma_chan *uc = to_udma_chan(chan);
@@ -1695,6 +1967,8 @@ static int udma_alloc_chan_resources(struct dma_chan *chan)
 	u32 irq_udma_idx;
 	int ret;
 
+	uc->dma_dev = ud->dev;
+
 	if (uc->config.pkt_mode || uc->config.dir == DMA_MEM_TO_MEM) {
 		uc->use_dma_pool = true;
 		/* in case of MEM_TO_MEM we have maximum of two TRs */
@@ -1790,7 +2064,7 @@ static int udma_alloc_chan_resources(struct dma_chan *chan)
 					K3_PSIL_DST_THREAD_ID_OFFSET;
 
 		irq_ring = uc->rflow->r_ring;
-		irq_udma_idx = soc_data->rchan_oes_offset + uc->rchan->id;
+		irq_udma_idx = soc_data->oes.udma_rchan + uc->rchan->id;
 
 		ret = udma_tisci_rx_channel_config(uc);
 		break;
@@ -1890,6 +2164,218 @@ err_cleanup:
 	return ret;
 }
 
+static int bcdma_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct udma_chan *uc = to_udma_chan(chan);
+	struct udma_dev *ud = to_udma_dev(chan->device);
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+	u32 irq_udma_idx, irq_ring_idx;
+	int ret;
+
+	/* Only TR mode is supported */
+	uc->config.pkt_mode = false;
+
+	/*
+	 * Make sure that the completion is in a known state:
+	 * No teardown, the channel is idle
+	 */
+	reinit_completion(&uc->teardown_completed);
+	complete_all(&uc->teardown_completed);
+	uc->state = UDMA_CHAN_IS_IDLE;
+
+	switch (uc->config.dir) {
+	case DMA_MEM_TO_MEM:
+		/* Non synchronized - mem to mem type of transfer */
+		dev_dbg(uc->ud->dev, "%s: chan%d as MEM-to-MEM\n", __func__,
+			uc->id);
+
+		ret = bcdma_alloc_bchan_resources(uc);
+		if (ret)
+			return ret;
+
+		irq_ring_idx = uc->bchan->id + oes->bcdma_bchan_ring;
+		irq_udma_idx = uc->bchan->id + oes->bcdma_bchan_data;
+
+		ret = bcdma_tisci_m2m_channel_config(uc);
+		break;
+	case DMA_MEM_TO_DEV:
+		/* Slave transfer synchronized - mem to dev (TX) trasnfer */
+		dev_dbg(uc->ud->dev, "%s: chan%d as MEM-to-DEV\n", __func__,
+			uc->id);
+
+		ret = udma_alloc_tx_resources(uc);
+		if (ret) {
+			uc->config.remote_thread_id = -1;
+			return ret;
+		}
+
+		uc->config.src_thread = ud->psil_base + uc->tchan->id;
+		uc->config.dst_thread = uc->config.remote_thread_id;
+		uc->config.dst_thread |= K3_PSIL_DST_THREAD_ID_OFFSET;
+
+		irq_ring_idx = uc->tchan->id + oes->bcdma_tchan_ring;
+		irq_udma_idx = uc->tchan->id + oes->bcdma_tchan_data;
+
+		ret = bcdma_tisci_tx_channel_config(uc);
+		break;
+	case DMA_DEV_TO_MEM:
+		/* Slave transfer synchronized - dev to mem (RX) trasnfer */
+		dev_dbg(uc->ud->dev, "%s: chan%d as DEV-to-MEM\n", __func__,
+			uc->id);
+
+		ret = udma_alloc_rx_resources(uc);
+		if (ret) {
+			uc->config.remote_thread_id = -1;
+			return ret;
+		}
+
+		uc->config.src_thread = uc->config.remote_thread_id;
+		uc->config.dst_thread = (ud->psil_base + uc->rchan->id) |
+					K3_PSIL_DST_THREAD_ID_OFFSET;
+
+		irq_ring_idx = uc->rchan->id + oes->bcdma_rchan_ring;
+		irq_udma_idx = uc->rchan->id + oes->bcdma_rchan_data;
+
+		ret = bcdma_tisci_rx_channel_config(uc);
+		break;
+	default:
+		/* Can not happen */
+		dev_err(uc->ud->dev, "%s: chan%d invalid direction (%u)\n",
+			__func__, uc->id, uc->config.dir);
+		return -EINVAL;
+	}
+
+	/* check if the channel configuration was successful */
+	if (ret)
+		goto err_res_free;
+
+	if (udma_is_chan_running(uc)) {
+		dev_warn(ud->dev, "chan%d: is running!\n", uc->id);
+		udma_reset_chan(uc, false);
+		if (udma_is_chan_running(uc)) {
+			dev_err(ud->dev, "chan%d: won't stop!\n", uc->id);
+			ret = -EBUSY;
+			goto err_res_free;
+		}
+	}
+
+	uc->dma_dev = dmaengine_get_dma_device(chan);
+	if (uc->config.dir == DMA_MEM_TO_MEM  && !uc->config.tr_trigger_type) {
+		uc->config.hdesc_size = cppi5_trdesc_calc_size(
+					sizeof(struct cppi5_tr_type15_t), 2);
+
+		uc->hdesc_pool = dma_pool_create(uc->name, ud->ddev.dev,
+						 uc->config.hdesc_size,
+						 ud->desc_align,
+						 0);
+		if (!uc->hdesc_pool) {
+			dev_err(ud->ddev.dev,
+				"Descriptor pool allocation failed\n");
+			uc->use_dma_pool = false;
+			return -ENOMEM;
+		}
+
+		uc->use_dma_pool = true;
+	} else if (uc->config.dir != DMA_MEM_TO_MEM) {
+		/* PSI-L pairing */
+		ret = navss_psil_pair(ud, uc->config.src_thread,
+				      uc->config.dst_thread);
+		if (ret) {
+			dev_err(ud->dev,
+				"PSI-L pairing failed: 0x%04x -> 0x%04x\n",
+				uc->config.src_thread, uc->config.dst_thread);
+			goto err_res_free;
+		}
+
+		uc->psil_paired = true;
+	}
+
+	uc->irq_num_ring = ti_sci_inta_msi_get_virq(ud->dev, irq_ring_idx);
+	if (uc->irq_num_ring <= 0) {
+		dev_err(ud->dev, "Failed to get ring irq (index: %u)\n",
+			irq_ring_idx);
+		ret = -EINVAL;
+		goto err_psi_free;
+	}
+
+	ret = request_irq(uc->irq_num_ring, udma_ring_irq_handler,
+			  IRQF_TRIGGER_HIGH, uc->name, uc);
+	if (ret) {
+		dev_err(ud->dev, "chan%d: ring irq request failed\n", uc->id);
+		goto err_irq_free;
+	}
+
+	/* Event from BCDMA (TR events) only needed for slave channels */
+	if (is_slave_direction(uc->config.dir)) {
+		uc->irq_num_udma = ti_sci_inta_msi_get_virq(ud->dev,
+							    irq_udma_idx);
+		if (uc->irq_num_udma <= 0) {
+			dev_err(ud->dev, "Failed to get bcdma irq (index: %u)\n",
+				irq_udma_idx);
+			free_irq(uc->irq_num_ring, uc);
+			ret = -EINVAL;
+			goto err_irq_free;
+		}
+
+		ret = request_irq(uc->irq_num_udma, udma_udma_irq_handler, 0,
+				  uc->name, uc);
+		if (ret) {
+			dev_err(ud->dev, "chan%d: BCDMA irq request failed\n",
+				uc->id);
+			free_irq(uc->irq_num_ring, uc);
+			goto err_irq_free;
+		}
+	} else {
+		uc->irq_num_udma = 0;
+	}
+
+	udma_reset_rings(uc);
+
+	INIT_DELAYED_WORK_ONSTACK(&uc->tx_drain.work,
+				  udma_check_tx_completion);
+	return 0;
+
+err_irq_free:
+	uc->irq_num_ring = 0;
+	uc->irq_num_udma = 0;
+err_psi_free:
+	if (uc->psil_paired)
+		navss_psil_unpair(ud, uc->config.src_thread,
+				  uc->config.dst_thread);
+	uc->psil_paired = false;
+err_res_free:
+	bcdma_free_bchan_resources(uc);
+	udma_free_tx_resources(uc);
+	udma_free_rx_resources(uc);
+
+	udma_reset_uchan(uc);
+
+	if (uc->use_dma_pool) {
+		dma_pool_destroy(uc->hdesc_pool);
+		uc->use_dma_pool = false;
+	}
+
+	return ret;
+}
+
+static int bcdma_router_config(struct dma_chan *chan)
+{
+	struct k3_event_route_data *router_data = chan->route_data;
+	struct udma_chan *uc = to_udma_chan(chan);
+	u32 trigger_event;
+
+	if (!uc->bchan)
+		return -EINVAL;
+
+	if (uc->config.tr_trigger_type != 1 && uc->config.tr_trigger_type != 2)
+		return -EINVAL;
+
+	trigger_event = uc->ud->soc_data->bcdma_trigger_event_offset;
+	trigger_event += (uc->bchan->id * 2) + uc->config.tr_trigger_type - 1;
+
+	return router_data->set_event(router_data->priv, trigger_event);
+}
+
 static int udma_slave_config(struct dma_chan *chan,
 			     struct dma_slave_config *cfg)
 {
@@ -2034,6 +2520,7 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl,
 	size_t tr_size;
 	int num_tr = 0;
 	int tr_idx = 0;
+	u64 asel;
 
 	/* estimate the number of TRs we will need */
 	for_each_sg(sgl, sgent, sglen, i) {
@@ -2051,6 +2538,11 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl,
 
 	d->sglen = sglen;
 
+	if (uc->ud->match_data->type == DMA_TYPE_UDMA)
+		asel = 0;
+	else
+		asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT;
+
 	tr_req = d->hwdesc[0].tr_req_base;
 	for_each_sg(sgl, sgent, sglen, i) {
 		dma_addr_t sg_addr = sg_dma_address(sgent);
@@ -2069,6 +2561,7 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl,
 			      false, CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
 		cppi5_tr_csf_set(&tr_req[tr_idx].flags, CPPI5_TR_CSF_SUPR_EVT);
 
+		sg_addr |= asel;
 		tr_req[tr_idx].addr = sg_addr;
 		tr_req[tr_idx].icnt0 = tr0_cnt0;
 		tr_req[tr_idx].icnt1 = tr0_cnt1;
@@ -2098,6 +2591,205 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl,
 	return d;
 }
 
+static struct udma_desc *
+udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl,
+				unsigned int sglen,
+				enum dma_transfer_direction dir,
+				unsigned long tx_flags, void *context)
+{
+	struct scatterlist *sgent;
+	struct cppi5_tr_type15_t *tr_req = NULL;
+	enum dma_slave_buswidth dev_width;
+	u16 tr_cnt0, tr_cnt1;
+	dma_addr_t dev_addr;
+	struct udma_desc *d;
+	unsigned int i;
+	size_t tr_size, sg_len;
+	int num_tr = 0;
+	int tr_idx = 0;
+	u32 burst, trigger_size, port_window;
+	u64 asel;
+
+	if (dir == DMA_DEV_TO_MEM) {
+		dev_addr = uc->cfg.src_addr;
+		dev_width = uc->cfg.src_addr_width;
+		burst = uc->cfg.src_maxburst;
+		port_window = uc->cfg.src_port_window_size;
+	} else if (dir == DMA_MEM_TO_DEV) {
+		dev_addr = uc->cfg.dst_addr;
+		dev_width = uc->cfg.dst_addr_width;
+		burst = uc->cfg.dst_maxburst;
+		port_window = uc->cfg.dst_port_window_size;
+	} else {
+		dev_err(uc->ud->dev, "%s: bad direction?\n", __func__);
+		return NULL;
+	}
+
+	if (!burst)
+		burst = 1;
+
+	if (port_window) {
+		if (port_window != burst) {
+			dev_err(uc->ud->dev,
+				"The burst must be equal to port_window\n");
+			return NULL;
+		}
+
+		tr_cnt0 = dev_width * port_window;
+		tr_cnt1 = 1;
+	} else {
+		tr_cnt0 = dev_width;
+		tr_cnt1 = burst;
+	}
+	trigger_size = tr_cnt0 * tr_cnt1;
+
+	/* estimate the number of TRs we will need */
+	for_each_sg(sgl, sgent, sglen, i) {
+		sg_len = sg_dma_len(sgent);
+
+		if (sg_len % trigger_size) {
+			dev_err(uc->ud->dev,
+				"Not aligned SG entry (%zu for %u)\n", sg_len,
+				trigger_size);
+			return NULL;
+		}
+
+		if (sg_len / trigger_size < SZ_64K)
+			num_tr++;
+		else
+			num_tr += 2;
+	}
+
+	/* Now allocate and setup the descriptor. */
+	tr_size = sizeof(struct cppi5_tr_type15_t);
+	d = udma_alloc_tr_desc(uc, tr_size, num_tr, dir);
+	if (!d)
+		return NULL;
+
+	d->sglen = sglen;
+
+	if (uc->ud->match_data->type == DMA_TYPE_UDMA) {
+		asel = 0;
+	} else {
+		asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT;
+		dev_addr |= asel;
+	}
+
+	tr_req = d->hwdesc[0].tr_req_base;
+	for_each_sg(sgl, sgent, sglen, i) {
+		u16 tr0_cnt2, tr0_cnt3, tr1_cnt2;
+		dma_addr_t sg_addr = sg_dma_address(sgent);
+
+		sg_len = sg_dma_len(sgent);
+		num_tr = udma_get_tr_counters(sg_len / trigger_size, 0,
+					      &tr0_cnt2, &tr0_cnt3, &tr1_cnt2);
+		if (num_tr < 0) {
+			dev_err(uc->ud->dev, "size %zu is not supported\n",
+				sg_len);
+			udma_free_hwdesc(uc, d);
+			kfree(d);
+			return NULL;
+		}
+
+		cppi5_tr_init(&tr_req[tr_idx].flags, CPPI5_TR_TYPE15, false,
+			      true, CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
+		cppi5_tr_csf_set(&tr_req[tr_idx].flags, CPPI5_TR_CSF_SUPR_EVT);
+		cppi5_tr_set_trigger(&tr_req[tr_idx].flags,
+				     uc->config.tr_trigger_type,
+				     CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, 0, 0);
+
+		sg_addr |= asel;
+		if (dir == DMA_DEV_TO_MEM) {
+			tr_req[tr_idx].addr = dev_addr;
+			tr_req[tr_idx].icnt0 = tr_cnt0;
+			tr_req[tr_idx].icnt1 = tr_cnt1;
+			tr_req[tr_idx].icnt2 = tr0_cnt2;
+			tr_req[tr_idx].icnt3 = tr0_cnt3;
+			tr_req[tr_idx].dim1 = (-1) * tr_cnt0;
+
+			tr_req[tr_idx].daddr = sg_addr;
+			tr_req[tr_idx].dicnt0 = tr_cnt0;
+			tr_req[tr_idx].dicnt1 = tr_cnt1;
+			tr_req[tr_idx].dicnt2 = tr0_cnt2;
+			tr_req[tr_idx].dicnt3 = tr0_cnt3;
+			tr_req[tr_idx].ddim1 = tr_cnt0;
+			tr_req[tr_idx].ddim2 = trigger_size;
+			tr_req[tr_idx].ddim3 = trigger_size * tr0_cnt2;
+		} else {
+			tr_req[tr_idx].addr = sg_addr;
+			tr_req[tr_idx].icnt0 = tr_cnt0;
+			tr_req[tr_idx].icnt1 = tr_cnt1;
+			tr_req[tr_idx].icnt2 = tr0_cnt2;
+			tr_req[tr_idx].icnt3 = tr0_cnt3;
+			tr_req[tr_idx].dim1 = tr_cnt0;
+			tr_req[tr_idx].dim2 = trigger_size;
+			tr_req[tr_idx].dim3 = trigger_size * tr0_cnt2;
+
+			tr_req[tr_idx].daddr = dev_addr;
+			tr_req[tr_idx].dicnt0 = tr_cnt0;
+			tr_req[tr_idx].dicnt1 = tr_cnt1;
+			tr_req[tr_idx].dicnt2 = tr0_cnt2;
+			tr_req[tr_idx].dicnt3 = tr0_cnt3;
+			tr_req[tr_idx].ddim1 = (-1) * tr_cnt0;
+		}
+
+		tr_idx++;
+
+		if (num_tr == 2) {
+			cppi5_tr_init(&tr_req[tr_idx].flags, CPPI5_TR_TYPE15,
+				      false, true,
+				      CPPI5_TR_EVENT_SIZE_COMPLETION, 0);
+			cppi5_tr_csf_set(&tr_req[tr_idx].flags,
+					 CPPI5_TR_CSF_SUPR_EVT);
+			cppi5_tr_set_trigger(&tr_req[tr_idx].flags,
+					     uc->config.tr_trigger_type,
+					     CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC,
+					     0, 0);
+
+			sg_addr += trigger_size * tr0_cnt2 * tr0_cnt3;
+			if (dir == DMA_DEV_TO_MEM) {
+				tr_req[tr_idx].addr = dev_addr;
+				tr_req[tr_idx].icnt0 = tr_cnt0;
+				tr_req[tr_idx].icnt1 = tr_cnt1;
+				tr_req[tr_idx].icnt2 = tr1_cnt2;
+				tr_req[tr_idx].icnt3 = 1;
+				tr_req[tr_idx].dim1 = (-1) * tr_cnt0;
+
+				tr_req[tr_idx].daddr = sg_addr;
+				tr_req[tr_idx].dicnt0 = tr_cnt0;
+				tr_req[tr_idx].dicnt1 = tr_cnt1;
+				tr_req[tr_idx].dicnt2 = tr1_cnt2;
+				tr_req[tr_idx].dicnt3 = 1;
+				tr_req[tr_idx].ddim1 = tr_cnt0;
+				tr_req[tr_idx].ddim2 = trigger_size;
+			} else {
+				tr_req[tr_idx].addr = sg_addr;
+				tr_req[tr_idx].icnt0 = tr_cnt0;
+				tr_req[tr_idx].icnt1 = tr_cnt1;
+				tr_req[tr_idx].icnt2 = tr1_cnt2;
+				tr_req[tr_idx].icnt3 = 1;
+				tr_req[tr_idx].dim1 = tr_cnt0;
+				tr_req[tr_idx].dim2 = trigger_size;
+
+				tr_req[tr_idx].daddr = dev_addr;
+				tr_req[tr_idx].dicnt0 = tr_cnt0;
+				tr_req[tr_idx].dicnt1 = tr_cnt1;
+				tr_req[tr_idx].dicnt2 = tr1_cnt2;
+				tr_req[tr_idx].dicnt3 = 1;
+				tr_req[tr_idx].ddim1 = (-1) * tr_cnt0;
+			}
+			tr_idx++;
+		}
+
+		d->residue += sg_len;
+	}
+
+	cppi5_tr_csf_set(&tr_req[tr_idx - 1].flags,
+			 CPPI5_TR_CSF_SUPR_EVT | CPPI5_TR_CSF_EOP);
+
+	return d;
+}
+
 static int udma_configure_statictr(struct udma_chan *uc, struct udma_desc *d,
 				   enum dma_slave_buswidth dev_width,
 				   u16 elcnt)
@@ -2339,7 +3031,8 @@ udma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	struct udma_desc *d;
 	u32 burst;
 
-	if (dir != uc->config.dir) {
+	if (dir != uc->config.dir &&
+	    (uc->config.dir == DMA_MEM_TO_MEM && !uc->config.tr_trigger_type)) {
 		dev_err(chan->device->dev,
 			"%s: chan%d is for %s, not supporting %s\n",
 			__func__, uc->id,
@@ -2365,9 +3058,12 @@ udma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 	if (uc->config.pkt_mode)
 		d = udma_prep_slave_sg_pkt(uc, sgl, sglen, dir, tx_flags,
 					   context);
-	else
+	else if (is_slave_direction(uc->config.dir))
 		d = udma_prep_slave_sg_tr(uc, sgl, sglen, dir, tx_flags,
 					  context);
+	else
+		d = udma_prep_slave_sg_triggered_tr(uc, sgl, sglen, dir,
+						    tx_flags, context);
 
 	if (!d)
 		return NULL;
@@ -2421,7 +3117,12 @@ udma_prep_dma_cyclic_tr(struct udma_chan *uc, dma_addr_t buf_addr,
 		return NULL;
 
 	tr_req = d->hwdesc[0].tr_req_base;
-	period_addr = buf_addr;
+	if (uc->ud->match_data->type == DMA_TYPE_UDMA)
+		period_addr = buf_addr;
+	else
+		period_addr = buf_addr |
+			((u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT);
+
 	for (i = 0; i < periods; i++) {
 		int tr_idx = i * num_tr;
 
@@ -2627,6 +3328,11 @@ udma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	d->tr_idx = 0;
 	d->residue = len;
 
+	if (uc->ud->match_data->type != DMA_TYPE_UDMA) {
+		src |= (u64)uc->ud->asel << K3_ADDRESS_ASEL_SHIFT;
+		dest |= (u64)uc->ud->asel << K3_ADDRESS_ASEL_SHIFT;
+	}
+
 	tr_req = d->hwdesc[0].tr_req_base;
 
 	cppi5_tr_init(&tr_req[0].flags, CPPI5_TR_TYPE15, false, true,
@@ -2984,6 +3690,7 @@ static void udma_free_chan_resources(struct dma_chan *chan)
 	vchan_free_chan_resources(&uc->vc);
 	tasklet_kill(&uc->vc.task);
 
+	bcdma_free_bchan_resources(uc);
 	udma_free_tx_resources(uc);
 	udma_free_rx_resources(uc);
 	udma_reset_uchan(uc);
@@ -2995,10 +3702,13 @@ static void udma_free_chan_resources(struct dma_chan *chan)
 }
 
 static struct platform_driver udma_driver;
+static struct platform_driver bcdma_driver;
 
 struct udma_filter_param {
 	int remote_thread_id;
 	u32 atype;
+	u32 asel;
+	u32 tr_trigger_type;
 };
 
 static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
@@ -3009,7 +3719,8 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
 	struct udma_chan *uc;
 	struct udma_dev *ud;
 
-	if (chan->device->dev->driver != &udma_driver.driver)
+	if (chan->device->dev->driver != &udma_driver.driver &&
+	    chan->device->dev->driver != &bcdma_driver.driver)
 		return false;
 
 	uc = to_udma_chan(chan);
@@ -3023,13 +3734,25 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
 		return false;
 	}
 
+	if (filter_param->asel > 15) {
+		dev_err(ud->dev, "Invalid channel asel: %u\n",
+			filter_param->asel);
+		return false;
+	}
+
 	ucc->remote_thread_id = filter_param->remote_thread_id;
 	ucc->atype = filter_param->atype;
+	ucc->asel = filter_param->asel;
+	ucc->tr_trigger_type = filter_param->tr_trigger_type;
 
-	if (ucc->remote_thread_id & K3_PSIL_DST_THREAD_ID_OFFSET)
+	if (ucc->tr_trigger_type) {
+		ucc->dir = DMA_MEM_TO_MEM;
+		goto triggered_bchan;
+	} else if (ucc->remote_thread_id & K3_PSIL_DST_THREAD_ID_OFFSET) {
 		ucc->dir = DMA_MEM_TO_DEV;
-	else
+	} else {
 		ucc->dir = DMA_DEV_TO_MEM;
+	}
 
 	ep_config = psil_get_ep_config(ucc->remote_thread_id);
 	if (IS_ERR(ep_config)) {
@@ -3038,6 +3761,19 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
 		ucc->dir = DMA_MEM_TO_MEM;
 		ucc->remote_thread_id = -1;
 		ucc->atype = 0;
+		ucc->asel = 0;
+		return false;
+	}
+
+	if (ud->match_data->type == DMA_TYPE_BCDMA &&
+	    ep_config->pkt_mode) {
+		dev_err(ud->dev,
+			"Only TR mode is supported (psi-l thread 0x%04x)\n",
+			ucc->remote_thread_id);
+		ucc->dir = DMA_MEM_TO_MEM;
+		ucc->remote_thread_id = -1;
+		ucc->atype = 0;
+		ucc->asel = 0;
 		return false;
 	}
 
@@ -3069,6 +3805,13 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
 		ucc->remote_thread_id, dmaengine_get_direction_text(ucc->dir));
 
 	return true;
+
+triggered_bchan:
+	dev_dbg(ud->dev, "chan%d: triggered channel (type: %u)\n", uc->id,
+		ucc->tr_trigger_type);
+
+	return true;
+
 }
 
 static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec,
@@ -3079,14 +3822,33 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec,
 	struct udma_filter_param filter_param;
 	struct dma_chan *chan;
 
-	if (dma_spec->args_count != 1 && dma_spec->args_count != 2)
-		return NULL;
+	if (ud->match_data->type == DMA_TYPE_BCDMA) {
+		if (dma_spec->args_count != 3)
+			return NULL;
 
-	filter_param.remote_thread_id = dma_spec->args[0];
-	if (dma_spec->args_count == 2)
-		filter_param.atype = dma_spec->args[1];
-	else
+		filter_param.tr_trigger_type = dma_spec->args[0];
+		filter_param.remote_thread_id = dma_spec->args[1];
+		filter_param.asel = dma_spec->args[2];
 		filter_param.atype = 0;
+	} else {
+		if (dma_spec->args_count != 1 && dma_spec->args_count != 2)
+			return NULL;
+
+		filter_param.remote_thread_id = dma_spec->args[0];
+		filter_param.tr_trigger_type = 0;
+		if (dma_spec->args_count == 2) {
+			if (ud->match_data->type == DMA_TYPE_UDMA) {
+				filter_param.atype = dma_spec->args[1];
+				filter_param.asel = 0;
+			} else {
+				filter_param.atype = 0;
+				filter_param.asel = dma_spec->args[1];
+			}
+		} else {
+			filter_param.atype = 0;
+			filter_param.asel = 0;
+		}
+	}
 
 	chan = __dma_request_channel(&mask, udma_dma_filter_fn, &filter_param,
 				     ofdma->of_node);
@@ -3099,18 +3861,21 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec,
 }
 
 static struct udma_match_data am654_main_data = {
+	.type = DMA_TYPE_UDMA,
 	.psil_base = 0x1000,
 	.enable_memcpy_support = true,
 	.statictr_z_mask = GENMASK(11, 0),
 };
 
 static struct udma_match_data am654_mcu_data = {
+	.type = DMA_TYPE_UDMA,
 	.psil_base = 0x6000,
 	.enable_memcpy_support = false,
 	.statictr_z_mask = GENMASK(11, 0),
 };
 
 static struct udma_match_data j721e_main_data = {
+	.type = DMA_TYPE_UDMA,
 	.psil_base = 0x1000,
 	.enable_memcpy_support = true,
 	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE,
@@ -3118,12 +3883,21 @@ static struct udma_match_data j721e_main_data = {
 };
 
 static struct udma_match_data j721e_mcu_data = {
+	.type = DMA_TYPE_UDMA,
 	.psil_base = 0x6000,
 	.enable_memcpy_support = false, /* MEM_TO_MEM is slow via MCU UDMA */
 	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE,
 	.statictr_z_mask = GENMASK(23, 0),
 };
 
+static struct udma_match_data am64_bcdma_data = {
+	.type = DMA_TYPE_BCDMA,
+	.psil_base = 0x2000, /* for tchan and rchan, not applicable to bchan */
+	.enable_memcpy_support = true, /* Supported via bchan */
+	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE,
+	.statictr_z_mask = GENMASK(23, 0),
+};
+
 static const struct of_device_id udma_of_match[] = {
 	{
 		.compatible = "ti,am654-navss-main-udmap",
@@ -3142,30 +3916,88 @@ static const struct of_device_id udma_of_match[] = {
 	{ /* Sentinel */ },
 };
 
+static const struct of_device_id bcdma_of_match[] = {
+	{
+		.compatible = "ti,am64-dmss-bcdma",
+		.data = &am64_bcdma_data,
+	},
+	{ /* Sentinel */ },
+};
+
 static struct udma_soc_data am654_soc_data = {
-	.rchan_oes_offset = 0x200,
+	.oes = {
+		.udma_rchan = 0x200,
+	},
 };
 
 static struct udma_soc_data j721e_soc_data = {
-	.rchan_oes_offset = 0x400,
+	.oes = {
+		.udma_rchan = 0x400,
+	},
 };
 
 static struct udma_soc_data j7200_soc_data = {
-	.rchan_oes_offset = 0x80,
+	.oes = {
+		.udma_rchan = 0x80,
+	},
+};
+
+static struct udma_soc_data am64_soc_data = {
+	.oes = {
+		.bcdma_bchan_data = 0x2200,
+		.bcdma_bchan_ring = 0x2400,
+		.bcdma_tchan_data = 0x2800,
+		.bcdma_tchan_ring = 0x2a00,
+		.bcdma_rchan_data = 0x2e00,
+		.bcdma_rchan_ring = 0x3000,
+	},
+	.bcdma_trigger_event_offset = 0xc400,
 };
 
 static const struct soc_device_attribute k3_soc_devices[] = {
 	{ .family = "AM65X", .data = &am654_soc_data },
 	{ .family = "J721E", .data = &j721e_soc_data },
 	{ .family = "J7200", .data = &j7200_soc_data },
+	{ .family = "AM64X", .data = &am64_soc_data },
 	{ /* sentinel */ }
 };
 
 static int udma_get_mmrs(struct platform_device *pdev, struct udma_dev *ud)
 {
+	u32 cap2, cap3;
 	int i;
 
-	for (i = 0; i < MMR_LAST; i++) {
+	ud->mmrs[MMR_GCFG] = devm_platform_ioremap_resource_byname(pdev, mmr_names[MMR_GCFG]);
+	if (IS_ERR(ud->mmrs[MMR_GCFG]))
+		return PTR_ERR(ud->mmrs[MMR_GCFG]);
+
+	cap2 = udma_read(ud->mmrs[MMR_GCFG], 0x28);
+	cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c);
+
+	switch (ud->match_data->type) {
+	case DMA_TYPE_UDMA:
+		ud->rflow_cnt = UDMA_CAP3_RFLOW_CNT(cap3);
+		ud->tchan_cnt = UDMA_CAP2_TCHAN_CNT(cap2);
+		ud->echan_cnt = UDMA_CAP2_ECHAN_CNT(cap2);
+		ud->rchan_cnt = UDMA_CAP2_RCHAN_CNT(cap2);
+		break;
+	case DMA_TYPE_BCDMA:
+		ud->bchan_cnt = BCDMA_CAP2_BCHAN_CNT(cap2);
+		ud->tchan_cnt = BCDMA_CAP2_TCHAN_CNT(cap2);
+		ud->rchan_cnt = BCDMA_CAP2_RCHAN_CNT(cap2);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	for (i = 1; i < MMR_LAST; i++) {
+		if (i == MMR_BCHANRT && ud->bchan_cnt == 0)
+			continue;
+		if (i == MMR_TCHANRT && ud->tchan_cnt == 0)
+			continue;
+		if (i == MMR_RCHANRT && ud->rchan_cnt == 0)
+			continue;
+
 		ud->mmrs[i] = devm_platform_ioremap_resource_byname(pdev, mmr_names[i]);
 		if (IS_ERR(ud->mmrs[i]))
 			return PTR_ERR(ud->mmrs[i]);
@@ -3185,27 +4017,23 @@ static void udma_mark_resource_ranges(struct udma_dev *ud, unsigned long *map,
 		rm_desc->num_sec);
 }
 
+static const char * const range_names[] = {
+	[RM_RANGE_BCHAN] = "ti,sci-rm-range-bchan",
+	[RM_RANGE_TCHAN] = "ti,sci-rm-range-tchan",
+	[RM_RANGE_RCHAN] = "ti,sci-rm-range-rchan",
+	[RM_RANGE_RFLOW] = "ti,sci-rm-range-rflow"
+};
+
 static int udma_setup_resources(struct udma_dev *ud)
 {
+	int ret, i, j;
 	struct device *dev = ud->dev;
-	int ch_count, ret, i, j;
-	u32 cap2, cap3;
 	struct ti_sci_resource *rm_res, irq_res;
 	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
-	static const char * const range_names[] = { "ti,sci-rm-range-tchan",
-						    "ti,sci-rm-range-rchan",
-						    "ti,sci-rm-range-rflow" };
-
-	cap2 = udma_read(ud->mmrs[MMR_GCFG], UDMA_CAP_REG(2));
-	cap3 = udma_read(ud->mmrs[MMR_GCFG], UDMA_CAP_REG(3));
-
-	ud->rflow_cnt = UDMA_CAP3_RFLOW_CNT(cap3);
-	ud->tchan_cnt = UDMA_CAP2_TCHAN_CNT(cap2);
-	ud->echan_cnt = UDMA_CAP2_ECHAN_CNT(cap2);
-	ud->rchan_cnt = UDMA_CAP2_RCHAN_CNT(cap2);
-	ch_count  = ud->tchan_cnt + ud->rchan_cnt;
+	u32 cap3;
 
 	/* Set up the throughput level start indexes */
+	cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c);
 	if (of_device_is_compatible(dev->of_node,
 				    "ti,am654-navss-main-udmap")) {
 		ud->tpl_levels = 2;
@@ -3262,11 +4090,15 @@ static int udma_setup_resources(struct udma_dev *ud)
 	bitmap_set(ud->rflow_gp_map, 0, ud->rflow_cnt);
 
 	/* Get resource ranges from tisci */
-	for (i = 0; i < RM_RANGE_LAST; i++)
+	for (i = 0; i < RM_RANGE_LAST; i++) {
+		if (i == RM_RANGE_BCHAN)
+			continue;
+
 		tisci_rm->rm_ranges[i] =
 			devm_ti_sci_get_of_resource(tisci_rm->tisci, dev,
 						    tisci_rm->tisci_dev_id,
 						    (char *)range_names[i]);
+	}
 
 	/* tchan ranges */
 	rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN];
@@ -3304,12 +4136,12 @@ static int udma_setup_resources(struct udma_dev *ud)
 	for (j = 0; j < rm_res->sets; j++, i++) {
 		if (rm_res->desc[j].num) {
 			irq_res.desc[i].start = rm_res->desc[j].start +
-					ud->soc_data->rchan_oes_offset;
+					ud->soc_data->oes.udma_rchan;
 			irq_res.desc[i].num = rm_res->desc[j].num;
 		}
 		if (rm_res->desc[j].num_sec) {
 			irq_res.desc[i].start_sec = rm_res->desc[j].start_sec +
-					ud->soc_data->rchan_oes_offset;
+					ud->soc_data->oes.udma_rchan;
 			irq_res.desc[i].num_sec = rm_res->desc[j].num_sec;
 		}
 	}
@@ -3332,6 +4164,174 @@ static int udma_setup_resources(struct udma_dev *ud)
 						  &rm_res->desc[i], "gp-rflow");
 	}
 
+	return 0;
+}
+
+static int bcdma_setup_resources(struct udma_dev *ud)
+{
+	int ret, i, j;
+	struct device *dev = ud->dev;
+	struct ti_sci_resource *rm_res, irq_res;
+	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	ud->bchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->bchan_cnt),
+					   sizeof(unsigned long), GFP_KERNEL);
+	ud->bchans = devm_kcalloc(dev, ud->bchan_cnt, sizeof(*ud->bchans),
+				  GFP_KERNEL);
+	ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt),
+					   sizeof(unsigned long), GFP_KERNEL);
+	ud->tchans = devm_kcalloc(dev, ud->tchan_cnt, sizeof(*ud->tchans),
+				  GFP_KERNEL);
+	ud->rchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->rchan_cnt),
+					   sizeof(unsigned long), GFP_KERNEL);
+	ud->rchans = devm_kcalloc(dev, ud->rchan_cnt, sizeof(*ud->rchans),
+				  GFP_KERNEL);
+	/* BCDMA do not really have flows, but the driver expect it */
+	ud->rflow_in_use = devm_kcalloc(dev, BITS_TO_LONGS(ud->rchan_cnt),
+					sizeof(unsigned long),
+					GFP_KERNEL);
+	ud->rflows = devm_kcalloc(dev, ud->rchan_cnt, sizeof(*ud->rflows),
+				  GFP_KERNEL);
+
+	if (!ud->bchan_map || !ud->tchan_map || !ud->rchan_map ||
+	    !ud->rflow_in_use || !ud->bchans || !ud->tchans || !ud->rchans ||
+	    !ud->rflows)
+		return -ENOMEM;
+
+	/* TPL is not yet supported for BCDMA */
+	ud->tpl_levels = 1;
+
+	/* Get resource ranges from tisci */
+	for (i = 0; i < RM_RANGE_LAST; i++) {
+		if (i == RM_RANGE_RFLOW)
+			continue;
+		if (i == RM_RANGE_BCHAN && ud->bchan_cnt == 0)
+			continue;
+		if (i == RM_RANGE_TCHAN && ud->tchan_cnt == 0)
+			continue;
+		if (i == RM_RANGE_RCHAN && ud->rchan_cnt == 0)
+			continue;
+
+		tisci_rm->rm_ranges[i] =
+			devm_ti_sci_get_of_resource(tisci_rm->tisci, dev,
+						    tisci_rm->tisci_dev_id,
+						    (char *)range_names[i]);
+	}
+
+	irq_res.sets = 0;
+
+	/* bchan ranges */
+	if (ud->bchan_cnt) {
+		rm_res = tisci_rm->rm_ranges[RM_RANGE_BCHAN];
+		if (IS_ERR(rm_res)) {
+			bitmap_zero(ud->bchan_map, ud->bchan_cnt);
+		} else {
+			bitmap_fill(ud->bchan_map, ud->bchan_cnt);
+			for (i = 0; i < rm_res->sets; i++)
+				udma_mark_resource_ranges(ud, ud->bchan_map,
+							  &rm_res->desc[i],
+							  "bchan");
+		}
+		irq_res.sets += rm_res->sets;
+	}
+
+	/* tchan ranges */
+	if (ud->tchan_cnt) {
+		rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN];
+		if (IS_ERR(rm_res)) {
+			bitmap_zero(ud->tchan_map, ud->tchan_cnt);
+		} else {
+			bitmap_fill(ud->tchan_map, ud->tchan_cnt);
+			for (i = 0; i < rm_res->sets; i++)
+				udma_mark_resource_ranges(ud, ud->tchan_map,
+							  &rm_res->desc[i],
+							  "tchan");
+		}
+		irq_res.sets += rm_res->sets * 2;
+	}
+
+	/* rchan ranges */
+	if (ud->rchan_cnt) {
+		rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN];
+		if (IS_ERR(rm_res)) {
+			bitmap_zero(ud->rchan_map, ud->rchan_cnt);
+		} else {
+			bitmap_fill(ud->rchan_map, ud->rchan_cnt);
+			for (i = 0; i < rm_res->sets; i++)
+				udma_mark_resource_ranges(ud, ud->rchan_map,
+							  &rm_res->desc[i],
+							  "rchan");
+		}
+		irq_res.sets += rm_res->sets * 2;
+	}
+
+	irq_res.desc = kcalloc(irq_res.sets, sizeof(*irq_res.desc), GFP_KERNEL);
+	if (ud->bchan_cnt) {
+		rm_res = tisci_rm->rm_ranges[RM_RANGE_BCHAN];
+		for (i = 0; i < rm_res->sets; i++) {
+			irq_res.desc[i].start = rm_res->desc[i].start +
+						oes->bcdma_bchan_ring;
+			irq_res.desc[i].num = rm_res->desc[i].num;
+		}
+	}
+	if (ud->tchan_cnt) {
+		rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN];
+		for (j = 0; j < rm_res->sets; j++, i += 2) {
+			irq_res.desc[i].start = rm_res->desc[j].start +
+						oes->bcdma_tchan_data;
+			irq_res.desc[i].num = rm_res->desc[j].num;
+
+			irq_res.desc[i + 1].start = rm_res->desc[j].start +
+						oes->bcdma_tchan_ring;
+			irq_res.desc[i + 1].num = rm_res->desc[j].num;
+		}
+	}
+	if (ud->rchan_cnt) {
+		rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN];
+		for (j = 0; j < rm_res->sets; j++, i += 2) {
+			irq_res.desc[i].start = rm_res->desc[j].start +
+						oes->bcdma_rchan_data;
+			irq_res.desc[i].num = rm_res->desc[j].num;
+
+			irq_res.desc[i + 1].start = rm_res->desc[j].start +
+						oes->bcdma_rchan_ring;
+			irq_res.desc[i + 1].num = rm_res->desc[j].num;
+		}
+	}
+
+	ret = ti_sci_inta_msi_domain_alloc_irqs(ud->dev, &irq_res);
+	kfree(irq_res.desc);
+	if (ret) {
+		dev_err(ud->dev, "Failed to allocate MSI interrupts\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int setup_resources(struct udma_dev *ud)
+{
+	struct device *dev = ud->dev;
+	int ch_count, ret;
+
+	switch (ud->match_data->type) {
+	case DMA_TYPE_UDMA:
+		ret = udma_setup_resources(ud);
+		break;
+	case DMA_TYPE_BCDMA:
+		ret = bcdma_setup_resources(ud);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (ret)
+		return ret;
+
+	ch_count  = ud->bchan_cnt + ud->tchan_cnt + ud->rchan_cnt;
+	if (ud->bchan_cnt)
+		ch_count -= bitmap_weight(ud->bchan_map, ud->bchan_cnt);
 	ch_count -= bitmap_weight(ud->tchan_map, ud->tchan_cnt);
 	ch_count -= bitmap_weight(ud->rchan_map, ud->rchan_cnt);
 	if (!ch_count)
@@ -3342,12 +4342,32 @@ static int udma_setup_resources(struct udma_dev *ud)
 	if (!ud->channels)
 		return -ENOMEM;
 
-	dev_info(dev, "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n",
-		 ch_count,
-		 ud->tchan_cnt - bitmap_weight(ud->tchan_map, ud->tchan_cnt),
-		 ud->rchan_cnt - bitmap_weight(ud->rchan_map, ud->rchan_cnt),
-		 ud->rflow_cnt - bitmap_weight(ud->rflow_gp_map,
-					       ud->rflow_cnt));
+	switch (ud->match_data->type) {
+	case DMA_TYPE_UDMA:
+		dev_info(dev,
+			 "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n",
+			 ch_count,
+			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
+						       ud->tchan_cnt),
+			 ud->rchan_cnt - bitmap_weight(ud->rchan_map,
+						       ud->rchan_cnt),
+			 ud->rflow_cnt - bitmap_weight(ud->rflow_gp_map,
+						       ud->rflow_cnt));
+		break;
+	case DMA_TYPE_BCDMA:
+		dev_info(dev,
+			 "Channels: %d (bchan: %u, tchan: %u, rchan: %u)\n",
+			 ch_count,
+			 ud->bchan_cnt - bitmap_weight(ud->bchan_map,
+						       ud->bchan_cnt),
+			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
+						       ud->tchan_cnt),
+			 ud->rchan_cnt - bitmap_weight(ud->rchan_map,
+						       ud->rchan_cnt));
+		break;
+	default:
+		break;
+	}
 
 	return ch_count;
 }
@@ -3456,10 +4476,19 @@ static void udma_dbg_summary_show_chan(struct seq_file *s,
 
 	seq_printf(s, " %-13s| %s", dma_chan_name(chan),
 		   chan->dbg_client_name ?: "in-use");
-	seq_printf(s, " (%s, ", dmaengine_get_direction_text(uc->config.dir));
+	if (ucc->tr_trigger_type)
+		seq_puts(s, " (triggered, ");
+	else
+		seq_printf(s, " (%s, ",
+			   dmaengine_get_direction_text(uc->config.dir));
 
 	switch (uc->config.dir) {
 	case DMA_MEM_TO_MEM:
+		if (uc->ud->match_data->type == DMA_TYPE_BCDMA) {
+			seq_printf(s, "bchan%d)\n", uc->bchan->id);
+			return;
+		}
+
 		seq_printf(s, "chan%d pair [0x%04x -> 0x%04x], ", uc->tchan->id,
 			   ucc->src_thread, ucc->dst_thread);
 		break;
@@ -3531,6 +4560,23 @@ static int udma_probe(struct platform_device *pdev)
 	if (!ud)
 		return -ENOMEM;
 
+	match = of_match_node(udma_of_match, dev->of_node);
+	if (!match) {
+		match = of_match_node(bcdma_of_match, dev->of_node);
+		if (!match) {
+			dev_err(dev, "No compatible match found\n");
+			return -ENODEV;
+		}
+	}
+	ud->match_data = match->data;
+
+	soc = soc_device_match(k3_soc_devices);
+	if (!soc) {
+		dev_err(dev, "No compatible SoC found\n");
+		return -ENODEV;
+	}
+	ud->soc_data = soc->data;
+
 	ret = udma_get_mmrs(pdev, ud);
 	if (ret)
 		return ret;
@@ -3554,16 +4600,38 @@ static int udma_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	ret = of_property_read_u32(dev->of_node, "ti,udma-atype", &ud->atype);
-	if (!ret && ud->atype > 2) {
-		dev_err(dev, "Invalid atype: %u\n", ud->atype);
-		return -EINVAL;
+	if (ud->match_data->type == DMA_TYPE_UDMA) {
+		ret = of_property_read_u32(dev->of_node, "ti,udma-atype",
+					   &ud->atype);
+		if (!ret && ud->atype > 2) {
+			dev_err(dev, "Invalid atype: %u\n", ud->atype);
+			return -EINVAL;
+		}
+	} else {
+		ret = of_property_read_u32(dev->of_node, "ti,asel",
+					   &ud->asel);
+		if (!ret && ud->asel > 15) {
+			dev_err(dev, "Invalid asel: %u\n", ud->asel);
+			return -EINVAL;
+		}
 	}
 
 	ud->tisci_rm.tisci_udmap_ops = &ud->tisci_rm.tisci->ops.rm_udmap_ops;
 	ud->tisci_rm.tisci_psil_ops = &ud->tisci_rm.tisci->ops.rm_psil_ops;
 
-	ud->ringacc = of_k3_ringacc_get_by_phandle(dev->of_node, "ti,ringacc");
+	if (ud->match_data->type == DMA_TYPE_UDMA) {
+		ud->ringacc = of_k3_ringacc_get_by_phandle(dev->of_node, "ti,ringacc");
+	} else {
+		struct k3_ringacc_init_data ring_init_data;
+
+		ring_init_data.tisci = ud->tisci_rm.tisci;
+		ring_init_data.tisci_dev_id = ud->tisci_rm.tisci_dev_id;
+		ring_init_data.num_rings = ud->bchan_cnt + ud->tchan_cnt +
+					   ud->rchan_cnt;
+
+		ud->ringacc = k3_ringacc_dmarings_init(pdev, &ring_init_data);
+	}
+
 	if (IS_ERR(ud->ringacc))
 		return PTR_ERR(ud->ringacc);
 
@@ -3574,24 +4642,9 @@ static int udma_probe(struct platform_device *pdev)
 		return -EPROBE_DEFER;
 	}
 
-	match = of_match_node(udma_of_match, dev->of_node);
-	if (!match) {
-		dev_err(dev, "No compatible match found\n");
-		return -ENODEV;
-	}
-	ud->match_data = match->data;
-
-	soc = soc_device_match(k3_soc_devices);
-	if (!soc) {
-		dev_err(dev, "No compatible SoC found\n");
-		return -ENODEV;
-	}
-	ud->soc_data = soc->data;
-
 	dma_cap_set(DMA_SLAVE, ud->ddev.cap_mask);
 	dma_cap_set(DMA_CYCLIC, ud->ddev.cap_mask);
 
-	ud->ddev.device_alloc_chan_resources = udma_alloc_chan_resources;
 	ud->ddev.device_config = udma_slave_config;
 	ud->ddev.device_prep_slave_sg = udma_prep_slave_sg;
 	ud->ddev.device_prep_dma_cyclic = udma_prep_dma_cyclic;
@@ -3605,7 +4658,21 @@ static int udma_probe(struct platform_device *pdev)
 	ud->ddev.dbg_summary_show = udma_dbg_summary_show;
 #endif
 
+	switch (ud->match_data->type) {
+	case DMA_TYPE_UDMA:
+		ud->ddev.device_alloc_chan_resources =
+					udma_alloc_chan_resources;
+		break;
+	case DMA_TYPE_BCDMA:
+		ud->ddev.device_alloc_chan_resources =
+					bcdma_alloc_chan_resources;
+		ud->ddev.device_router_config = bcdma_router_config;
+		break;
+	default:
+		return -EINVAL;
+	}
 	ud->ddev.device_free_chan_resources = udma_free_chan_resources;
+
 	ud->ddev.src_addr_widths = TI_UDMAC_BUSWIDTHS;
 	ud->ddev.dst_addr_widths = TI_UDMAC_BUSWIDTHS;
 	ud->ddev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
@@ -3613,7 +4680,8 @@ static int udma_probe(struct platform_device *pdev)
 	ud->ddev.copy_align = DMAENGINE_ALIGN_8_BYTES;
 	ud->ddev.desc_metadata_modes = DESC_METADATA_CLIENT |
 				       DESC_METADATA_ENGINE;
-	if (ud->match_data->enable_memcpy_support) {
+	if (ud->match_data->enable_memcpy_support &&
+	    !(ud->match_data->type == DMA_TYPE_BCDMA && ud->bchan_cnt == 0)) {
 		dma_cap_set(DMA_MEMCPY, ud->ddev.cap_mask);
 		ud->ddev.device_prep_dma_memcpy = udma_prep_dma_memcpy;
 		ud->ddev.directions |= BIT(DMA_MEM_TO_MEM);
@@ -3626,7 +4694,7 @@ static int udma_probe(struct platform_device *pdev)
 	INIT_LIST_HEAD(&ud->ddev.channels);
 	INIT_LIST_HEAD(&ud->desc_to_purge);
 
-	ch_count = udma_setup_resources(ud);
+	ch_count = setup_resources(ud);
 	if (ch_count <= 0)
 		return ch_count;
 
@@ -3641,6 +4709,13 @@ static int udma_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	for (i = 0; i < ud->bchan_cnt; i++) {
+		struct udma_bchan *bchan = &ud->bchans[i];
+
+		bchan->id = i;
+		bchan->reg_rt = ud->mmrs[MMR_BCHANRT] + i * 0x1000;
+	}
+
 	for (i = 0; i < ud->tchan_cnt; i++) {
 		struct udma_tchan *tchan = &ud->tchans[i];
 
@@ -3667,6 +4742,7 @@ static int udma_probe(struct platform_device *pdev)
 		uc->ud = ud;
 		uc->vc.desc_free = udma_desc_free;
 		uc->id = i;
+		uc->bchan = NULL;
 		uc->tchan = NULL;
 		uc->rchan = NULL;
 		uc->config.remote_thread_id = -1;
@@ -3708,5 +4784,15 @@ static struct platform_driver udma_driver = {
 };
 builtin_platform_driver(udma_driver);
 
+static struct platform_driver bcdma_driver = {
+	.driver = {
+		.name	= "ti-bcdma",
+		.of_match_table = bcdma_of_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe		= udma_probe,
+};
+builtin_platform_driver(bcdma_driver);
+
 /* Private interfaces to UDMA */
 #include "k3-udma-private.c"
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index b4334b1b7b14..c97ed5271fd9 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -18,7 +18,7 @@
 #define UDMA_RX_FLOW_ID_FW_OES_REG	0x80
 #define UDMA_RX_FLOW_ID_FW_STATUS_REG	0x88
 
-/* TCHANRT/RCHANRT registers */
+/* BCHANRT/TCHANRT/RCHANRT registers */
 #define UDMA_CHAN_RT_CTL_REG		0x0
 #define UDMA_CHAN_RT_SWTRIG_REG		0x8
 #define UDMA_CHAN_RT_STDATA_REG		0x80
@@ -45,6 +45,10 @@
 #define UDMA_CAP3_HCHAN_CNT(val)	(((val) >> 14) & 0x1ff)
 #define UDMA_CAP3_UCHAN_CNT(val)	(((val) >> 23) & 0x1ff)
 
+#define BCDMA_CAP2_BCHAN_CNT(val)	((val) & 0x1ff)
+#define BCDMA_CAP2_TCHAN_CNT(val)	(((val) >> 9) & 0x1ff)
+#define BCDMA_CAP2_RCHAN_CNT(val)	(((val) >> 18) & 0x1ff)
+
 /* UDMA_CHAN_RT_CTL_REG */
 #define UDMA_CHAN_RT_CTL_EN		BIT(31)
 #define UDMA_CHAN_RT_CTL_TDOWN		BIT(30)
@@ -82,13 +86,17 @@
  */
 #define PDMA_STATIC_TR_Z(x, mask)	((x) & (mask))
 
+/* Address Space Select */
+#define K3_ADDRESS_ASEL_SHIFT		48
+
 struct udma_dev;
 struct udma_tchan;
 struct udma_rchan;
 struct udma_rflow;
 
 enum udma_rm_range {
-	RM_RANGE_TCHAN = 0,
+	RM_RANGE_BCHAN = 0,
+	RM_RANGE_TCHAN,
 	RM_RANGE_RCHAN,
 	RM_RANGE_RFLOW,
 	RM_RANGE_LAST,

From 8844898028d4127b0071ecb5a72e1894bd8b21d6 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:38 +0200
Subject: [PATCH 144/230] dmaengine: ti: k3-udma: Add support for BCDMA channel
 TPL handling

Unlike UDMAP the BCDMA defines the channel TPL levels per channel type.
In UDMAP the number of high and ultra-high channels applies to both tchan
and rchan.

BCDMA defines the TPL per channel types: bchan, tchan and rchan can have
different number of high and ultra-high channels.

In order to support BCDMA channel TPL we need to move the tpl information
as per channel type property for the DMAs.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-19-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma.c | 117 ++++++++++++++++++++++++++-------------
 drivers/dma/ti/k3-udma.h |   6 ++
 2 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index f327d436f8ad..0dc3430fea40 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -146,6 +146,11 @@ struct udma_rx_flush {
 	dma_addr_t buffer_paddr;
 };
 
+struct udma_tpl {
+	u8 levels;
+	u32 start_idx[3];
+};
+
 struct udma_dev {
 	struct dma_device ddev;
 	struct device *dev;
@@ -153,8 +158,9 @@ struct udma_dev {
 	const struct udma_match_data *match_data;
 	const struct udma_soc_data *soc_data;
 
-	u8 tpl_levels;
-	u32 tpl_start_idx[3];
+	struct udma_tpl bchan_tpl;
+	struct udma_tpl tchan_tpl;
+	struct udma_tpl rchan_tpl;
 
 	size_t desc_align; /* alignment to use for descriptors */
 
@@ -1276,10 +1282,10 @@ static struct udma_##res *__udma_reserve_##res(struct udma_dev *ud,	\
 	} else {							\
 		int start;						\
 									\
-		if (tpl >= ud->tpl_levels)				\
-			tpl = ud->tpl_levels - 1;			\
+		if (tpl >= ud->res##_tpl.levels)			\
+			tpl = ud->res##_tpl.levels - 1;			\
 									\
-		start = ud->tpl_start_idx[tpl];				\
+		start = ud->res##_tpl.start_idx[tpl];			\
 									\
 		id = find_next_zero_bit(ud->res##_map, ud->res##_cnt,	\
 					start);				\
@@ -1292,29 +1298,14 @@ static struct udma_##res *__udma_reserve_##res(struct udma_dev *ud,	\
 	return &ud->res##s[id];						\
 }
 
+UDMA_RESERVE_RESOURCE(bchan);
 UDMA_RESERVE_RESOURCE(tchan);
 UDMA_RESERVE_RESOURCE(rchan);
 
-static struct udma_bchan *__bcdma_reserve_bchan(struct udma_dev *ud, int id)
-{
-	if (id >= 0) {
-		if (test_bit(id, ud->bchan_map)) {
-			dev_err(ud->dev, "bchan%d is in use\n", id);
-			return ERR_PTR(-ENOENT);
-		}
-	} else {
-		id = find_next_zero_bit(ud->bchan_map, ud->bchan_cnt, 0);
-		if (id == ud->bchan_cnt)
-			return ERR_PTR(-ENOENT);
-	}
-
-	set_bit(id, ud->bchan_map);
-	return &ud->bchans[id];
-}
-
 static int bcdma_get_bchan(struct udma_chan *uc)
 {
 	struct udma_dev *ud = uc->ud;
+	enum udma_tp_level tpl;
 
 	if (uc->bchan) {
 		dev_dbg(ud->dev, "chan%d: already have bchan%d allocated\n",
@@ -1322,7 +1313,16 @@ static int bcdma_get_bchan(struct udma_chan *uc)
 		return 0;
 	}
 
-	uc->bchan = __bcdma_reserve_bchan(ud, -1);
+	/*
+	 * Use normal channels for peripherals, and highest TPL channel for
+	 * mem2mem
+	 */
+	if (uc->config.tr_trigger_type)
+		tpl = 0;
+	else
+		tpl = ud->bchan_tpl.levels - 1;
+
+	uc->bchan = __udma_reserve_bchan(ud, tpl, -1);
 	if (IS_ERR(uc->bchan))
 		return PTR_ERR(uc->bchan);
 
@@ -1384,8 +1384,11 @@ static int udma_get_chan_pair(struct udma_chan *uc)
 
 	/* Can be optimized, but let's have it like this for now */
 	end = min(ud->tchan_cnt, ud->rchan_cnt);
-	/* Try to use the highest TPL channel pair for MEM_TO_MEM channels */
-	chan_id = ud->tpl_start_idx[ud->tpl_levels - 1];
+	/*
+	 * Try to use the highest TPL channel pair for MEM_TO_MEM channels
+	 * Note: in UDMAP the channel TPL is symmetric between tchan and rchan
+	 */
+	chan_id = ud->tchan_tpl.start_idx[ud->tchan_tpl.levels - 1];
 	for (; chan_id < end; chan_id++) {
 		if (!test_bit(chan_id, ud->tchan_map) &&
 		    !test_bit(chan_id, ud->rchan_map))
@@ -4036,23 +4039,27 @@ static int udma_setup_resources(struct udma_dev *ud)
 	cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c);
 	if (of_device_is_compatible(dev->of_node,
 				    "ti,am654-navss-main-udmap")) {
-		ud->tpl_levels = 2;
-		ud->tpl_start_idx[0] = 8;
+		ud->tchan_tpl.levels = 2;
+		ud->tchan_tpl.start_idx[0] = 8;
 	} else if (of_device_is_compatible(dev->of_node,
 					   "ti,am654-navss-mcu-udmap")) {
-		ud->tpl_levels = 2;
-		ud->tpl_start_idx[0] = 2;
+		ud->tchan_tpl.levels = 2;
+		ud->tchan_tpl.start_idx[0] = 2;
 	} else if (UDMA_CAP3_UCHAN_CNT(cap3)) {
-		ud->tpl_levels = 3;
-		ud->tpl_start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3);
-		ud->tpl_start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
+		ud->tchan_tpl.levels = 3;
+		ud->tchan_tpl.start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3);
+		ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
 	} else if (UDMA_CAP3_HCHAN_CNT(cap3)) {
-		ud->tpl_levels = 2;
-		ud->tpl_start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
+		ud->tchan_tpl.levels = 2;
+		ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
 	} else {
-		ud->tpl_levels = 1;
+		ud->tchan_tpl.levels = 1;
 	}
 
+	ud->rchan_tpl.levels = ud->tchan_tpl.levels;
+	ud->rchan_tpl.start_idx[0] = ud->tchan_tpl.start_idx[0];
+	ud->rchan_tpl.start_idx[1] = ud->tchan_tpl.start_idx[1];
+
 	ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt),
 					   sizeof(unsigned long), GFP_KERNEL);
 	ud->tchans = devm_kcalloc(dev, ud->tchan_cnt, sizeof(*ud->tchans),
@@ -4174,6 +4181,43 @@ static int bcdma_setup_resources(struct udma_dev *ud)
 	struct ti_sci_resource *rm_res, irq_res;
 	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
 	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+	u32 cap;
+
+	/* Set up the throughput level start indexes */
+	cap = udma_read(ud->mmrs[MMR_GCFG], 0x2c);
+	if (BCDMA_CAP3_UBCHAN_CNT(cap)) {
+		ud->bchan_tpl.levels = 3;
+		ud->bchan_tpl.start_idx[1] = BCDMA_CAP3_UBCHAN_CNT(cap);
+		ud->bchan_tpl.start_idx[0] = BCDMA_CAP3_HBCHAN_CNT(cap);
+	} else if (BCDMA_CAP3_HBCHAN_CNT(cap)) {
+		ud->bchan_tpl.levels = 2;
+		ud->bchan_tpl.start_idx[0] = BCDMA_CAP3_HBCHAN_CNT(cap);
+	} else {
+		ud->bchan_tpl.levels = 1;
+	}
+
+	cap = udma_read(ud->mmrs[MMR_GCFG], 0x30);
+	if (BCDMA_CAP4_URCHAN_CNT(cap)) {
+		ud->rchan_tpl.levels = 3;
+		ud->rchan_tpl.start_idx[1] = BCDMA_CAP4_URCHAN_CNT(cap);
+		ud->rchan_tpl.start_idx[0] = BCDMA_CAP4_HRCHAN_CNT(cap);
+	} else if (BCDMA_CAP4_HRCHAN_CNT(cap)) {
+		ud->rchan_tpl.levels = 2;
+		ud->rchan_tpl.start_idx[0] = BCDMA_CAP4_HRCHAN_CNT(cap);
+	} else {
+		ud->rchan_tpl.levels = 1;
+	}
+
+	if (BCDMA_CAP4_UTCHAN_CNT(cap)) {
+		ud->tchan_tpl.levels = 3;
+		ud->tchan_tpl.start_idx[1] = BCDMA_CAP4_UTCHAN_CNT(cap);
+		ud->tchan_tpl.start_idx[0] = BCDMA_CAP4_HTCHAN_CNT(cap);
+	} else if (BCDMA_CAP4_HTCHAN_CNT(cap)) {
+		ud->tchan_tpl.levels = 2;
+		ud->tchan_tpl.start_idx[0] = BCDMA_CAP4_HTCHAN_CNT(cap);
+	} else {
+		ud->tchan_tpl.levels = 1;
+	}
 
 	ud->bchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->bchan_cnt),
 					   sizeof(unsigned long), GFP_KERNEL);
@@ -4199,9 +4243,6 @@ static int bcdma_setup_resources(struct udma_dev *ud)
 	    !ud->rflows)
 		return -ENOMEM;
 
-	/* TPL is not yet supported for BCDMA */
-	ud->tpl_levels = 1;
-
 	/* Get resource ranges from tisci */
 	for (i = 0; i < RM_RANGE_LAST; i++) {
 		if (i == RM_RANGE_RFLOW)
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index c97ed5271fd9..3ec38b383b76 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -48,6 +48,12 @@
 #define BCDMA_CAP2_BCHAN_CNT(val)	((val) & 0x1ff)
 #define BCDMA_CAP2_TCHAN_CNT(val)	(((val) >> 9) & 0x1ff)
 #define BCDMA_CAP2_RCHAN_CNT(val)	(((val) >> 18) & 0x1ff)
+#define BCDMA_CAP3_HBCHAN_CNT(val)	(((val) >> 14) & 0x1ff)
+#define BCDMA_CAP3_UBCHAN_CNT(val)	(((val) >> 23) & 0x1ff)
+#define BCDMA_CAP4_HRCHAN_CNT(val)	((val) & 0xff)
+#define BCDMA_CAP4_URCHAN_CNT(val)	(((val) >> 8) & 0xff)
+#define BCDMA_CAP4_HTCHAN_CNT(val)	(((val) >> 16) & 0xff)
+#define BCDMA_CAP4_UTCHAN_CNT(val)	(((val) >> 24) & 0xff)
 
 /* UDMA_CHAN_RT_CTL_REG */
 #define UDMA_CHAN_RT_CTL_EN		BIT(31)

From d2abc982333c02f9e1ff1c6b3782174f5b7662d7 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:39 +0200
Subject: [PATCH 145/230] dmaengine: ti: k3-udma: Initial support for K3 PKTDMA

One of the DMAs introduced with AM64 is the Packet DMA (PKTDMA).
It serves similar purpose as K3 UDMAP channels in packet mode, but with
notable differences, like tflow support and channels being allocated to
service specific peripherals.
The rings for the PKTDMA is integrated within the DMA itself instead of
using rings from the general purpose ringacc.

PKTDMA can be used to service PSI-L peripherals, similarly to
K3 UDMA channels.

Most of the driver code can be reused for PKTDMA tchan/rchan support but
new setup and allocation functions are needed to handle the differences
between the DMAs.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-20-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-private.c |   9 +
 drivers/dma/ti/k3-udma.c         | 545 +++++++++++++++++++++++++++++--
 drivers/dma/ti/k3-udma.h         |   4 +
 3 files changed, 533 insertions(+), 25 deletions(-)

diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index 346a4dd9640a..5436b19d656e 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -88,6 +88,9 @@ EXPORT_SYMBOL(xudma_free_gp_rflow_range);
 
 bool xudma_rflow_is_gp(struct udma_dev *ud, int id)
 {
+	if (!ud->rflow_gp_map)
+		return false;
+
 	return !test_bit(id, ud->rflow_gp_map);
 }
 EXPORT_SYMBOL(xudma_rflow_is_gp);
@@ -119,6 +122,12 @@ void xudma_rflow_put(struct udma_dev *ud, struct udma_rflow *p)
 }
 EXPORT_SYMBOL(xudma_rflow_put);
 
+int xudma_get_rflow_ring_offset(struct udma_dev *ud)
+{
+	return ud->tflow_cnt;
+}
+EXPORT_SYMBOL(xudma_get_rflow_ring_offset);
+
 #define XUDMA_GET_RESOURCE_ID(res)					\
 int xudma_##res##_get_id(struct udma_##res *p)				\
 {									\
diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index 0dc3430fea40..87157cbae1b8 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -59,6 +59,7 @@ struct udma_chan;
 enum k3_dma_type {
 	DMA_TYPE_UDMA = 0,
 	DMA_TYPE_BCDMA,
+	DMA_TYPE_PKTDMA,
 };
 
 enum udma_mmr {
@@ -82,6 +83,8 @@ struct udma_tchan {
 	int id;
 	struct k3_ring *t_ring; /* Transmit ring */
 	struct k3_ring *tc_ring; /* Transmit Completion ring */
+	int tflow_id; /* applicable only for PKTDMA */
+
 };
 
 #define udma_bchan udma_tchan
@@ -109,6 +112,10 @@ struct udma_oes_offsets {
 	u32 bcdma_tchan_ring;
 	u32 bcdma_rchan_data;
 	u32 bcdma_rchan_ring;
+
+	/* PKTDMA Output Event Offsets */
+	u32 pktdma_tchan_flow;
+	u32 pktdma_rchan_flow;
 };
 
 #define UDMA_FLAG_PDMA_ACC32		BIT(0)
@@ -179,12 +186,14 @@ struct udma_dev {
 	int echan_cnt;
 	int rchan_cnt;
 	int rflow_cnt;
+	int tflow_cnt;
 	unsigned long *bchan_map;
 	unsigned long *tchan_map;
 	unsigned long *rchan_map;
 	unsigned long *rflow_gp_map;
 	unsigned long *rflow_gp_map_allocated;
 	unsigned long *rflow_in_use;
+	unsigned long *tflow_map;
 
 	struct udma_bchan *bchans;
 	struct udma_tchan *tchans;
@@ -249,6 +258,11 @@ struct udma_chan_config {
 
 	u32 tr_trigger_type;
 
+	/* PKDMA mapped channel */
+	int mapped_channel_id;
+	/* PKTDMA default tflow or rflow for mapped channel */
+	int default_flow_id;
+
 	enum dma_transfer_direction dir;
 };
 
@@ -426,6 +440,8 @@ static void udma_reset_uchan(struct udma_chan *uc)
 {
 	memset(&uc->config, 0, sizeof(uc->config));
 	uc->config.remote_thread_id = -1;
+	uc->config.mapped_channel_id = -1;
+	uc->config.default_flow_id = -1;
 	uc->state = UDMA_CHAN_IS_IDLE;
 }
 
@@ -815,10 +831,16 @@ static void udma_start_desc(struct udma_chan *uc)
 {
 	struct udma_chan_config *ucc = &uc->config;
 
-	if (ucc->pkt_mode && (uc->cyclic || ucc->dir == DMA_DEV_TO_MEM)) {
+	if (uc->ud->match_data->type == DMA_TYPE_UDMA && ucc->pkt_mode &&
+	    (uc->cyclic || ucc->dir == DMA_DEV_TO_MEM)) {
 		int i;
 
-		/* Push all descriptors to ring for packet mode cyclic or RX */
+		/*
+		 * UDMA only: Push all descriptors to ring for packet mode
+		 * cyclic or RX
+		 * PKTDMA supports pre-linked descriptor and cyclic is not
+		 * supported
+		 */
 		for (i = 0; i < uc->desc->sglen; i++)
 			udma_push_to_ring(uc, i);
 	} else {
@@ -1248,10 +1270,12 @@ static struct udma_rflow *__udma_get_rflow(struct udma_dev *ud, int id)
 	if (test_bit(id, ud->rflow_in_use))
 		return ERR_PTR(-ENOENT);
 
-	/* GP rflow has to be allocated first */
-	if (!test_bit(id, ud->rflow_gp_map) &&
-	    !test_bit(id, ud->rflow_gp_map_allocated))
-		return ERR_PTR(-EINVAL);
+	if (ud->rflow_gp_map) {
+		/* GP rflow has to be allocated first */
+		if (!test_bit(id, ud->rflow_gp_map) &&
+		    !test_bit(id, ud->rflow_gp_map_allocated))
+			return ERR_PTR(-EINVAL);
+	}
 
 	dev_dbg(ud->dev, "get rflow%d\n", id);
 	set_bit(id, ud->rflow_in_use);
@@ -1341,9 +1365,39 @@ static int udma_get_tchan(struct udma_chan *uc)
 		return 0;
 	}
 
-	uc->tchan = __udma_reserve_tchan(ud, uc->config.channel_tpl, -1);
+	/*
+	 * mapped_channel_id is -1 for UDMA, BCDMA and PKTDMA unmapped channels.
+	 * For PKTDMA mapped channels it is configured to a channel which must
+	 * be used to service the peripheral.
+	 */
+	uc->tchan = __udma_reserve_tchan(ud, uc->config.channel_tpl,
+					 uc->config.mapped_channel_id);
+	if (IS_ERR(uc->tchan))
+		return PTR_ERR(uc->tchan);
 
-	return PTR_ERR_OR_ZERO(uc->tchan);
+	if (ud->tflow_cnt) {
+		int tflow_id;
+
+		/* Only PKTDMA have support for tx flows */
+		if (uc->config.default_flow_id >= 0)
+			tflow_id = uc->config.default_flow_id;
+		else
+			tflow_id = uc->tchan->id;
+
+		if (test_bit(tflow_id, ud->tflow_map)) {
+			dev_err(ud->dev, "tflow%d is in use\n", tflow_id);
+			clear_bit(uc->tchan->id, ud->tchan_map);
+			uc->tchan = NULL;
+			return -ENOENT;
+		}
+
+		uc->tchan->tflow_id = tflow_id;
+		set_bit(tflow_id, ud->tflow_map);
+	} else {
+		uc->tchan->tflow_id = -1;
+	}
+
+	return 0;
 }
 
 static int udma_get_rchan(struct udma_chan *uc)
@@ -1356,7 +1410,13 @@ static int udma_get_rchan(struct udma_chan *uc)
 		return 0;
 	}
 
-	uc->rchan = __udma_reserve_rchan(ud, uc->config.channel_tpl, -1);
+	/*
+	 * mapped_channel_id is -1 for UDMA, BCDMA and PKTDMA unmapped channels.
+	 * For PKTDMA mapped channels it is configured to a channel which must
+	 * be used to service the peripheral.
+	 */
+	uc->rchan = __udma_reserve_rchan(ud, uc->config.channel_tpl,
+					 uc->config.mapped_channel_id);
 
 	return PTR_ERR_OR_ZERO(uc->rchan);
 }
@@ -1403,6 +1463,9 @@ static int udma_get_chan_pair(struct udma_chan *uc)
 	uc->tchan = &ud->tchans[chan_id];
 	uc->rchan = &ud->rchans[chan_id];
 
+	/* UDMA does not use tx flows */
+	uc->tchan->tflow_id = -1;
+
 	return 0;
 }
 
@@ -1459,6 +1522,10 @@ static void udma_put_tchan(struct udma_chan *uc)
 		dev_dbg(ud->dev, "chan%d: put tchan%d\n", uc->id,
 			uc->tchan->id);
 		clear_bit(uc->tchan->id, ud->tchan_map);
+
+		if (uc->tchan->tflow_id >= 0)
+			clear_bit(uc->tchan->tflow_id, ud->tflow_map);
+
 		uc->tchan = NULL;
 	}
 }
@@ -1559,7 +1626,10 @@ static int udma_alloc_tx_resources(struct udma_chan *uc)
 		return ret;
 
 	tchan = uc->tchan;
-	ring_idx = ud->bchan_cnt + tchan->id;
+	if (tchan->tflow_id >= 0)
+		ring_idx = tchan->tflow_id;
+	else
+		ring_idx = ud->bchan_cnt + tchan->id;
 
 	ret = k3_ringacc_request_rings_pair(ud->ringacc, ring_idx, -1,
 					    &tchan->t_ring,
@@ -1636,15 +1706,23 @@ static int udma_alloc_rx_resources(struct udma_chan *uc)
 	if (uc->config.dir == DMA_MEM_TO_MEM)
 		return 0;
 
-	ret = udma_get_rflow(uc, uc->rchan->id);
+	if (uc->config.default_flow_id >= 0)
+		ret = udma_get_rflow(uc, uc->config.default_flow_id);
+	else
+		ret = udma_get_rflow(uc, uc->rchan->id);
+
 	if (ret) {
 		ret = -EBUSY;
 		goto err_rflow;
 	}
 
 	rflow = uc->rflow;
-	fd_ring_id = ud->bchan_cnt + ud->tchan_cnt + ud->echan_cnt +
-		     uc->rchan->id;
+	if (ud->tflow_cnt)
+		fd_ring_id = ud->tflow_cnt + rflow->id;
+	else
+		fd_ring_id = ud->bchan_cnt + ud->tchan_cnt + ud->echan_cnt +
+			     uc->rchan->id;
+
 	ret = k3_ringacc_request_rings_pair(ud->ringacc, fd_ring_id, -1,
 					    &rflow->fd_ring, &rflow->r_ring);
 	if (ret) {
@@ -1860,6 +1938,8 @@ static int bcdma_tisci_tx_channel_config(struct udma_chan *uc)
 	return ret;
 }
 
+#define pktdma_tisci_tx_channel_config bcdma_tisci_tx_channel_config
+
 static int udma_tisci_rx_channel_config(struct udma_chan *uc)
 {
 	struct udma_dev *ud = uc->ud;
@@ -1961,6 +2041,52 @@ static int bcdma_tisci_rx_channel_config(struct udma_chan *uc)
 	return ret;
 }
 
+static int pktdma_tisci_rx_channel_config(struct udma_chan *uc)
+{
+	struct udma_dev *ud = uc->ud;
+	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
+	const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops;
+	struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 };
+	struct ti_sci_msg_rm_udmap_flow_cfg flow_req = { 0 };
+	int ret = 0;
+
+	req_rx.valid_params = TISCI_BCDMA_RCHAN_VALID_PARAMS;
+	req_rx.nav_id = tisci_rm->tisci_dev_id;
+	req_rx.index = uc->rchan->id;
+
+	ret = tisci_ops->rx_ch_cfg(tisci_rm->tisci, &req_rx);
+	if (ret) {
+		dev_err(ud->dev, "rchan%d cfg failed %d\n", uc->rchan->id, ret);
+		return ret;
+	}
+
+	flow_req.valid_params =
+		TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_EINFO_PRESENT_VALID |
+		TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PSINFO_PRESENT_VALID |
+		TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_ERROR_HANDLING_VALID;
+
+	flow_req.nav_id = tisci_rm->tisci_dev_id;
+	flow_req.flow_index = uc->rflow->id;
+
+	if (uc->config.needs_epib)
+		flow_req.rx_einfo_present = 1;
+	else
+		flow_req.rx_einfo_present = 0;
+	if (uc->config.psd_size)
+		flow_req.rx_psinfo_present = 1;
+	else
+		flow_req.rx_psinfo_present = 0;
+	flow_req.rx_error_handling = 1;
+
+	ret = tisci_ops->rx_flow_cfg(tisci_rm->tisci, &flow_req);
+
+	if (ret)
+		dev_err(ud->dev, "flow%d config failed: %d\n", uc->rflow->id,
+			ret);
+
+	return ret;
+}
+
 static int udma_alloc_chan_resources(struct dma_chan *chan)
 {
 	struct udma_chan *uc = to_udma_chan(chan);
@@ -2379,6 +2505,157 @@ static int bcdma_router_config(struct dma_chan *chan)
 	return router_data->set_event(router_data->priv, trigger_event);
 }
 
+static int pktdma_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct udma_chan *uc = to_udma_chan(chan);
+	struct udma_dev *ud = to_udma_dev(chan->device);
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+	u32 irq_ring_idx;
+	int ret;
+
+	/*
+	 * Make sure that the completion is in a known state:
+	 * No teardown, the channel is idle
+	 */
+	reinit_completion(&uc->teardown_completed);
+	complete_all(&uc->teardown_completed);
+	uc->state = UDMA_CHAN_IS_IDLE;
+
+	switch (uc->config.dir) {
+	case DMA_MEM_TO_DEV:
+		/* Slave transfer synchronized - mem to dev (TX) trasnfer */
+		dev_dbg(uc->ud->dev, "%s: chan%d as MEM-to-DEV\n", __func__,
+			uc->id);
+
+		ret = udma_alloc_tx_resources(uc);
+		if (ret) {
+			uc->config.remote_thread_id = -1;
+			return ret;
+		}
+
+		uc->config.src_thread = ud->psil_base + uc->tchan->id;
+		uc->config.dst_thread = uc->config.remote_thread_id;
+		uc->config.dst_thread |= K3_PSIL_DST_THREAD_ID_OFFSET;
+
+		irq_ring_idx = uc->tchan->tflow_id + oes->pktdma_tchan_flow;
+
+		ret = pktdma_tisci_tx_channel_config(uc);
+		break;
+	case DMA_DEV_TO_MEM:
+		/* Slave transfer synchronized - dev to mem (RX) trasnfer */
+		dev_dbg(uc->ud->dev, "%s: chan%d as DEV-to-MEM\n", __func__,
+			uc->id);
+
+		ret = udma_alloc_rx_resources(uc);
+		if (ret) {
+			uc->config.remote_thread_id = -1;
+			return ret;
+		}
+
+		uc->config.src_thread = uc->config.remote_thread_id;
+		uc->config.dst_thread = (ud->psil_base + uc->rchan->id) |
+					K3_PSIL_DST_THREAD_ID_OFFSET;
+
+		irq_ring_idx = uc->rflow->id + oes->pktdma_rchan_flow;
+
+		ret = pktdma_tisci_rx_channel_config(uc);
+		break;
+	default:
+		/* Can not happen */
+		dev_err(uc->ud->dev, "%s: chan%d invalid direction (%u)\n",
+			__func__, uc->id, uc->config.dir);
+		return -EINVAL;
+	}
+
+	/* check if the channel configuration was successful */
+	if (ret)
+		goto err_res_free;
+
+	if (udma_is_chan_running(uc)) {
+		dev_warn(ud->dev, "chan%d: is running!\n", uc->id);
+		udma_reset_chan(uc, false);
+		if (udma_is_chan_running(uc)) {
+			dev_err(ud->dev, "chan%d: won't stop!\n", uc->id);
+			ret = -EBUSY;
+			goto err_res_free;
+		}
+	}
+
+	uc->dma_dev = dmaengine_get_dma_device(chan);
+	uc->hdesc_pool = dma_pool_create(uc->name, uc->dma_dev,
+					 uc->config.hdesc_size, ud->desc_align,
+					 0);
+	if (!uc->hdesc_pool) {
+		dev_err(ud->ddev.dev,
+			"Descriptor pool allocation failed\n");
+		uc->use_dma_pool = false;
+		ret = -ENOMEM;
+		goto err_res_free;
+	}
+
+	uc->use_dma_pool = true;
+
+	/* PSI-L pairing */
+	ret = navss_psil_pair(ud, uc->config.src_thread, uc->config.dst_thread);
+	if (ret) {
+		dev_err(ud->dev, "PSI-L pairing failed: 0x%04x -> 0x%04x\n",
+			uc->config.src_thread, uc->config.dst_thread);
+		goto err_res_free;
+	}
+
+	uc->psil_paired = true;
+
+	uc->irq_num_ring = ti_sci_inta_msi_get_virq(ud->dev, irq_ring_idx);
+	if (uc->irq_num_ring <= 0) {
+		dev_err(ud->dev, "Failed to get ring irq (index: %u)\n",
+			irq_ring_idx);
+		ret = -EINVAL;
+		goto err_psi_free;
+	}
+
+	ret = request_irq(uc->irq_num_ring, udma_ring_irq_handler,
+			  IRQF_TRIGGER_HIGH, uc->name, uc);
+	if (ret) {
+		dev_err(ud->dev, "chan%d: ring irq request failed\n", uc->id);
+		goto err_irq_free;
+	}
+
+	uc->irq_num_udma = 0;
+
+	udma_reset_rings(uc);
+
+	INIT_DELAYED_WORK_ONSTACK(&uc->tx_drain.work,
+				  udma_check_tx_completion);
+
+	if (uc->tchan)
+		dev_dbg(ud->dev,
+			"chan%d: tchan%d, tflow%d, Remote thread: 0x%04x\n",
+			uc->id, uc->tchan->id, uc->tchan->tflow_id,
+			uc->config.remote_thread_id);
+	else if (uc->rchan)
+		dev_dbg(ud->dev,
+			"chan%d: rchan%d, rflow%d, Remote thread: 0x%04x\n",
+			uc->id, uc->rchan->id, uc->rflow->id,
+			uc->config.remote_thread_id);
+	return 0;
+
+err_irq_free:
+	uc->irq_num_ring = 0;
+err_psi_free:
+	navss_psil_unpair(ud, uc->config.src_thread, uc->config.dst_thread);
+	uc->psil_paired = false;
+err_res_free:
+	udma_free_tx_resources(uc);
+	udma_free_rx_resources(uc);
+
+	udma_reset_uchan(uc);
+
+	dma_pool_destroy(uc->hdesc_pool);
+	uc->use_dma_pool = false;
+
+	return ret;
+}
+
 static int udma_slave_config(struct dma_chan *chan,
 			     struct dma_slave_config *cfg)
 {
@@ -2857,6 +3134,7 @@ udma_prep_slave_sg_pkt(struct udma_chan *uc, struct scatterlist *sgl,
 	struct udma_desc *d;
 	u32 ring_id;
 	unsigned int i;
+	u64 asel;
 
 	d = kzalloc(struct_size(d, hwdesc, sglen), GFP_NOWAIT);
 	if (!d)
@@ -2870,6 +3148,11 @@ udma_prep_slave_sg_pkt(struct udma_chan *uc, struct scatterlist *sgl,
 	else
 		ring_id = k3_ringacc_get_ring_id(uc->tchan->tc_ring);
 
+	if (uc->ud->match_data->type == DMA_TYPE_UDMA)
+		asel = 0;
+	else
+		asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT;
+
 	for_each_sg(sgl, sgent, sglen, i) {
 		struct udma_hwdesc *hwdesc = &d->hwdesc[i];
 		dma_addr_t sg_addr = sg_dma_address(sgent);
@@ -2904,14 +3187,16 @@ udma_prep_slave_sg_pkt(struct udma_chan *uc, struct scatterlist *sgl,
 		}
 
 		/* attach the sg buffer to the descriptor */
+		sg_addr |= asel;
 		cppi5_hdesc_attach_buf(desc, sg_addr, sg_len, sg_addr, sg_len);
 
 		/* Attach link as host buffer descriptor */
 		if (h_desc)
 			cppi5_hdesc_link_hbdesc(h_desc,
-						hwdesc->cppi5_desc_paddr);
+						hwdesc->cppi5_desc_paddr | asel);
 
-		if (dir == DMA_MEM_TO_DEV)
+		if (uc->ud->match_data->type == DMA_TYPE_PKTDMA ||
+		    dir == DMA_MEM_TO_DEV)
 			h_desc = desc;
 	}
 
@@ -3190,6 +3475,9 @@ udma_prep_dma_cyclic_pkt(struct udma_chan *uc, dma_addr_t buf_addr,
 	else
 		ring_id = k3_ringacc_get_ring_id(uc->tchan->tc_ring);
 
+	if (uc->ud->match_data->type != DMA_TYPE_UDMA)
+		buf_addr |= (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT;
+
 	for (i = 0; i < periods; i++) {
 		struct udma_hwdesc *hwdesc = &d->hwdesc[i];
 		dma_addr_t period_addr = buf_addr + (period_len * i);
@@ -3706,6 +3994,7 @@ static void udma_free_chan_resources(struct dma_chan *chan)
 
 static struct platform_driver udma_driver;
 static struct platform_driver bcdma_driver;
+static struct platform_driver pktdma_driver;
 
 struct udma_filter_param {
 	int remote_thread_id;
@@ -3723,7 +4012,8 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
 	struct udma_dev *ud;
 
 	if (chan->device->dev->driver != &udma_driver.driver &&
-	    chan->device->dev->driver != &bcdma_driver.driver)
+	    chan->device->dev->driver != &bcdma_driver.driver &&
+	    chan->device->dev->driver != &pktdma_driver.driver)
 		return false;
 
 	uc = to_udma_chan(chan);
@@ -3785,6 +4075,15 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param)
 	ucc->notdpkt = ep_config->notdpkt;
 	ucc->ep_type = ep_config->ep_type;
 
+	if (ud->match_data->type == DMA_TYPE_PKTDMA &&
+	    ep_config->mapped_channel_id >= 0) {
+		ucc->mapped_channel_id = ep_config->mapped_channel_id;
+		ucc->default_flow_id = ep_config->default_flow_id;
+	} else {
+		ucc->mapped_channel_id = -1;
+		ucc->default_flow_id = -1;
+	}
+
 	if (ucc->ep_type != PSIL_EP_NATIVE) {
 		const struct udma_match_data *match_data = ud->match_data;
 
@@ -3901,6 +4200,14 @@ static struct udma_match_data am64_bcdma_data = {
 	.statictr_z_mask = GENMASK(23, 0),
 };
 
+static struct udma_match_data am64_pktdma_data = {
+	.type = DMA_TYPE_PKTDMA,
+	.psil_base = 0x1000,
+	.enable_memcpy_support = false, /* PKTDMA does not support MEM_TO_MEM */
+	.flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE,
+	.statictr_z_mask = GENMASK(23, 0),
+};
+
 static const struct of_device_id udma_of_match[] = {
 	{
 		.compatible = "ti,am654-navss-main-udmap",
@@ -3927,6 +4234,14 @@ static const struct of_device_id bcdma_of_match[] = {
 	{ /* Sentinel */ },
 };
 
+static const struct of_device_id pktdma_of_match[] = {
+	{
+		.compatible = "ti,am64-dmss-pktdma",
+		.data = &am64_pktdma_data,
+	},
+	{ /* Sentinel */ },
+};
+
 static struct udma_soc_data am654_soc_data = {
 	.oes = {
 		.udma_rchan = 0x200,
@@ -3953,6 +4268,8 @@ static struct udma_soc_data am64_soc_data = {
 		.bcdma_tchan_ring = 0x2a00,
 		.bcdma_rchan_data = 0x2e00,
 		.bcdma_rchan_ring = 0x3000,
+		.pktdma_tchan_flow = 0x1200,
+		.pktdma_rchan_flow = 0x1600,
 	},
 	.bcdma_trigger_event_offset = 0xc400,
 };
@@ -3967,7 +4284,7 @@ static const struct soc_device_attribute k3_soc_devices[] = {
 
 static int udma_get_mmrs(struct platform_device *pdev, struct udma_dev *ud)
 {
-	u32 cap2, cap3;
+	u32 cap2, cap3, cap4;
 	int i;
 
 	ud->mmrs[MMR_GCFG] = devm_platform_ioremap_resource_byname(pdev, mmr_names[MMR_GCFG]);
@@ -3989,6 +4306,13 @@ static int udma_get_mmrs(struct platform_device *pdev, struct udma_dev *ud)
 		ud->tchan_cnt = BCDMA_CAP2_TCHAN_CNT(cap2);
 		ud->rchan_cnt = BCDMA_CAP2_RCHAN_CNT(cap2);
 		break;
+	case DMA_TYPE_PKTDMA:
+		cap4 = udma_read(ud->mmrs[MMR_GCFG], 0x30);
+		ud->tchan_cnt = UDMA_CAP2_TCHAN_CNT(cap2);
+		ud->rchan_cnt = UDMA_CAP2_RCHAN_CNT(cap2);
+		ud->rflow_cnt = UDMA_CAP3_RFLOW_CNT(cap3);
+		ud->tflow_cnt = PKTDMA_CAP4_TFLOW_CNT(cap4);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -4024,7 +4348,8 @@ static const char * const range_names[] = {
 	[RM_RANGE_BCHAN] = "ti,sci-rm-range-bchan",
 	[RM_RANGE_TCHAN] = "ti,sci-rm-range-tchan",
 	[RM_RANGE_RCHAN] = "ti,sci-rm-range-rchan",
-	[RM_RANGE_RFLOW] = "ti,sci-rm-range-rflow"
+	[RM_RANGE_RFLOW] = "ti,sci-rm-range-rflow",
+	[RM_RANGE_TFLOW] = "ti,sci-rm-range-tflow",
 };
 
 static int udma_setup_resources(struct udma_dev *ud)
@@ -4098,7 +4423,7 @@ static int udma_setup_resources(struct udma_dev *ud)
 
 	/* Get resource ranges from tisci */
 	for (i = 0; i < RM_RANGE_LAST; i++) {
-		if (i == RM_RANGE_BCHAN)
+		if (i == RM_RANGE_BCHAN || i == RM_RANGE_TFLOW)
 			continue;
 
 		tisci_rm->rm_ranges[i] =
@@ -4245,7 +4570,7 @@ static int bcdma_setup_resources(struct udma_dev *ud)
 
 	/* Get resource ranges from tisci */
 	for (i = 0; i < RM_RANGE_LAST; i++) {
-		if (i == RM_RANGE_RFLOW)
+		if (i == RM_RANGE_RFLOW || i == RM_RANGE_TFLOW)
 			continue;
 		if (i == RM_RANGE_BCHAN && ud->bchan_cnt == 0)
 			continue;
@@ -4351,6 +4676,134 @@ static int bcdma_setup_resources(struct udma_dev *ud)
 	return 0;
 }
 
+static int pktdma_setup_resources(struct udma_dev *ud)
+{
+	int ret, i, j;
+	struct device *dev = ud->dev;
+	struct ti_sci_resource *rm_res, irq_res;
+	struct udma_tisci_rm *tisci_rm = &ud->tisci_rm;
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+	u32 cap3;
+
+	/* Set up the throughput level start indexes */
+	cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c);
+	if (UDMA_CAP3_UCHAN_CNT(cap3)) {
+		ud->tchan_tpl.levels = 3;
+		ud->tchan_tpl.start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3);
+		ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
+	} else if (UDMA_CAP3_HCHAN_CNT(cap3)) {
+		ud->tchan_tpl.levels = 2;
+		ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3);
+	} else {
+		ud->tchan_tpl.levels = 1;
+	}
+
+	ud->tchan_tpl.levels = ud->tchan_tpl.levels;
+	ud->tchan_tpl.start_idx[0] = ud->tchan_tpl.start_idx[0];
+	ud->tchan_tpl.start_idx[1] = ud->tchan_tpl.start_idx[1];
+
+	ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt),
+					   sizeof(unsigned long), GFP_KERNEL);
+	ud->tchans = devm_kcalloc(dev, ud->tchan_cnt, sizeof(*ud->tchans),
+				  GFP_KERNEL);
+	ud->rchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->rchan_cnt),
+					   sizeof(unsigned long), GFP_KERNEL);
+	ud->rchans = devm_kcalloc(dev, ud->rchan_cnt, sizeof(*ud->rchans),
+				  GFP_KERNEL);
+	ud->rflow_in_use = devm_kcalloc(dev, BITS_TO_LONGS(ud->rflow_cnt),
+					sizeof(unsigned long),
+					GFP_KERNEL);
+	ud->rflows = devm_kcalloc(dev, ud->rflow_cnt, sizeof(*ud->rflows),
+				  GFP_KERNEL);
+	ud->tflow_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tflow_cnt),
+					   sizeof(unsigned long), GFP_KERNEL);
+
+	if (!ud->tchan_map || !ud->rchan_map || !ud->tflow_map || !ud->tchans ||
+	    !ud->rchans || !ud->rflows || !ud->rflow_in_use)
+		return -ENOMEM;
+
+	/* Get resource ranges from tisci */
+	for (i = 0; i < RM_RANGE_LAST; i++) {
+		if (i == RM_RANGE_BCHAN)
+			continue;
+
+		tisci_rm->rm_ranges[i] =
+			devm_ti_sci_get_of_resource(tisci_rm->tisci, dev,
+						    tisci_rm->tisci_dev_id,
+						    (char *)range_names[i]);
+	}
+
+	/* tchan ranges */
+	rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN];
+	if (IS_ERR(rm_res)) {
+		bitmap_zero(ud->tchan_map, ud->tchan_cnt);
+	} else {
+		bitmap_fill(ud->tchan_map, ud->tchan_cnt);
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->tchan_map,
+						  &rm_res->desc[i], "tchan");
+	}
+
+	/* rchan ranges */
+	rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN];
+	if (IS_ERR(rm_res)) {
+		bitmap_zero(ud->rchan_map, ud->rchan_cnt);
+	} else {
+		bitmap_fill(ud->rchan_map, ud->rchan_cnt);
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->rchan_map,
+						  &rm_res->desc[i], "rchan");
+	}
+
+	/* rflow ranges */
+	rm_res = tisci_rm->rm_ranges[RM_RANGE_RFLOW];
+	if (IS_ERR(rm_res)) {
+		/* all rflows are assigned exclusively to Linux */
+		bitmap_zero(ud->rflow_in_use, ud->rflow_cnt);
+	} else {
+		bitmap_fill(ud->rflow_in_use, ud->rflow_cnt);
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->rflow_in_use,
+						  &rm_res->desc[i], "rflow");
+	}
+	irq_res.sets = rm_res->sets;
+
+	/* tflow ranges */
+	rm_res = tisci_rm->rm_ranges[RM_RANGE_TFLOW];
+	if (IS_ERR(rm_res)) {
+		/* all tflows are assigned exclusively to Linux */
+		bitmap_zero(ud->tflow_map, ud->tflow_cnt);
+	} else {
+		bitmap_fill(ud->tflow_map, ud->tflow_cnt);
+		for (i = 0; i < rm_res->sets; i++)
+			udma_mark_resource_ranges(ud, ud->tflow_map,
+						  &rm_res->desc[i], "tflow");
+	}
+	irq_res.sets += rm_res->sets;
+
+	irq_res.desc = kcalloc(irq_res.sets, sizeof(*irq_res.desc), GFP_KERNEL);
+	rm_res = tisci_rm->rm_ranges[RM_RANGE_TFLOW];
+	for (i = 0; i < rm_res->sets; i++) {
+		irq_res.desc[i].start = rm_res->desc[i].start +
+					oes->pktdma_tchan_flow;
+		irq_res.desc[i].num = rm_res->desc[i].num;
+	}
+	rm_res = tisci_rm->rm_ranges[RM_RANGE_RFLOW];
+	for (j = 0; j < rm_res->sets; j++, i++) {
+		irq_res.desc[i].start = rm_res->desc[j].start +
+					oes->pktdma_rchan_flow;
+		irq_res.desc[i].num = rm_res->desc[j].num;
+	}
+	ret = ti_sci_inta_msi_domain_alloc_irqs(ud->dev, &irq_res);
+	kfree(irq_res.desc);
+	if (ret) {
+		dev_err(ud->dev, "Failed to allocate MSI interrupts\n");
+		return ret;
+	}
+
+	return 0;
+}
+
 static int setup_resources(struct udma_dev *ud)
 {
 	struct device *dev = ud->dev;
@@ -4363,6 +4816,9 @@ static int setup_resources(struct udma_dev *ud)
 	case DMA_TYPE_BCDMA:
 		ret = bcdma_setup_resources(ud);
 		break;
+	case DMA_TYPE_PKTDMA:
+		ret = pktdma_setup_resources(ud);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -4406,6 +4862,14 @@ static int setup_resources(struct udma_dev *ud)
 			 ud->rchan_cnt - bitmap_weight(ud->rchan_map,
 						       ud->rchan_cnt));
 		break;
+	case DMA_TYPE_PKTDMA:
+		dev_info(dev,
+			 "Channels: %d (tchan: %u, rchan: %u)\n",
+			 ch_count,
+			 ud->tchan_cnt - bitmap_weight(ud->tchan_map,
+						       ud->tchan_cnt),
+			 ud->rchan_cnt - bitmap_weight(ud->rchan_map,
+						       ud->rchan_cnt));
 	default:
 		break;
 	}
@@ -4536,10 +5000,14 @@ static void udma_dbg_summary_show_chan(struct seq_file *s,
 	case DMA_DEV_TO_MEM:
 		seq_printf(s, "rchan%d [0x%04x -> 0x%04x], ", uc->rchan->id,
 			   ucc->src_thread, ucc->dst_thread);
+		if (uc->ud->match_data->type == DMA_TYPE_PKTDMA)
+			seq_printf(s, "rflow%d, ", uc->rflow->id);
 		break;
 	case DMA_MEM_TO_DEV:
 		seq_printf(s, "tchan%d [0x%04x -> 0x%04x], ", uc->tchan->id,
 			   ucc->src_thread, ucc->dst_thread);
+		if (uc->ud->match_data->type == DMA_TYPE_PKTDMA)
+			seq_printf(s, "tflow%d, ", uc->tchan->tflow_id);
 		break;
 	default:
 		seq_printf(s, ")\n");
@@ -4602,8 +5070,10 @@ static int udma_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	match = of_match_node(udma_of_match, dev->of_node);
-	if (!match) {
+	if (!match)
 		match = of_match_node(bcdma_of_match, dev->of_node);
+	if (!match) {
+		match = of_match_node(pktdma_of_match, dev->of_node);
 		if (!match) {
 			dev_err(dev, "No compatible match found\n");
 			return -ENODEV;
@@ -4667,8 +5137,14 @@ static int udma_probe(struct platform_device *pdev)
 
 		ring_init_data.tisci = ud->tisci_rm.tisci;
 		ring_init_data.tisci_dev_id = ud->tisci_rm.tisci_dev_id;
-		ring_init_data.num_rings = ud->bchan_cnt + ud->tchan_cnt +
-					   ud->rchan_cnt;
+		if (ud->match_data->type == DMA_TYPE_BCDMA) {
+			ring_init_data.num_rings = ud->bchan_cnt +
+						   ud->tchan_cnt +
+						   ud->rchan_cnt;
+		} else {
+			ring_init_data.num_rings = ud->rflow_cnt +
+						   ud->tflow_cnt;
+		}
 
 		ud->ringacc = k3_ringacc_dmarings_init(pdev, &ring_init_data);
 	}
@@ -4684,11 +5160,14 @@ static int udma_probe(struct platform_device *pdev)
 	}
 
 	dma_cap_set(DMA_SLAVE, ud->ddev.cap_mask);
-	dma_cap_set(DMA_CYCLIC, ud->ddev.cap_mask);
+	/* cyclic operation is not supported via PKTDMA */
+	if (ud->match_data->type != DMA_TYPE_PKTDMA) {
+		dma_cap_set(DMA_CYCLIC, ud->ddev.cap_mask);
+		ud->ddev.device_prep_dma_cyclic = udma_prep_dma_cyclic;
+	}
 
 	ud->ddev.device_config = udma_slave_config;
 	ud->ddev.device_prep_slave_sg = udma_prep_slave_sg;
-	ud->ddev.device_prep_dma_cyclic = udma_prep_dma_cyclic;
 	ud->ddev.device_issue_pending = udma_issue_pending;
 	ud->ddev.device_tx_status = udma_tx_status;
 	ud->ddev.device_pause = udma_pause;
@@ -4709,6 +5188,10 @@ static int udma_probe(struct platform_device *pdev)
 					bcdma_alloc_chan_resources;
 		ud->ddev.device_router_config = bcdma_router_config;
 		break;
+	case DMA_TYPE_PKTDMA:
+		ud->ddev.device_alloc_chan_resources =
+					pktdma_alloc_chan_resources;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -4787,6 +5270,8 @@ static int udma_probe(struct platform_device *pdev)
 		uc->tchan = NULL;
 		uc->rchan = NULL;
 		uc->config.remote_thread_id = -1;
+		uc->config.mapped_channel_id = -1;
+		uc->config.default_flow_id = -1;
 		uc->config.dir = DMA_MEM_TO_MEM;
 		uc->name = devm_kasprintf(dev, GFP_KERNEL, "%s chan%d",
 					  dev_name(dev), i);
@@ -4835,5 +5320,15 @@ static struct platform_driver bcdma_driver = {
 };
 builtin_platform_driver(bcdma_driver);
 
+static struct platform_driver pktdma_driver = {
+	.driver = {
+		.name	= "ti-pktdma",
+		.of_match_table = pktdma_of_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe		= udma_probe,
+};
+builtin_platform_driver(pktdma_driver);
+
 /* Private interfaces to UDMA */
 #include "k3-udma-private.c"
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index 3ec38b383b76..ccb19f286daf 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -55,6 +55,8 @@
 #define BCDMA_CAP4_HTCHAN_CNT(val)	(((val) >> 16) & 0xff)
 #define BCDMA_CAP4_UTCHAN_CNT(val)	(((val) >> 24) & 0xff)
 
+#define PKTDMA_CAP4_TFLOW_CNT(val)	((val) & 0x3fff)
+
 /* UDMA_CHAN_RT_CTL_REG */
 #define UDMA_CHAN_RT_CTL_EN		BIT(31)
 #define UDMA_CHAN_RT_CTL_TDOWN		BIT(30)
@@ -105,6 +107,7 @@ enum udma_rm_range {
 	RM_RANGE_TCHAN,
 	RM_RANGE_RCHAN,
 	RM_RANGE_RFLOW,
+	RM_RANGE_TFLOW,
 	RM_RANGE_LAST,
 };
 
@@ -152,5 +155,6 @@ void xudma_tchanrt_write(struct udma_tchan *tchan, int reg, u32 val);
 u32 xudma_rchanrt_read(struct udma_rchan *rchan, int reg);
 void xudma_rchanrt_write(struct udma_rchan *rchan, int reg, u32 val);
 bool xudma_rflow_is_gp(struct udma_dev *ud, int id);
+int xudma_get_rflow_ring_offset(struct udma_dev *ud);
 
 #endif /* K3_UDMA_H_ */

From 5b65781d06ea90ef2f8e51a13352c43c3daa8cdc Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 8 Dec 2020 11:04:40 +0200
Subject: [PATCH 146/230] dmaengine: ti: k3-udma-glue: Add support for K3
 PKTDMA

This commit adds support for PKTDMA in k3-udma glue driver. Use new
psil_endpoint_config struct to get static data for a given channel or a
flow during setup.  Make sure that the RX flows being mapped to a RX
channel is within the range of flows that is been allocated to that RX
channel.

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-21-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 291 +++++++++++++++++++++++++++----
 drivers/dma/ti/k3-udma-private.c |  24 +++
 drivers/dma/ti/k3-udma.h         |   4 +
 include/linux/dma/k3-udma-glue.h |   8 +
 4 files changed, 289 insertions(+), 38 deletions(-)

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index e6ebcd98c02a..4fdd9f06b723 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -22,6 +22,7 @@
 
 struct k3_udma_glue_common {
 	struct device *dev;
+	struct device chan_dev;
 	struct udma_dev *udmax;
 	const struct udma_tisci_rm *tisci_rm;
 	struct k3_ringacc *ringacc;
@@ -32,7 +33,8 @@ struct k3_udma_glue_common {
 	bool epib;
 	u32  psdata_size;
 	u32  swdata_size;
-	u32  atype;
+	u32  atype_asel;
+	struct psil_endpoint_config *ep_config;
 };
 
 struct k3_udma_glue_tx_channel {
@@ -53,6 +55,8 @@ struct k3_udma_glue_tx_channel {
 	bool tx_filt_einfo;
 	bool tx_filt_pswords;
 	bool tx_supr_tdpkt;
+
+	int udma_tflow_id;
 };
 
 struct k3_udma_glue_rx_flow {
@@ -81,6 +85,16 @@ struct k3_udma_glue_rx_channel {
 	u32 flows_ready;
 };
 
+static void k3_udma_chan_dev_release(struct device *dev)
+{
+	/* The struct containing the device is devm managed */
+}
+
+static struct class k3_udma_glue_devclass = {
+	.name		= "k3_udma_glue_chan",
+	.dev_release	= k3_udma_chan_dev_release,
+};
+
 #define K3_UDMAX_TDOWN_TIMEOUT_US 1000
 
 static int of_k3_udma_glue_parse(struct device_node *udmax_np,
@@ -100,7 +114,6 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 		const char *name, struct k3_udma_glue_common *common,
 		bool tx_chn)
 {
-	struct psil_endpoint_config *ep_config;
 	struct of_phandle_args dma_spec;
 	u32 thread_id;
 	int ret = 0;
@@ -117,15 +130,26 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 				       &dma_spec))
 		return -ENOENT;
 
+	ret = of_k3_udma_glue_parse(dma_spec.np, common);
+	if (ret)
+		goto out_put_spec;
+
 	thread_id = dma_spec.args[0];
 	if (dma_spec.args_count == 2) {
-		if (dma_spec.args[1] > 2) {
+		if (dma_spec.args[1] > 2 && !xudma_is_pktdma(common->udmax)) {
 			dev_err(common->dev, "Invalid channel atype: %u\n",
 				dma_spec.args[1]);
 			ret = -EINVAL;
 			goto out_put_spec;
 		}
-		common->atype = dma_spec.args[1];
+		if (dma_spec.args[1] > 15 && xudma_is_pktdma(common->udmax)) {
+			dev_err(common->dev, "Invalid channel asel: %u\n",
+				dma_spec.args[1]);
+			ret = -EINVAL;
+			goto out_put_spec;
+		}
+
+		common->atype_asel = dma_spec.args[1];
 	}
 
 	if (tx_chn && !(thread_id & K3_PSIL_DST_THREAD_ID_OFFSET)) {
@@ -139,25 +163,23 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 	}
 
 	/* get psil endpoint config */
-	ep_config = psil_get_ep_config(thread_id);
-	if (IS_ERR(ep_config)) {
+	common->ep_config = psil_get_ep_config(thread_id);
+	if (IS_ERR(common->ep_config)) {
 		dev_err(common->dev,
 			"No configuration for psi-l thread 0x%04x\n",
 			thread_id);
-		ret = PTR_ERR(ep_config);
+		ret = PTR_ERR(common->ep_config);
 		goto out_put_spec;
 	}
 
-	common->epib = ep_config->needs_epib;
-	common->psdata_size = ep_config->psd_size;
+	common->epib = common->ep_config->needs_epib;
+	common->psdata_size = common->ep_config->psd_size;
 
 	if (tx_chn)
 		common->dst_thread = thread_id;
 	else
 		common->src_thread = thread_id;
 
-	ret = of_k3_udma_glue_parse(dma_spec.np, common);
-
 out_put_spec:
 	of_node_put(dma_spec.np);
 	return ret;
@@ -223,7 +245,7 @@ static int k3_udma_glue_cfg_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 		req.tx_supr_tdpkt = 1;
 	req.tx_fetch_size = tx_chn->common.hdesc_size >> 2;
 	req.txcq_qnum = k3_ringacc_get_ring_id(tx_chn->ringtxcq);
-	req.tx_atype = tx_chn->common.atype;
+	req.tx_atype = tx_chn->common.atype_asel;
 
 	return tisci_rm->tisci_udmap_ops->tx_ch_cfg(tisci_rm->tisci, &req);
 }
@@ -255,8 +277,14 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 						tx_chn->common.psdata_size,
 						tx_chn->common.swdata_size);
 
+	if (xudma_is_pktdma(tx_chn->common.udmax))
+		tx_chn->udma_tchan_id = tx_chn->common.ep_config->mapped_channel_id;
+	else
+		tx_chn->udma_tchan_id = -1;
+
 	/* request and cfg UDMAP TX channel */
-	tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax, -1);
+	tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax,
+					      tx_chn->udma_tchan_id);
 	if (IS_ERR(tx_chn->udma_tchanx)) {
 		ret = PTR_ERR(tx_chn->udma_tchanx);
 		dev_err(dev, "UDMAX tchanx get err %d\n", ret);
@@ -264,11 +292,34 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 	}
 	tx_chn->udma_tchan_id = xudma_tchan_get_id(tx_chn->udma_tchanx);
 
+	tx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	tx_chn->common.chan_dev.parent = xudma_get_device(tx_chn->common.udmax);
+	dev_set_name(&tx_chn->common.chan_dev, "tchan%d-0x%04x",
+		     tx_chn->udma_tchan_id, tx_chn->common.dst_thread);
+	ret = device_register(&tx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		tx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
+
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		tx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&tx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
 	atomic_set(&tx_chn->free_pkts, cfg->txcq_cfg.size);
 
+	if (xudma_is_pktdma(tx_chn->common.udmax))
+		tx_chn->udma_tflow_id = tx_chn->common.ep_config->default_flow_id;
+	else
+		tx_chn->udma_tflow_id = tx_chn->udma_tchan_id;
+
 	/* request and cfg rings */
 	ret =  k3_ringacc_request_rings_pair(tx_chn->common.ringacc,
-					     tx_chn->udma_tchan_id, -1,
+					     tx_chn->udma_tflow_id, -1,
 					     &tx_chn->ringtx,
 					     &tx_chn->ringtxcq);
 	if (ret) {
@@ -280,6 +331,12 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 	cfg->tx_cfg.dma_dev = k3_udma_glue_tx_get_dma_device(tx_chn);
 	cfg->txcq_cfg.dma_dev = cfg->tx_cfg.dma_dev;
 
+	/* Set the ASEL value for DMA rings of PKTDMA */
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		cfg->tx_cfg.asel = tx_chn->common.atype_asel;
+		cfg->txcq_cfg.asel = tx_chn->common.atype_asel;
+	}
+
 	ret = k3_ringacc_ring_cfg(tx_chn->ringtx, &cfg->tx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringtx %d\n", ret);
@@ -331,6 +388,11 @@ void k3_udma_glue_release_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 
 	if (tx_chn->ringtx)
 		k3_ringacc_ring_free(tx_chn->ringtx);
+
+	if (tx_chn->common.chan_dev.parent) {
+		device_unregister(&tx_chn->common.chan_dev);
+		tx_chn->common.chan_dev.parent = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_release_tx_chn);
 
@@ -443,13 +505,10 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 			       void *data,
 			       void (*cleanup)(void *data, dma_addr_t desc_dma))
 {
+	struct device *dev = tx_chn->common.dev;
 	dma_addr_t desc_dma;
 	int occ_tx, i, ret;
 
-	/* reset TXCQ as it is not input for udma - expected to be empty */
-	if (tx_chn->ringtxcq)
-		k3_ringacc_ring_reset(tx_chn->ringtxcq);
-
 	/*
 	 * TXQ reset need to be special way as it is input for udma and its
 	 * state cached by udma, so:
@@ -458,17 +517,20 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 	 * 3) reset TXQ in a special way
 	 */
 	occ_tx = k3_ringacc_ring_get_occ(tx_chn->ringtx);
-	dev_dbg(tx_chn->common.dev, "TX reset occ_tx %u\n", occ_tx);
+	dev_dbg(dev, "TX reset occ_tx %u\n", occ_tx);
 
 	for (i = 0; i < occ_tx; i++) {
 		ret = k3_ringacc_ring_pop(tx_chn->ringtx, &desc_dma);
 		if (ret) {
-			dev_err(tx_chn->common.dev, "TX reset pop %d\n", ret);
+			if (ret != -ENODATA)
+				dev_err(dev, "TX reset pop %d\n", ret);
 			break;
 		}
 		cleanup(data, desc_dma);
 	}
 
+	/* reset TXCQ as it is not input for udma - expected to be empty */
+	k3_ringacc_ring_reset(tx_chn->ringtxcq);
 	k3_ringacc_ring_reset_dma(tx_chn->ringtx, occ_tx);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_reset_tx_chn);
@@ -487,7 +549,12 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_txcq_id);
 
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn)
 {
-	tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq);
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		tx_chn->virq = xudma_pktdma_tflow_get_irq(tx_chn->common.udmax,
+							  tx_chn->udma_tflow_id);
+	} else {
+		tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq);
+	}
 
 	return tx_chn->virq;
 }
@@ -496,10 +563,36 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq);
 struct device *
 	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn)
 {
+	if (xudma_is_pktdma(tx_chn->common.udmax) &&
+	    (tx_chn->common.atype_asel == 14 || tx_chn->common.atype_asel == 15))
+		return &tx_chn->common.chan_dev;
+
 	return xudma_get_device(tx_chn->common.udmax);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device);
 
+void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(tx_chn->common.udmax) ||
+	    !tx_chn->common.atype_asel)
+		return;
+
+	*addr |= (u64)tx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT;
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_dma_to_cppi5_addr);
+
+void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(tx_chn->common.udmax) ||
+	    !tx_chn->common.atype_asel)
+		return;
+
+	*addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_cppi5_to_dma_addr);
+
 static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
 	const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm;
@@ -511,8 +604,6 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	req.valid_params = TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID |
-			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID |
-			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID;
 
 	req.nav_id = tisci_rm->tisci_dev_id;
@@ -524,13 +615,16 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	 * req.rxcq_qnum = k3_ringacc_get_ring_id(rx_chn->flows[0].ringrx);
 	 */
 	req.rxcq_qnum = 0xFFFF;
-	if (rx_chn->flow_num && rx_chn->flow_id_base != rx_chn->udma_rchan_id) {
+	if (!xudma_is_pktdma(rx_chn->common.udmax) && rx_chn->flow_num &&
+	    rx_chn->flow_id_base != rx_chn->udma_rchan_id) {
 		/* Default flow + extra ones */
+		req.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID |
+				    TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID;
 		req.flowid_start = rx_chn->flow_id_base;
 		req.flowid_cnt = rx_chn->flow_num;
 	}
 	req.rx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR;
-	req.rx_atype = rx_chn->common.atype;
+	req.rx_atype = rx_chn->common.atype_asel;
 
 	ret = tisci_rm->tisci_udmap_ops->rx_ch_cfg(tisci_rm->tisci, &req);
 	if (ret)
@@ -584,10 +678,18 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 		goto err_rflow_put;
 	}
 
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		rx_ringfdq_id = flow->udma_rflow_id +
+				xudma_get_rflow_ring_offset(rx_chn->common.udmax);
+		rx_ring_id = 0;
+	} else {
+		rx_ring_id = flow_cfg->ring_rxq_id;
+		rx_ringfdq_id = flow_cfg->ring_rxfdq0_id;
+	}
+
 	/* request and cfg rings */
 	ret =  k3_ringacc_request_rings_pair(rx_chn->common.ringacc,
-					     flow_cfg->ring_rxfdq0_id,
-					     flow_cfg->ring_rxq_id,
+					     rx_ringfdq_id, rx_ring_id,
 					     &flow->ringrxfdq,
 					     &flow->ringrx);
 	if (ret) {
@@ -599,6 +701,12 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 	flow_cfg->rx_cfg.dma_dev = k3_udma_glue_rx_get_dma_device(rx_chn);
 	flow_cfg->rxfdq_cfg.dma_dev = flow_cfg->rx_cfg.dma_dev;
 
+	/* Set the ASEL value for DMA rings of PKTDMA */
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		flow_cfg->rx_cfg.asel = rx_chn->common.atype_asel;
+		flow_cfg->rxfdq_cfg.asel = rx_chn->common.atype_asel;
+	}
+
 	ret = k3_ringacc_ring_cfg(flow->ringrx, &flow_cfg->rx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringrx %d\n", ret);
@@ -757,6 +865,7 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 				 struct k3_udma_glue_rx_channel_cfg *cfg)
 {
 	struct k3_udma_glue_rx_channel *rx_chn;
+	struct psil_endpoint_config *ep_cfg;
 	int ret, i;
 
 	if (cfg->flow_id_num <= 0)
@@ -784,8 +893,16 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 						rx_chn->common.psdata_size,
 						rx_chn->common.swdata_size);
 
+	ep_cfg = rx_chn->common.ep_config;
+
+	if (xudma_is_pktdma(rx_chn->common.udmax))
+		rx_chn->udma_rchan_id = ep_cfg->mapped_channel_id;
+	else
+		rx_chn->udma_rchan_id = -1;
+
 	/* request and cfg UDMAP RX channel */
-	rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax, -1);
+	rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax,
+					      rx_chn->udma_rchan_id);
 	if (IS_ERR(rx_chn->udma_rchanx)) {
 		ret = PTR_ERR(rx_chn->udma_rchanx);
 		dev_err(dev, "UDMAX rchanx get err %d\n", ret);
@@ -793,12 +910,48 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 	}
 	rx_chn->udma_rchan_id = xudma_rchan_get_id(rx_chn->udma_rchanx);
 
-	rx_chn->flow_num = cfg->flow_id_num;
-	rx_chn->flow_id_base = cfg->flow_id_base;
+	rx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax);
+	dev_set_name(&rx_chn->common.chan_dev, "rchan%d-0x%04x",
+		     rx_chn->udma_rchan_id, rx_chn->common.src_thread);
+	ret = device_register(&rx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		rx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
 
-	/* Use RX channel id as flow id: target dev can't generate flow_id */
-	if (cfg->flow_id_use_rxchan_id)
-		rx_chn->flow_id_base = rx_chn->udma_rchan_id;
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		rx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		int flow_start = cfg->flow_id_base;
+		int flow_end;
+
+		if (flow_start == -1)
+			flow_start = ep_cfg->flow_start;
+
+		flow_end = flow_start + cfg->flow_id_num - 1;
+		if (flow_start < ep_cfg->flow_start ||
+		    flow_end > (ep_cfg->flow_start + ep_cfg->flow_num - 1)) {
+			dev_err(dev, "Invalid flow range requested\n");
+			ret = -EINVAL;
+			goto err;
+		}
+		rx_chn->flow_id_base = flow_start;
+	} else {
+		rx_chn->flow_id_base = cfg->flow_id_base;
+
+		/* Use RX channel id as flow id: target dev can't generate flow_id */
+		if (cfg->flow_id_use_rxchan_id)
+			rx_chn->flow_id_base = rx_chn->udma_rchan_id;
+	}
+
+	rx_chn->flow_num = cfg->flow_id_num;
 
 	rx_chn->flows = devm_kcalloc(dev, rx_chn->flow_num,
 				     sizeof(*rx_chn->flows), GFP_KERNEL);
@@ -888,6 +1041,24 @@ k3_udma_glue_request_remote_rx_chn(struct device *dev, const char *name,
 		goto err;
 	}
 
+	rx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax);
+	dev_set_name(&rx_chn->common.chan_dev, "rchan_remote-0x%04x",
+		     rx_chn->common.src_thread);
+	ret = device_register(&rx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		rx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
+
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		rx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
 	ret = k3_udma_glue_allocate_rx_flows(rx_chn, cfg);
 	if (ret)
 		goto err;
@@ -940,6 +1111,11 @@ void k3_udma_glue_release_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	if (!IS_ERR_OR_NULL(rx_chn->udma_rchanx))
 		xudma_rchan_put(rx_chn->common.udmax,
 				rx_chn->udma_rchanx);
+
+	if (rx_chn->common.chan_dev.parent) {
+		device_unregister(&rx_chn->common.chan_dev);
+		rx_chn->common.chan_dev.parent = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_release_rx_chn);
 
@@ -1151,12 +1327,10 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 	/* reset RXCQ as it is not input for udma - expected to be empty */
 	occ_rx = k3_ringacc_ring_get_occ(flow->ringrx);
 	dev_dbg(dev, "RX reset flow %u occ_rx %u\n", flow_num, occ_rx);
-	if (flow->ringrx)
-		k3_ringacc_ring_reset(flow->ringrx);
 
 	/* Skip RX FDQ in case one FDQ is used for the set of flows */
 	if (skip_fdq)
-		return;
+		goto do_reset;
 
 	/*
 	 * RX FDQ reset need to be special way as it is input for udma and its
@@ -1171,13 +1345,17 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 	for (i = 0; i < occ_rx; i++) {
 		ret = k3_ringacc_ring_pop(flow->ringrxfdq, &desc_dma);
 		if (ret) {
-			dev_err(dev, "RX reset pop %d\n", ret);
+			if (ret != -ENODATA)
+				dev_err(dev, "RX reset pop %d\n", ret);
 			break;
 		}
 		cleanup(data, desc_dma);
 	}
 
 	k3_ringacc_ring_reset_dma(flow->ringrxfdq, occ_rx);
+
+do_reset:
+	k3_ringacc_ring_reset(flow->ringrx);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_reset_rx_chn);
 
@@ -1207,7 +1385,12 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 
 	flow = &rx_chn->flows[flow_num];
 
-	flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx);
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		flow->virq = xudma_pktdma_rflow_get_irq(rx_chn->common.udmax,
+							flow->udma_rflow_id);
+	} else {
+		flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx);
+	}
 
 	return flow->virq;
 }
@@ -1216,6 +1399,38 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq);
 struct device *
 	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn)
 {
+	if (xudma_is_pktdma(rx_chn->common.udmax) &&
+	    (rx_chn->common.atype_asel == 14 || rx_chn->common.atype_asel == 15))
+		return &rx_chn->common.chan_dev;
+
 	return xudma_get_device(rx_chn->common.udmax);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device);
+
+void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(rx_chn->common.udmax) ||
+	    !rx_chn->common.atype_asel)
+		return;
+
+	*addr |= (u64)rx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT;
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_dma_to_cppi5_addr);
+
+void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(rx_chn->common.udmax) ||
+	    !rx_chn->common.atype_asel)
+		return;
+
+	*addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_cppi5_to_dma_addr);
+
+static int __init k3_udma_glue_class_init(void)
+{
+	return class_register(&k3_udma_glue_devclass);
+}
+arch_initcall(k3_udma_glue_class_init);
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index 5436b19d656e..eb4795c089bd 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -157,3 +157,27 @@ void xudma_##res##rt_write(struct udma_##res *p, int reg, u32 val)	\
 EXPORT_SYMBOL(xudma_##res##rt_write)
 XUDMA_RT_IO_FUNCTIONS(tchan);
 XUDMA_RT_IO_FUNCTIONS(rchan);
+
+int xudma_is_pktdma(struct udma_dev *ud)
+{
+	return ud->match_data->type == DMA_TYPE_PKTDMA;
+}
+EXPORT_SYMBOL(xudma_is_pktdma);
+
+int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id)
+{
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	return ti_sci_inta_msi_get_virq(ud->dev, udma_tflow_id +
+					oes->pktdma_tchan_flow);
+}
+EXPORT_SYMBOL(xudma_pktdma_tflow_get_irq);
+
+int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id)
+{
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	return ti_sci_inta_msi_get_virq(ud->dev, udma_rflow_id +
+					oes->pktdma_rchan_flow);
+}
+EXPORT_SYMBOL(xudma_pktdma_rflow_get_irq);
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index ccb19f286daf..d349c6d482ae 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -157,4 +157,8 @@ void xudma_rchanrt_write(struct udma_rchan *rchan, int reg, u32 val);
 bool xudma_rflow_is_gp(struct udma_dev *ud, int id);
 int xudma_get_rflow_ring_offset(struct udma_dev *ud);
 
+int xudma_is_pktdma(struct udma_dev *ud);
+
+int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id);
+int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id);
 #endif /* K3_UDMA_H_ */
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index d7c12f31377c..e443be4d3b4b 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -43,6 +43,10 @@ u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn);
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn);
 struct device *
 	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn);
+void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr);
+void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr);
 
 enum {
 	K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0,
@@ -134,5 +138,9 @@ int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
 				 u32 flow_idx);
 struct device *
 	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn);
+void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr);
+void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr);
 
 #endif /* K3_UDMA_GLUE_H_ */

From 115ff12aecfd55376d704fa2c0a2d117e5827f9f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 14 Dec 2020 08:54:21 +0200
Subject: [PATCH 147/230] soc: ti: k3-ringacc: Use correct error casting in
 k3_ringacc_dmarings_init

Use ERR_CAST() when devm_ioremap_resource() fails.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201214065421.5138-1-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soc/ti/k3-ringacc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c
index c88c305ba367..b495b0d5d0fa 100644
--- a/drivers/soc/ti/k3-ringacc.c
+++ b/drivers/soc/ti/k3-ringacc.c
@@ -1476,7 +1476,7 @@ struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ringrt");
 	base_rt = devm_ioremap_resource(dev, res);
 	if (IS_ERR(base_rt))
-		return base_rt;
+		return ERR_CAST(base_rt);
 
 	ringacc->rings = devm_kzalloc(dev,
 				      sizeof(*ringacc->rings) *

From ac9645c87380e39a8fa87a1b51721efcdea89dbf Mon Sep 17 00:00:00 2001
From: Dan Aloni <dan@kernelim.com>
Date: Sat, 5 Dec 2020 11:28:35 +0200
Subject: [PATCH 148/230] sunrpc: fix xs_read_xdr_buf for partial pages receive

When receiving pages data, return value 'ret' when positive includes
`buf->page_base`, so we should subtract that before it is used for
changing `offset` and comparing against `want`.

This was discovered on the very rare cases where the server returned a
chunk of bytes that when added to the already received amount of bytes
for the pages happened to match the current `recv.len`, for example
on this case:

     buf->page_base : 258356
     actually received from socket: 1740
     ret : 260096
     want : 260096

In this case neither of the two 'if ... goto out' trigger, and we
continue to tail parsing.

Worth to mention that the ensuing EMSGSIZE from the continued execution of
`xs_read_xdr_buf` may be observed by an application due to 4 superfluous
bytes being added to the pages data.

Fixes: 277e4ab7d530 ("SUNRPC: Simplify TCP receive code by switching to using iterators")
Signed-off-by: Dan Aloni <dan@kernelim.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtsock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c93ff70da3f9..c56a66cdf4ac 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags,
 		if (ret <= 0)
 			goto sock_err;
 		xs_flush_bvec(buf->bvec, ret, seek + buf->page_base);
-		offset += ret - buf->page_base;
+		ret -= buf->page_base;
+		offset += ret;
 		if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC))
 			goto out;
 		if (ret != want)

From a1f26739ccdcc6967617998bd200dd907f7ff80a Mon Sep 17 00:00:00 2001
From: Frank van der Linden <fllinden@amazon.com>
Date: Wed, 2 Dec 2020 00:34:11 +0000
Subject: [PATCH 149/230] NFSv4.2: improve page handling for GETXATTR

XDRBUF_SPARSE_PAGES can cause problems for the RDMA transport,
and it's easy enough to allocate enough pages for the request
up front, so do that.

Also, since we've allocated the pages anyway, use the full
page aligned length for the receive buffer. This will allow
caching of valid replies that are too large for the caller,
but that still fit in the allocated pages.

Signed-off-by: Frank van der Linden <fllinden@amazon.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42proc.c | 47 +++++++++++++++++++++++++++++++++++-----------
 fs/nfs/nfs42xdr.c  | 13 ++++++++-----
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 4fc61e3d098d..b9836e2ce4a2 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1173,14 +1173,12 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name,
 }
 
 static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
-				void *buf, size_t buflen)
+				void *buf, size_t buflen, struct page **pages,
+				size_t plen)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct page *pages[NFS4XATTR_MAXPAGES] = {};
 	struct nfs42_getxattrargs arg = {
 		.fh		= NFS_FH(inode),
-		.xattr_pages	= pages,
-		.xattr_len	= buflen,
 		.xattr_name	= name,
 	};
 	struct nfs42_getxattrres res;
@@ -1189,7 +1187,10 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
-	int ret, np;
+	ssize_t ret;
+
+	arg.xattr_len = plen;
+	arg.xattr_pages = pages;
 
 	ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args,
 	    &res.seq_res, 0);
@@ -1214,10 +1215,6 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name,
 		_copy_from_pages(buf, pages, 0, res.xattr_len);
 	}
 
-	np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE);
-	while (--np >= 0)
-		__free_page(pages[np]);
-
 	return res.xattr_len;
 }
 
@@ -1292,16 +1289,44 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
 			      void *buf, size_t buflen)
 {
 	struct nfs4_exception exception = { };
-	ssize_t err;
+	ssize_t err, np, i;
+	struct page **pages;
 
+	np = nfs_page_array_len(0, buflen ?: XATTR_SIZE_MAX);
+	pages = kmalloc_array(np, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	for (i = 0; i < np; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i]) {
+			np = i + 1;
+			goto out;
+		}
+	}
+
+	/*
+	 * The GETXATTR op has no length field in the call, and the
+	 * xattr data is at the end of the reply.
+	 *
+	 * There is no downside in using the page-aligned length. It will
+	 * allow receiving and caching xattrs that are too large for the
+	 * caller but still fit in the page-rounded value.
+	 */
 	do {
-		err = _nfs42_proc_getxattr(inode, name, buf, buflen);
+		err = _nfs42_proc_getxattr(inode, name, buf, buflen,
+			pages, np * PAGE_SIZE);
 		if (err >= 0)
 			break;
 		err = nfs4_handle_exception(NFS_SERVER(inode), err,
 				&exception);
 	} while (exception.retry);
 
+out:
+	while (--np >= 0)
+		__free_page(pages[np]);
+	kfree(pages);
+
 	return err;
 }
 
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 103978ff76c9..c0b8fcd266c9 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr,
 		return -EIO;
 
 	len = be32_to_cpup(p);
+
+	/*
+	 * Only check against the page length here. The actual
+	 * requested length may be smaller, but that is only
+	 * checked against after possibly caching a valid reply.
+	 */
 	if (len > req->rq_rcv_buf.page_len)
 		return -ERANGE;
 
@@ -1477,7 +1483,6 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
 		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
 	};
 	uint32_t replen;
-	size_t plen;
 
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
@@ -1485,10 +1490,8 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr,
 	replen = hdr.replen + op_decode_hdr_maxsz + 1;
 	encode_getxattr(xdr, args->xattr_name, &hdr);
 
-	plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX;
-
-	rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, replen);
-	req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES;
+	rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len,
+				replen);
 
 	encode_nops(&hdr);
 }

From 15261b9126cd5bb2ad8521da49d8f5c042d904c7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 8 Dec 2020 18:29:02 -0500
Subject: [PATCH 150/230] xprtrdma: Fix XDRBUF_SPARSE_PAGES support

Olga K. observed that rpcrdma_marsh_req() allocates sparse pages
only when it has determined that a Reply chunk is necessary. There
are plenty of cases where no Reply chunk is needed, but the
XDRBUF_SPARSE_PAGES flag is set. The result would be a crash in
rpcrdma_inline_fixup() when it tries to copy parts of the received
Reply into a missing page.

To avoid crashing, handle sparse page allocation up front.

Until XATTR support was added, this issue did not appear often
because the only SPARSE_PAGES consumer always expected a reply large
enough to always require a Reply chunk.

Reported-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xprtrdma/rpc_rdma.c | 40 ++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 0f5120c7668f..c48536f2121f 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
 		r_xprt->rx_ep->re_max_inline_recv;
 }
 
+/* ACL likes to be lazy in allocating pages. For TCP, these
+ * pages can be allocated during receive processing. Not true
+ * for RDMA, which must always provision receive buffers
+ * up front.
+ */
+static noinline int
+rpcrdma_alloc_sparse_pages(struct xdr_buf *buf)
+{
+	struct page **ppages;
+	int len;
+
+	len = buf->page_len;
+	ppages = buf->pages + (buf->page_base >> PAGE_SHIFT);
+	while (len > 0) {
+		if (!*ppages)
+			*ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
+		if (!*ppages)
+			return -ENOBUFS;
+		ppages++;
+		len -= PAGE_SIZE;
+	}
+
+	return 0;
+}
+
 /* Split @vec on page boundaries into SGEs. FMR registers pages, not
  * a byte range. Other modes coalesce these SGEs into a single MR
  * when they can.
@@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
 	ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
 	page_base = offset_in_page(xdrbuf->page_base);
 	while (len) {
-		/* ACL likes to be lazy in allocating pages - ACLs
-		 * are small by default but can get huge.
-		 */
-		if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) {
-			if (!*ppages)
-				*ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
-			if (!*ppages)
-				return -ENOBUFS;
-		}
 		seg->mr_page = *ppages;
 		seg->mr_offset = (char *)page_base;
 		seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
@@ -867,6 +883,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
 	__be32 *p;
 	int ret;
 
+	if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) {
+		ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf);
+		if (ret)
+			return ret;
+	}
+
 	rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
 	xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf),
 			rqst);

From 1f70ea700909d77d5658c33b6bf13e9123416ff1 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Mon, 7 Dec 2020 17:03:52 +0800
Subject: [PATCH 151/230] NFSv4.1: use BITS_PER_LONG macro in nfs4session.h

Use the existing BITS_PER_LONG macro instead of calculating the value.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4session.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b996ee23f1ba..3de425f59b3a 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -34,7 +34,7 @@ enum nfs4_slot_tbl_state {
 	NFS4_SLOT_TBL_DRAINING,
 };
 
-#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long))
+#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, BITS_PER_LONG)
 struct nfs4_slot_table {
 	struct nfs4_session *session;		/* Parent session */
 	struct nfs4_slot *slots;		/* seqid per slot */

From c54e959b36cbdb0cb2f2805e3e945dd83476a5c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 7 Dec 2020 13:30:46 -0500
Subject: [PATCH 152/230] SUNRPC: _shift_data_left/right_pages should check the
 shift length

Exit early if the shift is zero.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c852d199c789..5833329c132c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -225,6 +225,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
 
 	BUG_ON(pgfrom_base <= pgto_base);
 
+	if (!len)
+		return;
+
 	pgto = pages + (pgto_base >> PAGE_SHIFT);
 	pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
 
@@ -307,6 +310,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 
 	BUG_ON(pgto_base <= pgfrom_base);
 
+	if (!len)
+		return;
+
 	pgto_base += len;
 	pgfrom_base += len;
 
@@ -405,6 +411,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
 	char *vto;
 	size_t copy;
 
+	if (!len)
+		return;
+
 	pgto = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
 
@@ -449,6 +458,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 	char *vfrom;
 	size_t copy;
 
+	if (!len)
+		return;
+
 	pgfrom = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
 

From 9a20f6f4e6ba9713605fbf7e7426ca22f1181545 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Dec 2020 15:16:46 -0500
Subject: [PATCH 153/230] SUNRPC: Fixes for xdr_align_data()

The main use case right now for xdr_align_data() is to shift the page
data to the left, and in practice shrink the total XDR data buffer.
This patch ensures that we fix up the accounting for the buffer length
as we shift that data around.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   2 +-
 net/sunrpc/xdr.c           | 174 ++++++++++++++++++++++++++++---------
 2 files changed, 133 insertions(+), 43 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9548d075e06d..2b4e44bb0654 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -252,7 +252,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
-extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t);
+extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
 
 /**
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5833329c132c..c474339ba9ac 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -266,26 +266,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static void
-_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len)
-{
-	struct kvec *tail = buf->tail;
-
-	if (len > tail->iov_len)
-		len = tail->iov_len;
-
-	_copy_to_pages(buf->pages,
-		       buf->page_base + pgto,
-		       (char *)tail->iov_base,
-		       len);
-	tail->iov_len -= len;
-
-	if (tail->iov_len > 0)
-		memmove((char *)tail->iov_base,
-				tail->iov_base + len,
-				tail->iov_len);
-}
-
 /**
  * _shift_data_right_pages
  * @pages: vector of pages containing both the source and dest memory area.
@@ -516,6 +496,109 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+				   unsigned int len, unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len)
+		return;
+	if (len > tail->iov_len - base)
+		len = tail->iov_len - base;
+	/* Shift data into head */
+	if (shift > buf->page_len + base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto =
+			head->iov_len + buf->page_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	/* Shift data into pages */
+	if (shift > base) {
+		unsigned int pgto = buf->page_len + base - shift;
+		unsigned int pglen = len;
+
+		if (pgto + pglen > buf->page_len)
+			pglen = buf->page_len - pgto;
+		_copy_to_pages(buf->pages, buf->page_base + pgto,
+			       tail->iov_base + base, pglen);
+		base += pglen;
+		len -= pglen;
+		if (!len)
+			return;
+	}
+	memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	unsigned int pgto;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	/* Shift data into head */
+	if (shift > base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto = head->iov_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		_copy_from_pages(head->iov_base + hdto, buf->pages,
+				 buf->page_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	pgto = base - shift;
+	_shift_data_left_pages(buf->pages, buf->page_base + pgto,
+			       buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	xdr_buf_pages_copy_left(buf, base, len, shift);
+	len += base;
+	if (len <= buf->page_len)
+		return;
+	xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
+
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
@@ -1261,38 +1344,45 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 
-uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length)
+unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
+			    unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, bytes;
-	unsigned int shift = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, bytes, len;
+	unsigned int shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr_stream_remaining(xdr);
-	if (length < bytes)
-		bytes = length;
+
+	if (from >= buf->page_len + buf->tail->iov_len)
+		return 0;
+	if (from + buf->head->iov_len >= buf->len)
+		return 0;
+
+	len = buf->len - buf->head->iov_len;
+
+	/* We only shift data left! */
+	if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
+		      from, offset))
+		return 0;
+	if (WARN_ONCE(offset > buf->page_len,
+		      "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
+		      offset, buf->page_len))
+		return 0;
 
 	/* Move page data to the left */
-	if (from > offset) {
-		shift = min_t(unsigned int, bytes, buf->page_len - from);
-		_shift_data_left_pages(buf->pages,
-				       buf->page_base + offset,
-				       buf->page_base + from,
-				       shift);
-		bytes -= shift;
+	shift = from - offset;
+	xdr_buf_pages_shift_left(buf, from, len, shift);
+	xdr->buf->len -= shift;
+	xdr->nwords -= XDR_QUADLEN(shift);
 
-		/* Move tail data into the pages, if necessary */
-		if (bytes > 0)
-			_shift_data_left_tail(buf, offset + shift, bytes);
-	}
+	bytes = xdr_stream_remaining(xdr);
+	if (length > bytes)
+		length = bytes;
+	bytes -= length;
 
 	xdr->nwords -= XDR_QUADLEN(length);
-	xdr_set_page(xdr, from + length, xdr_stream_remaining(xdr));
+	xdr_set_page(xdr, offset + length, bytes);
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);

From c4f2f591f02c392ea7de018d2733748bf4c7b5f5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Dec 2020 17:15:09 -0500
Subject: [PATCH 154/230] SUNRPC: Fix xdr_expand_hole()

We do want to try to grow the buffer if possible, but if that attempt
fails, we still want to move the data and truncate the XDR message.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   2 +-
 net/sunrpc/xdr.c           | 270 ++++++++++++++++++++++++-------------
 2 files changed, 178 insertions(+), 94 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 2b4e44bb0654..178f499e2283 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -253,7 +253,7 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
-extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
+extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 
 /**
  * xdr_stream_remaining - Return the number of bytes remaining in the stream
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c474339ba9ac..e0906ed24374 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -334,46 +334,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static unsigned int
-_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len)
-{
-	struct kvec *tail = buf->tail;
-	unsigned int tailbuf_len;
-	unsigned int result = 0;
-	size_t copy;
-
-	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
-	/* Shift the tail first */
-	if (tailbuf_len != 0) {
-		unsigned int free_space = tailbuf_len - tail->iov_len;
-
-		if (len < free_space)
-			free_space = len;
-		if (len > free_space)
-			len = free_space;
-
-		tail->iov_len += free_space;
-		copy = len;
-
-		if (tail->iov_len > len) {
-			char *p = (char *)tail->iov_base + len;
-			memmove(p, tail->iov_base, tail->iov_len - free_space);
-			result += tail->iov_len - free_space;
-		} else
-			copy = tail->iov_len;
-
-		/* Copy from the inlined pages into the tail */
-		_copy_from_pages((char *)tail->iov_base,
-					 buf->pages,
-					 buf->page_base + pgfrom,
-					 copy);
-		result += copy;
-	}
-
-	return result;
-}
-
 /**
  * _copy_to_pages
  * @pages: array of pages
@@ -464,18 +424,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 }
 EXPORT_SYMBOL_GPL(_copy_from_pages);
 
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+			     unsigned int len)
+{
+	if (base >= iov->iov_len)
+		return;
+	if (len > iov->iov_len - base)
+		len = iov->iov_len - base;
+	memset(iov->iov_base + base, 0, len);
+}
+
 /**
- * _zero_pages
- * @pages: array of pages
- * @pgbase: beginning page vector address
+ * xdr_buf_pages_zero
+ * @buf: xdr_buf
+ * @pgbase: beginning offset
  * @len: length
  */
-static void
-_zero_pages(struct page **pages, size_t pgbase, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+			       unsigned int len)
 {
+	struct page **pages = buf->pages;
 	struct page **page;
 	char *vpage;
-	size_t zero;
+	unsigned int zero;
+
+	if (!len)
+		return;
+	if (pgbase >= buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+		return;
+	}
+	if (pgbase + len > buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+		len = buf->page_len - pgbase;
+	}
+
+	pgbase += buf->page_base;
 
 	page = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
@@ -496,6 +480,103 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+	struct kvec *head = buf->head;
+	struct kvec *tail = buf->tail;
+	unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+	unsigned int free_space;
+
+	if (sum > buf->len) {
+		free_space = min_t(unsigned int, sum - buf->len, len);
+		buf->len += free_space;
+		len -= free_space;
+		if (!len)
+			return;
+	}
+
+	if (buf->buflen > sum) {
+		/* Expand the tail buffer */
+		free_space = min_t(unsigned int, buf->buflen - sum, len);
+		tail->iov_len += free_space;
+		buf->len += free_space;
+	}
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+
+	if (to >= tail->iov_len)
+		return;
+	if (len + to > tail->iov_len)
+		len = tail->iov_len - to;
+	memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+	unsigned int pglen = 0;
+	unsigned int talen = 0, tato = 0;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	if (to >= buf->page_len) {
+		tato = to - buf->page_len;
+		if (tail->iov_len >= len + tato)
+			talen = len;
+		else if (tail->iov_len > tato)
+			talen = tail->iov_len - tato;
+	} else if (len + to >= buf->page_len) {
+		pglen = buf->page_len - to;
+		talen = len - pglen;
+		if (talen > tail->iov_len)
+			talen = tail->iov_len;
+	} else
+		pglen = len;
+
+	_copy_from_pages(tail->iov_base + tato, buf->pages,
+			 buf->page_base + base + pglen, talen);
+	_shift_data_right_pages(buf->pages, buf->page_base + to,
+				buf->page_base + base, pglen);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len || !shift || !len)
+		return;
+	xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+				      unsigned int base, unsigned int len,
+				      unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	if (base + len > buf->page_len)
+		xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+					 shift);
+	xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
 static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
 				   unsigned int len, unsigned int shift)
 {
@@ -685,30 +766,33 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 }
 
 /**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
  * @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
  *
  * The extra data is not lost, but is instead moved into buf->tail.
  * Returns the actual number of bytes moved.
  */
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
 {
-	unsigned int pglen = buf->page_len;
-	unsigned int result;
+	unsigned int shift, buflen = buf->len - buf->head->iov_len;
 
-	if (len > buf->page_len)
-		len = buf-> page_len;
-
-	result = _shift_data_right_tail(buf, pglen - len, len);
-	buf->page_len -= len;
-	buf->buflen -= len;
-	/* Have we truncated the message? */
-	if (buf->len > buf->buflen)
-		buf->len = buf->buflen;
-
-	return result;
+	WARN_ON_ONCE(len > buf->page_len);
+	if (buf->head->iov_len >= buf->len || len > buflen)
+		buflen = len;
+	if (buf->page_len > buflen) {
+		buf->buflen -= buf->page_len - buflen;
+		buf->page_len = buflen;
+	}
+	if (len >= buf->page_len)
+		return 0;
+	shift = buf->page_len - len;
+	xdr_buf_try_expand(buf, shift);
+	xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+	buf->page_len = len;
+	buf->len -= shift;
+	buf->buflen -= shift;
+	return shift;
 }
 
 void
@@ -728,6 +812,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
 }
 EXPORT_SYMBOL_GPL(xdr_stream_pos);
 
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	unsigned int blen = xdr->buf->len;
+
+	xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
 /**
  * xdr_page_pos - Return the current offset from the start of the xdr pages
  * @xdr: pointer to struct xdr_stream
@@ -1291,7 +1387,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 	struct xdr_buf *buf = xdr->buf;
 	unsigned int nwords = XDR_QUADLEN(len);
 	unsigned int cur = xdr_stream_pos(xdr);
-	unsigned int copied, offset;
+	unsigned int copied;
 
 	if (xdr->nwords == 0)
 		return 0;
@@ -1305,9 +1401,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 		len = buf->page_len;
 	else if (nwords < xdr->nwords) {
 		/* Truncate page data and move it into the tail */
-		offset = buf->page_len - len;
-		copied = xdr_shrink_pagelen(buf, offset);
-		trace_rpc_xdr_alignment(xdr, offset, copied);
+		copied = xdr_shrink_pagelen(buf, len);
+		trace_rpc_xdr_alignment(xdr, len, copied);
 		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 	return len;
@@ -1387,39 +1482,28 @@ unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
 
-uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length)
+unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
+			     unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int bytes;
-	unsigned int from;
-	unsigned int truncated = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, to, shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr_stream_remaining(xdr);
+	to = xdr_align_size(offset + length);
 
-	if (offset + length + bytes > buf->page_len) {
-		unsigned int shift = (offset + length + bytes) - buf->page_len;
-		unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift);
-		truncated = shift - res;
-		xdr->nwords -= XDR_QUADLEN(truncated);
-		bytes -= shift;
-	}
+	/* Could the hole be behind us? */
+	if (to > from) {
+		unsigned int buflen = buf->len - buf->head->iov_len;
+		shift = to - from;
+		xdr_buf_try_expand(buf, shift);
+		xdr_buf_pages_shift_right(buf, from, buflen, shift);
+		xdr_stream_page_set_pos(xdr, to);
+	} else if (to != from)
+		xdr_align_data(xdr, to, 0);
+	xdr_buf_pages_zero(buf, offset, length);
 
-	/* Now move the page data over and zero pages */
-	if (bytes > 0)
-		_shift_data_right_pages(buf->pages,
-					buf->page_base + offset + length,
-					buf->page_base + from,
-					bytes);
-	_zero_pages(buf->pages, buf->page_base + offset, length);
-
-	buf->len += length - (from - offset) - truncated;
-	xdr_set_page(xdr, offset + length, xdr_stream_remaining(xdr));
+	xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_expand_hole);

From 6707fbd7d3be72da4ebde7f56b46814befd2db19 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 6 Dec 2020 10:15:04 -0500
Subject: [PATCH 155/230] SUNRPC: Cleanup xdr_shrink_bufhead()

Clean up xdr_shrink_bufhead() to use the new helpers instead of doing
its own thing.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 164 +++++++++++++++++++++++++----------------------
 1 file changed, 87 insertions(+), 77 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e0906ed24374..19eaa38f7d16 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -550,6 +550,53 @@ static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
 				buf->page_base + base, pglen);
 }
 
+static void xdr_buf_head_copy_right(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *head = buf->head;
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+	unsigned int pglen = 0, pgto = 0;
+	unsigned int talen = 0, tato = 0;
+
+	if (base >= head->iov_len)
+		return;
+	if (len > head->iov_len - base)
+		len = head->iov_len - base;
+	if (to >= buf->page_len + head->iov_len) {
+		tato = to - buf->page_len - head->iov_len;
+		talen = len;
+	} else if (to >= head->iov_len) {
+		pgto = to - head->iov_len;
+		pglen = len;
+		if (pgto + pglen > buf->page_len) {
+			talen = pgto + pglen - buf->page_len;
+			pglen -= talen;
+		}
+	} else {
+		pglen = len - to;
+		if (pglen > buf->page_len) {
+			talen = pglen - buf->page_len;
+			pglen = buf->page_len;
+		}
+	}
+
+	len -= talen;
+	base += len;
+	if (talen + tato > tail->iov_len)
+		talen = tail->iov_len > tato ? tail->iov_len - tato : 0;
+	memcpy(tail->iov_base + tato, head->iov_base + base, talen);
+
+	len -= pglen;
+	base -= pglen;
+	_copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base,
+		       pglen);
+
+	base -= len;
+	memmove(head->iov_base + to, head->iov_base + base, len);
+}
+
 static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
 				     unsigned int base, unsigned int len,
 				     unsigned int shift)
@@ -577,6 +624,25 @@ static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
 	xdr_buf_pages_copy_right(buf, base, len, shift);
 }
 
+static void xdr_buf_head_shift_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *head = buf->head;
+
+	if (!shift)
+		return;
+	if (base >= head->iov_len) {
+		xdr_buf_pages_shift_right(buf, head->iov_len - base, len,
+					  shift);
+		return;
+	}
+	if (base + len > head->iov_len)
+		xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len,
+					  shift);
+	xdr_buf_head_copy_right(buf, base, len, shift);
+}
+
 static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
 				   unsigned int len, unsigned int shift)
 {
@@ -683,86 +749,31 @@ static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
- * @len: bytes to remove from buf->head[0]
+ * @len: new length of buf->head[0]
  *
- * Shrinks XDR buffer's header kvec buf->head[0] by
+ * Shrinks XDR buffer's header kvec buf->head[0], setting it to
  * 'len' bytes. The extra data is not lost, but is instead
  * moved into the inlined pages and/or the tail.
  */
-static unsigned int
-xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len)
 {
-	struct kvec *head, *tail;
-	size_t copy, offs;
-	unsigned int pglen = buf->page_len;
-	unsigned int result;
-
-	result = 0;
-	tail = buf->tail;
-	head = buf->head;
+	struct kvec *head = buf->head;
+	unsigned int shift, buflen = max(buf->len, len);
 
 	WARN_ON_ONCE(len > head->iov_len);
-	if (len > head->iov_len)
-		len = head->iov_len;
-
-	/* Shift the tail first */
-	if (tail->iov_len != 0) {
-		if (tail->iov_len > len) {
-			copy = tail->iov_len - len;
-			memmove((char *)tail->iov_base + len,
-					tail->iov_base, copy);
-			result += copy;
-		}
-		/* Copy from the inlined pages into the tail */
-		copy = len;
-		if (copy > pglen)
-			copy = pglen;
-		offs = len - copy;
-		if (offs >= tail->iov_len)
-			copy = 0;
-		else if (copy > tail->iov_len - offs)
-			copy = tail->iov_len - offs;
-		if (copy != 0) {
-			_copy_from_pages((char *)tail->iov_base + offs,
-					buf->pages,
-					buf->page_base + pglen + offs - len,
-					copy);
-			result += copy;
-		}
-		/* Do we also need to copy data from the head into the tail ? */
-		if (len > pglen) {
-			offs = copy = len - pglen;
-			if (copy > tail->iov_len)
-				copy = tail->iov_len;
-			memcpy(tail->iov_base,
-					(char *)head->iov_base +
-					head->iov_len - offs,
-					copy);
-			result += copy;
-		}
+	if (head->iov_len > buflen) {
+		buf->buflen -= head->iov_len - buflen;
+		head->iov_len = buflen;
 	}
-	/* Now handle pages */
-	if (pglen != 0) {
-		if (pglen > len)
-			_shift_data_right_pages(buf->pages,
-					buf->page_base + len,
-					buf->page_base,
-					pglen - len);
-		copy = len;
-		if (len > pglen)
-			copy = pglen;
-		_copy_to_pages(buf->pages, buf->page_base,
-				(char *)head->iov_base + head->iov_len - len,
-				copy);
-		result += copy;
-	}
-	head->iov_len -= len;
-	buf->buflen -= len;
-	/* Have we truncated the message? */
-	if (buf->len > buf->buflen)
-		buf->len = buf->buflen;
-
-	return result;
+	if (len >= head->iov_len)
+		return 0;
+	shift = head->iov_len - len;
+	xdr_buf_try_expand(buf, shift);
+	xdr_buf_head_shift_right(buf, len, buflen - len, shift);
+	head->iov_len = len;
+	buf->buflen -= shift;
+	buf->len -= shift;
+	return shift;
 }
 
 /**
@@ -798,7 +809,7 @@ static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
 void
 xdr_shift_buf(struct xdr_buf *buf, size_t len)
 {
-	xdr_shrink_bufhead(buf, len);
+	xdr_shrink_bufhead(buf, buf->head->iov_len - len);
 }
 EXPORT_SYMBOL_GPL(xdr_shift_buf);
 
@@ -1371,13 +1382,12 @@ static void xdr_realign_pages(struct xdr_stream *xdr)
 	struct xdr_buf *buf = xdr->buf;
 	struct kvec *iov = buf->head;
 	unsigned int cur = xdr_stream_pos(xdr);
-	unsigned int copied, offset;
+	unsigned int copied;
 
 	/* Realign pages to current pointer position */
 	if (iov->iov_len > cur) {
-		offset = iov->iov_len - cur;
-		copied = xdr_shrink_bufhead(buf, offset);
-		trace_rpc_xdr_alignment(xdr, offset, copied);
+		copied = xdr_shrink_bufhead(buf, cur);
+		trace_rpc_xdr_alignment(xdr, cur, copied);
 		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 }

From e43ac22b83921928479da0bad25aaee3d95c2b1a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 6 Dec 2020 12:41:41 -0500
Subject: [PATCH 156/230] SUNRPC: _copy_to/from_pages() now check for zero
 length

Clean up callers of _copy_to/from_pages() that still check for a zero
length.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 19eaa38f7d16..01918e60b67b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1665,8 +1665,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->page_len);
-	if (this_len)
-		_copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
+	_copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len);
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);
@@ -1696,8 +1695,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->page_len);
-	if (this_len)
-		_copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
+	_copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len);
 	len -= this_len;
 	obj += this_len;
 	this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len);

From 5a5f1c2c2cbb6ddef637abb7c7e7cab20b9cc933 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 15:02:14 -0500
Subject: [PATCH 157/230] SUNRPC: Clean up open coded setting of the xdr_stream
 'nwords' field

Move the setting of the xdr_stream 'nwords' field into the helpers that
reset the xdr_stream cursor.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 01918e60b67b..f0444bf5617c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1183,6 +1183,15 @@ static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
 	return len - base;
 }
 
+static unsigned int xdr_set_tail_base(struct xdr_stream *xdr,
+				      unsigned int base, unsigned int len)
+{
+	struct xdr_buf *buf = xdr->buf;
+
+	xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len);
+	return xdr_set_iov(xdr, buf->tail, base, len);
+}
+
 static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
 				      unsigned int base, unsigned int len)
 {
@@ -1201,6 +1210,7 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr,
 	if (len > maxlen)
 		len = maxlen;
 
+	xdr_stream_page_set_pos(xdr, base);
 	base += xdr->buf->page_base;
 
 	pgnr = base >> PAGE_SHIFT;
@@ -1223,7 +1233,7 @@ static void xdr_set_page(struct xdr_stream *xdr, unsigned int base,
 {
 	if (xdr_set_page_base(xdr, base, len) == 0) {
 		base -= xdr->buf->page_len;
-		xdr_set_iov(xdr, xdr->buf->tail, base, len);
+		xdr_set_tail_base(xdr, base, len);
 	}
 }
 
@@ -1236,7 +1246,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
 	if (newbase < xdr->buf->page_len)
 		xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr));
 	else
-		xdr_set_iov(xdr, xdr->buf->tail, 0, xdr_stream_remaining(xdr));
+		xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr));
 }
 
 static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -1388,7 +1398,7 @@ static void xdr_realign_pages(struct xdr_stream *xdr)
 	if (iov->iov_len > cur) {
 		copied = xdr_shrink_bufhead(buf, cur);
 		trace_rpc_xdr_alignment(xdr, cur, copied);
-		xdr->nwords = XDR_QUADLEN(buf->len - cur);
+		xdr_set_page(xdr, 0, buf->page_len);
 	}
 }
 
@@ -1396,7 +1406,6 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 {
 	struct xdr_buf *buf = xdr->buf;
 	unsigned int nwords = XDR_QUADLEN(len);
-	unsigned int cur = xdr_stream_pos(xdr);
 	unsigned int copied;
 
 	if (xdr->nwords == 0)
@@ -1413,7 +1422,6 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 		/* Truncate page data and move it into the tail */
 		copied = xdr_shrink_pagelen(buf, len);
 		trace_rpc_xdr_alignment(xdr, len, copied);
-		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 	return len;
 }
@@ -1439,12 +1447,10 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 	if (pglen == 0)
 		return 0;
 
-	xdr->nwords -= nwords;
 	base = (nwords << 2) - pglen;
 	end = xdr_stream_remaining(xdr) - pglen;
 
-	if (xdr_set_iov(xdr, xdr->buf->tail, base, end) == 0)
-		xdr->nwords = 0;
+	xdr_set_tail_base(xdr, base, end);
 	return len <= pglen ? len : pglen;
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
@@ -1478,15 +1484,13 @@ unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
 	/* Move page data to the left */
 	shift = from - offset;
 	xdr_buf_pages_shift_left(buf, from, len, shift);
-	xdr->buf->len -= shift;
-	xdr->nwords -= XDR_QUADLEN(shift);
 
 	bytes = xdr_stream_remaining(xdr);
 	if (length > bytes)
 		length = bytes;
 	bytes -= length;
 
-	xdr->nwords -= XDR_QUADLEN(length);
+	xdr->buf->len -= shift;
 	xdr_set_page(xdr, offset + length, bytes);
 	return length;
 }
@@ -1508,12 +1512,11 @@ unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
 		shift = to - from;
 		xdr_buf_try_expand(buf, shift);
 		xdr_buf_pages_shift_right(buf, from, buflen, shift);
-		xdr_stream_page_set_pos(xdr, to);
+		xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
 	} else if (to != from)
 		xdr_align_data(xdr, to, 0);
 	xdr_buf_pages_zero(buf, offset, length);
 
-	xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_expand_hole);

From f8d0e60f1056687826abc1eded98f0ea067dfc4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 22:56:18 -0500
Subject: [PATCH 158/230] SUNRPC: Cleanup - constify a number of xdr_buf
 helpers

There are a number of xdr helpers for struct xdr_buf that do not change
the structure itself. Mark those as taking const pointers for
documentation purposes.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h | 22 ++++++++--------
 net/sunrpc/xdr.c           | 53 +++++++++++++++++---------------------
 2 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 178f499e2283..68d49fdc4ee9 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -128,8 +128,8 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
 
 void	xdr_inline_pages(struct xdr_buf *, unsigned int,
 			 struct page **, unsigned int, unsigned int);
-void	xdr_terminate_string(struct xdr_buf *, const u32);
-size_t	xdr_buf_pagecount(struct xdr_buf *buf);
+void	xdr_terminate_string(const struct xdr_buf *, const u32);
+size_t	xdr_buf_pagecount(const struct xdr_buf *buf);
 int	xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp);
 void	xdr_free_bvec(struct xdr_buf *buf);
 
@@ -182,14 +182,14 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
  * XDR buffer helper functions
  */
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
-extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
-extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
+extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *);
+extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
 extern void xdr_buf_trim(struct xdr_buf *, unsigned int);
-extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
-extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int);
 
-extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32);
-extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *);
+extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32);
+extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *);
 
 struct xdr_array2_desc;
 typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem);
@@ -200,9 +200,9 @@ struct xdr_array2_desc {
 	xdr_xcode_elem_t xcode;
 };
 
-extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
-extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
 extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase,
 			     size_t len);
@@ -251,7 +251,7 @@ extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buf
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
-extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f0444bf5617c..2e91fbd70f11 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
  * @len: length of string, in bytes
  *
  */
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
 {
 	char *kaddr;
 
@@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
 }
 EXPORT_SYMBOL_GPL(xdr_terminate_string);
 
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
 {
 	if (!buf->page_len)
 		return 0;
@@ -1545,8 +1543,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
 
 static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
 
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
 {
 	buf->head[0] = *iov;
 	buf->tail[0] = empty_iov;
@@ -1569,9 +1566,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
  *
  * Returns -1 if base of length are out of bounds.
  */
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
-			unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+		       unsigned int base, unsigned int len)
 {
 	subbuf->buflen = subbuf->len = len;
 	if (base < buf->head[0].iov_len) {
@@ -1659,7 +1655,8 @@ fix_len:
 }
 EXPORT_SYMBOL_GPL(xdr_buf_trim);
 
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+				      void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1676,7 +1673,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			    void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1689,7 +1687,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
 }
 EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
 
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+				     void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1706,7 +1705,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			   void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1719,8 +1719,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
 }
 EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
 
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
 {
 	__be32	raw;
 	int	status;
@@ -1733,8 +1732,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
 }
 EXPORT_SYMBOL_GPL(xdr_decode_word);
 
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
 {
 	__be32	raw = cpu_to_be32(obj);
 
@@ -1743,9 +1741,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
 EXPORT_SYMBOL_GPL(xdr_encode_word);
 
 /* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
-		 struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+			    struct xdr_array2_desc *desc, int encode)
 {
 	char *elem = NULL, *c;
 	unsigned int copied = 0, todo, avail_here;
@@ -1937,9 +1934,8 @@ out:
 	return err;
 }
 
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if (base >= buf->len)
 		return -EINVAL;
@@ -1948,9 +1944,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_decode_array2);
 
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
 	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1960,9 +1955,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_encode_array2);
 
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
-		int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+		    unsigned int len,
+		    int (*actor)(struct scatterlist *, void *), void *data)
 {
 	int i, ret = 0;
 	unsigned int page_len, thislen, page_offset;

From 5802f7c2a6b876b2810e3e9f26d719961f12e251 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 10 Dec 2020 08:55:35 -0500
Subject: [PATCH 159/230] SUNRPC: When expanding the buffer, we may need grow
 the sparse pages

If we're shifting the page data to the right, and this happens to be a
sparse page array, then we may need to allocate new pages in order to
receive the data.

Reported-by: "Mkrtchyan, Tigran" <tigran.mkrtchyan@desy.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 net/sunrpc/xdr.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 2e91fbd70f11..60d4442c5273 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -478,16 +478,47 @@ static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
 	} while ((len -= zero) != 0);
 }
 
+static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf,
+					      unsigned int buflen, gfp_t gfp)
+{
+	unsigned int i, npages, pagelen;
+
+	if (!(buf->flags & XDRBUF_SPARSE_PAGES))
+		return buflen;
+	if (buflen <= buf->head->iov_len)
+		return buflen;
+	pagelen = buflen - buf->head->iov_len;
+	if (pagelen > buf->page_len)
+		pagelen = buf->page_len;
+	npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for (i = 0; i < npages; i++) {
+		if (!buf->pages[i])
+			continue;
+		buf->pages[i] = alloc_page(gfp);
+		if (likely(buf->pages[i]))
+			continue;
+		buflen -= pagelen;
+		pagelen = i << PAGE_SHIFT;
+		if (pagelen > buf->page_base)
+			buflen += pagelen - buf->page_base;
+		break;
+	}
+	return buflen;
+}
+
 static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
 {
 	struct kvec *head = buf->head;
 	struct kvec *tail = buf->tail;
 	unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
-	unsigned int free_space;
+	unsigned int free_space, newlen;
 
 	if (sum > buf->len) {
 		free_space = min_t(unsigned int, sum - buf->len, len);
-		buf->len += free_space;
+		newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space,
+						   GFP_KERNEL);
+		free_space = newlen - buf->len;
+		buf->len = newlen;
 		len -= free_space;
 		if (!len)
 			return;

From 1ee6310119a5b4460324111a8c4536054356b963 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 13:11:11 -0500
Subject: [PATCH 160/230] NFSv4.2: Ensure we always reset the result->count in
 decode_read_plus()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index c0b8fcd266c9..1c21db640f4d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1087,6 +1087,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 	if (unlikely(!p))
 		return -EIO;
 
+	res->count = 0;
 	eof = be32_to_cpup(p++);
 	segments = be32_to_cpup(p++);
 	if (segments == 0)

From 5c4afe2ab624cb8156e987ff929e00632fb56aeb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 07:40:04 -0500
Subject: [PATCH 161/230] NFSv4.2: decode_read_plus_data() must skip padding
 after data segment

All XDR opaque object sizes are 32-bit aligned, and a data segment is no
exception.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 1c21db640f4d..4c6bce3dbaeb 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1038,7 +1038,9 @@ static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *re
 
 	p = xdr_decode_hyper(p, &offset);
 	count = be32_to_cpup(p);
-	recvd = xdr_align_data(xdr, res->count, count);
+	recvd = xdr_align_data(xdr, res->count, xdr_align_size(count));
+	if (recvd > count)
+		recvd = count;
 	res->count += recvd;
 
 	if (count > recvd) {

From 82f98c8b116bd769a47688ca5227f94826ae8a2a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 08:14:55 -0500
Subject: [PATCH 162/230] NFSv4.2: decode_read_plus_hole() needs to check the
 extent offset

The server is allowed to return a hole extent with an offset that starts
before the offset supplied in the READ_PLUS argument. Ensure that we
support that case too.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 4c6bce3dbaeb..f9faa131a4f5 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1053,8 +1053,9 @@ static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *re
 	return 0;
 }
 
-static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res,
-				 uint32_t *eof)
+static int decode_read_plus_hole(struct xdr_stream *xdr,
+				 struct nfs_pgio_args *args,
+				 struct nfs_pgio_res *res, uint32_t *eof)
 {
 	uint64_t offset, length, recvd;
 	__be32 *p;
@@ -1065,6 +1066,20 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re
 
 	p = xdr_decode_hyper(p, &offset);
 	p = xdr_decode_hyper(p, &length);
+	if (offset != args->offset + res->count) {
+		/* Server returned an out-of-sequence extent */
+		if (offset > args->offset + res->count ||
+		    offset + length < args->offset + res->count) {
+			dprintk("NFS: server returned out of sequence extent: "
+				"offset/size = %llu/%llu != expected %llu\n",
+				(unsigned long long)offset,
+				(unsigned long long)length,
+				(unsigned long long)(args->offset +
+						     res->count));
+			return 1;
+		}
+		length -= args->offset + res->count - offset;
+	}
 	recvd = xdr_expand_hole(xdr, res->count, length);
 	res->count += recvd;
 
@@ -1077,6 +1092,9 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re
 
 static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 {
+	struct nfs_pgio_header *hdr =
+		container_of(res, struct nfs_pgio_header, res);
+	struct nfs_pgio_args *args = &hdr->args;
 	uint32_t eof, segments, type;
 	int status, i;
 	__be32 *p;
@@ -1104,7 +1122,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 		if (type == NFS4_CONTENT_DATA)
 			status = decode_read_plus_data(xdr, res, &eof);
 		else if (type == NFS4_CONTENT_HOLE)
-			status = decode_read_plus_hole(xdr, res, &eof);
+			status = decode_read_plus_hole(xdr, args, res, &eof);
 		else
 			return -EINVAL;
 

From dac3b1059b499c570f02cd94f3172d8c8df3a9dd Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 08:41:01 -0500
Subject: [PATCH 163/230] NFSv4.2: Handle hole lengths that exceed the
 READ_PLUS read buffer

If a hole extends beyond the READ_PLUS read buffer, then we want to fill
just the remaining buffer with zeros. Also ignore eof...

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index f9faa131a4f5..6ba2a28e7e03 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1080,6 +1080,12 @@ static int decode_read_plus_hole(struct xdr_stream *xdr,
 		}
 		length -= args->offset + res->count - offset;
 	}
+	if (length + res->count > args->count) {
+		*eof = 0;
+		if (unlikely(res->count >= args->count))
+			return 1;
+		length = args->count - res->count;
+	}
 	recvd = xdr_expand_hole(xdr, res->count, length);
 	res->count += recvd;
 

From 503b934a752f7e789a5f33217520e0a79f3096ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 07:51:29 -0500
Subject: [PATCH 164/230] NFSv4.2: Don't error when exiting early on a
 READ_PLUS buffer overflow

Expanding the READ_PLUS extents can cause the read buffer to overflow.
If it does, then don't error, but just exit early.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6ba2a28e7e03..9ef5261a1a70 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1025,16 +1025,16 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
 	return decode_op_hdr(xdr, OP_DEALLOCATE);
 }
 
-static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res,
-				 uint32_t *eof)
+static int decode_read_plus_data(struct xdr_stream *xdr,
+				 struct nfs_pgio_res *res)
 {
 	uint32_t count, recvd;
 	uint64_t offset;
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 8 + 4);
-	if (unlikely(!p))
-		return -EIO;
+	if (!p)
+		return 1;
 
 	p = xdr_decode_hyper(p, &offset);
 	count = be32_to_cpup(p);
@@ -1043,13 +1043,8 @@ static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *re
 		recvd = count;
 	res->count += recvd;
 
-	if (count > recvd) {
-		dprintk("NFS: server cheating in read reply: "
-				"count %u > recvd %u\n", count, recvd);
-		*eof = 0;
+	if (count > recvd)
 		return 1;
-	}
-
 	return 0;
 }
 
@@ -1061,8 +1056,8 @@ static int decode_read_plus_hole(struct xdr_stream *xdr,
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 8 + 8);
-	if (unlikely(!p))
-		return -EIO;
+	if (!p)
+		return 1;
 
 	p = xdr_decode_hyper(p, &offset);
 	p = xdr_decode_hyper(p, &length);
@@ -1089,10 +1084,8 @@ static int decode_read_plus_hole(struct xdr_stream *xdr,
 	recvd = xdr_expand_hole(xdr, res->count, length);
 	res->count += recvd;
 
-	if (recvd < length) {
-		*eof = 0;
+	if (recvd < length)
 		return 1;
-	}
 	return 0;
 }
 
@@ -1121,12 +1114,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 
 	for (i = 0; i < segments; i++) {
 		p = xdr_inline_decode(xdr, 4);
-		if (unlikely(!p))
-			return -EIO;
+		if (!p)
+			goto early_out;
 
 		type = be32_to_cpup(p++);
 		if (type == NFS4_CONTENT_DATA)
-			status = decode_read_plus_data(xdr, res, &eof);
+			status = decode_read_plus_data(xdr, res);
 		else if (type == NFS4_CONTENT_HOLE)
 			status = decode_read_plus_hole(xdr, args, res, &eof);
 		else
@@ -1135,12 +1128,17 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 		if (status < 0)
 			return status;
 		if (status > 0)
-			break;
+			goto early_out;
 	}
 
 out:
 	res->eof = eof;
 	return 0;
+early_out:
+	if (unlikely(!i))
+		return -EIO;
+	res->eof = 0;
+	return 0;
 }
 
 static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)

From 7aedc687c9f62e0d22b3231a100030e02344be1a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 09:03:51 -0500
Subject: [PATCH 165/230] NFSv4.2: Deal with potential READ_PLUS data extent
 buffer overflow

If the server returns more data than we have buffer space for, then
we need to truncate and exit early.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42xdr.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 9ef5261a1a70..8386ca45a43f 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -1026,6 +1026,7 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re
 }
 
 static int decode_read_plus_data(struct xdr_stream *xdr,
+				 struct nfs_pgio_args *args,
 				 struct nfs_pgio_res *res)
 {
 	uint32_t count, recvd;
@@ -1041,8 +1042,12 @@ static int decode_read_plus_data(struct xdr_stream *xdr,
 	recvd = xdr_align_data(xdr, res->count, xdr_align_size(count));
 	if (recvd > count)
 		recvd = count;
+	if (res->count + recvd > args->count) {
+		if (args->count > res->count)
+			res->count += args->count - res->count;
+		return 1;
+	}
 	res->count += recvd;
-
 	if (count > recvd)
 		return 1;
 	return 0;
@@ -1119,7 +1124,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
 
 		type = be32_to_cpup(p++);
 		if (type == NFS4_CONTENT_DATA)
-			status = decode_read_plus_data(xdr, res);
+			status = decode_read_plus_data(xdr, args, res);
 		else if (type == NFS4_CONTENT_HOLE)
 			status = decode_read_plus_hole(xdr, args, res, &eof);
 		else

From 5c3485bb12c90945f86d6b1c901bbe76aa8b45c9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 10 Dec 2020 09:34:34 -0500
Subject: [PATCH 166/230] NFSv4.2/pnfs: Don't use READ_PLUS with pNFS yet

We have no way of tracking server READ_PLUS support in pNFS for now, so
just disable it.

Reported-by: "Mkrtchyan, Tigran" <tigran.mkrtchyan@desy.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ab40d0e6a74..61a07dcb963d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5320,17 +5320,17 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 }
 
 #ifdef CONFIG_NFS_V4_2
-static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+				    struct rpc_message *msg)
 {
-	if (server->caps & NFS_CAP_READ_PLUS)
+	/* Note: We don't use READ_PLUS with pNFS yet */
+	if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp)
 		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
-	else
-		msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
 #else
-static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg)
+static void nfs42_read_plus_support(struct nfs_pgio_header *hdr,
+				    struct rpc_message *msg)
 {
-	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
 }
 #endif /* CONFIG_NFS_V4_2 */
 
@@ -5340,7 +5340,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
 	hdr->timestamp   = jiffies;
 	if (!hdr->pgio_done_cb)
 		hdr->pgio_done_cb = nfs4_read_done_cb;
-	nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg);
+	msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
+	nfs42_read_plus_support(hdr, msg);
 	nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
 }
 

From dc167e38a014e04e4484b969ee05765232249b0d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 25 Sep 2020 09:27:09 -0400
Subject: [PATCH 167/230] ceph: don't WARN when removing caps due to
 blocklisting

We expect to remove dirty caps when the client is blocklisted. Don't
throw a warning in that case.

[ idryomov: break unnecessarily long line ]

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ded4229c314a..8552d1082b0e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1149,7 +1149,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 	/* remove from inode's cap rbtree, and clear auth cap */
 	rb_erase(&cap->ci_node, &ci->i_caps);
 	if (ci->i_auth_cap == cap) {
-		WARN_ON_ONCE(!list_empty(&ci->i_dirty_item));
+		WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) &&
+			     !mdsc->fsc->blocklisted);
 		ci->i_auth_cap = NULL;
 	}
 

From aa5c791053c7deecded06f6525fc6e917cb2061b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 6 Oct 2020 08:38:20 -0400
Subject: [PATCH 168/230] ceph: make fsc->mount_state an int

This field is an unsigned long currently, which is a bit of a waste on
most arches since this just holds an enum. Make it (signed) int instead.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/super.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 482473e4cce1..f9e35e81c9af 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -106,7 +106,7 @@ struct ceph_fs_client {
 	struct ceph_mount_options *mount_options;
 	struct ceph_client *client;
 
-	unsigned long mount_state;
+	int mount_state;
 
 	unsigned long last_auto_reconnect;
 	bool blocklisted;

From 50c9132ddfb2024e96900407beeec660cf9848bd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 25 Sep 2020 07:55:39 -0400
Subject: [PATCH 169/230] ceph: add new RECOVER mount_state when recovering
 session

When recovering a session (a'la recover_session=clean), we want to do
all of the operations that we do on a forced umount, but changing the
mount state to SHUTDOWN is can cause queued MDS requests to fail when
the session comes back. Most of those can idle until the session is
recovered in this situation.

Reserve SHUTDOWN state for forced umount, and make a new RECOVER state
for the forced reconnect situation. Change several tests for equality with
SHUTDOWN to test for that or RECOVER.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c               |  4 ++--
 fs/ceph/caps.c               |  2 +-
 fs/ceph/inode.c              |  2 +-
 fs/ceph/mds_client.c         |  4 ++--
 fs/ceph/super.c              | 14 ++++++++++----
 include/linux/ceph/libceph.h |  1 +
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 35c83f65475b..e10b07edc95c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -840,7 +840,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		if (ci->i_wrbuffer_ref > 0) {
 			pr_warn_ratelimited(
 				"writepage_start %p %lld forced umount\n",
@@ -1264,7 +1264,7 @@ ceph_find_incompatible(struct page *page)
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		dout(" page %p forced umount\n", page);
 		return ERR_PTR(-EIO);
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8552d1082b0e..c74d8182fb48 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2747,7 +2747,7 @@ again:
 			goto out_unlock;
 		}
 
-		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 			dout("get_cap_refs %p forced umount\n", inode);
 			ret = -EIO;
 			goto out_unlock;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 526faf4778ce..02b11a4a4d39 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1888,7 +1888,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 
 	mutex_lock(&ci->i_truncate_mutex);
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
 				    inode, ceph_ino(inode));
 		mapping_set_error(inode->i_mapping, -EIO);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8f1d7500a7ec..a2d6ef808f70 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1595,7 +1595,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc = fsc->mdsc;
 
-		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 			if (inode->i_data.nrpages > 0)
 				invalidate = true;
 			if (ci->i_wrbuffer_ref > 0)
@@ -4678,7 +4678,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
-	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 33ba6f0aa55c..9b1b7f4cfdd4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -831,6 +831,13 @@ static void destroy_caches(void)
 	ceph_fscache_unregister();
 }
 
+static void __ceph_umount_begin(struct ceph_fs_client *fsc)
+{
+	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
+	ceph_mdsc_force_umount(fsc->mdsc);
+	fsc->filp_gen++; // invalidate open files
+}
+
 /*
  * ceph_umount_begin - initiate forced umount.  Tear down the
  * mount, skipping steps that may hang while waiting for server(s).
@@ -843,9 +850,7 @@ static void ceph_umount_begin(struct super_block *sb)
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
-	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
-	ceph_mdsc_force_umount(fsc->mdsc);
-	fsc->filp_gen++; // invalidate open files
+	__ceph_umount_begin(fsc);
 }
 
 static const struct super_operations ceph_super_ops = {
@@ -1234,7 +1239,8 @@ int ceph_force_reconnect(struct super_block *sb)
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int err = 0;
 
-	ceph_umount_begin(sb);
+	fsc->mount_state = CEPH_MOUNT_RECOVER;
+	__ceph_umount_begin(fsc);
 
 	/* Make sure all page caches get invalidated.
 	 * see remove_session_caps_cb() */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c8645f0b797d..eb5a7ca13f9c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -104,6 +104,7 @@ enum {
 	CEPH_MOUNT_UNMOUNTING,
 	CEPH_MOUNT_UNMOUNTED,
 	CEPH_MOUNT_SHUTDOWN,
+	CEPH_MOUNT_RECOVER,
 };
 
 static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)

From dbeec07bc84f8229322d7919692a17adae1e388e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 25 Sep 2020 09:07:49 -0400
Subject: [PATCH 170/230] ceph: remove timeout on allowing reconnect after
 blocklisting

30 minutes is a long time to wait, and this makes it difficult to test
the feature by manually blocklisting clients. Remove the timeout
infrastructure and just allow the client to reconnect at will.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 5 -----
 fs/ceph/super.h      | 1 -
 2 files changed, 6 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a2d6ef808f70..de95e08815c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4374,12 +4374,7 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
 	if (!READ_ONCE(fsc->blocklisted))
 		return;
 
-	if (fsc->last_auto_reconnect &&
-	    time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
-		return;
-
 	pr_info("auto reconnect after blocklisted\n");
-	fsc->last_auto_reconnect = jiffies;
 	ceph_force_reconnect(fsc->sb);
 }
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f9e35e81c9af..f097237a5ad3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -108,7 +108,6 @@ struct ceph_fs_client {
 
 	int mount_state;
 
-	unsigned long last_auto_reconnect;
 	bool blocklisted;
 
 	bool have_copy_from2;

From 4ae3713fe45a289f37c479412a991bc51c502013 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 21 Sep 2020 13:12:53 -0400
Subject: [PATCH 171/230] ceph: queue MDS requests to REJECTED sessions when
 CLEANRECOVER is set

Ilya noticed that the first access to a blacklisted mount would often
get back -EACCES, but then subsequent calls would be OK. The problem is
in __do_request. If the session is marked as REJECTED, a hard error is
returned instead of waiting for a new session to come into being.

When the session is REJECTED and the mount was done with
recover_session=clean, queue the request to the waiting_for_map queue,
which will be awoken after tearing down the old session. We can only
do this for sync requests though, so check for async ones first and
just let the callers redrive a sync request.

URL: https://tracker.ceph.com/issues/47385
Reported-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index de95e08815c0..aab848527db9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2818,10 +2818,6 @@ static void __do_request(struct ceph_mds_client *mdsc,
 	     ceph_session_state_name(session->s_state));
 	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
 	    session->s_state != CEPH_MDS_SESSION_HUNG) {
-		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
-			err = -EACCES;
-			goto out_session;
-		}
 		/*
 		 * We cannot queue async requests since the caps and delegated
 		 * inodes are bound to the session. Just return -EJUKEBOX and
@@ -2831,6 +2827,20 @@ static void __do_request(struct ceph_mds_client *mdsc,
 			err = -EJUKEBOX;
 			goto out_session;
 		}
+
+		/*
+		 * If the session has been REJECTED, then return a hard error,
+		 * unless it's a CLEANRECOVER mount, in which case we'll queue
+		 * it to the mdsc queue.
+		 */
+		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
+			if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
+				list_add(&req->r_wait, &mdsc->waiting_for_map);
+			else
+				err = -EACCES;
+			goto out_session;
+		}
+
 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
 		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
 			err = __open_session(mdsc, session);

From 06a1ad438b7b8d4fd689114a305b37cb526ff638 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 29 Sep 2020 19:32:19 -0400
Subject: [PATCH 172/230] ceph: fix up some warnings on W=1 builds

Convert some decodes into unused variables into skips, and fix up some
non-kerneldoc comment headers to not start with "/**".

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c       | 11 ++++-------
 fs/ceph/locks.c      |  8 ++++----
 fs/ceph/mds_client.c | 25 ++++++-------------------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c74d8182fb48..1c4d72c5f236 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4028,15 +4028,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	}
 
 	if (msg_version >= 8) {
-		u64 flush_tid;
-		u32 caller_uid, caller_gid;
 		u32 pool_ns_len;
 
 		/* version >= 6 */
-		ceph_decode_64_safe(&p, end, flush_tid, bad);
+		ceph_decode_skip_64(&p, end, bad);	// flush_tid
 		/* version >= 7 */
-		ceph_decode_32_safe(&p, end, caller_uid, bad);
-		ceph_decode_32_safe(&p, end, caller_gid, bad);
+		ceph_decode_skip_32(&p, end, bad);	// caller_uid
+		ceph_decode_skip_32(&p, end, bad);	// caller_gid
 		/* version >= 8 */
 		ceph_decode_32_safe(&p, end, pool_ns_len, bad);
 		if (pool_ns_len > 0) {
@@ -4059,9 +4057,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	}
 
 	if (msg_version >= 11) {
-		u32 flags;
 		/* version >= 10 */
-		ceph_decode_32_safe(&p, end, flags, bad);
+		ceph_decode_skip_32(&p, end, bad); // flags
 		/* version >= 11 */
 		extra_info.dirstat_valid = true;
 		ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 048a435a29be..fa8a847743d0 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -57,7 +57,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = {
 	.fl_release_private = ceph_fl_release_lock,
 };
 
-/**
+/*
  * Implement fcntl and flock locking functions.
  */
 static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
@@ -225,7 +225,7 @@ static int try_unlock_file(struct file *file, struct file_lock *fl)
 	return 1;
 }
 
-/**
+/*
  * Attempt to set an fcntl lock.
  * For now, this just goes away to the server. Later it may be more awesome.
  */
@@ -408,7 +408,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock,
 	return err;
 }
 
-/**
+/*
  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
  * array. Must be called with inode->i_lock already held.
  * If we encounter more of a specific lock type than expected, return -ENOSPC.
@@ -458,7 +458,7 @@ fail:
 	return err;
 }
 
-/**
+/*
  * Copy the encoded flock and fcntl locks into the pagelist.
  * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
  * sequential flock locks.
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index aab848527db9..b3c941503f8c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -516,13 +516,9 @@ static int parse_reply_info_create(void **p, void *end,
 			/* Malformed reply? */
 			info->has_create_ino = false;
 		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
-			u8 struct_v, struct_compat;
-			u32 len;
-
 			info->has_create_ino = true;
-			ceph_decode_8_safe(p, end, struct_v, bad);
-			ceph_decode_8_safe(p, end, struct_compat, bad);
-			ceph_decode_32_safe(p, end, len, bad);
+			/* struct_v, struct_compat, and len */
+			ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);
 			ceph_decode_64_safe(p, end, info->ino, bad);
 			ret = ceph_parse_deleg_inos(p, end, s);
 			if (ret)
@@ -4860,10 +4856,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 	void *p = msg->front.iov_base;
 	void *end = p + msg->front.iov_len;
 	u32 epoch;
-	u32 map_len;
 	u32 num_fs;
 	u32 mount_fscid = (u32)-1;
-	u8 struct_v, struct_cv;
 	int err = -EINVAL;
 
 	ceph_decode_need(&p, end, sizeof(u32), bad);
@@ -4871,24 +4865,17 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	dout("handle_fsmap epoch %u\n", epoch);
 
-	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
-	struct_v = ceph_decode_8(&p);
-	struct_cv = ceph_decode_8(&p);
-	map_len = ceph_decode_32(&p);
+	/* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
+	ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
 
-	ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
-	p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
-
-	num_fs = ceph_decode_32(&p);
+	ceph_decode_32_safe(&p, end, num_fs, bad);
 	while (num_fs-- > 0) {
 		void *info_p, *info_end;
 		u32 info_len;
-		u8 info_v, info_cv;
 		u32 fscid, namelen;
 
 		ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
-		info_v = ceph_decode_8(&p);
-		info_cv = ceph_decode_8(&p);
+		p += 2;		// info_v, info_cv
 		info_len = ceph_decode_32(&p);
 		ceph_decode_need(&p, end, info_len, bad);
 		info_p = p;

From 81048c00d133512e4b4a848b0653d782a83e5911 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 3 Nov 2020 14:00:26 -0500
Subject: [PATCH 173/230] ceph: acquire Fs caps when getting dir stats

We only update the inode's dirstats when we have Fs caps from the MDS.

Declare a new VXATTR_FLAG_DIRSTAT that we set on all dirstats, and have
the vxattr handling code acquire those caps when it's set.

URL: https://tracker.ceph.com/issues/48104
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/xattr.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 197cb1234341..0fd05d3d4399 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -42,6 +42,7 @@ struct ceph_vxattr {
 #define VXATTR_FLAG_READONLY		(1<<0)
 #define VXATTR_FLAG_HIDDEN		(1<<1)
 #define VXATTR_FLAG_RSTAT		(1<<2)
+#define VXATTR_FLAG_DIRSTAT		(1<<3)
 
 /* layouts */
 
@@ -347,9 +348,9 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(dir, layout, object_size),
 	XATTR_LAYOUT_FIELD(dir, layout, pool),
 	XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
-	XATTR_NAME_CEPH(dir, entries, 0),
-	XATTR_NAME_CEPH(dir, files, 0),
-	XATTR_NAME_CEPH(dir, subdirs, 0),
+	XATTR_NAME_CEPH(dir, entries, VXATTR_FLAG_DIRSTAT),
+	XATTR_NAME_CEPH(dir, files, VXATTR_FLAG_DIRSTAT),
+	XATTR_NAME_CEPH(dir, subdirs, VXATTR_FLAG_DIRSTAT),
 	XATTR_RSTAT_FIELD(dir, rentries),
 	XATTR_RSTAT_FIELD(dir, rfiles),
 	XATTR_RSTAT_FIELD(dir, rsubdirs),
@@ -837,6 +838,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 		int mask = 0;
 		if (vxattr->flags & VXATTR_FLAG_RSTAT)
 			mask |= CEPH_STAT_RSTAT;
+		if (vxattr->flags & VXATTR_FLAG_DIRSTAT)
+			mask |= CEPH_CAP_FILE_SHARED;
 		err = ceph_do_getattr(inode, mask, true);
 		if (err)
 			return err;

From 8ba3b8c7fba4631a6689d976264067b446af4c1e Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Thu, 5 Nov 2020 23:30:21 -0500
Subject: [PATCH 174/230] ceph: send dentry lease metrics to MDS daemon

For the old ceph version, if it received this one metric message
containing the dentry lease metric info, it will just ignore it.

URL: https://tracker.ceph.com/issues/43423
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/metric.c | 18 +++++++++++++++---
 fs/ceph/metric.h | 14 ++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index fee4c4778313..5ec94bd4c1de 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -16,6 +16,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	struct ceph_metric_read_latency *read;
 	struct ceph_metric_write_latency *write;
 	struct ceph_metric_metadata_latency *meta;
+	struct ceph_metric_dlease *dlease;
 	struct ceph_client_metric *m = &mdsc->metric;
 	u64 nr_caps = atomic64_read(&m->total_caps);
 	struct ceph_msg *msg;
@@ -25,7 +26,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	s32 len;
 
 	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
-	      + sizeof(*meta);
+	      + sizeof(*meta) + sizeof(*dlease);
 
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
 	if (!msg) {
@@ -42,8 +43,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	cap->ver = 1;
 	cap->compat = 1;
 	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
-	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
-	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+	cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit));
+	cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis));
 	cap->total = cpu_to_le64(nr_caps);
 	items++;
 
@@ -83,6 +84,17 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
 	meta->nsec = cpu_to_le32(ts.tv_nsec);
 	items++;
 
+	/* encode the dentry lease metric */
+	dlease = (struct ceph_metric_dlease *)(meta + 1);
+	dlease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE);
+	dlease->ver = 1;
+	dlease->compat = 1;
+	dlease->data_len = cpu_to_le32(sizeof(*dlease) - 10);
+	dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit));
+	dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis));
+	dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries));
+	items++;
+
 	put_unaligned_le32(items, &head->num);
 	msg->front.iov_len = len;
 	msg->hdr.version = cpu_to_le16(1);
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 710f3f1dceab..af6038ff39d4 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -27,6 +27,7 @@ enum ceph_metric_type {
 	CLIENT_METRIC_TYPE_READ_LATENCY,	\
 	CLIENT_METRIC_TYPE_WRITE_LATENCY,	\
 	CLIENT_METRIC_TYPE_METADATA_LATENCY,	\
+	CLIENT_METRIC_TYPE_DENTRY_LEASE,	\
 						\
 	CLIENT_METRIC_TYPE_MAX,			\
 }
@@ -80,6 +81,19 @@ struct ceph_metric_metadata_latency {
 	__le32 nsec;
 } __packed;
 
+/* metric dentry lease header */
+struct ceph_metric_dlease {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __packed;
+
 struct ceph_metric_head {
 	__le32 num;	/* the number of metrics that will be sent */
 } __packed;

From 04fabb1199d1f995d6b9a1c42c046ac4bdac2d19 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 9 Nov 2020 13:14:07 -0500
Subject: [PATCH 175/230] ceph: ensure we have Fs caps when fetching dir link
 count

The link count for a directory is defined as inode->i_subdirs + 2,
(for "." and ".."). i_subdirs is only populated when Fs caps are held.
Ensure we grab Fs caps when fetching the link count for a directory.

[ idryomov: break unnecessarily long line ]

URL: https://tracker.ceph.com/issues/48125
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 02b11a4a4d39..12140da244f3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2340,15 +2340,23 @@ int ceph_permission(struct inode *inode, int mask)
 }
 
 /* Craft a mask of needed caps given a set of requested statx attrs. */
-static int statx_to_caps(u32 want)
+static int statx_to_caps(u32 want, umode_t mode)
 {
 	int mask = 0;
 
 	if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
 		mask |= CEPH_CAP_AUTH_SHARED;
 
-	if (want & (STATX_NLINK|STATX_CTIME))
-		mask |= CEPH_CAP_LINK_SHARED;
+	if (want & (STATX_NLINK|STATX_CTIME)) {
+		/*
+		 * The link count for directories depends on inode->i_subdirs,
+		 * and that is only updated when Fs caps are held.
+		 */
+		if (S_ISDIR(mode))
+			mask |= CEPH_CAP_FILE_SHARED;
+		else
+			mask |= CEPH_CAP_LINK_SHARED;
+	}
 
 	if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
 		    STATX_BLOCKS))
@@ -2374,8 +2382,9 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 
 	/* Skip the getattr altogether if we're asked not to sync */
 	if (!(flags & AT_STATX_DONT_SYNC)) {
-		err = ceph_do_getattr(inode, statx_to_caps(request_mask),
-				      flags & AT_STATX_FORCE_SYNC);
+		err = ceph_do_getattr(inode,
+				statx_to_caps(request_mask, inode->i_mode),
+				flags & AT_STATX_FORCE_SYNC);
 		if (err)
 			return err;
 	}

From 36c9478d6069994848c8897755b4380aa0a29dd3 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <changcheng.liu@aliyun.com>
Date: Tue, 10 Nov 2020 21:20:08 +0800
Subject: [PATCH 176/230] libceph: remove unused port macros

1. monitor's default port is defined by CEPH_MON_PORT
2. CEPH_PORT_START and CEPH_PORT_LAST are not needed.

Signed-off-by: Changcheng Liu <changcheng.liu@aliyun.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/msgr.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9e50aede46c8..46939485f2c3 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -8,15 +8,6 @@
 
 #define CEPH_MON_PORT    6789  /* default monitor port */
 
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-
 /*
  * tcp connection banner.  include a protocol version. and adjust
  * whenever the wire protocol changes.  try to keep this string length

From 247b1f19dbeb4855cb891ca01428d7a81c1657a7 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 11 Nov 2020 09:29:39 +0800
Subject: [PATCH 177/230] ceph: add status debugfs file

This will help list some useful client side info, like the client
entity address/name and blocklisted status, etc.

URL: https://tracker.ceph.com/issues/48057
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/debugfs.c | 20 ++++++++++++++++++++
 fs/ceph/super.h   |  1 +
 2 files changed, 21 insertions(+)

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7a8fbe3e4751..66989c880adb 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -304,11 +304,25 @@ static int mds_sessions_show(struct seq_file *s, void *ptr)
 	return 0;
 }
 
+static int status_show(struct seq_file *s, void *p)
+{
+	struct ceph_fs_client *fsc = s->private;
+	struct ceph_entity_inst *inst = &fsc->client->msgr.inst;
+	struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client);
+
+	seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name),
+		   ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce));
+	seq_printf(s, "blocklisted: %s\n", fsc->blocklisted ? "true" : "false");
+
+	return 0;
+}
+
 DEFINE_SHOW_ATTRIBUTE(mdsmap);
 DEFINE_SHOW_ATTRIBUTE(mdsc);
 DEFINE_SHOW_ATTRIBUTE(caps);
 DEFINE_SHOW_ATTRIBUTE(mds_sessions);
 DEFINE_SHOW_ATTRIBUTE(metric);
+DEFINE_SHOW_ATTRIBUTE(status);
 
 
 /*
@@ -394,6 +408,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
 						fsc->client->debugfs_dir,
 						fsc,
 						&caps_fops);
+
+	fsc->debugfs_status = debugfs_create_file("status",
+						  0400,
+						  fsc->client->debugfs_dir,
+						  fsc,
+						  &status_fops);
 }
 
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f097237a5ad3..5138b75923f9 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -128,6 +128,7 @@ struct ceph_fs_client {
 	struct dentry *debugfs_bdi;
 	struct dentry *debugfs_mdsc, *debugfs_mdsmap;
 	struct dentry *debugfs_metric;
+	struct dentry *debugfs_status;
 	struct dentry *debugfs_mds_sessions;
 #endif
 

From 5a9e2f5d5590fc70514083bd8771ec04de538387 Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 11 Nov 2020 09:29:40 +0800
Subject: [PATCH 178/230] ceph: add ceph.{cluster_fsid/client_id} vxattrs

These two vxattrs will only exist in local client side, with which
we can easily know which mountpoint the file belongs to and also
they can help locate the debugfs path quickly.

URL: https://tracker.ceph.com/issues/48057
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/xattr.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0fd05d3d4399..e89750a1f039 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -304,6 +304,23 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
 				ci->i_snap_btime.tv_nsec);
 }
 
+static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
+					  char *val, size_t size)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+
+	return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
+}
+
+static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
+				       char *val, size_t size)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+
+	return ceph_fmt_xattr(val, size, "client%lld",
+			      ceph_client_gid(fsc->client));
+}
+
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 #define CEPH_XATTR_NAME2(_type, _name, _name2)	\
 	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
@@ -407,6 +424,24 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 	{ .name = NULL, 0 }	/* Required table terminator */
 };
 
+static struct ceph_vxattr ceph_common_vxattrs[] = {
+	{
+		.name = "ceph.cluster_fsid",
+		.name_size = sizeof("ceph.cluster_fsid"),
+		.getxattr_cb = ceph_vxattrcb_cluster_fsid,
+		.exists_cb = NULL,
+		.flags = VXATTR_FLAG_READONLY,
+	},
+	{
+		.name = "ceph.client_id",
+		.name_size = sizeof("ceph.client_id"),
+		.getxattr_cb = ceph_vxattrcb_client_id,
+		.exists_cb = NULL,
+		.flags = VXATTR_FLAG_READONLY,
+	},
+	{ .name = NULL, 0 }	/* Required table terminator */
+};
+
 static struct ceph_vxattr *ceph_inode_vxattrs(struct inode *inode)
 {
 	if (S_ISDIR(inode->i_mode))
@@ -429,6 +464,13 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
 		}
 	}
 
+	vxattr = ceph_common_vxattrs;
+	while (vxattr->name) {
+		if (!strcmp(vxattr->name, name))
+			return vxattr;
+		vxattr++;
+	}
+
 	return NULL;
 }
 

From 4a357f5069428afc7c48cb4bdc95c864b7a5c862 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 10 Nov 2020 16:24:40 -0500
Subject: [PATCH 179/230] ceph: pass down the flags to
 grab_cache_page_write_begin

write_begin operations are passed a flags parameter that we need to
mirror here, so that we don't (e.g.) recurse back into filesystem code
inappropriately.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e10b07edc95c..950552944436 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1321,7 +1321,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 	dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len);
 
 	for (;;) {
-		page = grab_cache_page_write_begin(mapping, index, 0);
+		page = grab_cache_page_write_begin(mapping, index, flags);
 		if (!page) {
 			r = -ENOMEM;
 			break;

From e5cafce3ad0f8652d6849314d951459c2bff7233 Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.de>
Date: Thu, 12 Nov 2020 10:45:12 +0000
Subject: [PATCH 180/230] ceph: fix race in concurrent __ceph_remove_cap
 invocations

A NULL pointer dereference may occur in __ceph_remove_cap with some of the
callbacks used in ceph_iterate_session_caps, namely trim_caps_cb and
remove_session_caps_cb. Those callers hold the session->s_mutex, so they
are prevented from concurrent execution, but ceph_evict_inode does not.

Since the callers of this function hold the i_ceph_lock, the fix is simply
a matter of returning immediately if caps->ci is NULL.

Cc: stable@vger.kernel.org
URL: https://tracker.ceph.com/issues/43272
Suggested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Luis Henriques <lhenriques@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1c4d72c5f236..255a512f1277 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1140,12 +1140,19 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 {
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
-	struct ceph_mds_client *mdsc =
-		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
+	struct ceph_mds_client *mdsc;
 	int removed = 0;
 
+	/* 'ci' being NULL means the remove have already occurred */
+	if (!ci) {
+		dout("%s: cap inode is NULL\n", __func__);
+		return;
+	}
+
 	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
 
+	mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
+
 	/* remove from inode's cap rbtree, and clear auth cap */
 	rb_erase(&cap->ci_node, &ci->i_caps);
 	if (ci->i_auth_cap == cap) {

From ccd1acdf1c49b835504b235461fd24e2ed826764 Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.de>
Date: Thu, 12 Nov 2020 11:25:32 +0000
Subject: [PATCH 181/230] ceph: downgrade warning from mdsmap decode to debug

While the MDS cluster is unstable and changing state the client may get
mdsmap updates that will trigger warnings:

  [144692.478400] ceph: mdsmap_decode got incorrect state(up:standby-replay)
  [144697.489552] ceph: mdsmap_decode got incorrect state(up:standby-replay)
  [144697.489580] ceph: mdsmap_decode got incorrect state(up:standby-replay)

This patch downgrades these warnings to debug, as they may flood the logs
if the cluster is unstable for a while.

Signed-off-by: Luis Henriques <lhenriques@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mdsmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index e4aba6c6d3b5..1096d1d3a84c 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -243,8 +243,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		}
 
 		if (state <= 0) {
-			pr_warn("mdsmap_decode got incorrect state(%s)\n",
-				ceph_mds_state_name(state));
+			dout("mdsmap_decode got incorrect state(%s)\n",
+			     ceph_mds_state_name(state));
 			continue;
 		}
 

From 68cbb8056a4c24c6a38ad2b79e0a9764b235e8fa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 12 Nov 2020 09:37:59 -0500
Subject: [PATCH 182/230] ceph: fix inode refcount leak when ceph_fill_inode on
 non-I_NEW inode fails

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 12140da244f3..ada906280e3a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1335,6 +1335,8 @@ retry_lookup:
 				in, ceph_vinop(in));
 			if (in->i_state & I_NEW)
 				discard_new_inode(in);
+			else
+				iput(in);
 			goto done;
 		}
 		req->r_target_inode = in;

From 6646ea1c8e8716ab6b8b60ff4930f808442cfe12 Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.de>
Date: Thu, 12 Nov 2020 15:23:21 +0000
Subject: [PATCH 183/230] Revert "ceph: allow rename operation under different
 quota realms"

This reverts commit dffdcd71458e699e839f0bf47c3d42d64210b939.

When doing a rename across quota realms, there's a corner case that isn't
handled correctly.  Here's a testcase:

  mkdir files limit
  truncate files/file -s 10G
  setfattr limit -n ceph.quota.max_bytes -v 1000000
  mv files limit/

The above will succeed because ftruncate(2) won't immediately notify the
MDSs with the new file size, and thus the quota realms stats won't be
updated.

Since the possible fixes for this issue would have a huge performance impact,
the solution for now is to simply revert to returning -EXDEV when doing a cross
quota realms rename.

URL: https://tracker.ceph.com/issues/48203
Signed-off-by: Luis Henriques <lhenriques@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/dir.c   |  9 ++++----
 fs/ceph/quota.c | 58 +------------------------------------------------
 fs/ceph/super.h |  3 +--
 3 files changed, 6 insertions(+), 64 deletions(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a4d48370b2b3..858ee7362ff5 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1202,12 +1202,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 			op = CEPH_MDS_OP_RENAMESNAP;
 		else
 			return -EROFS;
-	} else if (old_dir != new_dir) {
-		err = ceph_quota_check_rename(mdsc, d_inode(old_dentry),
-					      new_dir);
-		if (err)
-			return err;
 	}
+	/* don't allow cross-quota renames */
+	if ((old_dir != new_dir) &&
+	    (!ceph_quota_is_same_realm(old_dir, new_dir)))
+		return -EXDEV;
 
 	dout("rename dir %p dentry %p to dir %p dentry %p\n",
 	     old_dir, old_dentry, new_dir, new_dentry);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9b785f11e95a..4e32c9600ecc 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -264,7 +264,7 @@ restart:
 	return NULL;
 }
 
-static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
+bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
 	struct ceph_snap_realm *old_realm, *new_realm;
@@ -516,59 +516,3 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
 	return is_updated;
 }
 
-/*
- * ceph_quota_check_rename - check if a rename can be executed
- * @mdsc:	MDS client instance
- * @old:	inode to be copied
- * @new:	destination inode (directory)
- *
- * This function verifies if a rename (e.g. moving a file or directory) can be
- * executed.  It forces an rstat update in the @new target directory (and in the
- * source @old as well, if it's a directory).  The actual check is done both for
- * max_files and max_bytes.
- *
- * This function returns 0 if it's OK to do the rename, or, if quotas are
- * exceeded, -EXDEV (if @old is a directory) or -EDQUOT.
- */
-int ceph_quota_check_rename(struct ceph_mds_client *mdsc,
-			    struct inode *old, struct inode *new)
-{
-	struct ceph_inode_info *ci_old = ceph_inode(old);
-	int ret = 0;
-
-	if (ceph_quota_is_same_realm(old, new))
-		return 0;
-
-	/*
-	 * Get the latest rstat for target directory (and for source, if a
-	 * directory)
-	 */
-	ret = ceph_do_getattr(new, CEPH_STAT_RSTAT, false);
-	if (ret)
-		return ret;
-
-	if (S_ISDIR(old->i_mode)) {
-		ret = ceph_do_getattr(old, CEPH_STAT_RSTAT, false);
-		if (ret)
-			return ret;
-		ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP,
-					   ci_old->i_rbytes);
-		if (!ret)
-			ret = check_quota_exceeded(new,
-						   QUOTA_CHECK_MAX_FILES_OP,
-						   ci_old->i_rfiles +
-						   ci_old->i_rsubdirs);
-		if (ret)
-			ret = -EXDEV;
-	} else {
-		ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP,
-					   i_size_read(old));
-		if (!ret)
-			ret = check_quota_exceeded(new,
-						   QUOTA_CHECK_MAX_FILES_OP, 1);
-		if (ret)
-			ret = -EDQUOT;
-	}
-
-	return ret;
-}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5138b75923f9..b62d8fee3b86 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1222,14 +1222,13 @@ extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
 			      struct ceph_mds_session *session,
 			      struct ceph_msg *msg);
 extern bool ceph_quota_is_max_files_exceeded(struct inode *inode);
+extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new);
 extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode,
 					     loff_t newlen);
 extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode,
 						loff_t newlen);
 extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
 				     struct kstatfs *buf);
-extern int ceph_quota_check_rename(struct ceph_mds_client *mdsc,
-				   struct inode *old, struct inode *new);
 extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
 
 #endif /* _FS_CEPH_SUPER_H */

From bca9fc14c70fcbbebc84954cc39994e463fb9468 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 12 Nov 2020 10:03:38 -0500
Subject: [PATCH 184/230] ceph: when filling trace, call ceph_get_inode outside
 of mutexes

Geng Jichao reported a rather complex deadlock involving several
moving parts:

1) readahead is issued against an inode and some of its pages are locked
   while the read is in flight

2) the same inode is evicted from the cache, and this task gets stuck
   waiting for the page lock because of the above readahead

3) another task is processing a reply trace, and looks up the inode
   being evicted while holding the s_mutex. That ends up waiting for the
   eviction to complete

4) a write reply for an unrelated inode is then processed in the
   ceph_con_workfn job. It calls ceph_check_caps after putting wrbuffer
   caps, and that gets stuck waiting on the s_mutex held by 3.

The reply to "1" is stuck behind the write reply in "4", so we deadlock
at that point.

This patch changes the trace processing to call ceph_get_inode outside
of the s_mutex and snap_rwsem, which should break the cycle above.

[ idryomov: break unnecessarily long lines ]

URL: https://tracker.ceph.com/issues/47998
Reported-by: Geng Jichao <gengjichao@jd.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Luis Henriques <lhenriques@suse.de>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c      | 13 ++++---------
 fs/ceph/mds_client.c | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ada906280e3a..11ccdee4ed6a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1315,15 +1315,10 @@ retry_lookup:
 	}
 
 	if (rinfo->head->is_target) {
-		tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-		tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
-		in = ceph_get_inode(sb, tvino);
-		if (IS_ERR(in)) {
-			err = PTR_ERR(in);
-			goto done;
-		}
+		/* Should be filled in by handle_reply */
+		BUG_ON(!req->r_target_inode);
 
+		in = req->r_target_inode;
 		err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
 				NULL, session,
 				(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
@@ -1333,13 +1328,13 @@ retry_lookup:
 		if (err < 0) {
 			pr_err("ceph_fill_inode badness %p %llx.%llx\n",
 				in, ceph_vinop(in));
+			req->r_target_inode = NULL;
 			if (in->i_state & I_NEW)
 				discard_new_inode(in);
 			else
 				iput(in);
 			goto done;
 		}
-		req->r_target_inode = in;
 		if (in->i_state & I_NEW)
 			unlock_new_inode(in);
 	}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b3c941503f8c..81eaf5212eee 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3179,6 +3179,23 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
 	mutex_unlock(&mdsc->mutex);
 
+	/* Must find target inode outside of mutexes to avoid deadlocks */
+	if ((err >= 0) && rinfo->head->is_target) {
+		struct inode *in;
+		struct ceph_vino tvino = {
+			.ino  = le64_to_cpu(rinfo->targeti.in->ino),
+			.snap = le64_to_cpu(rinfo->targeti.in->snapid)
+		};
+
+		in = ceph_get_inode(mdsc->fsc->sb, tvino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			mutex_lock(&session->s_mutex);
+			goto out_err;
+		}
+		req->r_target_inode = in;
+	}
+
 	mutex_lock(&session->s_mutex);
 	if (err < 0) {
 		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);

From dd980fc0d598f90745dfcf3806bcc65452e03265 Mon Sep 17 00:00:00 2001
From: Luis Henriques <lhenriques@suse.de>
Date: Mon, 23 Nov 2020 17:38:46 +0000
Subject: [PATCH 185/230] ceph: add ceph.caps vxattr

Add a new vxattr that allows userspace to list the caps for a specific
directory or file.

[ jlayton: change format delimiter to '/' ]

Signed-off-by: Luis Henriques <lhenriques@suse.de>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/xattr.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index e89750a1f039..cd8c7aaa23a0 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -321,6 +321,19 @@ static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
 			      ceph_client_gid(fsc->client));
 }
 
+static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val,
+					size_t size)
+{
+	int issued;
+
+	spin_lock(&ci->i_ceph_lock);
+	issued = __ceph_caps_issued(ci, NULL);
+	spin_unlock(&ci->i_ceph_lock);
+
+	return ceph_fmt_xattr(val, size, "%s/0x%x",
+			      ceph_cap_string(issued), issued);
+}
+
 #define CEPH_XATTR_NAME(_type, _name)	XATTR_CEPH_PREFIX #_type "." #_name
 #define CEPH_XATTR_NAME2(_type, _name, _name2)	\
 	XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
@@ -396,6 +409,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 		.exists_cb = ceph_vxattrcb_snap_btime_exists,
 		.flags = VXATTR_FLAG_READONLY,
 	},
+	{
+		.name = "ceph.caps",
+		.name_size = sizeof("ceph.caps"),
+		.getxattr_cb = ceph_vxattrcb_caps,
+		.exists_cb = NULL,
+		.flags = VXATTR_FLAG_HIDDEN,
+	},
 	{ .name = NULL, 0 }	/* Required table terminator */
 };
 
@@ -421,6 +441,13 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 		.exists_cb = ceph_vxattrcb_snap_btime_exists,
 		.flags = VXATTR_FLAG_READONLY,
 	},
+	{
+		.name = "ceph.caps",
+		.name_size = sizeof("ceph.caps"),
+		.getxattr_cb = ceph_vxattrcb_caps,
+		.exists_cb = NULL,
+		.flags = VXATTR_FLAG_HIDDEN,
+	},
 	{ .name = NULL, 0 }	/* Required table terminator */
 };
 

From 4a756db2a149f25483a7c63f013ff96372a0b2cb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 4 Dec 2020 18:54:21 +0000
Subject: [PATCH 186/230] ceph: remove redundant assignment to variable i

The variable i is being initialized with a value that is never read
and it is being updated later with a new value in a for-loop.  The
initialization is redundant and can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 81eaf5212eee..70d347989603 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1239,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 {
 	struct ceph_msg *msg;
 	struct ceph_mds_session_head *h;
-	int i = -1;
+	int i;
 	int extra_bytes = 0;
 	int metadata_key_count = 0;
 	struct ceph_options *opt = mdsc->fsc->client->options;

From 968cd14edc3acff251f98bdc1eb15f13f05dd5fb Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 9 Dec 2020 10:52:20 +0800
Subject: [PATCH 187/230] ceph: set osdmap epoch for setxattr

When setting the file/dir layout, it may need data pool info. So
in mds server, it needs to check the osdmap. At present, if mds
doesn't find the data pool specified, it will try to get the latest
osdmap. Now if pass the osd epoch for setxattr, the mds server can
only check this epoch of osdmap.

URL: https://tracker.ceph.com/issues/48504
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 2 +-
 fs/ceph/xattr.c              | 3 +++
 include/linux/ceph/ceph_fs.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 70d347989603..75034f7d8f46 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2533,7 +2533,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free2;
 	}
 
-	msg->hdr.version = cpu_to_le16(2);
+	msg->hdr.version = cpu_to_le16(3);
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
 	head = msg->front.iov_base;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index cd8c7aaa23a0..24997982de01 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1022,6 +1022,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_pagelist *pagelist = NULL;
 	int op = CEPH_MDS_OP_SETXATTR;
 	int err;
@@ -1060,6 +1061,8 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 
 	if (op == CEPH_MDS_OP_SETXATTR) {
 		req->r_args.setxattr.flags = cpu_to_le32(flags);
+		req->r_args.setxattr.osdmap_epoch =
+			cpu_to_le32(osdc->osdmap->epoch);
 		req->r_pagelist = pagelist;
 		pagelist = NULL;
 	}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 455e9b9e2adf..c0f1b921ec69 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -424,6 +424,7 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
+		__le32 osdmap_epoch; /* used for setting file/dir layouts */
 	} __attribute__ ((packed)) setxattr;
 	struct {
 		struct ceph_file_layout_legacy layout;

From 0f51a983616c22a56d231950812f895e46dae256 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 9 Dec 2020 10:27:01 -0500
Subject: [PATCH 188/230] ceph: don't reach into request header for readdir
 info

We already have a pointer to the argument struct in req->r_args. Use that
instead of groveling around in the ceph_mds_request_head.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/inode.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 11ccdee4ed6a..adc8fc3c5d85 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1594,8 +1594,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	struct dentry *dn;
 	struct inode *in;
 	int err = 0, skipped = 0, ret, i;
-	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+	u32 frag = le32_to_cpu(req->r_args.readdir.frag);
 	u32 last_hash = 0;
 	u32 fpos_offset;
 	struct ceph_readdir_cache_control cache_ctl = {};
@@ -1612,7 +1611,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 		} else if (rinfo->offset_hash) {
 			/* mds understands offset_hash */
 			WARN_ON_ONCE(req->r_readdir_offset != 2);
-			last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+			last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
 		}
 	}
 

From 7fe0cdeb0f88988dce8a77e963d15539abba1f18 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 8 Dec 2020 11:24:09 -0500
Subject: [PATCH 189/230] ceph: take a cred reference instead of tracking
 individual uid/gid

Replace req->r_uid/r_gid with an r_cred pointer and take a reference to
that at the point where we previously would sample the two.  Use that to
populate the uid and gid in the header and release the reference when
the request is freed.

This should enable us to later add support for sending supplementary
group lists in MDS requests.

[ idryomov: break unnecessarily long lines ]

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 10 ++++++----
 fs/ceph/mds_client.h |  3 +--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 75034f7d8f46..3028660c4226 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -833,6 +833,7 @@ void ceph_mdsc_release_request(struct kref *kref)
 	}
 	kfree(req->r_path1);
 	kfree(req->r_path2);
+	put_cred(req->r_cred);
 	if (req->r_pagelist)
 		ceph_pagelist_release(req->r_pagelist);
 	put_request_session(req);
@@ -888,8 +889,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
 	ceph_mdsc_get_request(req);
 	insert_request(&mdsc->request_tree, req);
 
-	req->r_uid = current_fsuid();
-	req->r_gid = current_fsgid();
+	req->r_cred = get_current_cred();
 
 	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
 		mdsc->oldest_tid = req->r_tid;
@@ -2542,8 +2542,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
 	head->op = cpu_to_le32(req->r_op);
-	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
-	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
+	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+						 req->r_cred->fsuid));
+	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+						 req->r_cred->fsgid));
 	head->ino = cpu_to_le64(req->r_deleg_ino);
 	head->args = req->r_args;
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f5adbebcb38e..eaa7c5422116 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -275,8 +275,7 @@ struct ceph_mds_request {
 
 	union ceph_mds_request_args r_args;
 	int r_fmode;        /* file mode, if expecting cap */
-	kuid_t r_uid;
-	kgid_t r_gid;
+	const struct cred *r_cred;
 	int r_request_release_offset;
 	struct timespec64 r_stamp;
 

From 396bd62c6912d0fd66287f004017982e542706e7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 9 Dec 2020 08:24:18 -0500
Subject: [PATCH 190/230] ceph: clean up argument lists to
 __prepare_send_request and __send_request

We can always get the mdsc from the session, so there's no need to pass
it in as a separate argument. Pass the session to __prepare_send_request
as well, to prepare for later patches that will need to access it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3028660c4226..14d0a11b7d88 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2636,10 +2636,12 @@ static void complete_request(struct ceph_mds_client *mdsc,
 /*
  * called under mdsc->mutex
  */
-static int __prepare_send_request(struct ceph_mds_client *mdsc,
+static int __prepare_send_request(struct ceph_mds_session *session,
 				  struct ceph_mds_request *req,
-				  int mds, bool drop_cap_releases)
+				  bool drop_cap_releases)
 {
+	int mds = session->s_mds;
+	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_mds_request_head *rhead;
 	struct ceph_msg *msg;
 	int flags = 0;
@@ -2723,15 +2725,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 /*
  * called under mdsc->mutex
  */
-static int __send_request(struct ceph_mds_client *mdsc,
-			  struct ceph_mds_session *session,
+static int __send_request(struct ceph_mds_session *session,
 			  struct ceph_mds_request *req,
 			  bool drop_cap_releases)
 {
 	int err;
 
-	err = __prepare_send_request(mdsc, req, session->s_mds,
-				     drop_cap_releases);
+	err = __prepare_send_request(session, req, drop_cap_releases);
 	if (!err) {
 		ceph_msg_get(req->r_request);
 		ceph_con_send(&session->s_con, req->r_request);
@@ -2858,7 +2858,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
 	if (req->r_request_started == 0)   /* note request start time */
 		req->r_request_started = jiffies;
 
-	err = __send_request(mdsc, session, req, false);
+	err = __send_request(session, req, false);
 
 out_session:
 	ceph_put_mds_session(session);
@@ -3539,7 +3539,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 
 	mutex_lock(&mdsc->mutex);
 	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
-		__send_request(mdsc, session, req, true);
+		__send_request(session, req, true);
 
 	/*
 	 * also re-send old requests when MDS enters reconnect stage. So that MDS
@@ -3560,7 +3560,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 
 		ceph_mdsc_release_dir_caps_no_check(req);
 
-		__send_request(mdsc, session, req, true);
+		__send_request(session, req, true);
 	}
 	mutex_unlock(&mdsc->mutex);
 }

From 4f1ddb1ea874c7703528a8c21b77b7f2462ee247 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 9 Dec 2020 10:12:59 -0500
Subject: [PATCH 191/230] ceph: implement updated ceph_mds_request_head
 structure

When we added the btime feature in mainline ceph, we had to extend
struct ceph_mds_request_args so that it could be set. Implement the same
in the kernel client.

Rename ceph_mds_request_head with a _old extension, and a union
ceph_mds_request_args_ext to allow for the extended size of the new
header format.

Add the appropriate code to handle both formats in struct
create_request_message and key the behavior on whether the peer supports
CEPH_FEATURE_FS_BTIME.

The gid_list field in the payload is now populated from the saved
credential. For now, we don't add any support for setting the btime via
setattr, but this does enable us to add that in the future.

[ idryomov: break unnecessarily long lines ]

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 75 +++++++++++++++++++++++++++++-------
 include/linux/ceph/ceph_fs.h | 32 ++++++++++++++-
 2 files changed, 93 insertions(+), 14 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 14d0a11b7d88..a256d95ec99a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2478,21 +2478,24 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 /*
  * called under mdsc->mutex
  */
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 					       struct ceph_mds_request *req,
-					       int mds, bool drop_cap_releases)
+					       bool drop_cap_releases)
 {
+	int mds = session->s_mds;
+	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_msg *msg;
-	struct ceph_mds_request_head *head;
+	struct ceph_mds_request_head_old *head;
 	const char *path1 = NULL;
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
 	bool freepath1 = false, freepath2 = false;
-	int len;
+	int len, i;
 	u16 releases;
 	void *p, *end;
 	int ret;
+	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
 
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2514,14 +2517,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free1;
 	}
 
-	len = sizeof(*head) +
-		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+	if (legacy) {
+		/* Old style */
+		len = sizeof(*head);
+	} else {
+		/* New style: add gid_list and any later fields */
+		len = sizeof(struct ceph_mds_request_head) + sizeof(u32) +
+		      (sizeof(u64) * req->r_cred->group_info->ngroups);
+	}
+
+	len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
 		sizeof(struct ceph_timespec);
 
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
 		(!!req->r_inode_drop + !!req->r_dentry_drop +
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+
 	if (req->r_dentry_drop)
 		len += pathlen1;
 	if (req->r_old_dentry_drop)
@@ -2533,11 +2545,25 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free2;
 	}
 
-	msg->hdr.version = cpu_to_le16(3);
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
-	head = msg->front.iov_base;
-	p = msg->front.iov_base + sizeof(*head);
+	/*
+	 * The old ceph_mds_request_header didn't contain a version field, and
+	 * one was added when we moved the message version from 3->4.
+	 */
+	if (legacy) {
+		msg->hdr.version = cpu_to_le16(3);
+		head = msg->front.iov_base;
+		p = msg->front.iov_base + sizeof(*head);
+	} else {
+		struct ceph_mds_request_head *new_head = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(4);
+		new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+		p = msg->front.iov_base + sizeof(*new_head);
+	}
+
 	end = msg->front.iov_base + msg->front.iov_len;
 
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
@@ -2590,6 +2616,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		ceph_encode_copy(&p, &ts, sizeof(ts));
 	}
 
+	/* gid list */
+	if (!legacy) {
+		ceph_encode_32(&p, req->r_cred->group_info->ngroups);
+		for (i = 0; i < req->r_cred->group_info->ngroups; i++)
+			ceph_encode_64(&p, from_kgid(&init_user_ns,
+				       req->r_cred->group_info->gid[i]));
+	}
+
 	if (WARN_ON_ONCE(p > end)) {
 		ceph_msg_put(msg);
 		msg = ERR_PTR(-ERANGE);
@@ -2633,6 +2667,18 @@ static void complete_request(struct ceph_mds_client *mdsc,
 	complete_all(&req->r_completion);
 }
 
+static struct ceph_mds_request_head_old *
+find_old_request_head(void *p, u64 features)
+{
+	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+	struct ceph_mds_request_head *new_head;
+
+	if (legacy)
+		return (struct ceph_mds_request_head_old *)p;
+	new_head = (struct ceph_mds_request_head *)p;
+	return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+}
+
 /*
  * called under mdsc->mutex
  */
@@ -2642,7 +2688,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct ceph_mds_request_head *rhead;
+	struct ceph_mds_request_head_old *rhead;
 	struct ceph_msg *msg;
 	int flags = 0;
 
@@ -2661,6 +2707,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
 		void *p;
+
 		/*
 		 * Replay.  Do not regenerate message (and rebuild
 		 * paths, etc.); just use the original message.
@@ -2668,7 +2715,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		 * d_move mangles the src name.
 		 */
 		msg = req->r_request;
-		rhead = msg->front.iov_base;
+		rhead = find_old_request_head(msg->front.iov_base,
+					      session->s_con.peer_features);
 
 		flags = le32_to_cpu(rhead->flags);
 		flags |= CEPH_MDS_FLAG_REPLAY;
@@ -2699,14 +2747,15 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		ceph_msg_put(req->r_request);
 		req->r_request = NULL;
 	}
-	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
+	msg = create_request_message(session, req, drop_cap_releases);
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
 		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
 
-	rhead = msg->front.iov_base;
+	rhead = find_old_request_head(msg->front.iov_base,
+				      session->s_con.peer_features);
 	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_REPLAY;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c0f1b921ec69..d44d98033d58 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -446,11 +446,25 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) lookupino;
 } __attribute__ ((packed));
 
+union ceph_mds_request_args_ext {
+	union ceph_mds_request_args old;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr_ext;
+};
+
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
 #define CEPH_MDS_FLAG_WANT_DENTRY	2 /* want dentry in reply */
 #define CEPH_MDS_FLAG_ASYNC		4 /* request is asynchronous */
 
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -463,6 +477,22 @@ struct ceph_mds_request_head {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
+#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+
+struct ceph_mds_request_head {
+	__le16 version;                /* struct version */
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_ext args;
+} __attribute__ ((packed));
+
 /* cap/lease release record */
 struct ceph_mds_request_release {
 	__le64 ino, cap_id;            /* ino and unique cap id */

From b77f8f0e4f271c29cf5cc071ea1b6bc3a675b340 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 5 Nov 2020 15:01:53 +0100
Subject: [PATCH 192/230] libceph: include middle_len in process_message() dout

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index af0f1fa24937..214ae2d17a90 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2528,12 +2528,13 @@ static void process_message(struct ceph_connection *con)
 	con->in_seq++;
 	mutex_unlock(&con->mutex);
 
-	dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+	dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
 	     msg, le64_to_cpu(msg->hdr.seq),
 	     ENTITY_NAME(msg->hdr.src),
 	     le16_to_cpu(msg->hdr.type),
 	     ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
 	     le32_to_cpu(msg->hdr.front_len),
+	     le32_to_cpu(msg->hdr.middle_len),
 	     le32_to_cpu(msg->hdr.data_len),
 	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
 	con->ops->dispatch(con, msg);

From 418af5b3bfc4f1ef4854e83c5be8a0bdce51e95c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 29 Oct 2020 14:49:10 +0100
Subject: [PATCH 193/230] libceph: lower exponential backoff delay

The current setting allows the backoff to climb up to 5 minutes.  This
is too high -- it becomes hard to tell whether the client is stuck on
something or just in backoff.

In userspace, ms_max_backoff is defaulted to 15 seconds.  Let's do the
same.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  4 ++--
 net/ceph/messenger.c           | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 60b324efd1c4..b47c7cc4c90a 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -241,8 +241,8 @@ struct ceph_msg {
 };
 
 /* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL	(HZ/2)
-#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+#define BASE_DELAY_INTERVAL	(HZ / 4)
+#define MAX_DELAY_INTERVAL	(15 * HZ)
 
 /*
  * A single connection with another host.
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 214ae2d17a90..f3eb66bab988 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2812,6 +2812,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 		return -ENOENT;
 	}
 
+	if (delay >= HZ)
+		delay = round_jiffies_relative(delay);
+
 	dout("%s %p %lu\n", __func__, con, delay);
 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
 		dout("%s %p - already queued\n", __func__, con);
@@ -2871,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
 	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
 		return false;
 
-	ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+	ret = queue_con_delay(con, con->delay);
 	if (ret) {
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
@@ -3018,10 +3021,13 @@ static void con_fault(struct ceph_connection *con)
 	} else {
 		/* retry after a delay. */
 		con->state = CON_STATE_PREOPEN;
-		if (con->delay == 0)
+		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
-		else if (con->delay < MAX_DELAY_INTERVAL)
+		} else if (con->delay < MAX_DELAY_INTERVAL) {
 			con->delay *= 2;
+			if (con->delay > MAX_DELAY_INTERVAL)
+				con->delay = MAX_DELAY_INTERVAL;
+		}
 		con_flag_set(con, CON_FLAG_BACKOFF);
 		queue_con(con);
 	}

From 90b6561a0525d0888d5d705e343bacaaacd3c021 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 6 Nov 2020 17:09:24 +0100
Subject: [PATCH 194/230] libceph: don't call reset_connection() on
 version/feature mismatches

A fault due to a version mismatch or a feature set mismatch used to be
treated differently from other faults: the connection would get closed
without trying to reconnect and there was a ->bad_proto() connection op
for notifying about that.

This changed a long time ago, see commits 6384bb8b8e88 ("libceph: kill
bad_proto ceph connection op") and 0fa6ebc600bc ("libceph: fix protocol
feature mismatch failure path").  Nowadays these aren't any different
from other faults (i.e. we try to reconnect even though the mismatch
won't resolve until the server is replaced).  reset_connection() calls
there are rather confusing because reset_connection() resets a session
together an individual instance of the protocol.  This is cleaned up
in the next patch.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f3eb66bab988..a73525de7b70 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2091,7 +2091,6 @@ static int process_connect(struct ceph_connection *con)
 		       ceph_pr_addr(&con->peer_addr),
 		       sup_feat, server_feat, server_feat & ~sup_feat);
 		con->error_msg = "missing required protocol features";
-		reset_connection(con);
 		return -1;
 
 	case CEPH_MSGR_TAG_BADPROTOVER:
@@ -2102,7 +2101,6 @@ static int process_connect(struct ceph_connection *con)
 		       le32_to_cpu(con->out_connect.protocol_version),
 		       le32_to_cpu(con->in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
-		reset_connection(con);
 		return -1;
 
 	case CEPH_MSGR_TAG_BADAUTHORIZER:
@@ -2192,7 +2190,6 @@ static int process_connect(struct ceph_connection *con)
 			       ceph_pr_addr(&con->peer_addr),
 			       req_feat, server_feat, req_feat & ~server_feat);
 			con->error_msg = "missing required protocol features";
-			reset_connection(con);
 			return -1;
 		}
 

From 3596f4c1241d3c9b6a7cb03b7209c1897c3a5390 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 6 Nov 2020 17:07:07 +0100
Subject: [PATCH 195/230] libceph: split protocol reset bits out of
 reset_connection()

Move protocol reset bits into ceph_con_reset_protocol(), leaving
just session reset bits.

Note that con->out_skip is now reset on faults.  This fixes a crash
in the case of a stateful session getting a fault while in the middle
of revoking a message.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 50 +++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a73525de7b70..37166f130d44 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -613,6 +613,25 @@ static int con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
+static void ceph_con_reset_protocol(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+
+	con_close_socket(con);
+	if (con->in_msg) {
+		WARN_ON(con->in_msg->con != con);
+		ceph_msg_put(con->in_msg);
+		con->in_msg = NULL;
+	}
+	if (con->out_msg) {
+		WARN_ON(con->out_msg->con != con);
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+
+	con->out_skip = 0;
+}
+
 /*
  * Reset a connection.  Discard all incoming and outgoing messages
  * and clear *_seq state.
@@ -637,26 +656,16 @@ static void reset_connection(struct ceph_connection *con)
 	/* reset connection, out_queue, msg_ and connect_seq */
 	/* discard existing out_queue and msg_seq */
 	dout("reset_connection %p\n", con);
+
+	WARN_ON(con->in_msg);
+	WARN_ON(con->out_msg);
 	ceph_msg_remove_list(&con->out_queue);
 	ceph_msg_remove_list(&con->out_sent);
 
-	if (con->in_msg) {
-		BUG_ON(con->in_msg->con != con);
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-
 	con->connect_seq = 0;
 	con->out_seq = 0;
-	if (con->out_msg) {
-		BUG_ON(con->out_msg->con != con);
-		ceph_msg_put(con->out_msg);
-		con->out_msg = NULL;
-	}
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
-
-	con->out_skip = 0;
 }
 
 /*
@@ -673,10 +682,10 @@ void ceph_con_close(struct ceph_connection *con)
 	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
 	con_flag_clear(con, CON_FLAG_BACKOFF);
 
+	ceph_con_reset_protocol(con);
 	reset_connection(con);
 	con->peer_global_seq = 0;
 	cancel_con(con);
-	con_close_socket(con);
 	mutex_unlock(&con->mutex);
 }
 EXPORT_SYMBOL(ceph_con_close);
@@ -2986,7 +2995,7 @@ static void con_fault(struct ceph_connection *con)
 	       con->state != CON_STATE_NEGOTIATING &&
 	       con->state != CON_STATE_OPEN);
 
-	con_close_socket(con);
+	ceph_con_reset_protocol(con);
 
 	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
@@ -2994,17 +3003,6 @@ static void con_fault(struct ceph_connection *con)
 		return;
 	}
 
-	if (con->in_msg) {
-		BUG_ON(con->in_msg->con != con);
-		ceph_msg_put(con->in_msg);
-		con->in_msg = NULL;
-	}
-	if (con->out_msg) {
-		BUG_ON(con->out_msg->con != con);
-		ceph_msg_put(con->out_msg);
-		con->out_msg = NULL;
-	}
-
 	/* Requeue anything that hasn't been acked */
 	list_splice_init(&con->out_sent, &con->out_queue);
 

From 5963c3d01c8eec73d44a93fc0807b13369ffb63c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 6 Nov 2020 19:04:30 +0100
Subject: [PATCH 196/230] libceph: rename reset_connection() to
 ceph_con_reset_session()

With just session reset bits left, rename appropriately.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 37166f130d44..b1a936146855 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -651,11 +651,9 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-static void reset_connection(struct ceph_connection *con)
+static void ceph_con_reset_session(struct ceph_connection *con)
 {
-	/* reset connection, out_queue, msg_ and connect_seq */
-	/* discard existing out_queue and msg_seq */
-	dout("reset_connection %p\n", con);
+	dout("%s con %p\n", __func__, con);
 
 	WARN_ON(con->in_msg);
 	WARN_ON(con->out_msg);
@@ -683,7 +681,7 @@ void ceph_con_close(struct ceph_connection *con)
 	con_flag_clear(con, CON_FLAG_BACKOFF);
 
 	ceph_con_reset_protocol(con);
-	reset_connection(con);
+	ceph_con_reset_session(con);
 	con->peer_global_seq = 0;
 	cancel_con(con);
 	mutex_unlock(&con->mutex);
@@ -2140,7 +2138,7 @@ static int process_connect(struct ceph_connection *con)
 		pr_err("%s%lld %s connection reset\n",
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr));
-		reset_connection(con);
+		ceph_con_reset_session(con);
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)

From a3da057bbdb769c01ab06626ace3de160d40e973 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 11 Nov 2020 14:08:59 +0100
Subject: [PATCH 197/230] libceph: clear con->peer_global_seq on RESETSESSION

con->peer_global_seq is part of session state.  Clear it when
the server tells us to reset, not just in ceph_con_close().

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b1a936146855..1d4a51bf421c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -659,11 +659,12 @@ static void ceph_con_reset_session(struct ceph_connection *con)
 	WARN_ON(con->out_msg);
 	ceph_msg_remove_list(&con->out_queue);
 	ceph_msg_remove_list(&con->out_sent);
-
-	con->connect_seq = 0;
 	con->out_seq = 0;
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
+
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
 }
 
 /*
@@ -682,7 +683,6 @@ void ceph_con_close(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
-	con->peer_global_seq = 0;
 	cancel_con(con);
 	mutex_unlock(&con->mutex);
 }

From d3c1248cac2c07153ade346001dea001d8792479 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 11 Nov 2020 14:16:45 +0100
Subject: [PATCH 198/230] libceph: remove redundant session reset log message

Stick with pr_info message because session reset isn't an error most of
the time.  When it is (i.e. if the server denies the reconnect attempt),
we get a bunch of other pr_err messages.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1d4a51bf421c..a6d93280d3e9 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2135,9 +2135,9 @@ static int process_connect(struct ceph_connection *con)
 		 */
 		dout("process_connect got RESET peer seq %u\n",
 		     le32_to_cpu(con->in_reply.connect_seq));
-		pr_err("%s%lld %s connection reset\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr));
+		pr_info("%s%lld %s session reset\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr));
 		ceph_con_reset_session(con);
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
@@ -2147,7 +2147,6 @@ static int process_connect(struct ceph_connection *con)
 
 		/* Tell ceph about it. */
 		mutex_unlock(&con->mutex);
-		pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
 		if (con->ops->peer_reset)
 			con->ops->peer_reset(con);
 		mutex_lock(&con->mutex);

From 5cd8da3a1ca2160b8f9c2ff6a96762e66410ea38 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 13 Oct 2020 17:23:22 +0200
Subject: [PATCH 199/230] libceph: drop msg->ack_stamp field

It is set in process_ack() but never used.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 1 -
 net/ceph/messenger.c           | 1 -
 2 files changed, 2 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b47c7cc4c90a..6f77e70db855 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -235,7 +235,6 @@ struct ceph_msg {
 	bool more_to_follow;
 	bool needs_out_seq;
 	int front_alloc_len;
-	unsigned long ack_stamp;        /* tx: when we were acked */
 
 	struct ceph_msgpool *pool;
 };
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a6d93280d3e9..29b00b2cecac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2279,7 +2279,6 @@ static void process_ack(struct ceph_connection *con)
 			break;
 		dout("got ack for seq %llu type %d at %p\n", seq,
 		     le16_to_cpu(m->hdr.type), m);
-		m->ack_stamp = jiffies;
 		ceph_msg_remove(m);
 	}
 

From 0247192809e391009fec1b191080db953997477c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 13 Oct 2020 17:38:53 +0200
Subject: [PATCH 200/230] libceph: handle discarding acked and requeued
 messages separately

Make it easier to follow and remove dependency on msgr1 specific
CEPH_MSGR_TAG_SEQ.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 74 ++++++++++++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 29b00b2cecac..922fa158ffb2 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -760,6 +760,56 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 	return ret;
 }
 
+/*
+ * Discard messages that have been acked by the server.
+ */
+static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
+{
+	struct ceph_msg *msg;
+	u64 seq;
+
+	dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
+	while (!list_empty(&con->out_sent)) {
+		msg = list_first_entry(&con->out_sent, struct ceph_msg,
+				       list_head);
+		WARN_ON(msg->needs_out_seq);
+		seq = le64_to_cpu(msg->hdr.seq);
+		if (seq > ack_seq)
+			break;
+
+		dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+		     msg, seq);
+		ceph_msg_remove(msg);
+	}
+}
+
+/*
+ * Discard messages that have been requeued in con_fault(), up to
+ * reconnect_seq.  This avoids gratuitously resending messages that
+ * the server had received and handled prior to reconnect.
+ */
+static void ceph_con_discard_requeued(struct ceph_connection *con,
+				      u64 reconnect_seq)
+{
+	struct ceph_msg *msg;
+	u64 seq;
+
+	dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
+	while (!list_empty(&con->out_queue)) {
+		msg = list_first_entry(&con->out_queue, struct ceph_msg,
+				       list_head);
+		if (msg->needs_out_seq)
+			break;
+		seq = le64_to_cpu(msg->hdr.seq);
+		if (seq > reconnect_seq)
+			break;
+
+		dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+		     msg, seq);
+		ceph_msg_remove(msg);
+	}
+}
+
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
 	BUG_ON(con->out_skip);
@@ -2259,28 +2309,12 @@ static int read_partial_ack(struct ceph_connection *con)
  */
 static void process_ack(struct ceph_connection *con)
 {
-	struct ceph_msg *m;
 	u64 ack = le64_to_cpu(con->in_temp_ack);
-	u64 seq;
-	bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
-	struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
 
-	/*
-	 * In the reconnect case, con_fault() has requeued messages
-	 * in out_sent. We should cleanup old messages according to
-	 * the reconnect seq.
-	 */
-	while (!list_empty(list)) {
-		m = list_first_entry(list, struct ceph_msg, list_head);
-		if (reconnect && m->needs_out_seq)
-			break;
-		seq = le64_to_cpu(m->hdr.seq);
-		if (seq > ack)
-			break;
-		dout("got ack for seq %llu type %d at %p\n", seq,
-		     le16_to_cpu(m->hdr.type), m);
-		ceph_msg_remove(m);
-	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+		ceph_con_discard_sent(con, ack);
+	else
+		ceph_con_discard_requeued(con, ack);
 
 	prepare_read_tag(con);
 }

From 8ee8abf797bb3cb6007e30ac17a15f93277b0e91 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 4 Nov 2020 10:26:39 +0100
Subject: [PATCH 201/230] libceph: change ceph_msg_data_cursor_init() to take
 cursor

Make it possible to have local cursors and embed them outside struct
ceph_msg.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 922fa158ffb2..ca3f39b4f664 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1170,10 +1170,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 	cursor->need_crc = true;
 }
 
-static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+				      struct ceph_msg *msg, size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-
 	BUG_ON(!length);
 	BUG_ON(length > msg->data_length);
 	BUG_ON(!msg->num_data_items);
@@ -1278,7 +1277,7 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 {
 	/* Initialize data cursor */
 
-	ceph_msg_data_cursor_init(msg, (size_t)data_len);
+	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
 }
 
 /*

From fc4c128e15b50c73466dcd7234dde02f6fd9e4f8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 16 Nov 2020 17:27:50 +0100
Subject: [PATCH 202/230] libceph: change ceph_con_in_msg_alloc() to take hdr

ceph_con_in_msg_alloc() is protocol independent, but con->in_hdr (and
struct ceph_msg_header in general) is msgr1 specific.  While the struct
is deeply ingrained inside and outside the messenger, con->in_hdr field
can be separated.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ca3f39b4f664..d161878bc342 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2386,7 +2386,8 @@ static int read_partial_msg_data(struct ceph_connection *con)
 /*
  * read (part of) a message.
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
+static int ceph_con_in_msg_alloc(struct ceph_connection *con,
+				 struct ceph_msg_header *hdr, int *skip);
 
 static int read_partial_message(struct ceph_connection *con)
 {
@@ -2450,7 +2451,7 @@ static int read_partial_message(struct ceph_connection *con)
 
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
 		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &skip);
+		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
 		if (ret < 0)
 			return ret;
 
@@ -3455,9 +3456,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
  * On error (ENOMEM, EAGAIN, ...),
  *  - con->in_msg == NULL
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
+static int ceph_con_in_msg_alloc(struct ceph_connection *con,
+				 struct ceph_msg_header *hdr, int *skip)
 {
-	struct ceph_msg_header *hdr = &con->in_hdr;
 	int middle_len = le32_to_cpu(hdr->middle_len);
 	struct ceph_msg *msg;
 	int ret = 0;
@@ -3489,7 +3490,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
 		con->error_msg = "error allocating memory for incoming message";
 		return -ENOMEM;
 	}
-	memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+	memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
 
 	if (middle_len && !con->in_msg->middle) {
 		ret = ceph_alloc_middle(con, con->in_msg);

From 771294fe0724d92157048650f3585e7be606d0f8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 18 Nov 2020 16:37:14 +0100
Subject: [PATCH 203/230] libceph: factor out ceph_con_get_out_msg()

Move the logic of grabbing the next message from the queue into its own
function.  Like ceph_con_in_msg_alloc(), this is protocol independent.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 59 +++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d161878bc342..a912f2df9a2e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1304,6 +1304,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	con->out_msg_done = true;
 }
 
+static void ceph_con_get_out_msg(struct ceph_connection *con);
+
 /*
  * Prepare headers for the next outgoing message.
  */
@@ -1325,26 +1327,8 @@ static void prepare_write_message(struct ceph_connection *con)
 			&con->out_temp_ack);
 	}
 
-	BUG_ON(list_empty(&con->out_queue));
-	m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
-	con->out_msg = m;
-	BUG_ON(m->con != con);
-
-	/* put message on sent list */
-	ceph_msg_get(m);
-	list_move_tail(&m->list_head, &con->out_sent);
-
-	/*
-	 * only assign outgoing seq # if we haven't sent this message
-	 * yet.  if it is requeued, resend with it's original seq.
-	 */
-	if (m->needs_out_seq) {
-		m->hdr.seq = cpu_to_le64(++con->out_seq);
-		m->needs_out_seq = false;
-
-		if (con->ops->reencode_message)
-			con->ops->reencode_message(m);
-	}
+	ceph_con_get_out_msg(con);
+	m = con->out_msg;
 
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -3118,6 +3102,8 @@ static void clear_standby(struct ceph_connection *con)
 
 /*
  * Queue up an outgoing message on the given connection.
+ *
+ * Consumes a ref on @msg.
  */
 void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 {
@@ -3503,6 +3489,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	return ret;
 }
 
+static void ceph_con_get_out_msg(struct ceph_connection *con)
+{
+	struct ceph_msg *msg;
+
+	BUG_ON(list_empty(&con->out_queue));
+	msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+	WARN_ON(msg->con != con);
+
+	/*
+	 * Put the message on "sent" list using a ref from ceph_con_send().
+	 * It is put when the message is acked or revoked.
+	 */
+	list_move_tail(&msg->list_head, &con->out_sent);
+
+	/*
+	 * Only assign outgoing seq # if we haven't sent this message
+	 * yet.  If it is requeued, resend with it's original seq.
+	 */
+	if (msg->needs_out_seq) {
+		msg->hdr.seq = cpu_to_le64(++con->out_seq);
+		msg->needs_out_seq = false;
+
+		if (con->ops->reencode_message)
+			con->ops->reencode_message(msg);
+	}
+
+	/*
+	 * Get a ref for out_msg.  It is put when we are done sending the
+	 * message or in case of a fault.
+	 */
+	WARN_ON(con->out_msg);
+	con->out_msg = ceph_msg_get(msg);
+}
 
 /*
  * Free a generically kmalloc'd message.

From fd1a154cad6c6a16960fa9c2c9c6427da129e461 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 5 Nov 2020 18:48:04 +0100
Subject: [PATCH 204/230] libceph: make sure our addr->port is zero and
 addr->nonce is non-zero

Our messenger instance addr->port is normally zero -- anything else is
nonsensical because as a client we connect to multiple servers and don't
listen on any port.  However, a user can supply an arbitrary addr:port
via ip option and the port is currently preserved.  Zero it.

Conversely, make sure our addr->nonce is non-zero.  A zero nonce is
special: in combination with a zero port, it is used to blocklist the
entire ip.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a912f2df9a2e..0b432ce03a42 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2042,6 +2042,8 @@ bad:
 
 static int process_banner(struct ceph_connection *con)
 {
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
 	dout("process_banner on %p\n", con);
 
 	if (verify_hello(con) < 0)
@@ -2068,16 +2070,14 @@ static int process_banner(struct ceph_connection *con)
 	/*
 	 * did we learn our address?
 	 */
-	if (addr_is_blank(&con->msgr->inst.addr)) {
-		int port = addr_port(&con->msgr->inst.addr);
-
-		memcpy(&con->msgr->inst.addr.in_addr,
+	if (addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr,
 		       &con->peer_addr_for_me.in_addr,
 		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(&con->msgr->inst.addr, port);
+		addr_set_port(my_addr, 0);
 		encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
-		     ceph_pr_addr(&con->msgr->inst.addr));
+		     ceph_pr_addr(my_addr));
 	}
 
 	return 0;
@@ -3058,12 +3058,19 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 {
 	spin_lock_init(&msgr->global_seq_lock);
 
-	if (myaddr)
-		msgr->inst.addr = *myaddr;
+	if (myaddr) {
+		memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
+		       sizeof(msgr->inst.addr.in_addr));
+		addr_set_port(&msgr->inst.addr, 0);
+	}
 
-	/* select a random nonce */
 	msgr->inst.addr.type = 0;
-	get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+
+	/* generate a random non-zero nonce */
+	do {
+		get_random_bytes(&msgr->inst.addr.nonce,
+				 sizeof(msgr->inst.addr.nonce));
+	} while (!msgr->inst.addr.nonce);
 	encode_my_addr(msgr);
 
 	atomic_set(&msgr->stopping, 0);

From 2f68738037db30733caed6ac6278ba589d152afa Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 5 Nov 2020 18:42:33 +0100
Subject: [PATCH 205/230] libceph: don't export ceph_messenger_{init_fini}() to
 modules

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/messenger.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0b432ce03a42..bb79a59daf42 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3078,13 +3078,11 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 
 	dout("%s %p\n", __func__, msgr);
 }
-EXPORT_SYMBOL(ceph_messenger_init);
 
 void ceph_messenger_fini(struct ceph_messenger *msgr)
 {
 	put_net(read_pnet(&msgr->net));
 }
-EXPORT_SYMBOL(ceph_messenger_fini);
 
 static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
 {

From 30be780a87211de75b93935c20a0913e46744a3f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:11:26 +0100
Subject: [PATCH 206/230] libceph: make con->state an int

unsigned long is a leftover from when con->state used to be a set of
bits managed with set_bit(), clear_bit(), etc.  Save a bit of memory.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger.c           | 16 ++++++----------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6f77e70db855..f053de4f46dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -257,13 +257,13 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 
+	int state;
 	atomic_t sock_state;
 	struct socket *sock;
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_addr peer_addr_for_me;
 
 	unsigned long flags;
-	unsigned long state;
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index bb79a59daf42..9c92f101aa88 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -372,7 +372,7 @@ static void ceph_sock_data_ready(struct sock *sk)
 	}
 
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("%s on %p state = %lu, queueing work\n", __func__,
+		dout("%s %p state = %d, queueing work\n", __func__,
 		     con, con->state);
 		queue_con(con);
 	}
@@ -406,7 +406,7 @@ static void ceph_sock_state_change(struct sock *sk)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
-	dout("%s %p state = %lu sk_state = %u\n", __func__,
+	dout("%s %p state = %d sk_state = %u\n", __func__,
 	     con, con->state, sk->sk_state);
 
 	switch (sk->sk_state) {
@@ -2582,7 +2582,7 @@ static int try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
-	dout("try_write start %p state %lu\n", con, con->state);
+	dout("try_write start %p state %d\n", con, con->state);
 	if (con->state != CON_STATE_PREOPEN &&
 	    con->state != CON_STATE_CONNECTING &&
 	    con->state != CON_STATE_NEGOTIATING &&
@@ -2600,7 +2600,7 @@ static int try_write(struct ceph_connection *con)
 
 		BUG_ON(con->in_msg);
 		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %lu\n",
+		dout("try_write initiating connect on %p new state %d\n",
 		     con, con->state);
 		ret = ceph_tcp_connect(con);
 		if (ret < 0) {
@@ -2679,7 +2679,7 @@ static int try_read(struct ceph_connection *con)
 	int ret = -1;
 
 more:
-	dout("try_read start on %p state %lu\n", con, con->state);
+	dout("try_read start %p state %d\n", con, con->state);
 	if (con->state != CON_STATE_CONNECTING &&
 	    con->state != CON_STATE_NEGOTIATING &&
 	    con->state != CON_STATE_OPEN)
@@ -2876,11 +2876,7 @@ static bool con_sock_closed(struct ceph_connection *con)
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
-		pr_warn("%s con %p unrecognized state %lu\n",
-			__func__, con, con->state);
-		con->error_msg = "unrecognized con state";
 		BUG();
-		break;
 	}
 #undef CASE
 
@@ -2998,7 +2994,7 @@ static void ceph_con_workfn(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
-	dout("fault %p state %lu to peer %s\n",
+	dout("fault %p state %d to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr));
 
 	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),

From 6d7f62bfb5b5da6b0b37174c1fd32545f3b5b90d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:59:02 +0100
Subject: [PATCH 207/230] libceph: rename and export con->state states

In preparation for msgr2, rename msgr1 specific states and move the
defines to the header file.

Also drop state transition comments.  They don't cover all possible
transitions (e.g. NEGOTIATING -> STANDBY, etc) and currently do more
harm than good.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 12 ++++-
 net/ceph/messenger.c           | 90 +++++++++++++++-------------------
 2 files changed, 50 insertions(+), 52 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index f053de4f46dd..e6be85b67c6c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -239,6 +239,16 @@ struct ceph_msg {
 	struct ceph_msgpool *pool;
 };
 
+/*
+ * connection states
+ */
+#define CEPH_CON_S_CLOSED		1
+#define CEPH_CON_S_PREOPEN		2
+#define CEPH_CON_S_V1_BANNER		3
+#define CEPH_CON_S_V1_CONNECT_MSG	4
+#define CEPH_CON_S_OPEN			5
+#define CEPH_CON_S_STANDBY		6
+
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
@@ -257,7 +267,7 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 
-	int state;
+	int state;  /* CEPH_CON_S_* */
 	atomic_t sock_state;
 	struct socket *sock;
 	struct ceph_entity_addr peer_addr; /* peer address */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9c92f101aa88..adeb69ba6747 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,16 +82,6 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * connection states
- */
-#define CON_STATE_CLOSED        1  /* -> PREOPEN */
-#define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
-
 /*
  * ceph_connection flag bits
  */
@@ -674,7 +664,7 @@ void ceph_con_close(struct ceph_connection *con)
 {
 	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 
 	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
 	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
@@ -698,8 +688,8 @@ void ceph_con_open(struct ceph_connection *con,
 	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(addr));
 
-	WARN_ON(con->state != CON_STATE_CLOSED);
-	con->state = CON_STATE_PREOPEN;
+	WARN_ON(con->state != CEPH_CON_S_CLOSED);
+	con->state = CEPH_CON_S_PREOPEN;
 
 	con->peer_name.type = (__u8) entity_type;
 	con->peer_name.num = cpu_to_le64(entity_num);
@@ -739,7 +729,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 }
 EXPORT_SYMBOL(ceph_con_init);
 
@@ -2183,7 +2173,7 @@ static int process_connect(struct ceph_connection *con)
 		if (con->ops->peer_reset)
 			con->ops->peer_reset(con);
 		mutex_lock(&con->mutex);
-		if (con->state != CON_STATE_NEGOTIATING)
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
 			return -EAGAIN;
 		break;
 
@@ -2232,8 +2222,8 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 
-		WARN_ON(con->state != CON_STATE_NEGOTIATING);
-		con->state = CON_STATE_OPEN;
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
 		con->auth_retry = 0;    /* we authenticated; clear flag */
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
@@ -2583,16 +2573,16 @@ static int try_write(struct ceph_connection *con)
 	int ret = 1;
 
 	dout("try_write start %p state %d\n", con, con->state);
-	if (con->state != CON_STATE_PREOPEN &&
-	    con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
 		return 0;
 
 	/* open the socket first? */
-	if (con->state == CON_STATE_PREOPEN) {
+	if (con->state == CEPH_CON_S_PREOPEN) {
 		BUG_ON(con->sock);
-		con->state = CON_STATE_CONNECTING;
+		con->state = CEPH_CON_S_V1_BANNER;
 
 		con_out_kvec_reset(con);
 		prepare_write_banner(con);
@@ -2646,7 +2636,7 @@ more:
 	}
 
 do_next:
-	if (con->state == CON_STATE_OPEN) {
+	if (con->state == CEPH_CON_S_OPEN) {
 		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
@@ -2680,9 +2670,9 @@ static int try_read(struct ceph_connection *con)
 
 more:
 	dout("try_read start %p state %d\n", con, con->state);
-	if (con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
 		return 0;
 
 	BUG_ON(!con->sock);
@@ -2690,8 +2680,7 @@ more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
 	     con->in_base_pos);
 
-	if (con->state == CON_STATE_CONNECTING) {
-		dout("try_read connecting\n");
+	if (con->state == CEPH_CON_S_V1_BANNER) {
 		ret = read_partial_banner(con);
 		if (ret <= 0)
 			goto out;
@@ -2699,7 +2688,7 @@ more:
 		if (ret < 0)
 			goto out;
 
-		con->state = CON_STATE_NEGOTIATING;
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
 
 		/*
 		 * Received banner is good, exchange connection info.
@@ -2715,8 +2704,7 @@ more:
 		goto out;
 	}
 
-	if (con->state == CON_STATE_NEGOTIATING) {
-		dout("try_read negotiating\n");
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
 		ret = read_partial_connect(con);
 		if (ret <= 0)
 			goto out;
@@ -2726,7 +2714,7 @@ more:
 		goto more;
 	}
 
-	WARN_ON(con->state != CON_STATE_OPEN);
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
 
 	if (con->in_base_pos < 0) {
 		/*
@@ -2760,7 +2748,7 @@ more:
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			con_close_socket(con);
-			con->state = CON_STATE_CLOSED;
+			con->state = CEPH_CON_S_CLOSED;
 			goto out;
 		default:
 			goto bad_tag;
@@ -2785,7 +2773,7 @@ more:
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		process_message(con);
-		if (con->state == CON_STATE_OPEN)
+		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
 	}
@@ -2864,15 +2852,15 @@ static bool con_sock_closed(struct ceph_connection *con)
 		return false;
 
 #define CASE(x)								\
-	case CON_STATE_ ## x:						\
+	case CEPH_CON_S_ ## x:						\
 		con->error_msg = "socket closed (con state " #x ")";	\
 		break;
 
 	switch (con->state) {
 	CASE(CLOSED);
 	CASE(PREOPEN);
-	CASE(CONNECTING);
-	CASE(NEGOTIATING);
+	CASE(V1_BANNER);
+	CASE(V1_CONNECT_MSG);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
@@ -2943,16 +2931,16 @@ static void ceph_con_workfn(struct work_struct *work)
 			dout("%s: con %p BACKOFF\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_STANDBY) {
+		if (con->state == CEPH_CON_S_STANDBY) {
 			dout("%s: con %p STANDBY\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_CLOSED) {
+		if (con->state == CEPH_CON_S_CLOSED) {
 			dout("%s: con %p CLOSED\n", __func__, con);
 			BUG_ON(con->sock);
 			break;
 		}
-		if (con->state == CON_STATE_PREOPEN) {
+		if (con->state == CEPH_CON_S_PREOPEN) {
 			dout("%s: con %p PREOPEN\n", __func__, con);
 			BUG_ON(con->sock);
 		}
@@ -3001,15 +2989,15 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CON_STATE_CONNECTING &&
-	       con->state != CON_STATE_NEGOTIATING &&
-	       con->state != CON_STATE_OPEN);
+	WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
+	       con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	       con->state != CEPH_CON_S_OPEN);
 
 	ceph_con_reset_protocol(con);
 
 	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
-		con->state = CON_STATE_CLOSED;
+		con->state = CEPH_CON_S_CLOSED;
 		return;
 	}
 
@@ -3022,10 +3010,10 @@ static void con_fault(struct ceph_connection *con)
 	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
 		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-		con->state = CON_STATE_STANDBY;
+		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
-		con->state = CON_STATE_PREOPEN;
+		con->state = CEPH_CON_S_PREOPEN;
 		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
 		} else if (con->delay < MAX_DELAY_INTERVAL) {
@@ -3092,9 +3080,9 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-	if (con->state == CON_STATE_STANDBY) {
+	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
-		con->state = CON_STATE_PREOPEN;
+		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
 		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
 		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
@@ -3115,7 +3103,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 
-	if (con->state == CON_STATE_CLOSED) {
+	if (con->state == CEPH_CON_S_CLOSED) {
 		dout("con_send %p closed, dropping %p\n", con, msg);
 		ceph_msg_put(msg);
 		mutex_unlock(&con->mutex);
@@ -3456,7 +3444,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	mutex_unlock(&con->mutex);
 	msg = con->ops->alloc_msg(con, hdr, skip);
 	mutex_lock(&con->mutex);
-	if (con->state != CON_STATE_OPEN) {
+	if (con->state != CEPH_CON_S_OPEN) {
 		if (msg)
 			ceph_msg_put(msg);
 		return -EAGAIN;

From 3fefd43e741a5b8d55aeb9115ff488ad2cad439b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:56:36 +0100
Subject: [PATCH 208/230] libceph: rename and export con->flags bits

In preparation for msgr2, move the defines to the header file.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 13 +++++-
 net/ceph/messenger.c           | 77 +++++++++++++++-------------------
 2 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e6be85b67c6c..b6962a0fd76f 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -249,6 +249,17 @@ struct ceph_msg {
 #define CEPH_CON_S_OPEN			5
 #define CEPH_CON_S_STANDBY		6
 
+/*
+ * ceph_connection flag bits
+ */
+#define CEPH_CON_F_LOSSYTX		0  /* we can close channel or drop
+					      messages on errors */
+#define CEPH_CON_F_KEEPALIVE_PENDING	1  /* we need to send a keepalive */
+#define CEPH_CON_F_WRITE_PENDING	2  /* we have data ready to send */
+#define CEPH_CON_F_SOCK_CLOSED		3  /* socket state changed to closed */
+#define CEPH_CON_F_BACKOFF		4  /* need to retry queuing delayed
+					      work */
+
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
@@ -273,7 +284,7 @@ struct ceph_connection {
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_addr peer_addr_for_me;
 
-	unsigned long flags;
+	unsigned long flags;  /* CEPH_CON_F_* */
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index adeb69ba6747..ee87dc3af959 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,24 +82,14 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX           0  /* we can close channel or drop
-				       * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
-#define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
-
 static bool con_flag_valid(unsigned long con_flag)
 {
 	switch (con_flag) {
-	case CON_FLAG_LOSSYTX:
-	case CON_FLAG_KEEPALIVE_PENDING:
-	case CON_FLAG_WRITE_PENDING:
-	case CON_FLAG_SOCK_CLOSED:
-	case CON_FLAG_BACKOFF:
+	case CEPH_CON_F_LOSSYTX:
+	case CEPH_CON_F_KEEPALIVE_PENDING:
+	case CEPH_CON_F_WRITE_PENDING:
+	case CEPH_CON_F_SOCK_CLOSED:
+	case CEPH_CON_F_BACKOFF:
 		return true;
 	default:
 		return false;
@@ -380,7 +370,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+	if (con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -406,7 +396,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+		con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -597,7 +587,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+	con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -666,10 +656,10 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
 	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
-	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-	con_flag_clear(con, CON_FLAG_BACKOFF);
+	con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next connect */
+	con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
@@ -1365,7 +1355,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1386,7 +1376,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1404,7 +1394,7 @@ static void prepare_write_seq(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			 &con->out_temp_ack);
 
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1425,7 +1415,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1464,7 +1454,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
@@ -1475,7 +1465,7 @@ static void __prepare_write_connect(struct ceph_connection *con)
 				 con->auth->authorizer_buf);
 
 	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
@@ -2236,7 +2226,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CON_FLAG_LOSSYTX);
+			con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2637,7 +2627,8 @@ more:
 
 do_next:
 	if (con->state == CEPH_CON_S_OPEN) {
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+		if (con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
 		}
@@ -2653,7 +2644,7 @@ do_next:
 	}
 
 	/* Nothing to do! */
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2848,7 +2839,7 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+	if (!con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2875,7 +2866,7 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+	if (!con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
 	ret = queue_con_delay(con, con->delay);
@@ -2883,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2995,7 +2986,7 @@ static void con_fault(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+	if (con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CEPH_CON_S_CLOSED;
 		return;
@@ -3007,9 +2998,9 @@ static void con_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+	    !con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+		con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -3021,7 +3012,7 @@ static void con_fault(struct ceph_connection *con)
 			if (con->delay > MAX_DELAY_INTERVAL)
 				con->delay = MAX_DELAY_INTERVAL;
 		}
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
@@ -3084,8 +3075,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+		WARN_ON(con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
@@ -3126,7 +3117,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3222,10 +3213,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
+	con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);

From 699921d9e68ff3d9f8645488c12f4689c6533d70 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:37:06 +0100
Subject: [PATCH 209/230] libceph: export zero_page

In preparation for msgr2, make zero_page global.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  1 +
 net/ceph/messenger.c           | 17 +++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b6962a0fd76f..513ed5f90bff 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -343,6 +343,7 @@ struct ceph_connection {
 	unsigned long       delay;          /* current delay interval */
 };
 
+extern struct page *ceph_zero_page;
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ee87dc3af959..d3880fbe8424 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -164,7 +164,7 @@ static void con_fault(struct ceph_connection *con);
 static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
 static atomic_t addr_str_seq = ATOMIC_INIT(0);
 
-static struct page *zero_page;		/* used in certain error cases */
+struct page *ceph_zero_page;		/* used in certain error cases */
 
 const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 {
@@ -234,9 +234,9 @@ static void _ceph_msgr_exit(void)
 		ceph_msgr_wq = NULL;
 	}
 
-	BUG_ON(zero_page == NULL);
-	put_page(zero_page);
-	zero_page = NULL;
+	BUG_ON(!ceph_zero_page);
+	put_page(ceph_zero_page);
+	ceph_zero_page = NULL;
 
 	ceph_msgr_slab_exit();
 }
@@ -246,9 +246,9 @@ int __init ceph_msgr_init(void)
 	if (ceph_msgr_slab_init())
 		return -ENOMEM;
 
-	BUG_ON(zero_page != NULL);
-	zero_page = ZERO_PAGE(0);
-	get_page(zero_page);
+	BUG_ON(ceph_zero_page);
+	ceph_zero_page = ZERO_PAGE(0);
+	get_page(ceph_zero_page);
 
 	/*
 	 * The number of active work items is limited by the number of
@@ -1645,7 +1645,8 @@ static int write_partial_skip(struct ceph_connection *con)
 
 		if (size == con->out_skip)
 			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
 		if (ret <= 0)
 			goto out;
 		con->out_skip -= ret;

From 6503e0b69c9d4d78b5450db01e79328f8ed4ef21 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 16:29:47 +0100
Subject: [PATCH 210/230] libceph: export remaining protocol independent
 infrastructure

In preparation for msgr2, make all protocol independent functions
in messenger.c global.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  39 +++++++-
 net/ceph/messenger.c           | 157 ++++++++++++++++-----------------
 2 files changed, 113 insertions(+), 83 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 513ed5f90bff..93815f1a42b5 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -345,13 +345,50 @@ struct ceph_connection {
 
 extern struct page *ceph_zero_page;
 
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag);
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag);
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr);
+
+int ceph_tcp_connect(struct ceph_connection *con);
+int ceph_con_close_socket(struct ceph_connection *con);
+void ceph_con_reset_session(struct ceph_connection *con);
+
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt);
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq);
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length);
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece);
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length);
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr);
+int ceph_addr_port(const struct ceph_entity_addr *addr);
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
+
+void ceph_con_process_message(struct ceph_connection *con);
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip);
+void ceph_con_get_out_msg(struct ceph_connection *con);
+
+
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
 extern int ceph_parse_ips(const char *c, const char *end,
 			  struct ceph_entity_addr *addr,
 			  int max_count, int *count);
 
-
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
 extern void ceph_msgr_flush(void);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d3880fbe8424..85d20372f923 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -96,37 +96,37 @@ static bool con_flag_valid(unsigned long con_flag)
 	}
 }
 
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	clear_bit(con_flag, &con->flags);
 }
 
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	set_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_clear(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_and_clear_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_set(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
@@ -199,7 +199,7 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 }
 EXPORT_SYMBOL(ceph_pr_addr);
 
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
 	ceph_encode_banner_addr(&msgr->my_enc_addr);
@@ -370,7 +370,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -396,7 +396,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
+		ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -430,13 +430,15 @@ static void set_sock_callbacks(struct socket *sock,
 /*
  * initiate connection to a remote socket.
  */
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
 {
 	struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
 	struct socket *sock;
 	unsigned int noio_flag;
 	int ret;
 
+	dout("%s con %p peer_addr %s\n", __func__, con,
+	     ceph_pr_addr(&con->peer_addr));
 	BUG_ON(con->sock);
 
 	/* sock_create_kern() allocates with GFP_KERNEL */
@@ -454,8 +456,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
 	set_sock_callbacks(sock, con);
 
-	dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
-
 	con_sock_state_connecting(con);
 	ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
 				 O_NONBLOCK);
@@ -570,11 +570,11 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
 /*
  * Shutdown/close the socket for the given connection.
  */
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
 {
 	int rc = 0;
 
-	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	dout("%s con %p sock %p\n", __func__, con, con->sock);
 	if (con->sock) {
 		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
 		sock_release(con->sock);
@@ -587,7 +587,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
+	ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -597,7 +597,7 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
 
-	con_close_socket(con);
+	ceph_con_close_socket(con);
 	if (con->in_msg) {
 		WARN_ON(con->in_msg->con != con);
 		ceph_msg_put(con->in_msg);
@@ -631,7 +631,7 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-static void ceph_con_reset_session(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
 
@@ -656,10 +656,11 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
 	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next connect */
-	con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
-	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
-	con_flag_clear(con, CEPH_CON_F_BACKOFF);
+	ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next
+							  connect */
+	ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
@@ -728,7 +729,7 @@ EXPORT_SYMBOL(ceph_con_init);
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
  */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
 {
 	u32 ret;
 
@@ -743,7 +744,7 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 /*
  * Discard messages that have been acked by the server.
  */
-static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
 {
 	struct ceph_msg *msg;
 	u64 seq;
@@ -768,8 +769,7 @@ static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
  * reconnect_seq.  This avoids gratuitously resending messages that
  * the server had received and handled prior to reconnect.
  */
-static void ceph_con_discard_requeued(struct ceph_connection *con,
-				      u64 reconnect_seq)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 {
 	struct ceph_msg *msg;
 	u64 seq;
@@ -1150,8 +1150,8 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 	cursor->need_crc = true;
 }
 
-static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
-				      struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length)
 {
 	BUG_ON(!length);
 	BUG_ON(length > msg->data_length);
@@ -1168,9 +1168,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
  * data item, and supply the page offset and length of that piece.
  * Indicate whether this is the last piece in this data item.
  */
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
-					size_t *page_offset, size_t *length,
-					bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece)
 {
 	struct page *page;
 
@@ -1209,8 +1209,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
  * Returns true if the result moves the cursor on to the next piece
  * of the data item.
  */
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
-				  size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 {
 	bool new_piece;
 
@@ -1284,8 +1283,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	con->out_msg_done = true;
 }
 
-static void ceph_con_get_out_msg(struct ceph_connection *con);
-
 /*
  * Prepare headers for the next outgoing message.
  */
@@ -1355,7 +1352,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1376,7 +1373,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1394,7 +1391,7 @@ static void prepare_write_seq(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			 &con->out_temp_ack);
 
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1415,7 +1412,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1454,7 +1451,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
@@ -1465,12 +1462,12 @@ static void __prepare_write_connect(struct ceph_connection *con)
 				 con->auth->authorizer_buf);
 
 	con->out_more = 0;
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
 {
-	unsigned int global_seq = get_global_seq(con->msgr, 0);
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
 	int proto;
 	int ret;
 
@@ -1549,9 +1546,8 @@ out:
 	return ret;  /* done! */
 }
 
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
-				unsigned int page_offset,
-				unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length)
 {
 	char *kaddr;
 
@@ -1813,7 +1809,7 @@ static int verify_hello(struct ceph_connection *con)
 	return 0;
 }
 
-static bool addr_is_blank(struct ceph_entity_addr *addr)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
 	struct sockaddr_storage ss = addr->in_addr; /* align */
 	struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
@@ -1829,7 +1825,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr)
 	}
 }
 
-static int addr_port(struct ceph_entity_addr *addr)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1840,7 +1836,7 @@ static int addr_port(struct ceph_entity_addr *addr)
 	return 0;
 }
 
-static void addr_set_port(struct ceph_entity_addr *addr, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1998,7 +1994,7 @@ int ceph_parse_ips(const char *c, const char *end,
 			port = CEPH_MON_PORT;
 		}
 
-		addr_set_port(&addr[i], port);
+		ceph_addr_set_port(&addr[i], port);
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
@@ -2037,7 +2033,7 @@ static int process_banner(struct ceph_connection *con)
 	 */
 	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
 		   sizeof(con->peer_addr)) != 0 &&
-	    !(addr_is_blank(&con->actual_peer_addr) &&
+	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
@@ -2051,12 +2047,12 @@ static int process_banner(struct ceph_connection *con)
 	/*
 	 * did we learn our address?
 	 */
-	if (addr_is_blank(my_addr)) {
+	if (ceph_addr_is_blank(my_addr)) {
 		memcpy(&my_addr->in_addr,
 		       &con->peer_addr_for_me.in_addr,
 		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(my_addr, 0);
-		encode_my_addr(con->msgr);
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
 		     ceph_pr_addr(my_addr));
 	}
@@ -2192,8 +2188,8 @@ static int process_connect(struct ceph_connection *con)
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
 		     con->peer_global_seq,
 		     le32_to_cpu(con->in_reply.global_seq));
-		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->in_reply.global_seq));
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -2227,7 +2223,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CEPH_CON_F_LOSSYTX);
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2351,9 +2347,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 /*
  * read (part of) a message.
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con,
-				 struct ceph_msg_header *hdr, int *skip);
-
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
@@ -2515,7 +2508,7 @@ static int read_partial_message(struct ceph_connection *con)
  * be careful not to do anything that waits on other incoming messages or it
  * may deadlock.
  */
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
 
@@ -2628,7 +2621,7 @@ more:
 
 do_next:
 	if (con->state == CEPH_CON_S_OPEN) {
-		if (con_flag_test_and_clear(con,
+		if (ceph_con_flag_test_and_clear(con,
 				CEPH_CON_F_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
@@ -2645,7 +2638,7 @@ do_next:
 	}
 
 	/* Nothing to do! */
-	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2739,7 +2732,7 @@ more:
 			prepare_read_keepalive_ack(con);
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
-			con_close_socket(con);
+			ceph_con_close_socket(con);
 			con->state = CEPH_CON_S_CLOSED;
 			goto out;
 		default:
@@ -2764,7 +2757,7 @@ more:
 		}
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
-		process_message(con);
+		ceph_con_process_message(con);
 		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
@@ -2840,7 +2833,7 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2867,7 +2860,7 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
 	ret = queue_con_delay(con, con->delay);
@@ -2875,7 +2868,7 @@ static bool con_backoff(struct ceph_connection *con)
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CEPH_CON_F_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2987,7 +2980,7 @@ static void con_fault(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CEPH_CON_S_CLOSED;
 		return;
@@ -2999,9 +2992,9 @@ static void con_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+	    !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+		ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -3013,7 +3006,7 @@ static void con_fault(struct ceph_connection *con)
 			if (con->delay > MAX_DELAY_INTERVAL)
 				con->delay = MAX_DELAY_INTERVAL;
 		}
-		con_flag_set(con, CEPH_CON_F_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
@@ -3023,7 +3016,7 @@ void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
 	msgr->inst.addr.nonce = cpu_to_le32(nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 }
 
 /*
@@ -3037,7 +3030,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	if (myaddr) {
 		memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
 		       sizeof(msgr->inst.addr.in_addr));
-		addr_set_port(&msgr->inst.addr, 0);
+		ceph_addr_set_port(&msgr->inst.addr, 0);
 	}
 
 	msgr->inst.addr.type = 0;
@@ -3047,7 +3040,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 		get_random_bytes(&msgr->inst.addr.nonce,
 				 sizeof(msgr->inst.addr.nonce));
 	} while (!msgr->inst.addr.nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 
 	atomic_set(&msgr->stopping, 0);
 	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
@@ -3076,8 +3069,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
@@ -3118,7 +3111,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3214,10 +3207,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3423,8 +3416,8 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
  * On error (ENOMEM, EAGAIN, ...),
  *  - con->in_msg == NULL
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con,
-				 struct ceph_msg_header *hdr, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip)
 {
 	int middle_len = le32_to_cpu(hdr->middle_len);
 	struct ceph_msg *msg;
@@ -3470,7 +3463,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	return ret;
 }
 
-static void ceph_con_get_out_msg(struct ceph_connection *con)
+void ceph_con_get_out_msg(struct ceph_connection *con)
 {
 	struct ceph_msg *msg;
 

From 566050e17e53db283d4e26b73b4b50556f97ce7b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 12:55:39 +0100
Subject: [PATCH 211/230] libceph: separate msgr1 protocol implementation

In preparation for msgr2, define internal messenger <-> protocol
interface (as opposed to external messenger <-> client interface, which
is struct ceph_connection_operations) consisting of try_read(),
try_write(), revoke(), revoke_incoming(), opened(), reset_session() and
reset_protocol() ops.  The semantics are exactly the same as they are
now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |   8 ++
 net/ceph/messenger.c           | 138 +++++++++++++++++++++------------
 2 files changed, 96 insertions(+), 50 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 93815f1a42b5..8cc8b08eb3dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -382,6 +382,14 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
 void ceph_con_get_out_msg(struct ceph_connection *con);
 
+int ceph_con_v1_try_read(struct ceph_connection *con);
+int ceph_con_v1_try_write(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con);
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v1_opened(struct ceph_connection *con);
+void ceph_con_v1_reset_session(struct ceph_connection *con);
+void ceph_con_v1_reset_protocol(struct ceph_connection *con);
+
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 85d20372f923..4ca7d9b594c7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -593,6 +593,11 @@ int ceph_con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->out_skip = 0;
+}
+
 static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -609,7 +614,7 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 
-	con->out_skip = 0;
+	ceph_con_v1_reset_protocol(con);
 }
 
 /*
@@ -631,6 +636,12 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
+}
+
 void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -643,8 +654,7 @@ void ceph_con_reset_session(struct ceph_connection *con)
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
+	ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -692,12 +702,17 @@ void ceph_con_open(struct ceph_connection *con,
 }
 EXPORT_SYMBOL(ceph_con_open);
 
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->connect_seq;
+}
+
 /*
  * return true if this connection ever successfully opened
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
-	return con->connect_seq > 0;
+	return ceph_con_v1_opened(con);
 }
 
 /*
@@ -2552,7 +2567,7 @@ static int read_keepalive_ack(struct ceph_connection *con)
  * Write something to the socket.  Called in a worker thread when the
  * socket appears to be writeable and we have something ready to send.
  */
-static int try_write(struct ceph_connection *con)
+int ceph_con_v1_try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
@@ -2649,7 +2664,7 @@ out:
 /*
  * Read what we can from the socket.
  */
-static int try_read(struct ceph_connection *con)
+int ceph_con_v1_try_read(struct ceph_connection *con)
 {
 	int ret = -1;
 
@@ -2930,7 +2945,7 @@ static void ceph_con_workfn(struct work_struct *work)
 			BUG_ON(con->sock);
 		}
 
-		ret = try_read(con);
+		ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -2940,7 +2955,7 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = try_write(con);
+		ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -3116,6 +3131,29 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 }
 EXPORT_SYMBOL(ceph_con_send);
 
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->out_skip);
+	/* footer */
+	if (con->out_msg_done) {
+		con->out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->out_skip += con_out_kvec_skip(con);
+	con->out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->out_kvec_bytes, con->out_skip);
+}
+
 /*
  * Revoke a message that was previously queued for send
  */
@@ -3129,39 +3167,50 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	}
 
 	mutex_lock(&con->mutex);
-	if (!list_empty(&msg->list_head)) {
-		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
-		list_del_init(&msg->list_head);
-		msg->hdr.seq = 0;
-
-		ceph_msg_put(msg);
+	if (list_empty(&msg->list_head)) {
+		WARN_ON(con->out_msg == msg);
+		dout("%s con %p msg %p not linked\n", __func__, con, msg);
+		mutex_unlock(&con->mutex);
+		return;
 	}
+
+	dout("%s con %p msg %p was linked\n", __func__, con, msg);
+	msg->hdr.seq = 0;
+	ceph_msg_remove(msg);
+
 	if (con->out_msg == msg) {
-		BUG_ON(con->out_skip);
-		/* footer */
-		if (con->out_msg_done) {
-			con->out_skip += con_out_kvec_skip(con);
-		} else {
-			BUG_ON(!msg->data_length);
-			con->out_skip += sizeof_footer(con);
-		}
-		/* data, middle, front */
-		if (msg->data_length)
-			con->out_skip += msg->cursor.total_resid;
-		if (msg->middle)
-			con->out_skip += con_out_kvec_skip(con);
-		con->out_skip += con_out_kvec_skip(con);
-
-		dout("%s %p msg %p - was sending, will write %d skip %d\n",
-		     __func__, con, msg, con->out_kvec_bytes, con->out_skip);
-		msg->hdr.seq = 0;
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was sending\n", __func__, con, msg);
+		ceph_con_v1_revoke(con);
+		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
-		ceph_msg_put(msg);
+	} else {
+		dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+		     con, msg, con->out_msg);
 	}
-
 	mutex_unlock(&con->mutex);
 }
 
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+	/* skip rest of message */
+	con->in_base_pos = con->in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+}
+
 /*
  * Revoke a message that we may be reading data into
  */
@@ -3176,25 +3225,14 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 	if (con->in_msg == msg) {
-		unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-		unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-		unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-		/* skip rest of message */
-		dout("%s %p msg %p revoked\n", __func__, con, msg);
-		con->in_base_pos = con->in_base_pos -
-				sizeof(struct ceph_msg_header) -
-				front_len -
-				middle_len -
-				data_len -
-				sizeof(struct ceph_msg_footer);
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was recving\n", __func__, con, msg);
+		ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
 	} else {
-		dout("%s %p in_msg %p msg %p no-op\n",
-		     __func__, con, con->in_msg, msg);
+		dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+		     con, msg, con->in_msg);
 	}
 	mutex_unlock(&con->mutex);
 }

From 2f713615ddd9d805b6c5e79c52e0e11af99d2bf1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 15:48:06 +0100
Subject: [PATCH 212/230] libceph: move msgr1 protocol implementation to its
 own file

A pure move, no other changes.

Note that ceph_tcp_recv{msg,page}() and ceph_tcp_send{msg,page}()
helpers are also moved.  msgr2 will bring its own, more efficient,
variants based on iov_iter.  Switching msgr1 to them was considered
but decided against to avoid subtle regressions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |    1 +
 net/ceph/Makefile              |    3 +-
 net/ceph/messenger.c           | 1495 -------------------------------
 net/ceph/messenger_v1.c        | 1502 ++++++++++++++++++++++++++++++++
 4 files changed, 1505 insertions(+), 1496 deletions(-)
 create mode 100644 net/ceph/messenger_v1.c

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8cc8b08eb3dd..b5268127c55e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -382,6 +382,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
 void ceph_con_get_out_msg(struct ceph_connection *con);
 
+/* messenger_v1.c */
 int ceph_con_v1_try_read(struct ceph_connection *con);
 int ceph_con_v1_try_write(struct ceph_connection *con);
 void ceph_con_v1_revoke(struct ceph_connection *con);
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index ce09bb4fb249..df02bd8d6c7b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
-	pagevec.o snapshot.o string_table.o
+	pagevec.o snapshot.o string_table.o \
+	messenger_v1.o
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4ca7d9b594c7..544cfdbe52d6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -137,12 +137,6 @@ bool ceph_con_flag_test_and_set(struct ceph_connection *con,
 
 static struct kmem_cache	*ceph_msg_cache;
 
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
-
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
 #endif
@@ -477,96 +471,6 @@ int ceph_tcp_connect(struct ceph_connection *con)
 	return 0;
 }
 
-/*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-	struct kvec iov = {buf, len};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (!buf)
-		msg.msg_flags |= MSG_TRUNC;
-
-	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
-		     int page_offset, size_t length)
-{
-	struct bio_vec bvec = {
-		.bv_page = page,
-		.bv_offset = page_offset,
-		.bv_len = length
-	};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	BUG_ON(page_offset + length > PAGE_SIZE);
-	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-			    size_t kvlen, size_t len, bool more)
-{
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (more)
-		msg.msg_flags |= MSG_MORE;
-	else
-		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int more)
-{
-	ssize_t (*sendpage)(struct socket *sock, struct page *page,
-			    int offset, size_t size, int flags);
-	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
-	int ret;
-
-	/*
-	 * sendpage cannot properly handle pages with page_count == 0,
-	 * we need to fall back to sendmsg if that's the case.
-	 *
-	 * Same goes for slab pages: skb_can_coalesce() allows
-	 * coalescing neighboring slab objects into a single frag which
-	 * triggers one of hardened usercopy checks.
-	 */
-	if (sendpage_ok(page))
-		sendpage = sock->ops->sendpage;
-	else
-		sendpage = sock_no_sendpage;
-
-	ret = sendpage(sock, page, offset, size, flags);
-	if (ret == -EAGAIN)
-		ret = 0;
-
-	return ret;
-}
-
 /*
  * Shutdown/close the socket for the given connection.
  */
@@ -593,11 +497,6 @@ int ceph_con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
-void ceph_con_v1_reset_protocol(struct ceph_connection *con)
-{
-	con->out_skip = 0;
-}
-
 static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -636,12 +535,6 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-void ceph_con_v1_reset_session(struct ceph_connection *con)
-{
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
-}
-
 void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -702,11 +595,6 @@ void ceph_con_open(struct ceph_connection *con,
 }
 EXPORT_SYMBOL(ceph_con_open);
 
-bool ceph_con_v1_opened(struct ceph_connection *con)
-{
-	return con->connect_seq;
-}
-
 /*
  * return true if this connection ever successfully opened
  */
@@ -739,7 +627,6 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 }
 EXPORT_SYMBOL(ceph_con_init);
 
-
 /*
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
@@ -805,50 +692,6 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 	}
 }
 
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
-	BUG_ON(con->out_skip);
-
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
-				size_t size, void *data)
-{
-	int index = con->out_kvec_left;
-
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
-
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
-}
-
-/*
- * Chop off a kvec from the end.  Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
- */
-static int con_out_kvec_skip(struct ceph_connection *con)
-{
-	int off = con->out_kvec_cur - con->out_kvec;
-	int skip = 0;
-
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
-	}
-
-	return skip;
-}
-
 #ifdef CONFIG_BLOCK
 
 /*
@@ -1260,307 +1103,6 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 	cursor->need_crc = new_piece;
 }
 
-static size_t sizeof_footer(struct ceph_connection *con)
-{
-	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
-	    sizeof(struct ceph_msg_footer) :
-	    sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
-	/* Initialize data cursor */
-
-	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->out_msg;
-
-	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
-	dout("prepare_write_message_footer %p\n", con);
-	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
-	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
-		if (con->ops->sign_message)
-			con->ops->sign_message(m);
-		else
-			m->footer.sig = 0;
-	} else {
-		m->old_footer.flags = m->footer.flags;
-	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u32 crc;
-
-	con_out_kvec_reset(con);
-	con->out_msg_done = false;
-
-	/* Sneak an ack in there first?  If we can get it into the same
-	 * TCP packet that's a good thing. */
-	if (con->in_seq > con->in_seq_acked) {
-		con->in_seq_acked = con->in_seq;
-		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
-	}
-
-	ceph_con_get_out_msg(con);
-	m = con->out_msg;
-
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
-	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     m->data_length);
-	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
-	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
-	/* tag + hdr + front + middle */
-	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
-	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
-	if (m->middle)
-		con_out_kvec_add(con, m->middle->vec.iov_len,
-			m->middle->vec.iov_base);
-
-	/* fill in hdr crc and finalize hdr */
-	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
-	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
-	/* fill in front and middle crc, footer */
-	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
-	con->out_msg->footer.front_crc = cpu_to_le32(crc);
-	if (m->middle) {
-		crc = crc32c(0, m->middle->vec.iov_base,
-				m->middle->vec.iov_len);
-		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
-	} else
-		con->out_msg->footer.middle_crc = 0;
-	dout("%s front_crc %u middle_crc %u\n", __func__,
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-	con->out_msg->footer.flags = 0;
-
-	/* is there a data payload? */
-	con->out_msg->footer.data_crc = 0;
-	if (m->data_length) {
-		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
-		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con);
-	}
-
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-	dout("prepare_write_ack %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
-
-	con->out_more = 1;  /* more will follow.. eventually.. */
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
-	dout("prepare_write_seq %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
-
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-	dout("prepare_write_keepalive %p\n", con);
-	con_out_kvec_reset(con);
-	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
-		struct timespec64 now;
-
-		ktime_get_real_ts64(&now);
-		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
-	} else {
-		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
-	}
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
-	struct ceph_auth_handshake *auth;
-	int auth_proto;
-
-	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
-		return 0;
-	}
-
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
-	if (IS_ERR(auth))
-		return PTR_ERR(auth);
-
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
-	return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
-	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
-					&con->msgr->my_enc_addr);
-
-	con->out_more = 0;
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
-
-	con->out_more = 0;
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
-	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
-	int proto;
-	int ret;
-
-	switch (con->peer_name.type) {
-	case CEPH_ENTITY_TYPE_MON:
-		proto = CEPH_MONC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_OSD:
-		proto = CEPH_OSDC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_MDS:
-		proto = CEPH_MDSC_PROTOCOL;
-		break;
-	default:
-		BUG();
-	}
-
-	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
-
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
-
-	ret = get_connect_authorizer(con);
-	if (ret)
-		return ret;
-
-	__prepare_write_connect(con);
-	return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-	int ret;
-
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
-		if (ret <= 0)
-			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
-			break;            /* done */
-
-		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
-		}
-		/* and for a partially-consumed entry */
-		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
-		}
-	}
-	con->out_kvec_left = 0;
-	ret = 1;
-out:
-	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
-	return ret;  /* done! */
-}
-
 u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
 		     unsigned int length)
 {
@@ -1573,256 +1115,6 @@ u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
 
 	return crc;
 }
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	u32 crc;
-
-	dout("%s %p msg %p\n", __func__, con, msg);
-
-	if (!msg->num_data_items)
-		return -EINVAL;
-
-	/*
-	 * Iterate through each page that contains data to be
-	 * written, and send as much as possible for each.
-	 *
-	 * If we are calculating the data crc (the default), we will
-	 * need to map the page.  If we have no pages, they have
-	 * been revoked, so use the zero page.
-	 */
-	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
-	while (cursor->total_resid) {
-		struct page *page;
-		size_t page_offset;
-		size_t length;
-		int ret;
-
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		if (length == cursor->total_resid)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					more);
-		if (ret <= 0) {
-			if (do_datacrc)
-				msg->footer.data_crc = cpu_to_le32(crc);
-
-			return ret;
-		}
-		if (do_datacrc && cursor->need_crc)
-			crc = ceph_crc32c_page(crc, page, page_offset, length);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-
-	dout("%s %p msg %p done\n", __func__, con, msg);
-
-	/* prepare and queue up footer, too */
-	if (do_datacrc)
-		msg->footer.data_crc = cpu_to_le32(crc);
-	else
-		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con_out_kvec_reset(con);
-	prepare_write_message_footer(con);
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	int ret;
-
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
-		if (size == con->out_skip)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
-					more);
-		if (ret <= 0)
-			goto out;
-		con->out_skip -= ret;
-	}
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
-	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-	dout("prepare_read_message %p\n", con);
-	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
-	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-	return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-			int end, int size, void *object)
-{
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
-		int have = size - left;
-		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-	/* peer's banner */
-	size = strlen(CEPH_BANNER);
-	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
-	if (ret <= 0)
-		goto out;
-
-	size = sizeof (con->actual_peer_addr);
-	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
-
-	size = sizeof (con->peer_addr_for_me);
-	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
-
-out:
-	return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-	size = sizeof (con->in_reply);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
-	if (ret <= 0)
-		goto out;
-
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
-			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
-			ret = -EINVAL;
-			goto out;
-		}
-
-		end += size;
-		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
-		if (ret <= 0)
-			goto out;
-	}
-
-	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
-out:
-	return ret;
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to %s got bad banner\n",
-		       ceph_pr_addr(&con->peer_addr));
-		con->error_msg = "protocol error, bad banner";
-		return -1;
-	}
-	return 0;
-}
 
 bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
@@ -2032,492 +1324,6 @@ bad:
 	return ret;
 }
 
-static int process_banner(struct ceph_connection *con)
-{
-	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
-
-	dout("process_banner on %p\n", con);
-
-	if (verify_hello(con) < 0)
-		return -1;
-
-	/*
-	 * Make sure the other end is who we wanted.  note that the other
-	 * end may not yet know their ip address, so if it's 0.0.0.0, give
-	 * them the benefit of the doubt.
-	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-		   sizeof(con->peer_addr)) != 0 &&
-	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
-			ceph_pr_addr(&con->peer_addr),
-			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
-		con->error_msg = "wrong peer at address";
-		return -1;
-	}
-
-	/*
-	 * did we learn our address?
-	 */
-	if (ceph_addr_is_blank(my_addr)) {
-		memcpy(&my_addr->in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
-		ceph_addr_set_port(my_addr, 0);
-		ceph_encode_my_addr(con->msgr);
-		dout("process_banner learned my addr is %s\n",
-		     ceph_pr_addr(my_addr));
-	}
-
-	return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-	u64 sup_feat = from_msgr(con->msgr)->supported_features;
-	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
-	int ret;
-
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
-
-		/*
-		 * Any connection that defines ->get_authorizer()
-		 * should also define ->add_authorizer_challenge() and
-		 * ->verify_authorizer_reply().
-		 *
-		 * See get_connect_authorizer().
-		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
-			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
-			if (ret < 0)
-				return ret;
-
-			con_out_kvec_reset(con);
-			__prepare_write_connect(con);
-			prepare_read_connect(con);
-			return 0;
-		}
-
-		if (len) {
-			ret = con->ops->verify_authorizer_reply(con);
-			if (ret < 0) {
-				con->error_msg = "bad authorize reply";
-				return ret;
-			}
-		}
-	}
-
-	switch (con->in_reply.tag) {
-	case CEPH_MSGR_TAG_FEATURES:
-		pr_err("%s%lld %s feature set mismatch,"
-		       " my %llx < server's %llx, missing %llx\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       sup_feat, server_feat, server_feat & ~sup_feat);
-		con->error_msg = "missing required protocol features";
-		return -1;
-
-	case CEPH_MSGR_TAG_BADPROTOVER:
-		pr_err("%s%lld %s protocol version mismatch,"
-		       " my %d != server's %d\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
-		con->error_msg = "protocol version mismatch";
-		return -1;
-
-	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
-		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
-			con->error_msg = "connect authorization failure";
-			return -1;
-		}
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RESETSESSION:
-		/*
-		 * If we connected with a large connect_seq but the peer
-		 * has no record of a session with us (no connection, or
-		 * connect_seq == 0), they will send RESETSESION to indicate
-		 * that they must have reset their session, and may have
-		 * dropped messages.
-		 */
-		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
-		pr_info("%s%lld %s session reset\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr));
-		ceph_con_reset_session(con);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-
-		/* Tell ceph about it. */
-		mutex_unlock(&con->mutex);
-		if (con->ops->peer_reset)
-			con->ops->peer_reset(con);
-		mutex_lock(&con->mutex);
-		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
-			return -EAGAIN;
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_SESSION:
-		/*
-		 * If we sent a smaller connect_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_GLOBAL:
-		/*
-		 * If we sent a smaller global_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
-		ceph_get_global_seq(con->msgr,
-				    le32_to_cpu(con->in_reply.global_seq));
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_SEQ:
-	case CEPH_MSGR_TAG_READY:
-		if (req_feat & ~server_feat) {
-			pr_err("%s%lld %s protocol feature mismatch,"
-			       " my required %llx > server's %llx, need %llx\n",
-			       ENTITY_NAME(con->peer_name),
-			       ceph_pr_addr(&con->peer_addr),
-			       req_feat, server_feat, req_feat & ~server_feat);
-			con->error_msg = "missing required protocol features";
-			return -1;
-		}
-
-		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
-		con->state = CEPH_CON_S_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
-		con->peer_features = server_feat;
-		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
-
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
-
-		con->delay = 0;      /* reset backoff memory */
-
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
-			prepare_write_seq(con);
-			prepare_read_seq(con);
-		} else {
-			prepare_read_tag(con);
-		}
-		break;
-
-	case CEPH_MSGR_TAG_WAIT:
-		/*
-		 * If there is a connection race (we are opening
-		 * connections to each other), one of us may just have
-		 * to WAIT.  This shouldn't happen if we are the
-		 * client.
-		 */
-		con->error_msg = "protocol error, got WAIT as client";
-		return -1;
-
-	default:
-		con->error_msg = "protocol error, garbage tag during connect";
-		return -1;
-	}
-	return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-	int size = sizeof (con->in_temp_ack);
-	int end = size;
-
-	return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-	u64 ack = le64_to_cpu(con->in_temp_ack);
-
-	if (con->in_tag == CEPH_MSGR_TAG_ACK)
-		ceph_con_discard_sent(con, ack);
-	else
-		ceph_con_discard_requeued(con, ack);
-
-	prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section,
-					unsigned int sec_len, u32 *crc)
-{
-	int ret, left;
-
-	BUG_ON(!section);
-
-	while (section->iov_len < sec_len) {
-		BUG_ON(section->iov_base == NULL);
-		left = sec_len - section->iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-				       section->iov_len, left);
-		if (ret <= 0)
-			return ret;
-		section->iov_len += ret;
-	}
-	if (section->iov_len == sec_len)
-		*crc = crc32c(0, section->iov_base, section->iov_len);
-
-	return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	struct page *page;
-	size_t page_offset;
-	size_t length;
-	u32 crc = 0;
-	int ret;
-
-	if (!msg->num_data_items)
-		return -EIO;
-
-	if (do_datacrc)
-		crc = con->in_data_crc;
-	while (cursor->total_resid) {
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
-		if (ret <= 0) {
-			if (do_datacrc)
-				con->in_data_crc = crc;
-
-			return ret;
-		}
-
-		if (do_datacrc)
-			crc = ceph_crc32c_page(crc, page, page_offset, ret);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-	if (do_datacrc)
-		con->in_data_crc = crc;
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->in_msg;
-	int size;
-	int end;
-	int ret;
-	unsigned int front_len, middle_len, data_len;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
-	u64 seq;
-	u32 crc;
-
-	dout("read_partial_message con %p msg %p\n", con, m);
-
-	/* header */
-	size = sizeof (con->in_hdr);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
-	if (ret <= 0)
-		return ret;
-
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
-		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
-		return -EBADMSG;
-	}
-
-	front_len = le32_to_cpu(con->in_hdr.front_len);
-	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
-		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
-	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-
-	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
-	if ((s64)seq - (s64)con->in_seq < 1) {
-		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr),
-			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		return 1;
-	} else if ((s64)seq - (s64)con->in_seq > 1) {
-		pr_err("read_partial_message bad seq %lld expected %lld\n",
-		       seq, con->in_seq + 1);
-		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADE;
-	}
-
-	/* allocate message? */
-	if (!con->in_msg) {
-		int skip = 0;
-
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
-		if (ret < 0)
-			return ret;
-
-		BUG_ON(!con->in_msg ^ skip);
-		if (skip) {
-			/* skip this message */
-			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			con->in_seq++;
-			return 1;
-		}
-
-		BUG_ON(!con->in_msg);
-		BUG_ON(con->in_msg->con != con);
-		m = con->in_msg;
-		m->front.iov_len = 0;    /* haven't read it yet */
-		if (m->middle)
-			m->middle->vec.iov_len = 0;
-
-		/* prepare for data payload, if any */
-
-		if (data_len)
-			prepare_message_data(con->in_msg, data_len);
-	}
-
-	/* front */
-	ret = read_partial_message_section(con, &m->front, front_len,
-					   &con->in_front_crc);
-	if (ret <= 0)
-		return ret;
-
-	/* middle */
-	if (m->middle) {
-		ret = read_partial_message_section(con, &m->middle->vec,
-						   middle_len,
-						   &con->in_middle_crc);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* (page) data */
-	if (data_len) {
-		ret = read_partial_msg_data(con);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* footer */
-	size = sizeof_footer(con);
-	end += size;
-	ret = read_partial(con, end, size, &m->footer);
-	if (ret <= 0)
-		return ret;
-
-	if (!need_sign) {
-		m->footer.flags = m->old_footer.flags;
-		m->footer.sig = 0;
-	}
-
-	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-	     m, front_len, m->footer.front_crc, middle_len,
-	     m->footer.middle_crc, data_len, m->footer.data_crc);
-
-	/* crc ok? */
-	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-		pr_err("read_partial_message %p front crc %u != exp. %u\n",
-		       m, con->in_front_crc, m->footer.front_crc);
-		return -EBADMSG;
-	}
-	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-		pr_err("read_partial_message %p middle crc %u != exp %u\n",
-		       m, con->in_middle_crc, m->footer.middle_crc);
-		return -EBADMSG;
-	}
-	if (do_datacrc &&
-	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-		return -EBADMSG;
-	}
-
-	if (need_sign && con->ops->check_message_signature &&
-	    con->ops->check_message_signature(m)) {
-		pr_err("read_partial_message %p signature check failed\n", m);
-		return -EBADMSG;
-	}
-
-	return 1; /* done! */
-}
-
 /*
  * Process message.  This happens in the worker thread.  The callback should
  * be careful not to do anything that waits on other incoming messages or it
@@ -2551,263 +1357,6 @@ void ceph_con_process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
-static int read_keepalive_ack(struct ceph_connection *con)
-{
-	struct ceph_timespec ceph_ts;
-	size_t size = sizeof(ceph_ts);
-	int ret = read_partial(con, size, size, &ceph_ts);
-	if (ret <= 0)
-		return ret;
-	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
-	prepare_read_tag(con);
-	return 1;
-}
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-int ceph_con_v1_try_write(struct ceph_connection *con)
-{
-	int ret = 1;
-
-	dout("try_write start %p state %d\n", con, con->state);
-	if (con->state != CEPH_CON_S_PREOPEN &&
-	    con->state != CEPH_CON_S_V1_BANNER &&
-	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	    con->state != CEPH_CON_S_OPEN)
-		return 0;
-
-	/* open the socket first? */
-	if (con->state == CEPH_CON_S_PREOPEN) {
-		BUG_ON(con->sock);
-		con->state = CEPH_CON_S_V1_BANNER;
-
-		con_out_kvec_reset(con);
-		prepare_write_banner(con);
-		prepare_read_banner(con);
-
-		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %d\n",
-		     con, con->state);
-		ret = ceph_tcp_connect(con);
-		if (ret < 0) {
-			con->error_msg = "connect error";
-			goto out;
-		}
-	}
-
-more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-	BUG_ON(!con->sock);
-
-	/* kvec data queued? */
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
-		if (ret <= 0)
-			goto out;
-	}
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
-		if (ret <= 0)
-			goto out;
-	}
-
-	/* msg pages? */
-	if (con->out_msg) {
-		if (con->out_msg_done) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;   /* we're done with this one */
-			goto do_next;
-		}
-
-		ret = write_partial_message_data(con);
-		if (ret == 1)
-			goto more;  /* we need to send the footer, too! */
-		if (ret == 0)
-			goto out;
-		if (ret < 0) {
-			dout("try_write write_partial_message_data err %d\n",
-			     ret);
-			goto out;
-		}
-	}
-
-do_next:
-	if (con->state == CEPH_CON_S_OPEN) {
-		if (ceph_con_flag_test_and_clear(con,
-				CEPH_CON_F_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
-		/* is anything else pending? */
-		if (!list_empty(&con->out_queue)) {
-			prepare_write_message(con);
-			goto more;
-		}
-		if (con->in_seq > con->in_seq_acked) {
-			prepare_write_ack(con);
-			goto more;
-		}
-	}
-
-	/* Nothing to do! */
-	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
-	dout("try_write nothing else to write.\n");
-	ret = 0;
-out:
-	dout("try_write done on %p ret %d\n", con, ret);
-	return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-int ceph_con_v1_try_read(struct ceph_connection *con)
-{
-	int ret = -1;
-
-more:
-	dout("try_read start %p state %d\n", con, con->state);
-	if (con->state != CEPH_CON_S_V1_BANNER &&
-	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	    con->state != CEPH_CON_S_OPEN)
-		return 0;
-
-	BUG_ON(!con->sock);
-
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
-
-	if (con->state == CEPH_CON_S_V1_BANNER) {
-		ret = read_partial_banner(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_banner(con);
-		if (ret < 0)
-			goto out;
-
-		con->state = CEPH_CON_S_V1_CONNECT_MSG;
-
-		/*
-		 * Received banner is good, exchange connection info.
-		 * Do not reset out_kvec, as sending our banner raced
-		 * with receiving peer banner after connect completed.
-		 */
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			goto out;
-		prepare_read_connect(con);
-
-		/* Send connection info before awaiting response */
-		goto out;
-	}
-
-	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
-		ret = read_partial_connect(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_connect(con);
-		if (ret < 0)
-			goto out;
-		goto more;
-	}
-
-	WARN_ON(con->state != CEPH_CON_S_OPEN);
-
-	if (con->in_base_pos < 0) {
-		/*
-		 * skipping + discarding content.
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
-		if (ret <= 0)
-			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
-			goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
-		/*
-		 * what's next?
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-		if (ret <= 0)
-			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
-		case CEPH_MSGR_TAG_MSG:
-			prepare_read_message(con);
-			break;
-		case CEPH_MSGR_TAG_ACK:
-			prepare_read_ack(con);
-			break;
-		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
-			prepare_read_keepalive_ack(con);
-			break;
-		case CEPH_MSGR_TAG_CLOSE:
-			ceph_con_close_socket(con);
-			con->state = CEPH_CON_S_CLOSED;
-			goto out;
-		default:
-			goto bad_tag;
-		}
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-		ret = read_partial_message(con);
-		if (ret <= 0) {
-			switch (ret) {
-			case -EBADMSG:
-				con->error_msg = "bad crc/signature";
-				fallthrough;
-			case -EBADE:
-				ret = -EIO;
-				break;
-			case -EIO:
-				con->error_msg = "io error";
-				break;
-			}
-			goto out;
-		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
-			goto more;
-		ceph_con_process_message(con);
-		if (con->state == CEPH_CON_S_OPEN)
-			prepare_read_tag(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
-		/*
-		 * the final handshake seq exchange is semantically
-		 * equivalent to an ACK
-		 */
-		ret = read_partial_ack(con);
-		if (ret <= 0)
-			goto out;
-		process_ack(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
-		ret = read_keepalive_ack(con);
-		if (ret <= 0)
-			goto out;
-		goto more;
-	}
-
-out:
-	dout("try_read done on %p ret %d\n", con, ret);
-	return ret;
-
-bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-	con->error_msg = "protocol error, garbage tag";
-	ret = -1;
-	goto out;
-}
-
-
 /*
  * Atomically queue work on a connection after the specified delay.
  * Bump @con reference to avoid races with connection teardown.
@@ -3026,7 +1575,6 @@ static void con_fault(struct ceph_connection *con)
 	}
 }
 
-
 void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
@@ -3131,29 +1679,6 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 }
 EXPORT_SYMBOL(ceph_con_send);
 
-void ceph_con_v1_revoke(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-
-	WARN_ON(con->out_skip);
-	/* footer */
-	if (con->out_msg_done) {
-		con->out_skip += con_out_kvec_skip(con);
-	} else {
-		WARN_ON(!msg->data_length);
-		con->out_skip += sizeof_footer(con);
-	}
-	/* data, middle, front */
-	if (msg->data_length)
-		con->out_skip += msg->cursor.total_resid;
-	if (msg->middle)
-		con->out_skip += con_out_kvec_skip(con);
-	con->out_skip += con_out_kvec_skip(con);
-
-	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
-	     con->out_kvec_bytes, con->out_skip);
-}
-
 /*
  * Revoke a message that was previously queued for send
  */
@@ -3191,26 +1716,6 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	mutex_unlock(&con->mutex);
 }
 
-void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
-{
-	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-	/* skip rest of message */
-	con->in_base_pos = con->in_base_pos -
-			sizeof(struct ceph_msg_header) -
-			front_len -
-			middle_len -
-			data_len -
-			sizeof(struct ceph_msg_footer);
-
-	con->in_tag = CEPH_MSGR_TAG_READY;
-	con->in_seq++;
-
-	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
-}
-
 /*
  * Revoke a message that we may be reading data into
  */
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..899038a9678e
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1502 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (!buf)
+		msg.msg_flags |= MSG_TRUNC;
+
+	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+		     int page_offset, size_t length)
+{
+	struct bio_vec bvec = {
+		.bv_page = page,
+		.bv_offset = page_offset,
+		.bv_len = length
+	};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	BUG_ON(page_offset + length > PAGE_SIZE);
+	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+			    size_t kvlen, size_t len, bool more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int more)
+{
+	ssize_t (*sendpage)(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags);
+	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+	int ret;
+
+	/*
+	 * sendpage cannot properly handle pages with page_count == 0,
+	 * we need to fall back to sendmsg if that's the case.
+	 *
+	 * Same goes for slab pages: skb_can_coalesce() allows
+	 * coalescing neighboring slab objects into a single frag which
+	 * triggers one of hardened usercopy checks.
+	 */
+	if (sendpage_ok(page))
+		sendpage = sock->ops->sendpage;
+	else
+		sendpage = sock_no_sendpage;
+
+	ret = sendpage(sock, page, offset, size, flags);
+	if (ret == -EAGAIN)
+		ret = 0;
+
+	return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+	BUG_ON(con->out_skip);
+
+	con->out_kvec_left = 0;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_cur = &con->out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+				size_t size, void *data)
+{
+	int index = con->out_kvec_left;
+
+	BUG_ON(con->out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+
+	con->out_kvec[index].iov_len = size;
+	con->out_kvec[index].iov_base = data;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end.  Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+	int off = con->out_kvec_cur - con->out_kvec;
+	int skip = 0;
+
+	if (con->out_kvec_bytes > 0) {
+		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+		BUG_ON(con->out_kvec_bytes < skip);
+		BUG_ON(!con->out_kvec_left);
+		con->out_kvec_bytes -= skip;
+		con->out_kvec_left--;
+	}
+
+	return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+	    sizeof(struct ceph_msg_footer) :
+	    sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+	/* Initialize data cursor */
+
+	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+		if (con->ops->sign_message)
+			con->ops->sign_message(m);
+		else
+			m->footer.sig = 0;
+	} else {
+		m->old_footer.flags = m->footer.flags;
+	}
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u32 crc;
+
+	con_out_kvec_reset(con);
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			&con->out_temp_ack);
+	}
+
+	ceph_con_get_out_msg(con);
+	m = con->out_msg;
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     m->data_length);
+	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+	/* tag + hdr + front + middle */
+	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+	if (m->middle)
+		con_out_kvec_add(con, m->middle->vec.iov_len,
+			m->middle->vec.iov_base);
+
+	/* fill in hdr crc and finalize hdr */
+	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+	con->out_msg->hdr.crc = cpu_to_le32(crc);
+	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+
+	/* fill in front and middle crc, footer */
+	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+	con->out_msg->footer.front_crc = cpu_to_le32(crc);
+	if (m->middle) {
+		crc = crc32c(0, m->middle->vec.iov_base,
+				m->middle->vec.iov_len);
+		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+	} else
+		con->out_msg->footer.middle_crc = 0;
+	dout("%s front_crc %u middle_crc %u\n", __func__,
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+	con->out_msg->footer.flags = 0;
+
+	/* is there a data payload? */
+	con->out_msg->footer.data_crc = 0;
+	if (m->data_length) {
+		prepare_message_data(con->out_msg, m->data_length);
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+				&con->out_temp_ack);
+
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+	dout("prepare_write_seq %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			 &con->out_temp_ack);
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con_out_kvec_reset(con);
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec64 now;
+
+		ktime_get_real_ts64(&now);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
+				 &con->out_temp_keepalive2);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+	struct ceph_auth_handshake *auth;
+	int auth_proto;
+
+	if (!con->ops->get_authorizer) {
+		con->auth = NULL;
+		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->out_connect.authorizer_len = 0;
+		return 0;
+	}
+
+	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
+	if (IS_ERR(auth))
+		return PTR_ERR(auth);
+
+	con->auth = auth;
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+	return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+					&con->msgr->my_enc_addr);
+
+	con->out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
+	if (con->auth)
+		con_out_kvec_add(con, con->auth->authorizer_buf_len,
+				 con->auth->authorizer_buf);
+
+	con->out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+	int proto;
+	int ret;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features =
+	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	ret = get_connect_authorizer(con);
+	if (ret)
+		return ret;
+
+	__prepare_write_connect(con);
+	return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+
+		/* account for full iov entries consumed */
+		while (ret >= con->out_kvec_cur->iov_len) {
+			BUG_ON(!con->out_kvec_left);
+			ret -= con->out_kvec_cur->iov_len;
+			con->out_kvec_cur++;
+			con->out_kvec_left--;
+		}
+		/* and for a partially-consumed entry */
+		if (ret) {
+			con->out_kvec_cur->iov_len -= ret;
+			con->out_kvec_cur->iov_base += ret;
+		}
+	}
+	con->out_kvec_left = 0;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	u32 crc;
+
+	dout("%s %p msg %p\n", __func__, con, msg);
+
+	if (!msg->num_data_items)
+		return -EINVAL;
+
+	/*
+	 * Iterate through each page that contains data to be
+	 * written, and send as much as possible for each.
+	 *
+	 * If we are calculating the data crc (the default), we will
+	 * need to map the page.  If we have no pages, they have
+	 * been revoked, so use the zero page.
+	 */
+	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+	while (cursor->total_resid) {
+		struct page *page;
+		size_t page_offset;
+		size_t length;
+		int ret;
+
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		if (length == cursor->total_resid)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+					more);
+		if (ret <= 0) {
+			if (do_datacrc)
+				msg->footer.data_crc = cpu_to_le32(crc);
+
+			return ret;
+		}
+		if (do_datacrc && cursor->need_crc)
+			crc = ceph_crc32c_page(crc, page, page_offset, length);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+
+	dout("%s %p msg %p done\n", __func__, con, msg);
+
+	/* prepare and queue up footer, too */
+	if (do_datacrc)
+		msg->footer.data_crc = cpu_to_le32(crc);
+	else
+		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con_out_kvec_reset(con);
+	prepare_write_message_footer(con);
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	int ret;
+
+	dout("%s %p %d left\n", __func__, con, con->out_skip);
+	while (con->out_skip > 0) {
+		size_t size = min(con->out_skip, (int) PAGE_SIZE);
+
+		if (size == con->out_skip)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+	dout("prepare_read_seq %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+			int end, int size, void *object)
+{
+	while (con->in_base_pos < end) {
+		int left = end - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	size = strlen(CEPH_BANNER);
+	end = size;
+	ret = read_partial(con, end, size, con->in_banner);
+	if (ret <= 0)
+		goto out;
+
+	size = sizeof (con->actual_peer_addr);
+	end += size;
+	ret = read_partial(con, end, size, &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->actual_peer_addr);
+
+	size = sizeof (con->peer_addr_for_me);
+	end += size;
+	ret = read_partial(con, end, size, &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->peer_addr_for_me);
+
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	size = sizeof (con->in_reply);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_reply);
+	if (ret <= 0)
+		goto out;
+
+	if (con->auth) {
+		size = le32_to_cpu(con->in_reply.authorizer_len);
+		if (size > con->auth->authorizer_reply_buf_len) {
+			pr_err("authorizer reply too big: %d > %zu\n", size,
+			       con->auth->authorizer_reply_buf_len);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		end += size;
+		ret = read_partial(con, end, size,
+				   con->auth->authorizer_reply_buf);
+		if (ret <= 0)
+			goto out;
+	}
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+			ceph_pr_addr(&con->peer_addr),
+			le32_to_cpu(con->peer_addr.nonce),
+			ceph_pr_addr(&con->actual_peer_addr),
+			le32_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(my_addr));
+	}
+
+	return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = from_msgr(con->msgr)->supported_features;
+	u64 req_feat = from_msgr(con->msgr)->required_features;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	int ret;
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	if (con->auth) {
+		int len = le32_to_cpu(con->in_reply.authorizer_len);
+
+		/*
+		 * Any connection that defines ->get_authorizer()
+		 * should also define ->add_authorizer_challenge() and
+		 * ->verify_authorizer_reply().
+		 *
+		 * See get_connect_authorizer().
+		 */
+		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+			ret = con->ops->add_authorizer_challenge(
+				    con, con->auth->authorizer_reply_buf, len);
+			if (ret < 0)
+				return ret;
+
+			con_out_kvec_reset(con);
+			__prepare_write_connect(con);
+			prepare_read_connect(con);
+			return 0;
+		}
+
+		if (len) {
+			ret = con->ops->verify_authorizer_reply(con);
+			if (ret < 0) {
+				con->error_msg = "bad authorize reply";
+				return ret;
+			}
+		}
+	}
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			return -1;
+		}
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_reply.connect_seq));
+		pr_info("%s%lld %s session reset\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr));
+		ceph_con_reset_session(con);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+			return -EAGAIN;
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_reply.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->in_reply.global_seq));
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_SEQ:
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			return -1;
+		}
+
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
+		con->auth_retry = 0;    /* we authenticated; clear flag */
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+		con->delay = 0;      /* reset backoff memory */
+
+		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+			prepare_write_seq(con);
+			prepare_read_seq(con);
+		} else {
+			prepare_read_tag(con);
+		}
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		con->error_msg = "protocol error, got WAIT as client";
+		return -1;
+
+	default:
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int size = sizeof (con->in_temp_ack);
+	int end = size;
+
+	return read_partial(con, end, size, &con->in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+
+	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+		ceph_con_discard_sent(con, ack);
+	else
+		ceph_con_discard_requeued(con, ack);
+
+	prepare_read_tag(con);
+}
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+	}
+	if (section->iov_len == sec_len)
+		*crc = crc32c(0, section->iov_base, section->iov_len);
+
+	return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	struct page *page;
+	size_t page_offset;
+	size_t length;
+	u32 crc = 0;
+	int ret;
+
+	if (!msg->num_data_items)
+		return -EIO;
+
+	if (do_datacrc)
+		crc = con->in_data_crc;
+	while (cursor->total_resid) {
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
+
+			return ret;
+		}
+
+		if (do_datacrc)
+			crc = ceph_crc32c_page(crc, page, page_offset, ret);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+	if (do_datacrc)
+		con->in_data_crc = crc;
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int size;
+	int end;
+	int ret;
+	unsigned int front_len, middle_len, data_len;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+	u64 seq;
+	u32 crc;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	size = sizeof (con->in_hdr);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_hdr);
+	if (ret <= 0)
+		return ret;
+
+	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+		       crc, con->in_hdr.crc);
+		return -EBADMSG;
+	}
+
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof_footer(con);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		return 1;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		int skip = 0;
+
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     front_len, data_len);
+		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
+		if (ret < 0)
+			return ret;
+
+		BUG_ON(!con->in_msg ^ skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof_footer(con);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 1;
+		}
+
+		BUG_ON(!con->in_msg);
+		BUG_ON(con->in_msg->con != con);
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		/* prepare for data payload, if any */
+
+		if (data_len)
+			prepare_message_data(con->in_msg, data_len);
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* (page) data */
+	if (data_len) {
+		ret = read_partial_msg_data(con);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* footer */
+	size = sizeof_footer(con);
+	end += size;
+	ret = read_partial(con, end, size, &m->footer);
+	if (ret <= 0)
+		return ret;
+
+	if (!need_sign) {
+		m->footer.flags = m->old_footer.flags;
+		m->footer.sig = 0;
+	}
+
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (do_datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	if (need_sign && con->ops->check_message_signature &&
+	    con->ops->check_message_signature(m)) {
+		pr_err("read_partial_message %p signature check failed\n", m);
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+more:
+	dout("try_read start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	BUG_ON(!con->sock);
+
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+
+	if (con->state == CEPH_CON_S_V1_BANNER) {
+		ret = read_partial_banner(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_banner(con);
+		if (ret < 0)
+			goto out;
+
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+		/*
+		 * Received banner is good, exchange connection info.
+		 * Do not reset out_kvec, as sending our banner raced
+		 * with receiving peer banner after connect completed.
+		 */
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			goto out;
+		prepare_read_connect(con);
+
+		/* Send connection info before awaiting response */
+		goto out;
+	}
+
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_connect(con);
+		if (ret < 0)
+			goto out;
+		goto more;
+	}
+
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
+		if (ret <= 0)
+			goto out;
+		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto out;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			ceph_con_close_socket(con);
+			con->state = CEPH_CON_S_CLOSED;
+			goto out;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc/signature";
+				fallthrough;
+			case -EBADE:
+				ret = -EIO;
+				break;
+			case -EIO:
+				con->error_msg = "io error";
+				break;
+			}
+			goto out;
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		ceph_con_process_message(con);
+		if (con->state == CEPH_CON_S_OPEN)
+			prepare_read_tag(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+		/*
+		 * the final handshake seq exchange is semantically
+		 * equivalent to an ACK
+		 */
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto out;
+		process_ack(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
+
+out:
+	dout("try_read done on %p ret %d\n", con, ret);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+	int ret = 1;
+
+	dout("try_write start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		BUG_ON(con->sock);
+		con->state = CEPH_CON_S_V1_BANNER;
+
+		con_out_kvec_reset(con);
+		prepare_write_banner(con);
+		prepare_read_banner(con);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %d\n",
+		     con, con->state);
+		ret = ceph_tcp_connect(con);
+		if (ret < 0) {
+			con->error_msg = "connect error";
+			goto out;
+		}
+	}
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+	BUG_ON(!con->sock);
+
+	/* kvec data queued? */
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto out;
+	}
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto out;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_message_data(con);
+		if (ret == 1)
+			goto more;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto out;
+		if (ret < 0) {
+			dout("try_write write_partial_message_data err %d\n",
+			     ret);
+			goto out;
+		}
+	}
+
+do_next:
+	if (con->state == CEPH_CON_S_OPEN) {
+		if (ceph_con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	dout("try_write nothing else to write.\n");
+	ret = 0;
+out:
+	dout("try_write done on %p ret %d\n", con, ret);
+	return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->out_skip);
+	/* footer */
+	if (con->out_msg_done) {
+		con->out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->out_skip += con_out_kvec_skip(con);
+	con->out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->out_kvec_bytes, con->out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+	/* skip rest of message */
+	con->in_base_pos = con->in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->out_skip = 0;
+}

From a56dd9bf47220c3206f27075af8bdfb219a2a3cf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 16:31:41 +0100
Subject: [PATCH 213/230] libceph: move msgr1 protocol specific fields to its
 own struct

A couple whitespace fixups, no functional changes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  76 +++---
 net/ceph/messenger.c           |   8 +-
 net/ceph/messenger_v1.c        | 420 +++++++++++++++++----------------
 3 files changed, 257 insertions(+), 247 deletions(-)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b5268127c55e..54a64e8dfce6 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -264,6 +264,43 @@ struct ceph_msg {
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
 
+struct ceph_connection_v1_info {
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_more;       /* there is more data after the kvecs */
+	bool out_msg_done;
+
+	struct ceph_auth_handshake *auth;
+	int auth_retry;       /* true if we need a newer authorizer */
+
+	/* connection negotiation temps */
+	u8 in_banner[CEPH_BANNER_MAX_LEN];
+	struct ceph_entity_addr actual_peer_addr;
+	struct ceph_entity_addr peer_addr_for_me;
+	struct ceph_msg_connect out_connect;
+	struct ceph_msg_connect_reply in_reply;
+
+	int in_base_pos;     /* bytes read */
+
+	/* message in temps */
+	u8 in_tag;           /* protocol control byte */
+	struct ceph_msg_header in_hdr;
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	/* message out temps */
+	struct ceph_msg_header out_hdr;
+	__le64 out_temp_ack;  /* for writing an ack */
+	struct ceph_timespec out_temp_keepalive2;  /* for writing keepalive2
+						      stamp */
+
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this session */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+};
+
 /*
  * A single connection with another host.
  *
@@ -281,21 +318,13 @@ struct ceph_connection {
 	int state;  /* CEPH_CON_S_* */
 	atomic_t sock_state;
 	struct socket *sock;
-	struct ceph_entity_addr peer_addr; /* peer address */
-	struct ceph_entity_addr peer_addr_for_me;
 
 	unsigned long flags;  /* CEPH_CON_F_* */
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
-
+	struct ceph_entity_addr peer_addr; /* peer address */
 	u64 peer_features;
-	u32 connect_seq;      /* identify the most recent connection
-				 attempt for this connection, client */
-	u32 peer_global_seq;  /* peer's global seq for this connection */
-
-	struct ceph_auth_handshake *auth;
-	int auth_retry;       /* true if we need a newer authorizer */
 
 	struct mutex mutex;
 
@@ -306,41 +335,18 @@ struct ceph_connection {
 
 	u64 in_seq, in_seq_acked;  /* last message received, acked */
 
-	/* connection negotiation temps */
-	char in_banner[CEPH_BANNER_MAX_LEN];
-	struct ceph_msg_connect out_connect;
-	struct ceph_msg_connect_reply in_reply;
-	struct ceph_entity_addr actual_peer_addr;
-
-	/* message out temps */
-	struct ceph_msg_header out_hdr;
+	struct ceph_msg *in_msg;
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
-	bool out_msg_done;
 
-	struct kvec out_kvec[8],         /* sending header/footer data */
-		*out_kvec_cur;
-	int out_kvec_left;   /* kvec's left in out_kvec */
-	int out_skip;        /* skip this many bytes */
-	int out_kvec_bytes;  /* total bytes left */
-	int out_more;        /* there is more data after the kvecs */
-	__le64 out_temp_ack; /* for writing an ack */
-	struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
-						     stamp */
-
-	/* message in temps */
-	struct ceph_msg_header in_hdr;
-	struct ceph_msg *in_msg;
 	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
 
-	char in_tag;         /* protocol control byte */
-	int in_base_pos;     /* bytes read */
-	__le64 in_temp_ack;  /* for reading an ack */
-
 	struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
 
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
+
+	struct ceph_connection_v1_info v1;
 };
 
 extern struct page *ceph_zero_page;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 544cfdbe52d6..4fb3c33a7b03 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1448,11 +1448,11 @@ static void con_fault_finish(struct ceph_connection *con)
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.
 	 */
-	if (con->auth_retry) {
-		dout("auth_retry %d, invalidating\n", con->auth_retry);
+	if (con->v1.auth_retry) {
+		dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
 		if (con->ops->invalidate_authorizer)
 			con->ops->invalidate_authorizer(con);
-		con->auth_retry = 0;
+		con->v1.auth_retry = 0;
 	}
 
 	if (con->ops->fault)
@@ -1631,7 +1631,7 @@ static void clear_standby(struct ceph_connection *con)
 	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
-		con->connect_seq++;
+		con->v1.connect_seq++;
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 899038a9678e..04f653b3c897 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -110,25 +110,25 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
 
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
-	BUG_ON(con->out_skip);
+	BUG_ON(con->v1.out_skip);
 
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
+	con->v1.out_kvec_left = 0;
+	con->v1.out_kvec_bytes = 0;
+	con->v1.out_kvec_cur = &con->v1.out_kvec[0];
 }
 
 static void con_out_kvec_add(struct ceph_connection *con,
 				size_t size, void *data)
 {
-	int index = con->out_kvec_left;
+	int index = con->v1.out_kvec_left;
 
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+	BUG_ON(con->v1.out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
 
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
+	con->v1.out_kvec[index].iov_len = size;
+	con->v1.out_kvec[index].iov_base = data;
+	con->v1.out_kvec_left++;
+	con->v1.out_kvec_bytes += size;
 }
 
 /*
@@ -138,15 +138,14 @@ static void con_out_kvec_add(struct ceph_connection *con,
  */
 static int con_out_kvec_skip(struct ceph_connection *con)
 {
-	int off = con->out_kvec_cur - con->out_kvec;
 	int skip = 0;
 
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
+	if (con->v1.out_kvec_bytes > 0) {
+		skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+		BUG_ON(con->v1.out_kvec_bytes < skip);
+		BUG_ON(!con->v1.out_kvec_left);
+		con->v1.out_kvec_bytes -= skip;
+		con->v1.out_kvec_left--;
 	}
 
 	return skip;
@@ -186,8 +185,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	} else {
 		m->old_footer.flags = m->footer.flags;
 	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
+	con->v1.out_more = m->more_to_follow;
+	con->v1.out_msg_done = true;
 }
 
 /*
@@ -199,16 +198,16 @@ static void prepare_write_message(struct ceph_connection *con)
 	u32 crc;
 
 	con_out_kvec_reset(con);
-	con->out_msg_done = false;
+	con->v1.out_msg_done = false;
 
 	/* Sneak an ack in there first?  If we can get it into the same
 	 * TCP packet that's a good thing. */
 	if (con->in_seq > con->in_seq_acked) {
 		con->in_seq_acked = con->in_seq;
 		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
+		con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			&con->v1.out_temp_ack);
 	}
 
 	ceph_con_get_out_msg(con);
@@ -223,7 +222,7 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* tag + hdr + front + middle */
 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+	con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
 	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
 
 	if (m->middle)
@@ -233,7 +232,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	/* fill in hdr crc and finalize hdr */
 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
 	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+	memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
 
 	/* fill in front and middle crc, footer */
 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
@@ -253,7 +252,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	con->out_msg->footer.data_crc = 0;
 	if (m->data_length) {
 		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
+		con->v1.out_more = 1;  /* data + footer will follow */
 	} else {
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
@@ -275,11 +274,11 @@ static void prepare_write_ack(struct ceph_connection *con)
 
 	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
 
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
 
-	con->out_more = 1;  /* more will follow.. eventually.. */
+	con->v1.out_more = 1;  /* more will follow.. eventually.. */
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
@@ -294,9 +293,9 @@ static void prepare_write_seq(struct ceph_connection *con)
 
 	con_out_kvec_reset(con);
 
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
 
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
@@ -313,9 +312,9 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 
 		ktime_get_real_ts64(&now);
 		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
+		ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+				 &con->v1.out_temp_keepalive2);
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
@@ -332,19 +331,20 @@ static int get_connect_authorizer(struct ceph_connection *con)
 	int auth_proto;
 
 	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
+		con->v1.auth = NULL;
+		con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->v1.out_connect.authorizer_len = 0;
 		return 0;
 	}
 
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
+	auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
 	if (IS_ERR(auth))
 		return PTR_ERR(auth);
 
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+	con->v1.auth = auth;
+	con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->v1.out_connect.authorizer_len =
+		cpu_to_le32(auth->authorizer_buf_len);
 	return 0;
 }
 
@@ -357,18 +357,19 @@ static void prepare_write_banner(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
 					&con->msgr->my_enc_addr);
 
-	con->out_more = 0;
+	con->v1.out_more = 0;
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
 {
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
+	con_out_kvec_add(con, sizeof(con->v1.out_connect),
+			 &con->v1.out_connect);
+	if (con->v1.auth)
+		con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+				 con->v1.auth->authorizer_buf);
 
-	con->out_more = 0;
+	con->v1.out_more = 0;
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
@@ -393,15 +394,15 @@ static int prepare_write_connect(struct ceph_connection *con)
 	}
 
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
+	     con->v1.connect_seq, global_seq, proto);
 
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
+	con->v1.out_connect.features =
+		cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+	con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+	con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+	con->v1.out_connect.flags = 0;
 
 	ret = get_connect_authorizer(con);
 	if (ret)
@@ -421,35 +422,36 @@ static int write_partial_kvec(struct ceph_connection *con)
 {
 	int ret;
 
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
+	dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+	while (con->v1.out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+				       con->v1.out_kvec_left,
+				       con->v1.out_kvec_bytes,
+				       con->v1.out_more);
 		if (ret <= 0)
 			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
+		con->v1.out_kvec_bytes -= ret;
+		if (!con->v1.out_kvec_bytes)
 			break;            /* done */
 
 		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
+		while (ret >= con->v1.out_kvec_cur->iov_len) {
+			BUG_ON(!con->v1.out_kvec_left);
+			ret -= con->v1.out_kvec_cur->iov_len;
+			con->v1.out_kvec_cur++;
+			con->v1.out_kvec_left--;
 		}
 		/* and for a partially-consumed entry */
 		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
+			con->v1.out_kvec_cur->iov_len -= ret;
+			con->v1.out_kvec_cur->iov_base += ret;
 		}
 	}
-	con->out_kvec_left = 0;
+	con->v1.out_kvec_left = 0;
 	ret = 1;
 out:
 	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	     con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
 	return ret;  /* done! */
 }
 
@@ -530,17 +532,17 @@ static int write_partial_skip(struct ceph_connection *con)
 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	int ret;
 
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
+	dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+	while (con->v1.out_skip > 0) {
+		size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
 
-		if (size == con->out_skip)
+		if (size == con->v1.out_skip)
 			more = MSG_MORE;
 		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
 					more);
 		if (ret <= 0)
 			goto out;
-		con->out_skip -= ret;
+		con->v1.out_skip -= ret;
 	}
 	ret = 1;
 out:
@@ -553,39 +555,39 @@ out:
 static void prepare_read_banner(struct ceph_connection *con)
 {
 	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_connect(struct ceph_connection *con)
 {
 	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_seq(struct ceph_connection *con)
 {
 	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
 }
 
 static void prepare_read_tag(struct ceph_connection *con)
 {
 	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
 }
 
 static void prepare_read_keepalive_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 /*
@@ -595,7 +597,7 @@ static int prepare_read_message(struct ceph_connection *con)
 {
 	dout("prepare_read_message %p\n", con);
 	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
 	return 0;
 }
@@ -603,13 +605,13 @@ static int prepare_read_message(struct ceph_connection *con)
 static int read_partial(struct ceph_connection *con,
 			int end, int size, void *object)
 {
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
+	while (con->v1.in_base_pos < end) {
+		int left = end - con->v1.in_base_pos;
 		int have = size - left;
 		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
 		if (ret <= 0)
 			return ret;
-		con->in_base_pos += ret;
+		con->v1.in_base_pos += ret;
 	}
 	return 1;
 }
@@ -623,28 +625,28 @@ static int read_partial_banner(struct ceph_connection *con)
 	int end;
 	int ret;
 
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
 
 	/* peer's banner */
 	size = strlen(CEPH_BANNER);
 	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
+	ret = read_partial(con, end, size, con->v1.in_banner);
 	if (ret <= 0)
 		goto out;
 
-	size = sizeof (con->actual_peer_addr);
+	size = sizeof(con->v1.actual_peer_addr);
 	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
+	ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
+	ceph_decode_banner_addr(&con->v1.actual_peer_addr);
 
-	size = sizeof (con->peer_addr_for_me);
+	size = sizeof(con->v1.peer_addr_for_me);
 	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
+	ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
+	ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
 
 out:
 	return ret;
@@ -656,34 +658,34 @@ static int read_partial_connect(struct ceph_connection *con)
 	int end;
 	int ret;
 
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
 
-	size = sizeof (con->in_reply);
+	size = sizeof(con->v1.in_reply);
 	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
+	ret = read_partial(con, end, size, &con->v1.in_reply);
 	if (ret <= 0)
 		goto out;
 
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
+	if (con->v1.auth) {
+		size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+		if (size > con->v1.auth->authorizer_reply_buf_len) {
 			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
+			       con->v1.auth->authorizer_reply_buf_len);
 			ret = -EINVAL;
 			goto out;
 		}
 
 		end += size;
 		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
+				   con->v1.auth->authorizer_reply_buf);
 		if (ret <= 0)
 			goto out;
 	}
 
 	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
+	     con, con->v1.in_reply.tag,
+	     le32_to_cpu(con->v1.in_reply.connect_seq),
+	     le32_to_cpu(con->v1.in_reply.global_seq));
 out:
 	return ret;
 }
@@ -693,7 +695,7 @@ out:
  */
 static int verify_hello(struct ceph_connection *con)
 {
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+	if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
 		pr_err("connect to %s got bad banner\n",
 		       ceph_pr_addr(&con->peer_addr));
 		con->error_msg = "protocol error, bad banner";
@@ -716,15 +718,15 @@ static int process_banner(struct ceph_connection *con)
 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
 	 * them the benefit of the doubt.
 	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+	if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
 		   sizeof(con->peer_addr)) != 0 &&
-	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+	    !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+	      con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
 			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
+			ceph_pr_addr(&con->v1.actual_peer_addr),
+			le32_to_cpu(con->v1.actual_peer_addr.nonce));
 		con->error_msg = "wrong peer at address";
 		return -1;
 	}
@@ -734,8 +736,8 @@ static int process_banner(struct ceph_connection *con)
 	 */
 	if (ceph_addr_is_blank(my_addr)) {
 		memcpy(&my_addr->in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
+		       &con->v1.peer_addr_for_me.in_addr,
+		       sizeof(con->v1.peer_addr_for_me.in_addr));
 		ceph_addr_set_port(my_addr, 0);
 		ceph_encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
@@ -749,13 +751,13 @@ static int process_connect(struct ceph_connection *con)
 {
 	u64 sup_feat = from_msgr(con->msgr)->supported_features;
 	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
 	int ret;
 
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+	dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
 
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
+	if (con->v1.auth) {
+		int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
 
 		/*
 		 * Any connection that defines ->get_authorizer()
@@ -764,9 +766,10 @@ static int process_connect(struct ceph_connection *con)
 		 *
 		 * See get_connect_authorizer().
 		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+		if (con->v1.in_reply.tag ==
+				CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
 			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
+				con, con->v1.auth->authorizer_reply_buf, len);
 			if (ret < 0)
 				return ret;
 
@@ -785,7 +788,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 	}
 
-	switch (con->in_reply.tag) {
+	switch (con->v1.in_reply.tag) {
 	case CEPH_MSGR_TAG_FEATURES:
 		pr_err("%s%lld %s feature set mismatch,"
 		       " my %llx < server's %llx, missing %llx\n",
@@ -800,16 +803,16 @@ static int process_connect(struct ceph_connection *con)
 		       " my %d != server's %d\n",
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
+		       le32_to_cpu(con->v1.out_connect.protocol_version),
+		       le32_to_cpu(con->v1.in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
 		return -1;
 
 	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
+		con->v1.auth_retry++;
 		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
+		     con->v1.auth_retry);
+		if (con->v1.auth_retry == 2) {
 			con->error_msg = "connect authorization failure";
 			return -1;
 		}
@@ -829,7 +832,7 @@ static int process_connect(struct ceph_connection *con)
 		 * dropped messages.
 		 */
 		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
 		pr_info("%s%lld %s session reset\n",
 			ENTITY_NAME(con->peer_name),
 			ceph_pr_addr(&con->peer_addr));
@@ -855,9 +858,9 @@ static int process_connect(struct ceph_connection *con)
 		 * again with a larger value.
 		 */
 		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		     le32_to_cpu(con->v1.out_connect.connect_seq),
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
+		con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -871,10 +874,10 @@ static int process_connect(struct ceph_connection *con)
 		 * again with a larger value.
 		 */
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.global_seq));
 		ceph_get_global_seq(con->msgr,
-				    le32_to_cpu(con->in_reply.global_seq));
+				    le32_to_cpu(con->v1.in_reply.global_seq));
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -896,23 +899,24 @@ static int process_connect(struct ceph_connection *con)
 
 		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
 		con->state = CEPH_CON_S_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
+		con->v1.auth_retry = 0;    /* we authenticated; clear flag */
+		con->v1.peer_global_seq =
+			le32_to_cpu(con->v1.in_reply.global_seq);
+		con->v1.connect_seq++;
 		con->peer_features = server_feat;
 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.connect_seq),
+		     con->v1.connect_seq);
+		WARN_ON(con->v1.connect_seq !=
+			le32_to_cpu(con->v1.in_reply.connect_seq));
 
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+		if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
 			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+		if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
 			prepare_write_seq(con);
 			prepare_read_seq(con);
 		} else {
@@ -942,10 +946,10 @@ static int process_connect(struct ceph_connection *con)
  */
 static int read_partial_ack(struct ceph_connection *con)
 {
-	int size = sizeof (con->in_temp_ack);
+	int size = sizeof(con->v1.in_temp_ack);
 	int end = size;
 
-	return read_partial(con, end, size, &con->in_temp_ack);
+	return read_partial(con, end, size, &con->v1.in_temp_ack);
 }
 
 /*
@@ -953,9 +957,9 @@ static int read_partial_ack(struct ceph_connection *con)
  */
 static void process_ack(struct ceph_connection *con)
 {
-	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 ack = le64_to_cpu(con->v1.in_temp_ack);
 
-	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
 		ceph_con_discard_sent(con, ack);
 	else
 		ceph_con_discard_requeued(con, ack);
@@ -1045,39 +1049,39 @@ static int read_partial_message(struct ceph_connection *con)
 	dout("read_partial_message con %p msg %p\n", con, m);
 
 	/* header */
-	size = sizeof (con->in_hdr);
+	size = sizeof(con->v1.in_hdr);
 	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
+	ret = read_partial(con, end, size, &con->v1.in_hdr);
 	if (ret <= 0)
 		return ret;
 
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+	crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
 		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
+		       crc, con->v1.in_hdr.crc);
 		return -EBADMSG;
 	}
 
-	front_len = le32_to_cpu(con->in_hdr.front_len);
+	front_len = le32_to_cpu(con->v1.in_hdr.front_len);
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
 	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
 		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
+	data_len = le32_to_cpu(con->v1.in_hdr.data_len);
 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
 		return -EIO;
 
 	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
+	seq = le64_to_cpu(con->v1.in_hdr.seq);
 	if ((s64)seq - (s64)con->in_seq < 1) {
 		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
 			ENTITY_NAME(con->peer_name),
 			ceph_pr_addr(&con->peer_addr),
 			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->v1.in_base_pos = -front_len - middle_len - data_len -
+				      sizeof_footer(con);
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
 		return 1;
 	} else if ((s64)seq - (s64)con->in_seq > 1) {
 		pr_err("read_partial_message bad seq %lld expected %lld\n",
@@ -1090,9 +1094,9 @@ static int read_partial_message(struct ceph_connection *con)
 	if (!con->in_msg) {
 		int skip = 0;
 
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
 		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
+		ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
 		if (ret < 0)
 			return ret;
 
@@ -1100,9 +1104,9 @@ static int read_partial_message(struct ceph_connection *con)
 		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->v1.in_base_pos = -front_len - middle_len -
+					      data_len - sizeof_footer(con);
+			con->v1.in_tag = CEPH_MSGR_TAG_READY;
 			con->in_seq++;
 			return 1;
 		}
@@ -1214,8 +1218,8 @@ more:
 
 	BUG_ON(!con->sock);
 
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
+	dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+	     con->v1.in_base_pos);
 
 	if (con->state == CEPH_CON_S_V1_BANNER) {
 		ret = read_partial_banner(con);
@@ -1253,27 +1257,27 @@ more:
 
 	WARN_ON(con->state != CEPH_CON_S_OPEN);
 
-	if (con->in_base_pos < 0) {
+	if (con->v1.in_base_pos < 0) {
 		/*
 		 * skipping + discarding content.
 		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
 		if (ret <= 0)
 			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
+		dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+		con->v1.in_base_pos += ret;
+		if (con->v1.in_base_pos)
 			goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
 		/*
 		 * what's next?
 		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
 		if (ret <= 0)
 			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
+		dout("try_read got tag %d\n", con->v1.in_tag);
+		switch (con->v1.in_tag) {
 		case CEPH_MSGR_TAG_MSG:
 			prepare_read_message(con);
 			break;
@@ -1291,7 +1295,7 @@ more:
 			goto bad_tag;
 		}
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
 		ret = read_partial_message(con);
 		if (ret <= 0) {
 			switch (ret) {
@@ -1307,15 +1311,15 @@ more:
 			}
 			goto out;
 		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
+		if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		ceph_con_process_message(con);
 		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
 		/*
 		 * the final handshake seq exchange is semantically
 		 * equivalent to an ACK
@@ -1326,7 +1330,7 @@ more:
 		process_ack(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
 		ret = read_keepalive_ack(con);
 		if (ret <= 0)
 			goto out;
@@ -1338,7 +1342,7 @@ out:
 	return ret;
 
 bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	pr_err("try_read bad tag %d\n", con->v1.in_tag);
 	con->error_msg = "protocol error, garbage tag";
 	ret = -1;
 	goto out;
@@ -1369,7 +1373,7 @@ int ceph_con_v1_try_write(struct ceph_connection *con)
 		prepare_read_banner(con);
 
 		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
 		dout("try_write initiating connect on %p new state %d\n",
 		     con, con->state);
 		ret = ceph_tcp_connect(con);
@@ -1380,16 +1384,16 @@ int ceph_con_v1_try_write(struct ceph_connection *con)
 	}
 
 more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+	dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
 	BUG_ON(!con->sock);
 
 	/* kvec data queued? */
-	if (con->out_kvec_left) {
+	if (con->v1.out_kvec_left) {
 		ret = write_partial_kvec(con);
 		if (ret <= 0)
 			goto out;
 	}
-	if (con->out_skip) {
+	if (con->v1.out_skip) {
 		ret = write_partial_skip(con);
 		if (ret <= 0)
 			goto out;
@@ -1397,7 +1401,7 @@ more:
 
 	/* msg pages? */
 	if (con->out_msg) {
-		if (con->out_msg_done) {
+		if (con->v1.out_msg_done) {
 			ceph_msg_put(con->out_msg);
 			con->out_msg = NULL;   /* we're done with this one */
 			goto do_next;
@@ -1446,57 +1450,57 @@ void ceph_con_v1_revoke(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
 
-	WARN_ON(con->out_skip);
+	WARN_ON(con->v1.out_skip);
 	/* footer */
-	if (con->out_msg_done) {
-		con->out_skip += con_out_kvec_skip(con);
+	if (con->v1.out_msg_done) {
+		con->v1.out_skip += con_out_kvec_skip(con);
 	} else {
 		WARN_ON(!msg->data_length);
-		con->out_skip += sizeof_footer(con);
+		con->v1.out_skip += sizeof_footer(con);
 	}
 	/* data, middle, front */
 	if (msg->data_length)
-		con->out_skip += msg->cursor.total_resid;
+		con->v1.out_skip += msg->cursor.total_resid;
 	if (msg->middle)
-		con->out_skip += con_out_kvec_skip(con);
-	con->out_skip += con_out_kvec_skip(con);
+		con->v1.out_skip += con_out_kvec_skip(con);
+	con->v1.out_skip += con_out_kvec_skip(con);
 
 	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
-	     con->out_kvec_bytes, con->out_skip);
+	     con->v1.out_kvec_bytes, con->v1.out_skip);
 }
 
 void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
 {
-	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+	unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
 
 	/* skip rest of message */
-	con->in_base_pos = con->in_base_pos -
+	con->v1.in_base_pos = con->v1.in_base_pos -
 			sizeof(struct ceph_msg_header) -
 			front_len -
 			middle_len -
 			data_len -
 			sizeof(struct ceph_msg_footer);
 
-	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
 	con->in_seq++;
 
-	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
 }
 
 bool ceph_con_v1_opened(struct ceph_connection *con)
 {
-	return con->connect_seq;
+	return con->v1.connect_seq;
 }
 
 void ceph_con_v1_reset_session(struct ceph_connection *con)
 {
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
+	con->v1.connect_seq = 0;
+	con->v1.peer_global_seq = 0;
 }
 
 void ceph_con_v1_reset_protocol(struct ceph_connection *con)
 {
-	con->out_skip = 0;
+	con->v1.out_skip = 0;
 }

From f79e25b087b80eef47eef4c8b0763eb1a583a357 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 27 Nov 2020 17:18:27 +0100
Subject: [PATCH 214/230] libceph: more insight into ticket expiry and
 invalidation

Make it clear that "need" is a union of "missing" and "have, but up
for renewal" and dout when the ticket goes missing due to expiry or
invalidation by client.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/auth_x.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index b52732337ca6..512984d9bbfc 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
 static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
 {
 	struct ceph_x_info *xi = ac->private;
-	int need;
+	int missing;
+	int need;  /* missing + need renewal */
 
 	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return (ac->want_keys & xi->have_keys) == ac->want_keys;
+	missing = ac->want_keys & ~xi->have_keys;
+	WARN_ON((need & missing) != missing);
+	dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__,
+	     ac->want_keys, xi->have_keys, missing, !missing);
+	return !missing;
 }
 
 static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
@@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
 	int need;
 
 	ceph_x_validate_tickets(ac, &need);
-	dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
-	     ac->want_keys, need, xi->have_keys);
-	return need != 0;
+	dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__,
+	     ac->want_keys, xi->have_keys, need, !!need);
+	return !!need;
 }
 
 static int ceph_x_encrypt_offset(void)
@@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 		}
 	}
 	au->service = th->service;
+	WARN_ON(!th->secret_id);
 	au->secret_id = th->secret_id;
 
 	msg_a = au->buf->vec.iov_base;
@@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th)
 
 static bool have_key(struct ceph_x_ticket_handler *th)
 {
-	if (th->have_key) {
-		if (ktime_get_real_seconds() >= th->expires)
-			th->have_key = false;
+	if (th->have_key && ktime_get_real_seconds() >= th->expires) {
+		dout("ticket %d (%s) secret_id %llu expired\n", th->service,
+		     ceph_entity_type_name(th->service), th->secret_id);
+		th->have_key = false;
 	}
 
 	return th->have_key;
@@ -494,9 +499,8 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		return PTR_ERR(th);
 
 	ceph_x_validate_tickets(ac, &need);
-
-	dout("build_request want %x have %x need %x\n",
-	     ac->want_keys, xi->have_keys, need);
+	dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys,
+	     xi->have_keys, need);
 
 	if (need & CEPH_ENTITY_TYPE_AUTH) {
 		struct ceph_x_authenticate *auth = (void *)(head + 1);
@@ -785,8 +789,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
 	struct ceph_x_ticket_handler *th;
 
 	th = get_ticket_handler(ac, peer_type);
-	if (!IS_ERR(th))
+	if (IS_ERR(th))
+		return;
+
+	if (th->have_key) {
+		dout("ticket %d (%s) secret_id %llu invalidated\n",
+		     th->service, ceph_entity_type_name(th->service),
+		     th->secret_id);
 		th->have_key = false;
+	}
 }
 
 static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,

From 6610fff2782a4a793069a5dd395883a91c76e7d4 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 12 Oct 2020 15:05:09 +0200
Subject: [PATCH 215/230] libceph: safer en/decoding of cephx requests and
 replies

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/auth_x.c | 47 ++++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 512984d9bbfc..425508d4dafd 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -200,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 	dout(" decrypted %d bytes\n", ret);
 	dend = dp + ret;
 
-	tkt_struct_v = ceph_decode_8(&dp);
+	ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad);
 	if (tkt_struct_v != 1)
 		goto bad;
 
@@ -208,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
 	if (ret)
 		goto out;
 
+	ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad);
 	ceph_decode_timespec64(&validity, dp);
 	dp += sizeof(struct ceph_timespec);
 	new_expires = ktime_get_real_seconds() + validity.tv_sec;
@@ -491,6 +492,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 	struct ceph_x_info *xi = ac->private;
 	int need;
 	struct ceph_x_request_header *head = buf;
+	void *p;
 	int ret;
 	struct ceph_x_ticket_handler *th =
 		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
@@ -504,12 +506,12 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 
 	if (need & CEPH_ENTITY_TYPE_AUTH) {
 		struct ceph_x_authenticate *auth = (void *)(head + 1);
-		void *p = auth + 1;
 		void *enc_buf = xi->auth_authorizer.enc_buf;
 		struct ceph_x_challenge_blob *blob = enc_buf +
 							ceph_x_encrypt_offset();
 		u64 *u;
 
+		p = auth + 1;
 		if (p > end)
 			return -ERANGE;
 
@@ -542,35 +544,35 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 	}
 
 	if (need) {
-		void *p = head + 1;
-		struct ceph_x_service_ticket_request *req;
-
-		if (p > end)
-			return -ERANGE;
-		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
+		dout(" get_principal_session_key\n");
 		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
 		if (ret)
 			return ret;
-		ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
-				 xi->auth_authorizer.buf->vec.iov_len);
 
-		req = p;
-		req->keys = cpu_to_le32(need);
-		p += sizeof(*req);
+		p = buf;
+		ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY,
+				    e_range);
+		ceph_encode_copy_safe(&p, end,
+			xi->auth_authorizer.buf->vec.iov_base,
+			xi->auth_authorizer.buf->vec.iov_len, e_range);
+		ceph_encode_8_safe(&p, end, 1, e_range);
+		ceph_encode_32_safe(&p, end, need, e_range);
 		return p - buf;
 	}
 
 	return 0;
+
+e_range:
+	return -ERANGE;
 }
 
 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 			       void *buf, void *end)
 {
 	struct ceph_x_info *xi = ac->private;
-	struct ceph_x_reply_header *head = buf;
 	struct ceph_x_ticket_handler *th;
 	int len = end - buf;
+	void *p;
 	int op;
 	int ret;
 
@@ -591,22 +593,22 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 		return -EAGAIN;
 	}
 
-	op = le16_to_cpu(head->op);
-	result = le32_to_cpu(head->result);
+	p = buf;
+	ceph_decode_16_safe(&p, end, op, e_inval);
+	ceph_decode_32_safe(&p, end, result, e_inval);
 	dout("handle_reply op %d result %d\n", op, result);
 	switch (op) {
 	case CEPHX_GET_AUTH_SESSION_KEY:
 		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
-					       buf + sizeof(*head), end);
+		ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
 		break;
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
 		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
 		if (IS_ERR(th))
 			return PTR_ERR(th);
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
-					       buf + sizeof(*head), end);
+
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, p, end);
 		break;
 
 	default:
@@ -617,6 +619,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	if (ac->want_keys == xi->have_keys)
 		return 0;
 	return -EAGAIN;
+
+e_inval:
+	return -EINVAL;
 }
 
 static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)

From 285ea34fc876aa0a2c5e65d310c4a41269e2e5f2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 16:47:20 +0100
Subject: [PATCH 216/230] libceph, ceph: incorporate nautilus cephx changes

- request service tickets together with auth ticket.  Currently we get
  auth ticket via CEPHX_GET_AUTH_SESSION_KEY op and then request service
  tickets via CEPHX_GET_PRINCIPAL_SESSION_KEY op in a separate message.
  Since nautilus, desired service tickets are shared togther with auth
  ticket in CEPHX_GET_AUTH_SESSION_KEY reply.

- propagate session key and connection secret, if any.  In preparation
  for msgr2, update handle_reply() and verify_authorizer_reply() auth
  ops to propagate session key and connection secret.  Since nautilus,
  if secure mode is negotiated, connection secret is shared either in
  CEPHX_GET_AUTH_SESSION_KEY reply (for mons) or in a final authorizer
  reply (for osds and mdses).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c       |   5 +-
 include/linux/ceph/auth.h  |  16 ++-
 net/ceph/auth.c            |  12 +-
 net/ceph/auth_none.c       |   4 +-
 net/ceph/auth_x.c          | 217 +++++++++++++++++++++++++++++--------
 net/ceph/auth_x_protocol.h |   3 +-
 net/ceph/crypto.h          |   3 +
 net/ceph/osd_client.c      |   5 +-
 8 files changed, 211 insertions(+), 54 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a256d95ec99a..278fe67e2617 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5178,8 +5178,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6728c2ee0205..d9e7d0bcdaf1 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -53,7 +53,9 @@ struct ceph_auth_client_ops {
 	 */
 	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
 	int (*handle_reply)(struct ceph_auth_client *ac, int result,
-			    void *buf, void *end);
+			    void *buf, void *end, u8 *session_key,
+			    int *session_key_len, u8 *con_secret,
+			    int *con_secret_len);
 
 	/*
 	 * Create authorizer for connecting to a service, and verify
@@ -69,7 +71,10 @@ struct ceph_auth_client_ops {
 					void *challenge_buf,
 					int challenge_buf_len);
 	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-				       struct ceph_authorizer *a);
+				       struct ceph_authorizer *a,
+				       void *reply, int reply_len,
+				       u8 *session_key, int *session_key_len,
+				       u8 *con_secret, int *con_secret_len);
 	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
 				      int peer_type);
 
@@ -126,8 +131,11 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
 				       int challenge_buf_len);
-extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-					     struct ceph_authorizer *a);
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len);
 extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
 					    int peer_type);
 
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..40d3d95344d9 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -240,7 +240,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		ac->negotiating = false;
 	}
 
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN) {
 		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
@@ -332,13 +333,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
 
 int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len)
 {
 	int ret = 0;
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->verify_authorizer_reply)
-		ret = ac->ops->verify_authorizer_reply(ac, a);
+		ret = ac->ops->verify_authorizer_reply(ac, a,
+			reply, reply_len, session_key, session_key_len,
+			con_secret, con_secret_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..af8ae507e861 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
  * authenticate state, so nothing happens here.
  */
 static int handle_reply(struct ceph_auth_client *ac, int result,
-			void *buf, void *end)
+			void *buf, void *end, u8 *session_key,
+			int *session_key_len, u8 *con_secret,
+			int *con_secret_len)
 {
 	struct ceph_auth_none_info *xi = ac->private;
 
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 425508d4dafd..a265792642dc 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -269,22 +269,21 @@ out:
 
 static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 				    struct ceph_crypto_key *secret,
-				    void *buf, void *end)
+				    void **p, void *end)
 {
-	void *p = buf;
 	u8 reply_struct_v;
 	u32 num;
 	int ret;
 
-	ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+	ceph_decode_8_safe(p, end, reply_struct_v, bad);
 	if (reply_struct_v != 1)
 		return -EINVAL;
 
-	ceph_decode_32_safe(&p, end, num, bad);
+	ceph_decode_32_safe(p, end, num, bad);
 	dout("%d tickets\n", num);
 
 	while (num--) {
-		ret = process_one_ticket(ac, secret, &p, end);
+		ret = process_one_ticket(ac, secret, p, end);
 		if (ret)
 			return ret;
 	}
@@ -527,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
-		auth->struct_v = 1;
+		auth->struct_v = 2;  /* nautilus+ */
 		auth->key = 0;
 		for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
 			auth->key ^= *(__le64 *)u;
@@ -540,6 +539,10 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
+		/* nautilus+: request service tickets at the same time */
+		need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+		WARN_ON(!need);
+		ceph_encode_32_safe(&p, end, need, e_range);
 		return p - buf;
 	}
 
@@ -566,8 +569,82 @@ e_range:
 	return -ERANGE;
 }
 
+static int handle_auth_session_key(struct ceph_auth_client *ac,
+				   void **p, void *end,
+				   u8 *session_key, int *session_key_len,
+				   u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_ticket_handler *th;
+	void *dp, *dend;
+	int len;
+	int ret;
+
+	/* AUTH ticket */
+	ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+	if (ret)
+		return ret;
+
+	if (*p == end) {
+		/* pre-nautilus (or didn't request service tickets!) */
+		WARN_ON(session_key || con_secret);
+		return 0;
+	}
+
+	th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	if (session_key) {
+		memcpy(session_key, th->session_key.key, th->session_key.len);
+		*session_key_len = th->session_key.len;
+	}
+
+	/* connection secret */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s connection secret blob len %d\n", __func__, len);
+	if (len > 0) {
+		dp = *p + ceph_x_encrypt_offset();
+		ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+		if (ret < 0)
+			return ret;
+
+		dout("%s decrypted %d bytes\n", __func__, ret);
+		dend = dp + ret;
+
+		ceph_decode_32_safe(&dp, dend, len, e_inval);
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			return -EINVAL;
+		}
+
+		dout("%s connection secret len %d\n", __func__, len);
+		if (con_secret) {
+			memcpy(con_secret, dp, len);
+			*con_secret_len = len;
+		}
+	}
+
+	/* service tickets */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s service tickets blob len %d\n", __func__, len);
+	if (len > 0) {
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       p, *p + len);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-			       void *buf, void *end)
+			       void *buf, void *end,
+			       u8 *session_key, int *session_key_len,
+			       u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_info *xi = ac->private;
 	struct ceph_x_ticket_handler *th;
@@ -599,8 +676,10 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	dout("handle_reply op %d result %d\n", op, result);
 	switch (op) {
 	case CEPHX_GET_AUTH_SESSION_KEY:
-		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+		/* AUTH ticket + [connection secret] + service tickets */
+		ret = handle_auth_session_key(ac, &p, end, session_key,
+					      session_key_len, con_secret,
+					      con_secret_len);
 		break;
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
@@ -608,7 +687,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 		if (IS_ERR(th))
 			return PTR_ERR(th);
 
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, p, end);
+		/* service tickets */
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
 		break;
 
 	default:
@@ -687,40 +767,44 @@ static int ceph_x_update_authorizer(
 	return 0;
 }
 
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
-				       void *challenge_buf,
-				       int challenge_buf_len,
-				       u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+					void *challenge, int challenge_len,
+					u64 *server_challenge)
 {
-	struct ceph_x_authorize_challenge *ch =
-	    challenge_buf + sizeof(struct ceph_x_encrypt_header);
+	void *dp, *dend;
 	int ret;
 
 	/* no leading len */
-	ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
-			       challenge_buf_len);
+	ret = __ceph_x_decrypt(secret, challenge, challenge_len);
 	if (ret < 0)
 		return ret;
-	if (ret < sizeof(*ch)) {
-		pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
-		return -EINVAL;
-	}
 
-	*server_challenge = le64_to_cpu(ch->server_challenge);
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dp = challenge + sizeof(struct ceph_x_encrypt_header);
+	dend = dp + ret;
+
+	ceph_decode_skip_8(&dp, dend, e_inval);  /* struct_v */
+	ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+	dout("%s server_challenge %llu\n", __func__, *server_challenge);
 	return 0;
+
+e_inval:
+	return -EINVAL;
 }
 
 static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 					   struct ceph_authorizer *a,
-					   void *challenge_buf,
-					   int challenge_buf_len)
+					   void *challenge, int challenge_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
 	u64 server_challenge;
 	int ret;
 
-	ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
-					  &server_challenge);
+	ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+					   challenge_len, &server_challenge);
 	if (ret) {
 		pr_err("failed to decrypt authorize challenge: %d", ret);
 		return ret;
@@ -735,29 +819,76 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 	return 0;
 }
 
-static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-					  struct ceph_authorizer *a)
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+				    void **p, void *end, u64 *nonce_plus_one,
+				    u8 *con_secret, int *con_secret_len)
 {
-	struct ceph_x_authorizer *au = (void *)a;
-	void *p = au->enc_buf;
-	struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+	void *dp, *dend;
+	u8 struct_v;
+	int len;
 	int ret;
 
-	ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
+	dp = *p + ceph_x_encrypt_offset();
+	ret = ceph_x_decrypt(secret, p, end);
 	if (ret < 0)
 		return ret;
-	if (ret < sizeof(*reply)) {
-		pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
-		return -EINVAL;
+
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dend = dp + ret;
+
+	ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+	ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+	dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+	if (struct_v >= 2) {
+		ceph_decode_32_safe(&dp, dend, len, e_inval);
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			return -EINVAL;
+		}
+
+		dout("%s connection secret len %d\n", __func__, len);
+		if (con_secret) {
+			memcpy(con_secret, dp, len);
+			*con_secret_len = len;
+		}
 	}
 
-	if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
-		ret = -EPERM;
-	else
-		ret = 0;
-	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-	     au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
-	return ret;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+					  struct ceph_authorizer *a,
+					  void *reply, int reply_len,
+					  u8 *session_key, int *session_key_len,
+					  u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_x_authorizer *au = (void *)a;
+	u64 nonce_plus_one;
+	int ret;
+
+	if (session_key) {
+		memcpy(session_key, au->session_key.key, au->session_key.len);
+		*session_key_len = au->session_key.len;
+	}
+
+	ret = decrypt_authorizer_reply(&au->session_key, &reply,
+				       reply + reply_len, &nonce_plus_one,
+				       con_secret, con_secret_len);
+	if (ret)
+		return ret;
+
+	if (nonce_plus_one != au->nonce + 1) {
+		pr_err("failed to authenticate server\n");
+		return -EPERM;
+	}
+
+	return 0;
 }
 
 static void ceph_x_reset(struct ceph_auth_client *ac)
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..792fcb974dc3 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
 	__u8 struct_v;
 	__le64 client_challenge;
 	__le64 key;
-	/* ticket blob */
+	/* old_ticket blob */
+	/* nautilus+: other_keys */
 } __attribute__ ((packed));
 
 struct ceph_x_service_ticket_request {
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 96ef4d860bc9..13bd526349fa 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/buffer.h>
 
+#define CEPH_KEY_LEN			16
+#define CEPH_MAX_CON_SECRET_LEN		64
+
 /*
  * cryptographic secret
  */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..8966eae543d3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5623,8 +5623,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)

From 59711f9ec219bf5245a8e95989803fb503adc52d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 17:01:53 +0100
Subject: [PATCH 217/230] libceph: amend cephx init_protocol() and
 build_request()

In msgr2, initial authentication happens with an exchange of msgr2
control frames -- MAuth message and struct ceph_mon_request_header
aren't used.  Make that optional.

Stop reporting cephx protocol as "x".  Use "cephx" instead.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h |  1 +
 net/ceph/auth.c              | 63 ++++++++++++++++++++----------------
 net/ceph/ceph_strings.c      | 14 ++++++++
 3 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d44d98033d58..6d986e52000b 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -95,6 +95,7 @@ struct ceph_dir_layout {
 
 #define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 
+const char *ceph_auth_proto_name(int proto);
 
 /*********************************************
  * message layer
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 40d3d95344d9..deaf267f8942 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,15 +21,18 @@ static u32 supported_protocols[] = {
 	CEPH_AUTH_CEPHX
 };
 
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
 {
-	switch (protocol) {
+	dout("%s proto %d\n", __func__, proto);
+
+	switch (proto) {
 	case CEPH_AUTH_NONE:
 		return ceph_auth_none_init(ac);
 	case CEPH_AUTH_CEPHX:
 		return ceph_x_init(ac);
 	default:
-		return -ENOENT;
+		pr_err("bad auth protocol %d\n", proto);
+		return -EINVAL;
 	}
 }
 
@@ -145,31 +148,35 @@ bad:
 	goto out;
 }
 
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-				   void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+			 void *buf, int buf_len)
 {
-	struct ceph_mon_request_header *monhdr = msg_buf;
-	void *p = monhdr + 1;
-	void *end = msg_buf + msg_len;
+	void *end = buf + buf_len;
+	void *p;
 	int ret;
 
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, ac->protocol);
+	p = buf;
+	if (add_header) {
+		/* struct ceph_mon_request_header + protocol */
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_16_safe(&p, end, -1, e_range);
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	}
 
+	ceph_encode_need(&p, end, sizeof(u32), e_range);
 	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
 	if (ret < 0) {
-		pr_err("error %d building auth method %s request\n", ret,
-		       ac->ops->name);
-		goto out;
+		pr_err("auth protocol '%s' building request failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), ret);
+		return ret;
 	}
 	dout(" built request %d bytes\n", ret);
 	ceph_encode_32(&p, ret);
-	ret = p + ret - msg_buf;
-out:
-	return ret;
+	return p + ret - buf;
+
+e_range:
+	return -ERANGE;
 }
 
 /*
@@ -229,10 +236,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 			ac->ops = NULL;
 		}
 		if (ac->protocol != protocol) {
-			ret = ceph_auth_init_protocol(ac, protocol);
+			ret = init_protocol(ac, protocol);
 			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
+				pr_err("auth protocol '%s' init failed: %d\n",
+				       ceph_auth_proto_name(protocol), ret);
 				goto out;
 			}
 		}
@@ -242,11 +249,11 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
 	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
 				    NULL, NULL, NULL, NULL);
-	if (ret == -EAGAIN) {
-		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
-	} else if (ret) {
-		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-	}
+	if (ret == -EAGAIN)
+		ret = build_request(ac, true, reply_buf, reply_len);
+	else if (ret)
+		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), result);
 
 out:
 	mutex_unlock(&ac->mutex);
@@ -265,7 +272,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops->should_authenticate(ac))
-		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+		ret = build_request(ac, true, msg_buf, msg_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..69cd391e02a6 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,20 @@ const char *ceph_entity_type_name(int type)
 }
 EXPORT_SYMBOL(ceph_entity_type_name);
 
+const char *ceph_auth_proto_name(int proto)
+{
+	switch (proto) {
+	case CEPH_AUTH_UNKNOWN:
+		return "unknown";
+	case CEPH_AUTH_NONE:
+		return "none";
+	case CEPH_AUTH_CEPHX:
+		return "cephx";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {

From c1c0ce78f479cf4d7dfe72c4c1cabbf0bc0730c9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 17:05:44 +0100
Subject: [PATCH 218/230] libceph: drop ac->ops->name field

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h | 2 --
 net/ceph/auth_none.c      | 1 -
 net/ceph/auth_x.c         | 1 -
 3 files changed, 4 deletions(-)

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index d9e7d0bcdaf1..5f64f66309fa 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -32,8 +32,6 @@ struct ceph_auth_handshake {
 };
 
 struct ceph_auth_client_ops {
-	const char *name;
-
 	/*
 	 * true if we are authenticated and can connect to
 	 * services.
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index af8ae507e861..70e86e462250 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -118,7 +118,6 @@ static int ceph_auth_none_create_authorizer(
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-	.name = "none",
 	.reset = reset,
 	.destroy = destroy,
 	.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a265792642dc..9815cfe42af0 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1058,7 +1058,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
 }
 
 static const struct ceph_auth_client_ops ceph_x_ops = {
-	.name = "x",
 	.is_authenticated = ceph_x_is_authenticated,
 	.should_authenticate = ceph_x_should_authenticate,
 	.build_request = ceph_x_build_request,

From 8921f25116af3081fb56871feb93f2dcaf52c722 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 14 Oct 2020 15:53:39 +0200
Subject: [PATCH 219/230] libceph: factor out finish_auth()

In preparation for msgr2, factor out finish_auth() so it is suitable
for both existing MAuth message based authentication and upcoming msgr2
authentication exchange.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 net/ceph/mon_client.c | 52 +++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index c4cf2529d08b..ebfecf8d0918 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1194,30 +1194,22 @@ static void finish_hunting(struct ceph_mon_client *monc)
 	}
 }
 
-static void handle_auth_reply(struct ceph_mon_client *monc,
-			      struct ceph_msg *msg)
+static void finish_auth(struct ceph_mon_client *monc, int auth_err,
+			bool was_authed)
 {
-	int ret;
-	int was_auth = 0;
+	dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed);
+	WARN_ON(auth_err > 0);
 
-	mutex_lock(&monc->mutex);
-	was_auth = ceph_auth_is_authenticated(monc->auth);
 	monc->pending_auth = 0;
-	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
-				     msg->front.iov_len,
-				     monc->m_auth->front.iov_base,
-				     monc->m_auth->front_alloc_len);
-	if (ret > 0) {
-		__send_prepared_auth_request(monc, ret);
-		goto out;
+	if (auth_err) {
+		monc->client->auth_err = auth_err;
+		wake_up_all(&monc->client->auth_wq);
+		return;
 	}
 
-	finish_hunting(monc);
-
-	if (ret < 0) {
-		monc->client->auth_err = ret;
-	} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
-		dout("authenticated, starting session\n");
+	if (!was_authed && ceph_auth_is_authenticated(monc->auth)) {
+		dout("%s authenticated, starting session global_id %llu\n",
+		     __func__, monc->auth->global_id);
 
 		monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
 		monc->client->msgr.inst.name.num =
@@ -1229,11 +1221,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		pr_info("mon%d %s session established\n", monc->cur_mon,
 			ceph_pr_addr(&monc->con.peer_addr));
 	}
+}
 
-out:
+static void handle_auth_reply(struct ceph_mon_client *monc,
+			      struct ceph_msg *msg)
+{
+	bool was_authed;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+				     msg->front.iov_len,
+				     monc->m_auth->front.iov_base,
+				     monc->m_auth->front_alloc_len);
+	if (ret > 0) {
+		__send_prepared_auth_request(monc, ret);
+	} else {
+		finish_auth(monc, ret, was_authed);
+		finish_hunting(monc);
+	}
 	mutex_unlock(&monc->mutex);
-	if (monc->client->auth_err < 0)
-		wake_up_all(&monc->client->auth_wq);
 }
 
 static int __validate_auth(struct ceph_mon_client *monc)

From a5cbd5fc22d5043a8a76e15d75d031fe24d1f69c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 30 Oct 2020 13:30:51 +0100
Subject: [PATCH 220/230] libceph, ceph: get and handle cluster maps with
 addrvecs

In preparation for msgr2, make the cluster send us maps with addrvecs
including both LEGACY and MSGR2 addrs instead of a single LEGACY addr.
This means advertising support for SERVER_NAUTILUS and also some older
features: SERVER_MIMIC, MONENC and MONNAMES.

MONNAMES and MONENC are actually pre-argonaut, we just never updated
ceph_monmap_decode() for them.  Decoding is unconditional, see commit
23c625ce3065 ("libceph: assume argonaut on the server side").

SERVER_MIMIC doesn't bear any meaning for the kernel client.

Since ceph_decode_entity_addrvec() is guarded by encoding version
checks (and in msgr2 case it is guarded implicitly by the fact that
server is speaking msgr2), we assume MSG_ADDR2 for it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c               |   2 +-
 fs/ceph/mdsmap.c                   |  21 +++--
 include/linux/ceph/ceph_features.h |  11 ++-
 include/linux/ceph/decode.h        |   4 +
 include/linux/ceph/mdsmap.h        |   2 +-
 include/linux/ceph/osdmap.h        |   4 +-
 net/ceph/decode.c                  |  56 +++++++++++
 net/ceph/mon_client.c              | 145 +++++++++++++++++++++--------
 net/ceph/osd_client.c              |   4 +-
 net/ceph/osdmap.c                  |  45 ++++++---
 10 files changed, 222 insertions(+), 72 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 278fe67e2617..afd22815fbda 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end);
+	newmap = ceph_mdsmap_decode(&p, end, false);
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 1096d1d3a84c..abd9af7727ad 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -114,7 +114,7 @@ bad:
  * Ignore any fields we don't care about (there are quite a few of
  * them).
  */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_mdsmap *m;
 	const void *start = *p;
@@ -201,18 +201,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
 
-		ceph_decode_need(p, end,
-				 4*sizeof(u32) + sizeof(u64) +
-				 sizeof(addr) + sizeof(struct ceph_timespec),
-				 bad);
-		mds = ceph_decode_32(p);
-		inc = ceph_decode_32(p);
-		state = ceph_decode_32(p);
+		ceph_decode_32_safe(p, end, mds, bad);
+		ceph_decode_32_safe(p, end, inc, bad);
+		ceph_decode_32_safe(p, end, state, bad);
 		*p += sizeof(u64);		/* state_seq */
-		err = ceph_decode_entity_addr(p, end, &addr);
+		if (info_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			err = ceph_decode_entity_addr(p, end, &addr);
 		if (err)
 			goto corrupt;
-		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+
+		ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
+				      bad);
 		laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 999636d53cf2..3a47acd9cc14 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -8,7 +8,8 @@
  * feature.  Base case is 1 (first use).
  */
 #define CEPH_FEATURE_INCARNATION_1 (0ull)
-#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57)              // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
 
 #define DEFINE_CEPH_FEATURE(bit, incarnation, name)			\
 	static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit);		\
@@ -75,7 +76,7 @@
 DEFINE_CEPH_FEATURE( 0, 1, UID)
 DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
 DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
-
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
 DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
 DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
 DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
@@ -114,7 +115,7 @@ DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
 DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
 DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
 DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
-DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
 DEFINE_CEPH_FEATURE(29, 1, MDSENC)
 DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
 DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS)  // deprecate me
@@ -177,13 +178,16 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT		\
 	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_SERVER_NAUTILUS |		\
 	 CEPH_FEATURE_FLOCK |			\
 	 CEPH_FEATURE_SUBSCRIBE2 |		\
+	 CEPH_FEATURE_MONNAMES |		\
 	 CEPH_FEATURE_RECONNECT_SEQ |		\
 	 CEPH_FEATURE_DIRLAYOUTHASH |		\
 	 CEPH_FEATURE_PGID64 |			\
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
+	 CEPH_FEATURE_MONENC |			\
 	 CEPH_FEATURE_CRUSH_TUNABLES |		\
 	 CEPH_FEATURE_SERVER_LUMINOUS |		\
 	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
@@ -193,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MSG_AUTH |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
+	 CEPH_FEATURE_SERVER_MIMIC |		\
 	 CEPH_FEATURE_MDSENC |			\
 	 CEPH_FEATURE_OSDHASHPSPOOL |		\
 	 CEPH_FEATURE_OSD_CACHEPOOL |		\
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 450384fe487c..9a934e04f841 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -220,6 +220,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
  */
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2	__cpu_to_le32(2)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -239,6 +240,9 @@ static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 
 extern int ceph_decode_entity_addr(void **p, void *end,
 				   struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 35d385296fbb..523fd0452856 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -64,7 +64,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 }
 
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
 extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index cad9acfbc320..5553019c3f07 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -251,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 }
 
 struct ceph_osdmap *ceph_osdmap_alloc(void);
-extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index eea529595a7a..6429b6713507 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/ceph/decode.h>
 
@@ -82,3 +83,58 @@ bad:
 }
 EXPORT_SYMBOL(ceph_decode_entity_addr);
 
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr)
+{
+	__le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+				 CEPH_ENTITY_ADDR_TYPE_LEGACY;
+	struct ceph_entity_addr tmp_addr;
+	int addr_cnt;
+	bool found;
+	u8 marker;
+	int ret;
+	int i;
+
+	ceph_decode_8_safe(p, end, marker, e_inval);
+	if (marker != 2) {
+		pr_err("bad addrvec marker %d\n", marker);
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+	found = false;
+	for (i = 0; i < addr_cnt; i++) {
+		ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+		if (ret)
+			return ret;
+
+		if (tmp_addr.type == my_type) {
+			if (found) {
+				pr_err("another match of type %d in addrvec\n",
+				       le32_to_cpu(my_type));
+				return -EINVAL;
+			}
+
+			memcpy(addr, &tmp_addr, sizeof(*addr));
+			found = true;
+		}
+	}
+	if (!found && addr_cnt != 0) {
+		pr_err("no match of type %d in addrvec\n",
+		       le32_to_cpu(my_type));
+		return -ENOENT;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ebfecf8d0918..a9754a7fa78c 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
+static int decode_mon_info(void **p, void *end, bool msgr2,
+			   struct ceph_entity_addr *addr)
+{
+	void *mon_info_end;
+	u32 struct_len;
+	u8 struct_v;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	mon_info_end = *p + struct_len;
+	ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+	ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+	if (ret)
+		return ret;
+
+	*p = mon_info_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 /*
  * Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
  */
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
 {
-	struct ceph_monmap *m = NULL;
-	int i, err = -EINVAL;
+	struct ceph_monmap *monmap = NULL;
 	struct ceph_fsid fsid;
-	u32 epoch, num_mon;
-	u32 len;
+	u32 struct_len;
+	int blob_len;
+	int num_mon;
+	u8 struct_v;
+	u32 epoch;
+	int ret;
+	int i;
 
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
+	ceph_decode_32_safe(p, end, blob_len, e_inval);
+	ceph_decode_need(p, end, blob_len, e_inval);
 
-	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
-	p += sizeof(u16);  /* skip version */
+	ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+	if (ret)
+		goto fail;
 
-	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(&p);
+	dout("%s struct_v %d\n", __func__, struct_v);
+	ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+	ceph_decode_32_safe(p, end, epoch, e_inval);
+	if (struct_v >= 6) {
+		u32 feat_struct_len;
+		u8 feat_struct_v;
 
-	num_mon = ceph_decode_32(&p);
+		*p += sizeof(struct ceph_timespec);  /* skip last_changed */
+		*p += sizeof(struct ceph_timespec);  /* skip created */
 
-	if (num_mon > CEPH_MAX_MON)
-		goto bad;
-	m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
-	if (m == NULL)
-		return ERR_PTR(-ENOMEM);
-	m->fsid = fsid;
-	m->epoch = epoch;
-	m->num_mon = num_mon;
-	for (i = 0; i < num_mon; ++i) {
-		struct ceph_entity_inst *inst = &m->mon_inst[i];
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
 
-		/* copy name portion */
-		ceph_decode_copy_safe(&p, end, &inst->name,
-					sizeof(inst->name), bad);
-		err = ceph_decode_entity_addr(&p, end, &inst->addr);
-		if (err)
-			goto bad;
+		*p += feat_struct_len;  /* skip persistent_features */
+
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
+
+		*p += feat_struct_len;  /* skip optional_features */
 	}
-	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-	     m->num_mon);
-	for (i = 0; i < m->num_mon; i++)
-		dout("monmap_decode  mon%d is %s\n", i,
-		     ceph_pr_addr(&m->mon_inst[i].addr));
-	return m;
-bad:
-	dout("monmap_decode failed with %d\n", err);
-	kfree(m);
-	return ERR_PTR(err);
+	ceph_decode_32_safe(p, end, num_mon, e_inval);
+
+	dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+	     num_mon);
+	if (num_mon > CEPH_MAX_MON)
+		goto e_inval;
+
+	monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+	if (!monmap) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	monmap->fsid = fsid;
+	monmap->epoch = epoch;
+	monmap->num_mon = num_mon;
+
+	/* legacy_mon_addr map or mon_info map */
+	for (i = 0; i < num_mon; i++) {
+		struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+		ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
+
+		if (struct_v >= 6)
+			ret = decode_mon_info(p, end, msgr2, &inst->addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &inst->addr);
+		if (ret)
+			goto fail;
+
+		dout("%s mon%d addr %s\n", __func__, i,
+		     ceph_pr_addr(&inst->addr));
+	}
+
+	return monmap;
+
+e_inval:
+	ret = -EINVAL;
+fail:
+	kfree(monmap);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -476,7 +541,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(p, end);
+	monmap = ceph_monmap_decode(&p, end, false);
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8966eae543d3..51be5a7482fc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,9 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end);
+		newmap = ceph_osdmap_decode(&p, end, false);
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fa08c15be0c0..2b1dd252f231 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
 /*
  * decode a full map.
  */
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+			 struct ceph_osdmap *map)
 {
 	u8 struct_v;
 	u32 epoch = 0;
@@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		goto e_inval;
 
 	for (i = 0; i < map->max_osd; i++) {
-		err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+		struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+		if (struct_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+		else
+			err = ceph_decode_entity_addr(p, end, addr);
 		if (err)
 			goto bad;
+
+		dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
 	}
 
 	/* pg_temp */
@@ -1790,7 +1798,7 @@ bad:
 /*
  * Allocate and decode a full map.
  */
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_osdmap *map;
 	int ret;
@@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	ret = osdmap_decode(p, end, map);
+	ret = osdmap_decode(p, end, msgr2, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
 		return ERR_PTR(ret);
@@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
  */
 static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
-				      struct ceph_osdmap *map)
+				      bool msgr2, struct ceph_osdmap *map)
 {
 	void *new_up_client;
 	void *new_state;
 	void *new_weight_end;
 	u32 len;
+	int ret;
 	int i;
 
 	new_up_client = *p;
@@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 		struct ceph_entity_addr addr;
 
 		ceph_decode_skip_32(p, end, e_inval);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
 	}
 
 	new_state = *p;
@@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 	while (len--) {
 		s32 osd;
 		u32 xorstate;
-		int ret;
 
 		osd = ceph_decode_32(p);
 		if (struct_v >= 5)
@@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 
 		osd = ceph_decode_32(p);
 		BUG_ON(osd >= map->max_osd);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
+
+		dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
 		pr_info("osd%d up\n", osd);
 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
 		map->osd_addr[osd] = addr;
@@ -1927,7 +1946,7 @@ e_inval:
 /*
  * decode and apply an incremental map update.
  */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map)
 {
 	struct ceph_fsid fsid;
@@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
-		return ceph_osdmap_decode(p, min(*p+len, end));
+		return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
 	}
 
 	/* new crush? */
@@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_up_client, new_state, new_weight */
-	err = decode_new_up_state_weight(p, end, struct_v, map);
+	err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
 	if (err)
 		goto bad;
 

From 313771e80fd253d4b5472e61a2d12b03c5293aa9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 25 Nov 2020 14:41:59 +0100
Subject: [PATCH 221/230] libceph, rbd: ignore addr->type while comparing in
 some cases

For libceph, this ensures that libceph instance sharing (share option)
continues to work.  For rbd, this avoids blocklisting alive lock owners
(locker addr is always LEGACY, while watcher addr is ANY in nautilus).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c       | 8 ++++++--
 include/linux/ceph/msgr.h | 9 ++++++++-
 net/ceph/mon_client.c     | 6 ++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f84128abade3..bec85c054522 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3957,8 +3957,12 @@ static int find_watcher(struct rbd_device *rbd_dev,
 
 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
 	for (i = 0; i < num_watchers; i++) {
-		if (!memcmp(&watchers[i].addr, &locker->info.addr,
-			    sizeof(locker->info.addr)) &&
+		/*
+		 * Ignore addr->type while comparing.  This mimics
+		 * entity_addr_t::get_legacy_str() + strcmp().
+		 */
+		if (ceph_addr_equal_no_type(&watchers[i].addr,
+					    &locker->info.addr) &&
 		    watchers[i].cookie == cookie) {
 			struct rbd_client_id cid = {
 				.gid = le64_to_cpu(watchers[i].name.num),
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 46939485f2c3..9a897a60f20b 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -52,11 +52,18 @@ extern const char *ceph_entity_type_name(int type);
  * entity_addr -- network address
  */
 struct ceph_entity_addr {
-	__le32 type;
+	__le32 type;  /* CEPH_ENTITY_ADDR_TYPE_* */
 	__le32 nonce;  /* unique id for process (e.g. pid) */
 	struct sockaddr_storage in_addr;
 } __attribute__ ((packed));
 
+static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs,
+					   const struct ceph_entity_addr *rhs)
+{
+	return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) &&
+	       lhs->nonce == rhs->nonce;
+}
+
 struct ceph_entity_inst {
 	struct ceph_entity_name name;
 	struct ceph_entity_addr addr;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a9754a7fa78c..f5f090b4e409 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -161,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 {
 	int i;
 
-	for (i = 0; i < m->num_mon; i++)
-		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+	for (i = 0; i < m->num_mon; i++) {
+		if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
 			return 1;
+	}
+
 	return 0;
 }
 

From 00498b994113a871a556f7ff24a4cf8a00611700 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 16:04:58 +0100
Subject: [PATCH 222/230] libceph: introduce connection modes and ms_mode
 option

msgr2 supports two connection modes: crc (plain) and secure (on-wire
encryption).  Connection mode is picked by server based on input from
client.

Introduce ms_mode option:

  ms_mode=legacy        - msgr1 (default)
  ms_mode=crc           - crc mode, if denied fail
  ms_mode=secure        - secure mode, if denied fail
  ms_mode=prefer-crc    - crc mode, if denied agree to secure mode
  ms_mode=prefer-secure - secure mode, if denied agree to crc mode

ms_mode affects all connections, we don't separate connections to mons
like it's done in userspace with ms_client_mode vs ms_mon_client_mode.

For now the default is legacy, to be flipped to prefer-crc after some
time.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h    |  8 +++--
 include/linux/ceph/ceph_fs.h |  6 ++++
 include/linux/ceph/libceph.h |  1 +
 net/ceph/auth.c              | 12 ++++---
 net/ceph/ceph_common.c       | 63 ++++++++++++++++++++++++++++++++++++
 net/ceph/ceph_strings.c      | 14 ++++++++
 net/ceph/mon_client.c        |  4 +--
 7 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 5f64f66309fa..6fc058fe9efa 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -98,11 +98,15 @@ struct ceph_auth_client {
 	const struct ceph_crypto_key *key;     /* our secret key */
 	unsigned want_keys;     /* which services we want */
 
+	int preferred_mode;	/* CEPH_CON_MODE_* */
+	int fallback_mode;	/* ditto */
+
 	struct mutex mutex;
 };
 
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-					       const struct ceph_crypto_key *key);
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes);
 extern void ceph_auth_destroy(struct ceph_auth_client *ac);
 
 extern void ceph_auth_reset(struct ceph_auth_client *ac);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 6d986e52000b..ce22d5469670 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -93,9 +93,15 @@ struct ceph_dir_layout {
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN	0x0
+#define CEPH_CON_MODE_CRC	0x1
+#define CEPH_CON_MODE_SECURE	0x2
+
 #define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 
 const char *ceph_auth_proto_name(int proto);
+const char *ceph_con_mode_name(int mode);
 
 /*********************************************
  * message layer
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index eb5a7ca13f9c..8765a5ad267a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -53,6 +53,7 @@ struct ceph_options {
 	unsigned long osd_keepalive_timeout;	/* jiffies */
 	unsigned long osd_request_timeout;	/* jiffies */
 	u32 read_from_replica;  /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
+	int con_modes[2];  /* CEPH_CON_MODE_* */
 
 	/*
 	 * any type that can't be simply compared or doesn't need
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index deaf267f8942..4a0f32b32cc6 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -39,13 +39,13 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
 /*
  * setup, teardown.
  */
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes)
 {
 	struct ceph_auth_client *ac;
 	int ret;
 
-	dout("auth_init name '%s'\n", name);
-
 	ret = -ENOMEM;
 	ac = kzalloc(sizeof(*ac), GFP_NOFS);
 	if (!ac)
@@ -57,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
 		ac->name = name;
 	else
 		ac->name = CEPH_AUTH_NAME_DEFAULT;
-	dout("auth_init name %s\n", ac->name);
 	ac->key = key;
+	ac->preferred_mode = con_modes[0];
+	ac->fallback_mode = con_modes[1];
+
+	dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+	     ac->name, ac->preferred_mode, ac->fallback_mode);
 	return ac;
 
 out:
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4e7edd707a14..271287c5ec12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
 	Opt_ip,
 	Opt_crush_location,
 	Opt_read_from_replica,
+	Opt_ms_mode,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = {
 	{}
 };
 
+enum ceph_ms_mode {
+	Opt_ms_mode_legacy,
+	Opt_ms_mode_crc,
+	Opt_ms_mode_secure,
+	Opt_ms_mode_prefer_crc,
+	Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+	{"legacy",		Opt_ms_mode_legacy},
+	{"crc",			Opt_ms_mode_crc},
+	{"secure",		Opt_ms_mode_secure},
+	{"prefer-crc",		Opt_ms_mode_prefer_crc},
+	{"prefer-secure",	Opt_ms_mode_prefer_secure},
+	{}
+};
+
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
@@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 			 fs_param_deprecated, NULL),
 	fsparam_enum	("read_from_replica",		Opt_read_from_replica,
 			 ceph_param_read_from_replica),
+	fsparam_enum	("ms_mode",			Opt_ms_mode,
+			 ceph_param_ms_mode),
 	fsparam_string	("secret",			Opt_secret),
 	fsparam_flag_no ("share",			Opt_share),
 	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
@@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void)
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 	opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+	opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+	opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
 	return opt;
 }
 EXPORT_SYMBOL(ceph_alloc_options);
@@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 			BUG();
 		}
 		break;
+	case Opt_ms_mode:
+		switch (result.uint_32) {
+		case Opt_ms_mode_legacy:
+			opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_prefer_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+			break;
+		case Opt_ms_mode_prefer_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_CRC;
+			break;
+		default:
+			BUG();
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 	} else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
 		seq_puts(m, "read_from_replica=localize,");
 	}
+	if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+		if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+		    opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=secure,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+			   opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+			seq_puts(m, "ms_mode=prefer-crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+			seq_puts(m, "ms_mode=prefer-secure,");
+		}
+	}
 
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 69cd391e02a6..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -32,6 +32,20 @@ const char *ceph_auth_proto_name(int proto)
 	}
 }
 
+const char *ceph_con_mode_name(int mode)
+{
+	switch (mode) {
+	case CEPH_CON_MODE_UNKNOWN:
+		return "unknown";
+	case CEPH_CON_MODE_CRC:
+		return "crc";
+	case CEPH_CON_MODE_SECURE:
+		return "secure";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index f5f090b4e409..792a8c4164d7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1156,8 +1156,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	/* connection */
 	/* authentication */
-	monc->auth = ceph_auth_init(cl->options->name,
-				    cl->options->key);
+	monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+				    cl->options->con_modes);
 	if (IS_ERR(monc->auth)) {
 		err = PTR_ERR(monc->auth);
 		goto out_monmap;

From cd1a677cad994021b19665ed476aea63f5d54f31 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 16:59:08 +0100
Subject: [PATCH 223/230] libceph, ceph: implement msgr2.1 protocol (crc and
 secure modes)

Implement msgr2.1 wire protocol, available since nautilus 14.2.11
and octopus 15.2.5.  msgr2.0 wire protocol is not implemented -- it
has several security, integrity and robustness issues and therefore
considered deprecated.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c           |   80 +-
 include/linux/ceph/auth.h      |   36 +-
 include/linux/ceph/ceph_fs.h   |    4 +
 include/linux/ceph/decode.h    |    4 +
 include/linux/ceph/libceph.h   |    9 +-
 include/linux/ceph/messenger.h |  136 +-
 include/linux/ceph/msgr.h      |   48 +
 net/ceph/Kconfig               |    3 +
 net/ceph/Makefile              |    2 +-
 net/ceph/auth.c                |  309 +++
 net/ceph/decode.c              |   45 +
 net/ceph/messenger.c           |   68 +-
 net/ceph/messenger_v2.c        | 3443 ++++++++++++++++++++++++++++++++
 net/ceph/mon_client.c          |  115 +-
 net/ceph/osd_client.c          |   85 +-
 15 files changed, 4356 insertions(+), 31 deletions(-)
 create mode 100644 net/ceph/messenger_v2.c

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index afd22815fbda..740d63d0fc50 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end, false);
+	newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
@@ -5196,6 +5196,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
+static int mds_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int mds_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int mds_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int mds_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
 				struct ceph_msg_header *hdr, int *skip)
 {
@@ -5245,6 +5319,10 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.alloc_msg = mds_alloc_msg,
 	.sign_message = mds_sign_message,
 	.check_message_signature = mds_check_message_signature,
+	.get_auth_request = mds_get_auth_request,
+	.handle_auth_reply_more = mds_handle_auth_reply_more,
+	.handle_auth_done = mds_handle_auth_done,
+	.handle_auth_bad_method = mds_handle_auth_bad_method,
 };
 
 /* eof */
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6fc058fe9efa..3fbe72ebd779 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -120,8 +120,12 @@ int ceph_auth_entity_name_encode(const char *name, void **p, void *end);
 
 extern int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len);
-
 extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode);
 extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				       int peer_type,
 				       struct ceph_auth_handshake *auth);
@@ -157,4 +161,34 @@ int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
 		return auth->check_message_signature(auth, msg);
 	return 0;
 }
+
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len);
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len);
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt);
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt);
+
 #endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ce22d5469670..e41a811026f6 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -93,6 +93,10 @@ struct ceph_dir_layout {
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+#define CEPH_AUTH_MODE_NONE		0
+#define CEPH_AUTH_MODE_AUTHORIZER	1
+#define CEPH_AUTH_MODE_MON		10
+
 /* msgr2 protocol modes */
 #define CEPH_CON_MODE_UNKNOWN	0x0
 #define CEPH_CON_MODE_CRC	0x1
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 9a934e04f841..04f3ace5787b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -221,6 +221,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
 #define CEPH_ENTITY_ADDR_TYPE_MSGR2	__cpu_to_le32(2)
+#define CEPH_ENTITY_ADDR_TYPE_ANY	__cpu_to_le32(3)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -243,6 +244,9 @@ extern int ceph_decode_entity_addr(void **p, void *end,
 int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
 			       struct ceph_entity_addr *addr);
 
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr);
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8765a5ad267a..eb9008bb3992 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -31,10 +31,10 @@
 #define CEPH_OPT_FSID             (1<<0)
 #define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
 #define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes (msgr1) */
 #define CEPH_OPT_NOMSGAUTH	  (1<<4) /* don't require msg signing feat */
 #define CEPH_OPT_TCP_NODELAY	  (1<<5) /* TCP_NODELAY on TCP sockets */
-#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs */
+#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs (msgr1) */
 #define CEPH_OPT_ABORT_ON_FULL	  (1<<7) /* abort w/ ENOSPC when full */
 
 #define CEPH_OPT_DEFAULT   (CEPH_OPT_TCP_NODELAY)
@@ -84,6 +84,7 @@ struct ceph_options {
 #define CEPH_MONC_HUNT_BACKOFF		2
 #define CEPH_MONC_HUNT_MAX_MULT		10
 
+#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024)
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 
@@ -152,6 +153,10 @@ struct ceph_client {
 
 #define from_msgr(ms)	container_of(ms, struct ceph_client, msgr)
 
+static inline bool ceph_msgr2(struct ceph_client *client)
+{
+	return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN;
+}
 
 /*
  * snapshots
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 54a64e8dfce6..0e6e9ad3c3bf 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -3,6 +3,7 @@
 #define __FS_CEPH_MESSENGER_H
 
 #include <linux/bvec.h>
+#include <linux/crypto.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/net.h>
@@ -52,6 +53,23 @@ struct ceph_connection_operations {
 
 	int (*sign_message) (struct ceph_msg *msg);
 	int (*check_message_signature) (struct ceph_msg *msg);
+
+	/* msgr2 authentication exchange */
+	int (*get_auth_request)(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len);
+	int (*handle_auth_reply_more)(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len);
+	int (*handle_auth_done)(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len);
+	int (*handle_auth_bad_method)(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt);
 };
 
 /* use format string %s%lld */
@@ -246,8 +264,15 @@ struct ceph_msg {
 #define CEPH_CON_S_PREOPEN		2
 #define CEPH_CON_S_V1_BANNER		3
 #define CEPH_CON_S_V1_CONNECT_MSG	4
-#define CEPH_CON_S_OPEN			5
-#define CEPH_CON_S_STANDBY		6
+#define CEPH_CON_S_V2_BANNER_PREFIX	5
+#define CEPH_CON_S_V2_BANNER_PAYLOAD	6
+#define CEPH_CON_S_V2_HELLO		7
+#define CEPH_CON_S_V2_AUTH		8
+#define CEPH_CON_S_V2_AUTH_SIGNATURE	9
+#define CEPH_CON_S_V2_SESSION_CONNECT	10
+#define CEPH_CON_S_V2_SESSION_RECONNECT	11
+#define CEPH_CON_S_OPEN			12
+#define CEPH_CON_S_STANDBY		13
 
 /*
  * ceph_connection flag bits
@@ -301,6 +326,99 @@ struct ceph_connection_v1_info {
 	u32 peer_global_seq;  /* peer's global seq for this connection */
 };
 
+#define CEPH_CRC_LEN			4
+#define CEPH_GCM_KEY_LEN		16
+#define CEPH_GCM_IV_LEN			sizeof(struct ceph_gcm_nonce)
+#define CEPH_GCM_BLOCK_LEN		16
+#define CEPH_GCM_TAG_LEN		16
+
+#define CEPH_PREAMBLE_LEN		32
+#define CEPH_PREAMBLE_INLINE_LEN	48
+#define CEPH_PREAMBLE_PLAIN_LEN		CEPH_PREAMBLE_LEN
+#define CEPH_PREAMBLE_SECURE_LEN	(CEPH_PREAMBLE_LEN +		\
+					 CEPH_PREAMBLE_INLINE_LEN +	\
+					 CEPH_GCM_TAG_LEN)
+#define CEPH_EPILOGUE_PLAIN_LEN		(1 + 3 * CEPH_CRC_LEN)
+#define CEPH_EPILOGUE_SECURE_LEN	(CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN)
+
+#define CEPH_FRAME_MAX_SEGMENT_COUNT	4
+
+struct ceph_frame_desc {
+	int fd_tag;  /* FRAME_TAG_* */
+	int fd_seg_cnt;
+	int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT];  /* logical */
+	int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT];
+};
+
+struct ceph_gcm_nonce {
+	__le32 fixed;
+	__le64 counter __packed;
+};
+
+struct ceph_connection_v2_info {
+	struct iov_iter in_iter;
+	struct kvec in_kvecs[5];  /* recvmsg */
+	struct bio_vec in_bvec;  /* recvmsg (in_cursor) */
+	int in_kvec_cnt;
+	int in_state;  /* IN_S_* */
+
+	struct iov_iter out_iter;
+	struct kvec out_kvecs[8];  /* sendmsg */
+	struct bio_vec out_bvec;  /* sendpage (out_cursor, out_zero),
+				     sendmsg (out_enc_pages) */
+	int out_kvec_cnt;
+	int out_state;  /* OUT_S_* */
+
+	int out_zero;  /* # of zero bytes to send */
+	bool out_iter_sendpage;  /* use sendpage if possible */
+
+	struct ceph_frame_desc in_desc;
+	struct ceph_msg_data_cursor in_cursor;
+	struct ceph_msg_data_cursor out_cursor;
+
+	struct crypto_shash *hmac_tfm;  /* post-auth signature */
+	struct crypto_aead *gcm_tfm;  /* on-wire encryption */
+	struct aead_request *gcm_req;
+	struct crypto_wait gcm_wait;
+	struct ceph_gcm_nonce in_gcm_nonce;
+	struct ceph_gcm_nonce out_gcm_nonce;
+
+	struct page **out_enc_pages;
+	int out_enc_page_cnt;
+	int out_enc_resid;
+	int out_enc_i;
+
+	int con_mode;  /* CEPH_CON_MODE_* */
+
+	void *conn_bufs[16];
+	int conn_buf_cnt;
+
+	struct kvec in_sign_kvecs[8];
+	struct kvec out_sign_kvecs[8];
+	int in_sign_kvec_cnt;
+	int out_sign_kvec_cnt;
+
+	u64 client_cookie;
+	u64 server_cookie;
+	u64 global_seq;
+	u64 connect_seq;
+	u64 peer_global_seq;
+
+	u8 in_buf[CEPH_PREAMBLE_SECURE_LEN];
+	u8 out_buf[CEPH_PREAMBLE_SECURE_LEN];
+	struct {
+		u8 late_status;  /* FRAME_LATE_STATUS_* */
+		union {
+			struct {
+				u32 front_crc;
+				u32 middle_crc;
+				u32 data_crc;
+			} __packed;
+			u8 pad[CEPH_GCM_BLOCK_LEN - 1];
+		};
+	} out_epil;
+};
+
 /*
  * A single connection with another host.
  *
@@ -346,7 +464,10 @@ struct ceph_connection {
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
 
-	struct ceph_connection_v1_info v1;
+	union {
+		struct ceph_connection_v1_info v1;
+		struct ceph_connection_v2_info v2;
+	};
 };
 
 extern struct page *ceph_zero_page;
@@ -397,6 +518,15 @@ bool ceph_con_v1_opened(struct ceph_connection *con);
 void ceph_con_v1_reset_session(struct ceph_connection *con);
 void ceph_con_v1_reset_protocol(struct ceph_connection *con);
 
+/* messenger_v2.c */
+int ceph_con_v2_try_read(struct ceph_connection *con);
+int ceph_con_v2_try_write(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con);
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v2_opened(struct ceph_connection *con);
+void ceph_con_v2_reset_session(struct ceph_connection *con);
+void ceph_con_v2_reset_protocol(struct ceph_connection *con);
+
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9a897a60f20b..f5e02f6c0655 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -14,9 +14,39 @@
  * constant.
  */
 #define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_LEN 9
 #define CEPH_BANNER_MAX_LEN 30
 
 
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2 "ceph v2\n"
+#define CEPH_BANNER_V2_LEN 8
+#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16))
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name)               \
+	static const uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+	static const uint64_t CEPH_MSGR2_FEATUREMASK_##name =            \
+			(1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+	(((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1)   // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES  (CEPH_MSGR2_FEATURE_REVISION_1)
+
+
 /*
  * Rollover-safe type and comparator for 32-bit sequence numbers.
  * Comparator returns -1, 0, or 1.
@@ -158,6 +188,24 @@ struct ceph_msg_header {
 	__le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
 
+struct ceph_msg_header2 {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 data_pre_padding_len;
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	__le64 ack_seq;
+	__u8 flags;
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+} __attribute__ ((packed));
+
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
 	select CRYPTO_CBC
+	select CRYPTO_GCM
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
 	select CRYPTO
 	select KEYS
 	default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index df02bd8d6c7b..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -15,4 +15,4 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
 	pagevec.o snapshot.o string_table.o \
-	messenger_v1.o
+	messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 4a0f32b32cc6..6b315c8212b1 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -293,6 +293,39 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 }
 EXPORT_SYMBOL(ceph_auth_is_authenticated);
 
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (force_new && auth->authorizer) {
+		ceph_auth_destroy_authorizer(auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (!auth->authorizer)
+		ret = ac->ops->create_authorizer(ac, peer_type, auth);
+	else if (ac->ops->update_authorizer)
+		ret = ac->ops->update_authorizer(ac, peer_type, auth);
+	else
+		ret = 0;
+	if (ret)
+		goto out;
+
+	*proto = ac->protocol;
+	if (pref_mode && fallb_mode) {
+		*pref_mode = ac->preferred_mode;
+		*fallb_mode = ac->fallback_mode;
+	}
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
+
 int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				int peer_type,
 				struct ceph_auth_handshake *auth)
@@ -369,3 +402,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
 	mutex_unlock(&ac->mutex);
 }
 EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		if (arr[i] == val)
+			return true;
+	}
+
+	return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+	WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+	if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+		ceph_encode_32_safe(p, end, 2, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+		ceph_encode_32_safe(p, end, fallb_mode, e_range);
+	} else {
+		ceph_encode_32_safe(p, end, 1, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+	}
+
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+	int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+	void *end = buf + buf_len;
+	void *lenp;
+	void *p;
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+		ret = init_protocol(ac, proto);
+		if (ret) {
+			pr_err("auth protocol '%s' init failed: %d\n",
+			       ceph_auth_proto_name(proto), ret);
+			goto out;
+		}
+	} else {
+		WARN_ON(ac->protocol != proto);
+		ac->ops->reset(ac);
+	}
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+	if (ret)
+		goto out;
+
+	lenp = p;
+	p += 4;  /* space for len */
+
+	ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+	ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+	if (ret)
+		goto out;
+
+	ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+	ceph_encode_32(&lenp, p - lenp - 4);
+	ret = p - buf;
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+
+e_range:
+	ret = -ERANGE;
+	goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    NULL, NULL, NULL, NULL);
+	if (ret == -EAGAIN)
+		ret = build_request(ac, false, buf, buf_len);
+	else
+		WARN_ON(ret >= 0);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (global_id && ac->global_id != global_id) {
+		dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+		     global_id);
+		ac->global_id = global_id;
+	}
+
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    session_key, session_key_len,
+				    con_secret, con_secret_len);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed\n",
+			       ceph_auth_proto_name(ac->protocol));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed\n",
+			       ceph_con_mode_name(ac->preferred_mode));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed\n",
+				       ceph_con_mode_name(ac->fallback_mode));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol), result);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	int pref_mode, fallb_mode;
+	int proto;
+	void *p;
+	int ret;
+
+	ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+					 &pref_mode, &fallb_mode);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, proto, e_range);
+	ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+	if (ret)
+		return ret;
+
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	void *p;
+	int ret;
+
+	ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+						 reply, reply_len);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len)
+{
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		reply, reply_len, session_key, session_key_len,
+		con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed by %s\n",
+			       ceph_auth_proto_name(ac->protocol),
+			       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed by %s\n",
+			       ceph_con_mode_name(ac->preferred_mode),
+			       ceph_entity_type_name(peer_type));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed by %s\n",
+				       ceph_con_mode_name(ac->fallback_mode),
+				       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol),
+	       ceph_entity_type_name(peer_type), result);
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, peer_type);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index 6429b6713507..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
 
+#include <linux/inet.h>
+
 #include <linux/ceph/decode.h>
 
 static int
@@ -138,3 +140,46 @@ e_inval:
 	return -EINVAL;
 }
 EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+		struct sockaddr_in6 sin6;
+	} u;
+
+	switch (family) {
+	case AF_INET:
+		return sizeof(u.sin);
+	case AF_INET6:
+		return sizeof(u.sin6);
+	default:
+		return sizeof(u);
+	}
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	ceph_encode_8(p, 1);  /* marker */
+	ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+				     sizeof(addr->nonce) +
+				     sizeof(u32) + addr_len);
+	ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+	ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+	ceph_encode_32(p, addr_len);
+	ceph_encode_16(p, family);
+	ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4fb3c33a7b03..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -195,8 +195,11 @@ EXPORT_SYMBOL(ceph_pr_addr);
 
 void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
-	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_banner_addr(&msgr->my_enc_addr);
+	if (!ceph_msgr2(from_msgr(msgr))) {
+		memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+		       sizeof(msgr->my_enc_addr));
+		ceph_encode_banner_addr(&msgr->my_enc_addr);
+	}
 }
 
 /*
@@ -513,7 +516,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 
-	ceph_con_v1_reset_protocol(con);
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_protocol(con);
+	else
+		ceph_con_v1_reset_protocol(con);
 }
 
 /*
@@ -526,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
 
 	ceph_msg_put(msg);
 }
+
 static void ceph_msg_remove_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
@@ -547,7 +554,10 @@ void ceph_con_reset_session(struct ceph_connection *con)
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	ceph_con_v1_reset_session(con);
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_session(con);
+	else
+		ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -600,6 +610,9 @@ EXPORT_SYMBOL(ceph_con_open);
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		return ceph_con_v2_opened(con);
+
 	return ceph_con_v1_opened(con);
 }
 
@@ -1302,7 +1315,16 @@ int ceph_parse_ips(const char *c, const char *end,
 		}
 
 		ceph_addr_set_port(&addr[i], port);
+		/*
+		 * We want the type to be set according to ms_mode
+		 * option, but options are normally parsed after mon
+		 * addresses.  Rather than complicating parsing, set
+		 * to LEGACY and override in build_initial_monmap()
+		 * for mon addresses and ceph_messenger_init() for
+		 * ip option.
+		 */
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+		addr[i].nonce = 0;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
@@ -1410,6 +1432,13 @@ static bool con_sock_closed(struct ceph_connection *con)
 	CASE(PREOPEN);
 	CASE(V1_BANNER);
 	CASE(V1_CONNECT_MSG);
+	CASE(V2_BANNER_PREFIX);
+	CASE(V2_BANNER_PAYLOAD);
+	CASE(V2_HELLO);
+	CASE(V2_AUTH);
+	CASE(V2_AUTH_SIGNATURE);
+	CASE(V2_SESSION_CONNECT);
+	CASE(V2_SESSION_RECONNECT);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
@@ -1494,7 +1523,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			BUG_ON(con->sock);
 		}
 
-		ret = ceph_con_v1_try_read(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_read(con);
+		else
+			ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -1504,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = ceph_con_v1_try_write(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_write(con);
+		else
+			ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -1538,9 +1573,8 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
-	       con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	       con->state != CEPH_CON_S_OPEN);
+	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+		con->state == CEPH_CON_S_CLOSED);
 
 	ceph_con_reset_protocol(con);
 
@@ -1596,7 +1630,11 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 		ceph_addr_set_port(&msgr->inst.addr, 0);
 	}
 
-	msgr->inst.addr.type = 0;
+	/*
+	 * Since nautilus, clients are identified using type ANY.
+	 * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+	 */
+	msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
 
 	/* generate a random non-zero nonce */
 	do {
@@ -1706,7 +1744,10 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	if (con->out_msg == msg) {
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was sending\n", __func__, con, msg);
-		ceph_con_v1_revoke(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke(con);
+		else
+			ceph_con_v1_revoke(con);
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	} else {
@@ -1732,7 +1773,10 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 	if (con->in_msg == msg) {
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was recving\n", __func__, con, msg);
-		ceph_con_v1_revoke_incoming(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke_incoming(con);
+		else
+			ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 	} else {
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..5e38c847317b
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h>  /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h"  /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO			1
+#define FRAME_TAG_AUTH_REQUEST		2
+#define FRAME_TAG_AUTH_BAD_METHOD	3
+#define FRAME_TAG_AUTH_REPLY_MORE	4
+#define FRAME_TAG_AUTH_REQUEST_MORE	5
+#define FRAME_TAG_AUTH_DONE		6
+#define FRAME_TAG_AUTH_SIGNATURE	7
+#define FRAME_TAG_CLIENT_IDENT		8
+#define FRAME_TAG_SERVER_IDENT		9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT	11
+#define FRAME_TAG_SESSION_RESET		12
+#define FRAME_TAG_SESSION_RETRY		13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL	14
+#define FRAME_TAG_SESSION_RECONNECT_OK	15
+#define FRAME_TAG_WAIT			16
+#define FRAME_TAG_MESSAGE		17
+#define FRAME_TAG_KEEPALIVE2		18
+#define FRAME_TAG_KEEPALIVE2_ACK	19
+#define FRAME_TAG_ACK			20
+
+#define FRAME_LATE_STATUS_ABORTED	0x1
+#define FRAME_LATE_STATUS_COMPLETE	0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK	0xf
+
+#define IN_S_HANDLE_PREAMBLE		1
+#define IN_S_HANDLE_CONTROL		2
+#define IN_S_HANDLE_CONTROL_REMAINDER	3
+#define IN_S_PREPARE_READ_DATA		4
+#define IN_S_PREPARE_READ_DATA_CONT	5
+#define IN_S_HANDLE_EPILOGUE		6
+#define IN_S_FINISH_SKIP		7
+
+#define OUT_S_QUEUE_DATA		1
+#define OUT_S_QUEUE_DATA_CONT		2
+#define OUT_S_QUEUE_ENC_PAGE		3
+#define OUT_S_QUEUE_ZEROS		4
+#define OUT_S_FINISH_MESSAGE		5
+#define OUT_S_GET_NEXT			6
+
+#define CTRL_BODY(p)	((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p)	((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p)	(FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p)	(MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ *   1 - done, nothing (else) to read
+ *   0 - socket is empty, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p %s %zu\n", __func__, con,
+	     iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+	     iov_iter_count(&con->v2.in_iter));
+	ret = do_recvmsg(con->sock, &con->v2.in_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.in_iter));
+	return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_sendmsg(sock, &msg);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	struct bio_vec bv;
+	int ret;
+
+	if (WARN_ON(!iov_iter_is_bvec(it)))
+		return -EINVAL;
+
+	while (iov_iter_count(it)) {
+		/* iov_iter_iovec() for ITER_BVEC */
+		bv.bv_page = it->bvec->bv_page;
+		bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+		bv.bv_len = min(iov_iter_count(it),
+				it->bvec->bv_len - it->iov_offset);
+
+		/*
+		 * sendpage cannot properly handle pages with
+		 * page_count == 0, we need to fall back to sendmsg if
+		 * that's the case.
+		 *
+		 * Same goes for slab pages: skb_can_coalesce() allows
+		 * coalescing neighboring slab objects into a single frag
+		 * which triggers one of hardened usercopy checks.
+		 */
+		if (sendpage_ok(bv.bv_page)) {
+			ret = sock->ops->sendpage(sock, bv.bv_page,
+						  bv.bv_offset, bv.bv_len,
+						  CEPH_MSG_FLAGS);
+		} else {
+			iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+			ret = sock_sendmsg(sock, &msg);
+		}
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	return 1;
+}
+
+/*
+ * Write as much as possible.  The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ *   1 - done, nothing (else) to write
+ *   0 - socket is full, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+	if (con->v2.out_iter_sendpage)
+		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+	else
+		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.out_iter));
+	return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+	con->v2.in_kvec_cnt++;
+
+	con->v2.in_iter.nr_segs++;
+	con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_kvec_cnt = 0;
+	iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_bvec = *bv;
+	iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	dout("%s con %p len %d\n", __func__, con, len);
+	iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+	con->v2.out_kvec_cnt++;
+
+	con->v2.out_iter.nr_segs++;
+	con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvec_cnt = 0;
+
+	iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+	con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+			 bool zerocopy)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_bvec = *bv;
+	con->v2.out_iter_sendpage = zerocopy;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(!con->v2.out_zero);
+
+	con->v2.out_bvec.bv_page = ceph_zero_page;
+	con->v2.out_bvec.bv_offset = 0;
+	con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+	con->v2.out_iter_sendpage = true;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+	dout("%s con %p len %d\n", __func__, con, len);
+	con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+	void *buf;
+
+	dout("%s con %p len %d\n", __func__, con, len);
+
+	if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+		return NULL;
+
+	buf = ceph_kvmalloc(len, GFP_NOIO);
+	if (!buf)
+		return NULL;
+
+	con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+	return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+	while (con->v2.conn_buf_cnt)
+		kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+	con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+	con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+	return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+	return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+	return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+	return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+	int head_len;
+	int rem_len;
+
+	if (secure) {
+		head_len = CEPH_PREAMBLE_SECURE_LEN;
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+			head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+		}
+	} else {
+		head_len = CEPH_PREAMBLE_PLAIN_LEN;
+		if (ctrl_len)
+			head_len += ctrl_len + CEPH_CRC_LEN;
+	}
+	return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+			     bool secure)
+{
+	if (!front_len && !middle_len && !data_len)
+		return 0;
+
+	if (!secure)
+		return front_len + middle_len + data_len +
+		       CEPH_EPILOGUE_PLAIN_LEN;
+
+	return padded_len(front_len) + padded_len(middle_len) +
+	       padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+	return __tail_onwire_len(front_len(msg), middle_len(msg),
+				 data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN	(CEPH_PREAMBLE_PLAIN_LEN +		\
+				 sizeof(struct ceph_msg_header2) +	\
+				 CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+	sizeof(void *),
+	sizeof(void *),
+	sizeof(void *),
+	PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+	int i;
+
+	for (i = len_cnt - 1; i >= 0; i--) {
+		if (lens[i])
+			return i + 1;
+	}
+
+	return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+			    const int *lens, int len_cnt)
+{
+	int i;
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = tag;
+	desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+	BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = lens[i];
+		desc->fd_aligns[i] = frame_aligns[i];
+	}
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	void *start = p;
+	int i;
+
+	memset(p, 0, CEPH_PREAMBLE_LEN);
+
+	ceph_encode_8(&p, desc->fd_tag);
+	ceph_encode_8(&p, desc->fd_seg_cnt);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		ceph_encode_32(&p, desc->fd_lens[i]);
+		ceph_encode_16(&p, desc->fd_aligns[i]);
+	}
+
+	put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	u32 crc, expected_crc;
+	int i;
+
+	crc = crc32c(0, p, crcp - p);
+	expected_crc = get_unaligned_le32(crcp);
+	if (crc != expected_crc) {
+		pr_err("bad preamble crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = ceph_decode_8(&p);
+	desc->fd_seg_cnt = ceph_decode_8(&p);
+	if (desc->fd_seg_cnt < 1 ||
+	    desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+		pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = ceph_decode_32(&p);
+		desc->fd_aligns[i] = ceph_decode_16(&p);
+	}
+
+	/*
+	 * This would fire for FRAME_TAG_WAIT (it has one empty
+	 * segment), but we should never get it as client.
+	 */
+	if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+		pr_err("last segment empty\n");
+		return -EINVAL;
+	}
+
+	if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+		pr_err("control segment too big %d\n", desc->fd_lens[0]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+		pr_err("front segment too big %d\n", desc->fd_lens[1]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+		pr_err("middle segment too big %d\n", desc->fd_lens[2]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+		pr_err("data segment too big %d\n", desc->fd_lens[3]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+	cpu_to_le32s(&con->v2.out_epil.front_crc);
+	cpu_to_le32s(&con->v2.out_epil.middle_crc);
+	cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+	memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+			   u32 *data_crc)
+{
+	u8 late_status;
+
+	late_status = ceph_decode_8(&p);
+	if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+			FRAME_LATE_STATUS_COMPLETE) {
+		/* we should never get an aborted message as client */
+		pr_err("bad late_status 0x%x\n", late_status);
+		return -EINVAL;
+	}
+
+	if (front_crc && middle_crc && data_crc) {
+		*front_crc = ceph_decode_32(&p);
+		*middle_crc = ceph_decode_32(&p);
+		*data_crc = ceph_decode_32(&p);
+	}
+
+	return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+			const struct ceph_msg_header2 *hdr2,
+			int front_len, int middle_len, int data_len,
+			const struct ceph_entity_name *peer_name)
+{
+	hdr->seq = hdr2->seq;
+	hdr->tid = hdr2->tid;
+	hdr->type = hdr2->type;
+	hdr->priority = hdr2->priority;
+	hdr->version = hdr2->version;
+	hdr->front_len = cpu_to_le32(front_len);
+	hdr->middle_len = cpu_to_le32(middle_len);
+	hdr->data_len = cpu_to_le32(data_len);
+	hdr->data_off = hdr2->data_off;
+	hdr->src = *peer_name;
+	hdr->compat_version = hdr2->compat_version;
+	hdr->reserved = 0;
+	hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+			 const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+	hdr2->seq = hdr->seq;
+	hdr2->tid = hdr->tid;
+	hdr2->type = hdr->type;
+	hdr2->priority = hdr->priority;
+	hdr2->version = hdr->version;
+	hdr2->data_pre_padding_len = 0;
+	hdr2->data_off = hdr->data_off;
+	hdr2->ack_seq = cpu_to_le64(ack_seq);
+	hdr2->flags = 0;
+	hdr2->compat_version = hdr->compat_version;
+	hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	u32 crc, expected_crc;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+	crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+	expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+	if (crc != expected_crc) {
+		pr_err("bad control crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+				u32 middle_crc, u32 data_crc)
+{
+	if (front_len(con->in_msg)) {
+		con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+					   front_len(con->in_msg));
+	} else {
+		WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+		con->in_front_crc = -1;
+	}
+
+	if (middle_len(con->in_msg))
+		con->in_middle_crc = crc32c(-1,
+					    con->in_msg->middle->vec.iov_base,
+					    middle_len(con->in_msg));
+	else if (data_len(con->in_msg))
+		con->in_middle_crc = -1;
+	else
+		con->in_middle_crc = 0;
+
+	if (!data_len(con->in_msg))
+		con->in_data_crc = 0;
+
+	dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+	if (con->in_front_crc != front_crc) {
+		pr_err("bad front crc, calculated %u, expected %u\n",
+		       con->in_front_crc, front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != middle_crc) {
+		pr_err("bad middle crc, calculated %u, expected %u\n",
+		       con->in_middle_crc, middle_crc);
+		return -EBADMSG;
+	}
+	if (con->in_data_crc != data_crc) {
+		pr_err("bad data crc, calculated %u, expected %u\n",
+		       con->in_data_crc, data_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+			u8 *session_key, int session_key_len,
+			u8 *con_secret, int con_secret_len)
+{
+	unsigned int noio_flag;
+	void *p;
+	int ret;
+
+	dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+	     __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+	WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+
+	if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+	    con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+		pr_err("bad con_mode %d\n", con->v2.con_mode);
+		return -EINVAL;
+	}
+
+	if (!session_key_len) {
+		WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_none */
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.hmac_tfm)) {
+		ret = PTR_ERR(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+		pr_err("failed to allocate hmac tfm context: %d\n", ret);
+		return ret;
+	}
+
+	WARN_ON((unsigned long)session_key &
+		crypto_shash_alignmask(con->v2.hmac_tfm));
+	ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
+				  session_key_len);
+	if (ret) {
+		pr_err("failed to set hmac key: %d\n", ret);
+		return ret;
+	}
+
+	if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_x, plain mode */
+	}
+
+	if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+		pr_err("con_secret too small %d\n", con_secret_len);
+		return -EINVAL;
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.gcm_tfm)) {
+		ret = PTR_ERR(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+		pr_err("failed to allocate gcm tfm context: %d\n", ret);
+		return ret;
+	}
+
+	p = con_secret;
+	WARN_ON((unsigned long)p & crypto_aead_alignmask(con->v2.gcm_tfm));
+	ret = crypto_aead_setkey(con->v2.gcm_tfm, p, CEPH_GCM_KEY_LEN);
+	if (ret) {
+		pr_err("failed to set gcm key: %d\n", ret);
+		return ret;
+	}
+
+	p += CEPH_GCM_KEY_LEN;
+	WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+	ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+	if (ret) {
+		pr_err("failed to set gcm tag size: %d\n", ret);
+		return ret;
+	}
+
+	con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+	if (!con->v2.gcm_req) {
+		pr_err("failed to allocate gcm request\n");
+		return -ENOMEM;
+	}
+
+	crypto_init_wait(&con->v2.gcm_wait);
+	aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &con->v2.gcm_wait);
+
+	memcpy(&con->v2.in_gcm_nonce, p, CEPH_GCM_IV_LEN);
+	memcpy(&con->v2.out_gcm_nonce, p + CEPH_GCM_IV_LEN, CEPH_GCM_IV_LEN);
+	return 0;  /* auth_x, secure mode */
+}
+
+static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
+		       int kvec_cnt, u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm);  /* tfm arg is ignored */
+	int ret;
+	int i;
+
+	dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
+	     con->v2.hmac_tfm, kvec_cnt);
+
+	if (!con->v2.hmac_tfm) {
+		memset(hmac, 0, SHA256_DIGEST_SIZE);
+		return 0;  /* auth_none */
+	}
+
+	desc->tfm = con->v2.hmac_tfm;
+	ret = crypto_shash_init(desc);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < kvec_cnt; i++) {
+		WARN_ON((unsigned long)kvecs[i].iov_base &
+			crypto_shash_alignmask(con->v2.hmac_tfm));
+		ret = crypto_shash_update(desc, kvecs[i].iov_base,
+					  kvecs[i].iov_len);
+		if (ret)
+			return ret;
+	}
+
+	ret = crypto_shash_final(desc, hmac);
+	if (ret)
+		return ret;
+
+	shash_desc_zero(desc);
+	return 0;  /* auth_x, both plain and secure modes */
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+	u64 counter;
+
+	counter = le64_to_cpu(nonce->counter);
+	nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+		     struct scatterlist *src, struct scatterlist *dst,
+		     int src_len)
+{
+	struct ceph_gcm_nonce *nonce;
+	int ret;
+
+	nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+	aead_request_set_ad(con->v2.gcm_req, 0);  /* no AAD */
+	aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+	ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+					crypto_aead_decrypt(con->v2.gcm_req),
+			      &con->v2.gcm_wait);
+	if (ret)
+		return ret;
+
+	gcm_inc_nonce(nonce);
+	return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+			struct bio_vec *bv)
+{
+	struct page *page;
+	size_t off, len;
+
+	WARN_ON(!cursor->total_resid);
+
+	/* skip zero-length data items */
+	while (!cursor->resid)
+		ceph_msg_data_advance(cursor, 0);
+
+	/* get a piece of data, cursor isn't advanced */
+	page = ceph_msg_data_next(cursor, &off, &len, NULL);
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+	int sg_cnt;
+
+	if (!buf_len)
+		return 0;
+
+	sg_cnt = need_padding(buf_len) ? 1 : 0;
+	if (is_vmalloc_addr(buf)) {
+		WARN_ON(offset_in_page(buf));
+		sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+	} else {
+		sg_cnt++;
+	}
+
+	return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+	int sg_cnt;
+
+	if (!data_len)
+		return 0;
+
+	sg_cnt = need_padding(data_len) ? 1 : 0;
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_cnt++;
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+	void *end = buf + buf_len;
+	struct page *page;
+	int len;
+	void *p;
+
+	if (!buf_len)
+		return;
+
+	if (is_vmalloc_addr(buf)) {
+		p = buf;
+		do {
+			page = vmalloc_to_page(p);
+			len = min_t(int, end - p, PAGE_SIZE);
+			WARN_ON(!page || !len || offset_in_page(p));
+			sg_set_page(*sg, page, len, 0);
+			*sg = sg_next(*sg);
+			p += len;
+		} while (p != end);
+	} else {
+		sg_set_buf(*sg, buf, buf_len);
+		*sg = sg_next(*sg);
+	}
+
+	if (need_padding(buf_len)) {
+		sg_set_buf(*sg, pad, padding_len(buf_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+			    struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+
+	if (!data_len)
+		return;
+
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+		*sg = sg_next(*sg);
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	if (need_padding(data_len)) {
+		sg_set_buf(*sg, pad, padding_len(data_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+			     u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+			     void *epilogue, bool add_tag)
+{
+	struct ceph_msg_data_cursor cursor;
+	struct scatterlist *cur_sg;
+	int sg_cnt;
+	int ret;
+
+	if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+		return 0;
+
+	sg_cnt = 1;  /* epilogue + [auth tag] */
+	if (front_len(msg))
+		sg_cnt += calc_sg_cnt(msg->front.iov_base,
+				      front_len(msg));
+	if (middle_len(msg))
+		sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+				      middle_len(msg));
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		sg_cnt += calc_sg_cnt_cursor(&cursor);
+	}
+
+	ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	cur_sg = sgt->sgl;
+	if (front_len(msg))
+		init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+			 front_pad);
+	if (middle_len(msg))
+		init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+			 middle_pad);
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		init_sgs_cursor(&cur_sg, &cursor, data_pad);
+	}
+
+	WARN_ON(!sg_is_last(cur_sg));
+	sg_set_buf(cur_sg, epilogue,
+		   CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+	return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+	return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+	sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+	return gcm_crypt(con, false, sgs, sgs,
+			 padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+static int decrypt_message(struct ceph_connection *con)
+{
+	struct sg_table sgt = {};
+	int ret;
+
+	ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+			MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+			con->v2.in_buf, true);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
+			tail_onwire_len(con->in_msg, true));
+
+out:
+	sg_free_table(&sgt);
+	return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+	int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+	void *buf, *p;
+
+	buf = alloc_conn_buf(con, buf_len);
+	if (!buf)
+		return -ENOMEM;
+
+	p = buf;
+	ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+	ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+	ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+	ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+	WARN_ON(p != buf + buf_len);
+
+	add_out_kvec(con, buf, buf_len);
+	add_out_sign_kvec(con, buf, buf_len);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for control crc
+ *
+ * extdata (optional):
+ *   control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ *   preamble
+ *   control body (ctrl_len + extdata_len bytes)
+ *   control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+			       int ctrl_len, void *extdata, int extdata_len,
+			       bool to_be_signed)
+{
+	int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+	void *crcp = base + base_len - CEPH_CRC_LEN;
+	u32 crc;
+
+	crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+	if (extdata_len)
+		crc = crc32c(crc, extdata, extdata_len);
+	put_unaligned_le32(crc, crcp);
+
+	if (!extdata_len) {
+		add_out_kvec(con, base, base_len);
+		if (to_be_signed)
+			add_out_sign_kvec(con, base, base_len);
+		return;
+	}
+
+	add_out_kvec(con, base, crcp - base);
+	add_out_kvec(con, extdata, extdata_len);
+	add_out_kvec(con, crcp, CEPH_CRC_LEN);
+	if (to_be_signed) {
+		add_out_sign_kvec(con, base, crcp - base);
+		add_out_sign_kvec(con, extdata, extdata_len);
+		add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+	}
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+				     void *base, int ctrl_len)
+{
+	struct scatterlist sg;
+	int ret;
+
+	/* inline buffer padding? */
+	if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+		memset(CTRL_BODY(base) + ctrl_len, 0,
+		       CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+	sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+	ret = gcm_crypt(con, true, &sg, &sg,
+			CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for padding, if needed
+ *   space for control remainder auth tag
+ *   space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ *   preamble
+ *   control body (48 bytes)
+ *   preamble auth tag
+ *   control body (ctrl_len - 48 bytes)
+ *   zero padding, if needed
+ *   control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+				   void *base, int ctrl_len)
+{
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+	void *rem_tag = rem + padded_len(rem_len);
+	void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+	int ret;
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], base, rem - base);
+	sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+	ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+	if (ret)
+		return ret;
+
+	/* control remainder padding? */
+	if (need_padding(rem_len))
+		memset(rem + rem_len, 0, padding_len(rem_len));
+
+	sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+	ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, rem - base);
+	add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+	add_out_kvec(con, rem, pmbl_tag - rem);
+	return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+			     void *base, int ctrl_len, void *extdata,
+			     int extdata_len, bool to_be_signed)
+{
+	int total_len = ctrl_len + extdata_len;
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+	     total_len, ctrl_len, extdata_len);
+
+	/* extdata may be vmalloc'ed but not base */
+	if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+		return -EINVAL;
+
+	init_frame_desc(&desc, tag, &total_len, 1);
+	encode_preamble(&desc, base);
+
+	if (con_secure(con)) {
+		if (WARN_ON(extdata_len || to_be_signed))
+			return -EINVAL;
+
+		if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+			/* fully inlined, inline buffer may need padding */
+			ret = prepare_head_secure_small(con, base, ctrl_len);
+		else
+			/* partially inlined, inline buffer is full */
+			ret = prepare_head_secure_big(con, base, ctrl_len);
+		if (ret)
+			return ret;
+	} else {
+		prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+				   to_be_signed);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+			   void *base, int ctrl_len)
+{
+	return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+	void *buf, *p;
+	int ctrl_len;
+
+	ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+				 NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN	(512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+	void *authorizer, *authorizer_copy;
+	int ctrl_len, authorizer_len;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+					 &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	authorizer_copy = alloc_conn_buf(con, authorizer_len);
+	if (!authorizer_copy)
+		return -ENOMEM;
+
+	memcpy(authorizer_copy, authorizer, authorizer_len);
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+				 authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+				     void *reply, int reply_len)
+{
+	int ctrl_len, authorizer_len;
+	void *authorizer;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+					       CTRL_BODY(buf), &ctrl_len,
+					       &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+				 ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+	void *buf;
+	int ret;
+
+	buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, false));
+	if (!buf)
+		return -ENOMEM;
+
+	ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+			  CTRL_BODY(buf));
+	if (ret)
+		return ret;
+
+	return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+			       SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 global_id = ceph_client_gid(client);
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(con->v2.server_cookie);
+	WARN_ON(con->v2.connect_seq);
+	WARN_ON(con->v2.peer_global_seq);
+
+	if (!con->v2.client_cookie) {
+		do {
+			get_random_bytes(&con->v2.client_cookie,
+					 sizeof(con->v2.client_cookie));
+		} while (!con->v2.client_cookie);
+		dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	} else {
+		dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	}
+
+	dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+	     global_id, con->v2.global_seq, client->supported_features,
+	     client->required_features, con->v2.client_cookie);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+		   ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* addrvec marker */
+	ceph_encode_32(&p, 1);  /* addr_cnt */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	ceph_encode_64(&p, global_id);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, client->supported_features);
+	ceph_encode_64(&p, client->required_features);
+	ceph_encode_64(&p, 0);  /* flags */
+	ceph_encode_64(&p, con->v2.client_cookie);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(!con->v2.client_cookie);
+	WARN_ON(!con->v2.server_cookie);
+	WARN_ON(!con->v2.connect_seq);
+	WARN_ON(!con->v2.peer_global_seq);
+
+	dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+	     con->v2.connect_seq, con->in_seq);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* entity_addrvec_t marker */
+	ceph_encode_32(&p, 1);  /* my_addrs len */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_64(&p, con->v2.client_cookie);
+	ceph_encode_64(&p, con->v2.server_cookie);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, con->v2.connect_seq);
+	ceph_encode_64(&p, con->in_seq);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+	struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
+	     now.tv_nsec);
+
+	ceph_encode_timespec64(ts, &now);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+			       sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+	void *p;
+
+	dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	p = CTRL_BODY(con->v2.out_buf);
+	ceph_encode_64(&p, con->in_seq_acked);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+	     con->out_msg, aborted, con->v2.out_epil.front_crc,
+	     con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+	encode_epilogue_plain(con, aborted);
+	add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1.  For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	prepare_head_plain(con, con->v2.out_buf,
+			   sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg)) {
+			/*
+			 * Empty message: once the head is written,
+			 * we are done -- there is no epilogue.
+			 */
+			con->v2.out_state = OUT_S_FINISH_MESSAGE;
+			return;
+		}
+
+		con->v2.out_epil.front_crc = -1;
+		con->v2.out_epil.middle_crc = -1;
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+		return;
+	}
+
+	if (front_len(msg)) {
+		con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+						    front_len(msg));
+		add_out_kvec(con, msg->front.iov_base, front_len(msg));
+	} else {
+		/* middle (at least) is there, checked above */
+		con->v2.out_epil.front_crc = -1;
+	}
+
+	if (middle_len(msg)) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+		add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+	} else {
+		con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+	} else {
+		con->v2.out_epil.data_crc = 0;
+		prepare_epilogue_plain(con, false);
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+	}
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs.  Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con)
+{
+	void *zerop = page_address(ceph_zero_page);
+	struct sg_table enc_sgt = {};
+	struct sg_table sgt = {};
+	struct page **enc_pages;
+	int enc_page_cnt;
+	int tail_len;
+	int ret;
+
+	ret = prepare_head_secure_small(con, con->v2.out_buf,
+					sizeof(struct ceph_msg_header2));
+	if (ret)
+		return ret;
+
+	tail_len = tail_onwire_len(con->out_msg, true);
+	if (!tail_len) {
+		/*
+		 * Empty message: once the head is written,
+		 * we are done -- there is no epilogue.
+		 */
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+		return 0;
+	}
+
+	encode_epilogue_secure(con, false);
+	ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+				&con->v2.out_epil, false);
+	if (ret)
+		goto out;
+
+	enc_page_cnt = calc_pages_for(0, tail_len);
+	enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+	if (IS_ERR(enc_pages)) {
+		ret = PTR_ERR(enc_pages);
+		goto out;
+	}
+
+	WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+	con->v2.out_enc_pages = enc_pages;
+	con->v2.out_enc_page_cnt = enc_page_cnt;
+	con->v2.out_enc_resid = tail_len;
+	con->v2.out_enc_i = 0;
+
+	ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+					0, tail_len, GFP_NOIO);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+			tail_len - CEPH_GCM_TAG_LEN);
+	if (ret)
+		goto out;
+
+	dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+	     con->out_msg, sgt.orig_nents, enc_page_cnt);
+	con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+	sg_free_table(&sgt);
+	sg_free_table(&enc_sgt);
+	return ret;
+}
+
+static int prepare_message(struct ceph_connection *con)
+{
+	int lens[] = {
+		sizeof(struct ceph_msg_header2),
+		front_len(con->out_msg),
+		middle_len(con->out_msg),
+		data_len(con->out_msg)
+	};
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+	     con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+
+	if (con->in_seq > con->in_seq_acked) {
+		dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+		     con->in_seq_acked, con->in_seq);
+		con->in_seq_acked = con->in_seq;
+	}
+
+	reset_out_kvecs(con);
+	init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+	encode_preamble(&desc, con->v2.out_buf);
+	fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+		     con->in_seq_acked);
+
+	if (con_secure(con)) {
+		ret = prepare_message_secure(con);
+		if (ret)
+			return ret;
+	} else {
+		prepare_message_plain(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+	return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+				       int payload_len)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, payload_len);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, payload_len);
+	add_in_sign_kvec(con, buf, payload_len);
+	con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+	return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+	reset_in_kvecs(con);
+	add_in_kvec(con, con->v2.in_buf,
+		    con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+				      CEPH_PREAMBLE_PLAIN_LEN);
+	con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int head_len;
+	void *buf;
+
+	reset_in_kvecs(con);
+	if (con->state == CEPH_CON_S_V2_HELLO ||
+	    con->state == CEPH_CON_S_V2_AUTH) {
+		head_len = head_onwire_len(ctrl_len, false);
+		buf = alloc_conn_buf(con, head_len);
+		if (!buf)
+			return -ENOMEM;
+
+		/* preserve preamble */
+		memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+		add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+		add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+		add_in_sign_kvec(con, buf, head_len);
+	} else {
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			buf = alloc_conn_buf(con, ctrl_len);
+			if (!buf)
+				return -ENOMEM;
+
+			add_in_kvec(con, buf, ctrl_len);
+		} else {
+			add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+		}
+		add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_CONTROL;
+	return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *buf;
+
+	buf = alloc_conn_buf(con, ctrl_len);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+	add_in_kvec(con, con->v2.in_buf,
+		    padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+	con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+	return 0;
+}
+
+static void prepare_read_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+				  data_len(con->in_msg));
+
+	get_bvec_at(&con->v2.in_cursor, &bv);
+	set_in_bvec(con, &bv);
+	con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+						    con->v2.in_bvec.bv_page,
+						    con->v2.in_bvec.bv_offset,
+						    con->v2.in_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+	if (con->v2.in_cursor.total_resid) {
+		get_bvec_at(&con->v2.in_cursor, &bv);
+		set_in_bvec(con, &bv);
+		WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've read all data.  Prepare to read data padding (if any)
+	 * and epilogue.
+	 */
+	reset_in_kvecs(con);
+	if (con_secure(con)) {
+		if (need_padding(data_len(con->in_msg)))
+			add_in_kvec(con, DATA_PAD(con->v2.in_buf),
+				    padding_len(data_len(con->in_msg)));
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+	} else {
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+	con->in_seq++;
+	prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int tail_len;
+
+	dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+	     desc->fd_lens[2], desc->fd_lens[3]);
+
+	tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+				     desc->fd_lens[3], con_secure(con));
+	if (!tail_len) {
+		__finish_skip(con);
+	} else {
+		set_in_skip(con, tail_len);
+		con->v2.in_state = IN_S_FINISH_SKIP;
+	}
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+	int payload_len;
+	void *p;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+	p = con->v2.in_kvecs[0].iov_base;
+	if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+		if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+			con->error_msg = "server is speaking msgr1 protocol";
+		else
+			con->error_msg = "protocol error, bad banner";
+		return -EINVAL;
+	}
+
+	p += CEPH_BANNER_V2_LEN;
+	payload_len = ceph_decode_16(&p);
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+	void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+	u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+	u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+	u64 server_feat, server_req_feat;
+	void *p;
+	int ret;
+
+	p = con->v2.in_kvecs[0].iov_base;
+	ceph_decode_64_safe(&p, end, server_feat, bad);
+	ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+	dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+	     __func__, con, server_feat, server_req_feat);
+
+	if (req_feat & ~server_feat) {
+		pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       server_feat, req_feat & ~server_feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+	if (server_req_feat & ~feat) {
+		pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+		       feat, server_req_feat & ~feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/* no reset_out_kvecs() as our banner may still be pending */
+	ret = prepare_hello(con);
+	if (ret) {
+		pr_err("prepare_hello failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_HELLO;
+	prepare_read_preamble(con);
+	return 0;
+
+bad:
+	pr_err("failed to decode banner payload\n");
+	return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_entity_addr addr_for_me;
+	u8 entity_type;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		con->error_msg = "protocol error, unexpected hello";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, entity_type, bad);
+	ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+	if (ret) {
+		pr_err("failed to decode addr_for_me: %d\n", ret);
+		return ret;
+	}
+
+	dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+	     entity_type, ceph_pr_addr(&addr_for_me));
+
+	if (entity_type != con->peer_name.type) {
+		pr_err("bad peer type, want %d, got %d\n",
+		       con->peer_name.type, entity_type);
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	/*
+	 * Set our address to the address our first peer (i.e. monitor)
+	 * sees that we are connecting from.  If we are behind some sort
+	 * of NAT and want to be identified by some private (not NATed)
+	 * address, ip option should be used.
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+		       sizeof(my_addr->in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		dout("%s con %p set my addr %s, as seen by peer %s\n",
+		     __func__, con, ceph_pr_addr(my_addr),
+		     ceph_pr_addr(&con->peer_addr));
+	} else {
+		dout("%s con %p my addr already set %s\n",
+		     __func__, con, ceph_pr_addr(my_addr));
+	}
+
+	WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+	WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+	WARN_ON(!my_addr->nonce);
+
+	/* no reset_out_kvecs() as our hello may still be pending */
+	ret = prepare_auth_request(con);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH;
+	return 0;
+
+bad:
+	pr_err("failed to decode hello\n");
+	return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int allowed_protos[8], allowed_modes[8];
+	int allowed_proto_cnt, allowed_mode_cnt;
+	int used_proto, result;
+	int ret;
+	int i;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_bad_method";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, used_proto, bad);
+	ceph_decode_32_safe(&p, end, result, bad);
+	dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+	     result);
+
+	ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+	if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+		pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_proto_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+		dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+		     i, allowed_protos[i]);
+	}
+
+	ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+	if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+		pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_mode_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+		dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+		     i, allowed_modes[i]);
+	}
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+					       allowed_protos,
+					       allowed_proto_cnt,
+					       allowed_modes,
+					       allowed_mode_cnt);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+	return ret;
+
+bad:
+	pr_err("failed to decode auth_bad_method\n");
+	return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int payload_len;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_reply_more";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+	ceph_decode_need(&p, end, payload_len, bad);
+
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_request_more(con, p, payload_len);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request_more failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_reply_more\n");
+	return -EINVAL;
+}
+
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+	u8 session_key[CEPH_KEY_LEN];
+	u8 con_secret[CEPH_MAX_CON_SECRET_LEN];
+	int session_key_len, con_secret_len;
+	int payload_len;
+	u64 global_id;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_done";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+
+	dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+	     __func__, con, global_id, con->v2.con_mode, payload_len);
+
+	mutex_unlock(&con->mutex);
+	session_key_len = 0;
+	con_secret_len = 0;
+	ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+					 session_key, &session_key_len,
+					 con_secret, &con_secret_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	ret = setup_crypto(con, session_key, session_key_len, con_secret,
+			   con_secret_len);
+	if (ret)
+		return ret;
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_signature(con);
+	if (ret) {
+		pr_err("prepare_auth_signature failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_done\n");
+	return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	u8 hmac[SHA256_DIGEST_SIZE];
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+		con->error_msg = "protocol error, unexpected auth_signature";
+		return -EINVAL;
+	}
+
+	ret = hmac_sha256(con, con->v2.out_sign_kvecs,
+			  con->v2.out_sign_kvec_cnt, hmac);
+	if (ret)
+		return ret;
+
+	ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+	if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+		con->error_msg = "integrity error, bad auth signature";
+		return -EBADMSG;
+	}
+
+	dout("%s con %p auth signature ok\n", __func__, con);
+
+	/* no reset_out_kvecs() as our auth_signature may still be pending */
+	if (!con->v2.server_cookie) {
+		ret = prepare_client_ident(con);
+		if (ret) {
+			pr_err("prepare_client_ident failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	} else {
+		ret = prepare_session_reconnect(con);
+		if (ret) {
+			pr_err("prepare_session_reconnect failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_signature\n");
+	return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+				void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 features, required_features;
+	struct ceph_entity_addr addr;
+	u64 global_seq;
+	u64 global_id;
+	u64 cookie;
+	u64 flags;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected server_ident";
+		return -EINVAL;
+	}
+
+	ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+	if (ret) {
+		pr_err("failed to decode server addrs: %d\n", ret);
+		return ret;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+	ceph_decode_64_safe(&p, end, features, bad);
+	ceph_decode_64_safe(&p, end, required_features, bad);
+	ceph_decode_64_safe(&p, end, flags, bad);
+	ceph_decode_64_safe(&p, end, cookie, bad);
+
+	dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+	     global_id, global_seq, features, required_features, flags, cookie);
+
+	/* is this who we intended to talk to? */
+	if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+		pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->peer_addr.nonce),
+		       ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	if (client->required_features & ~features) {
+		pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       features, client->required_features & ~features);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/*
+	 * Both name->type and name->num are set in ceph_con_open() but
+	 * name->num may be bogus in the initial monmap.  name->type is
+	 * verified in handle_hello().
+	 */
+	WARN_ON(!con->peer_name.type);
+	con->peer_name.num = cpu_to_le64(global_id);
+	con->v2.peer_global_seq = global_seq;
+	con->peer_features = features;
+	WARN_ON(required_features & ~client->supported_features);
+	con->v2.server_cookie = cookie;
+
+	if (flags & CEPH_MSG_CONNECT_LOSSY) {
+		ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+		WARN_ON(con->v2.server_cookie);
+	} else {
+		WARN_ON(!con->v2.server_cookie);
+	}
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode server_ident\n");
+	return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+					  void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 missing_features;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected ident_missing_features";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, missing_features, bad);
+	pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+	       client->supported_features, missing_features);
+	con->error_msg = "missing required protocol features";
+	return -EINVAL;
+
+bad:
+	pr_err("failed to decode ident_missing_features\n");
+	return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reconnect_ok";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_requeued(con, seq);
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reconnect_ok\n");
+	return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	u64 connect_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+	dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+	WARN_ON(connect_seq <= con->v2.connect_seq);
+	con->v2.connect_seq = connect_seq + 1;
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry\n");
+	return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 global_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry_global";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+
+	dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+	WARN_ON(global_seq <= con->v2.global_seq);
+	con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry_global\n");
+	return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	bool full;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reset";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, full, bad);
+	if (!full) {
+		con->error_msg = "protocol error, bad session_reset";
+		return -EINVAL;
+	}
+
+	pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+		ceph_pr_addr(&con->peer_addr));
+	ceph_con_reset_session(con);
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->peer_reset)
+		con->ops->peer_reset(con);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_client_ident(con);
+	if (ret) {
+		pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reset\n");
+	return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected keepalive2_ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+	ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
+	     con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
+
+	return 0;
+
+bad:
+	pr_err("failed to decode keepalive2_ack\n");
+	return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_sent(con, seq);
+	return 0;
+
+bad:
+	pr_err("failed to decode ack\n");
+	return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+	int tag = con->v2.in_desc.fd_tag;
+	int ret;
+
+	dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+	switch (tag) {
+	case FRAME_TAG_HELLO:
+		ret = process_hello(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_BAD_METHOD:
+		ret = process_auth_bad_method(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_REPLY_MORE:
+		ret = process_auth_reply_more(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_DONE:
+		ret = process_auth_done(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_SIGNATURE:
+		ret = process_auth_signature(con, p, end);
+		break;
+	case FRAME_TAG_SERVER_IDENT:
+		ret = process_server_ident(con, p, end);
+		break;
+	case FRAME_TAG_IDENT_MISSING_FEATURES:
+		ret = process_ident_missing_features(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RECONNECT_OK:
+		ret = process_session_reconnect_ok(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY:
+		ret = process_session_retry(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY_GLOBAL:
+		ret = process_session_retry_global(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RESET:
+		ret = process_session_reset(con, p, end);
+		break;
+	case FRAME_TAG_KEEPALIVE2_ACK:
+		ret = process_keepalive2_ack(con, p, end);
+		break;
+	case FRAME_TAG_ACK:
+		ret = process_ack(con, p, end);
+		break;
+	default:
+		pr_err("bad tag %d\n", tag);
+		con->error_msg = "protocol error, bad tag";
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+/*
+ * Return:
+ *   1 - con->in_msg set, read message
+ *   0 - skip message
+ *  <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	struct ceph_msg_header2 *hdr2 = p;
+	struct ceph_msg_header hdr;
+	int skip;
+	int ret;
+	u64 seq;
+
+	/* verify seq# */
+	seq = le64_to_cpu(hdr2->seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		return 0;
+	}
+	if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+	fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+		    desc->fd_lens[3], &con->peer_name);
+	ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+	if (ret)
+		return ret;
+
+	WARN_ON(!con->in_msg ^ skip);
+	if (skip)
+		return 0;
+
+	WARN_ON(!con->in_msg);
+	WARN_ON(con->in_msg->con != con);
+	return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+	ceph_con_process_message(con);
+
+	/*
+	 * We could have been closed by ceph_con_close() because
+	 * ceph_con_process_message() temporarily drops con->mutex.
+	 */
+	if (con->state != CEPH_CON_S_OPEN) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+	void *end = p + con->v2.in_desc.fd_lens[0];
+	struct ceph_msg *msg;
+	int ret;
+
+	if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+		return process_control(con, p, end);
+
+	ret = process_message_header(con, p, end);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		prepare_skip_message(con);
+		return 0;
+	}
+
+	msg = con->in_msg;  /* set in process_message_header() */
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg))
+			return process_message(con);
+
+		prepare_read_data(con);
+		return 0;
+	}
+
+	reset_in_kvecs(con);
+	if (front_len(msg)) {
+		WARN_ON(front_len(msg) > msg->front_alloc_len);
+		add_in_kvec(con, msg->front.iov_base, front_len(msg));
+		msg->front.iov_len = front_len(msg);
+
+		if (con_secure(con) && need_padding(front_len(msg)))
+			add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
+				    padding_len(front_len(msg)));
+	} else {
+		msg->front.iov_len = 0;
+	}
+	if (middle_len(msg)) {
+		WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+		add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+		msg->middle->vec.iov_len = middle_len(msg);
+
+		if (con_secure(con) && need_padding(middle_len(msg)))
+			add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
+				    padding_len(middle_len(msg)));
+	} else if (msg->middle) {
+		msg->middle->vec.iov_len = 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.in_state = IN_S_PREPARE_READ_DATA;
+	} else {
+		add_in_kvec(con, con->v2.in_buf,
+			    con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
+					      CEPH_EPILOGUE_PLAIN_LEN);
+		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+	}
+	return 0;
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_preamble(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad preamble auth tag";
+			return ret;
+		}
+	}
+
+	ret = decode_preamble(con->v2.in_buf, desc);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad crc";
+		else
+			con->error_msg = "protocol error, bad preamble";
+		return ret;
+	}
+
+	dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+	     con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+	     desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+	if (!con_secure(con))
+		return prepare_read_control(con);
+
+	if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+		return prepare_read_control_remainder(con);
+
+	return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	void *buf;
+	int ret;
+
+	WARN_ON(con_secure(con));
+
+	ret = verify_control_crc(con);
+	if (ret) {
+		con->error_msg = "integrity error, bad crc";
+		return ret;
+	}
+
+	if (con->state == CEPH_CON_S_V2_AUTH) {
+		buf = alloc_conn_buf(con, ctrl_len);
+		if (!buf)
+			return -ENOMEM;
+
+		memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+		return __handle_control(con, buf);
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+	int ret;
+
+	WARN_ON(!con_secure(con));
+
+	ret = decrypt_control_remainder(con);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad control remainder auth tag";
+		return ret;
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+				     CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+	u32 front_crc, middle_crc, data_crc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_message(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad epilogue auth tag";
+			return ret;
+		}
+
+		/* just late_status */
+		ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+	} else {
+		ret = decode_epilogue(con->v2.in_buf, &front_crc,
+				      &middle_crc, &data_crc);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+
+		ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+					   data_crc);
+		if (ret) {
+			con->error_msg = "integrity error, bad crc";
+			return ret;
+		}
+	}
+
+	return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+
+	if (con_secure(con))
+		gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+	__finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+	     con->v2.in_state);
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+		ret = process_banner_prefix(con);
+	} else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+		ret = process_banner_payload(con);
+	} else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+		    con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+		   con->state == CEPH_CON_S_OPEN) {
+		switch (con->v2.in_state) {
+		case IN_S_HANDLE_PREAMBLE:
+			ret = handle_preamble(con);
+			break;
+		case IN_S_HANDLE_CONTROL:
+			ret = handle_control(con);
+			break;
+		case IN_S_HANDLE_CONTROL_REMAINDER:
+			ret = handle_control_remainder(con);
+			break;
+		case IN_S_PREPARE_READ_DATA:
+			prepare_read_data(con);
+			ret = 0;
+			break;
+		case IN_S_PREPARE_READ_DATA_CONT:
+			prepare_read_data_cont(con);
+			ret = 0;
+			break;
+		case IN_S_HANDLE_EPILOGUE:
+			ret = handle_epilogue(con);
+			break;
+		case IN_S_FINISH_SKIP:
+			finish_skip(con);
+			ret = 0;
+			break;
+		default:
+			WARN(1, "bad in_state %d", con->v2.in_state);
+			return -EINVAL;
+		}
+	} else {
+		WARN(1, "bad state %d", con->state);
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.in_iter));
+	return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_PREOPEN)
+		return 0;
+
+	/*
+	 * We should always have something pending here.  If not,
+	 * avoid calling populate_in_iter() as if we read something
+	 * (ceph_tcp_recv() would immediately return 1).
+	 */
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+
+	for (;;) {
+		ret = ceph_tcp_recv(con);
+		if (ret <= 0)
+			return ret;
+
+		ret = populate_in_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "read processing error";
+			return ret;
+		}
+	}
+}
+
+static void queue_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
+				  data_len(con->out_msg));
+
+	get_bvec_at(&con->v2.out_cursor, &bv);
+	set_out_bvec(con, &bv, true);
+	con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = ceph_crc32c_page(
+		con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+		con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+	if (con->v2.out_cursor.total_resid) {
+		get_bvec_at(&con->v2.out_cursor, &bv);
+		set_out_bvec(con, &bv, true);
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've written all data.  Queue epilogue.  Once it's written,
+	 * we are done.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, false);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+	     con->v2.out_enc_resid);
+	WARN_ON(!con->v2.out_enc_resid);
+
+	bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i];
+	bv.bv_offset = 0;
+	bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE);
+
+	set_out_bvec(con, &bv, false);
+	con->v2.out_enc_i++;
+	con->v2.out_enc_resid -= bv.bv_len;
+
+	if (con->v2.out_enc_resid) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+		return;
+	}
+
+	/*
+	 * We've queued the last piece of ciphertext (ending with
+	 * epilogue) + auth tag.  Once it's written, we are done.
+	 */
+	WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con)
+{
+	dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+	if (con->v2.out_zero) {
+		set_out_bvec_zero(con);
+		con->v2.out_zero -= con->v2.out_bvec.bv_len;
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	/*
+	 * We've zero-filled everything up to epilogue.  Queue epilogue
+	 * with late_status set to ABORTED and crcs adjusted for zeros.
+	 * Once it's written, we are done patching up for the revoke.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, true);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+	dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+	/* we end up here both plain and secure modes */
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+	/* message may have been revoked */
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+
+	con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+	     con->v2.out_state);
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+			con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+		goto nothing_pending;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		WARN_ON(!con->out_msg);
+		queue_data(con);
+		goto populated;
+	case OUT_S_QUEUE_DATA_CONT:
+		WARN_ON(!con->out_msg);
+		queue_data_cont(con);
+		goto populated;
+	case OUT_S_QUEUE_ENC_PAGE:
+		queue_enc_page(con);
+		goto populated;
+	case OUT_S_QUEUE_ZEROS:
+		WARN_ON(con->out_msg);  /* revoked */
+		queue_zeros(con);
+		goto populated;
+	case OUT_S_FINISH_MESSAGE:
+		finish_message(con);
+		break;
+	case OUT_S_GET_NEXT:
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		return -EINVAL;
+	}
+
+	WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+	if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+		ret = prepare_keepalive2(con);
+		if (ret) {
+			pr_err("prepare_keepalive2 failed: %d\n", ret);
+			return ret;
+		}
+	} else if (!list_empty(&con->out_queue)) {
+		ceph_con_get_out_msg(con);
+		ret = prepare_message(con);
+		if (ret) {
+			pr_err("prepare_message failed: %d\n", ret);
+			return ret;
+		}
+	} else if (con->in_seq > con->in_seq_acked) {
+		ret = prepare_ack(con);
+		if (ret) {
+			pr_err("prepare_ack failed: %d\n", ret);
+			return ret;
+		}
+	} else {
+		goto nothing_pending;
+	}
+
+populated:
+	if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter));
+	return 1;
+
+nothing_pending:
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	dout("%s con %p nothing pending\n", __func__, con);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.out_iter));
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+		/*
+		 * Always bump global_seq.  Bump connect_seq only if
+		 * there is a session (i.e. we are reconnecting and will
+		 * send session_reconnect instead of client_ident).
+		 */
+		con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+		if (con->v2.server_cookie)
+			con->v2.connect_seq++;
+
+		ret = prepare_read_banner_prefix(con);
+		if (ret) {
+			pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		reset_out_kvecs(con);
+		ret = prepare_banner(con);
+		if (ret) {
+			pr_err("prepare_banner failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		ret = ceph_tcp_connect(con);
+		if (ret) {
+			pr_err("ceph_tcp_connect failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+	}
+
+	if (!iov_iter_count(&con->v2.out_iter)) {
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			return ret;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, true);
+	for (;;) {
+		ret = ceph_tcp_send(con);
+		if (ret <= 0)
+			break;
+
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			break;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, false);
+	return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+	int len;
+
+	while (zero_len) {
+		len = min(zero_len, (int)PAGE_SIZE);
+		crc = crc32c(crc, page_address(ceph_zero_page), len);
+		zero_len -= len;
+	}
+
+	return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > front_len(con->out_msg));
+	sent = front_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.front_crc =
+			crc32c(-1, con->out_msg->front.iov_base, sent);
+		con->v2.out_epil.front_crc =
+			crc32c_zeros(con->v2.out_epil.front_crc, resid);
+	} else {
+		con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > middle_len(con->out_msg));
+	sent = middle_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+		con->v2.out_epil.middle_crc =
+			crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+	} else {
+		con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+	con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
+	out_zero_add(con, data_len(con->out_msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending middle\n", __func__, con);
+	prepare_zero_middle(con, resid);
+	prepare_zero_data(con);
+	queue_zeros(con);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con)
+{
+	int sent, resid;  /* current piece of data */
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+	WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+	sent = con->v2.out_bvec.bv_len - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.data_crc = ceph_crc32c_page(
+			con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+			con->v2.out_bvec.bv_offset, sent);
+		ceph_msg_data_advance(&con->v2.out_cursor, sent);
+	}
+	WARN_ON(resid > con->v2.out_cursor.total_resid);
+	con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+						con->v2.out_cursor.total_resid);
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, con->v2.out_cursor.total_resid);
+	queue_zeros(con);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
+	    !data_len(con->out_msg)) {
+		WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head (empty message) - noop\n",
+		     __func__, con);
+		return;
+	}
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+		   CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	boundary = CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending middle\n", __func__, con);
+		prepare_zero_middle(con, resid);
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con)
+{
+	WARN_ON(con->v2.out_zero);
+
+	if (con_secure(con)) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+			con->v2.out_state != OUT_S_FINISH_MESSAGE);
+		dout("%s con %p secure - noop\n", __func__, con);
+		return;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		revoke_at_queue_data(con);
+		break;
+	case OUT_S_QUEUE_DATA_CONT:
+		revoke_at_queue_data_cont(con);
+		break;
+	case OUT_S_FINISH_MESSAGE:
+		revoke_at_finish_message(con);
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		break;
+	}
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+	int remaining;  /* data + [data padding] + epilogue */
+	int resid;
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	if (con_secure(con))
+		remaining = padded_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+	     remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+	int recved, resid;  /* current piece of data */
+	int remaining;  /* [data padding] + epilogue */
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+	recved = con->v2.in_bvec.bv_len - resid;
+	dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+	if (recved)
+		ceph_msg_data_advance(&con->v2.in_cursor, recved);
+	WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+	if (con_secure(con))
+		remaining = padding_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+	     con->v2.in_cursor.total_resid, remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	dout("%s con %p resid %d\n", __func__, con, resid);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+	switch (con->v2.in_state) {
+	case IN_S_PREPARE_READ_DATA:
+		revoke_at_prepare_read_data(con);
+		break;
+	case IN_S_PREPARE_READ_DATA_CONT:
+		revoke_at_prepare_read_data_cont(con);
+		break;
+	case IN_S_HANDLE_EPILOGUE:
+		revoke_at_handle_epilogue(con);
+		break;
+	default:
+		WARN(1, "bad in_state %d", con->v2.in_state);
+		break;
+	}
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+	return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+	con->v2.client_cookie = 0;
+	con->v2.server_cookie = 0;
+	con->v2.global_seq = 0;
+	con->v2.connect_seq = 0;
+	con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+	iov_iter_truncate(&con->v2.in_iter, 0);
+	iov_iter_truncate(&con->v2.out_iter, 0);
+	con->v2.out_zero = 0;
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+
+	con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+
+	if (con->v2.hmac_tfm) {
+		crypto_free_shash(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+	}
+	if (con->v2.gcm_req) {
+		aead_request_free(con->v2.gcm_req);
+		con->v2.gcm_req = NULL;
+	}
+	if (con->v2.gcm_tfm) {
+		crypto_free_aead(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+	}
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 792a8c4164d7..b9d54ed9f338 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -257,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
 		      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
 	/*
-	 * send an initial keepalive to ensure our timestamp is valid
-	 * by the time we are in an OPENED state
+	 * Queue a keepalive to ensure that in case of an early fault
+	 * the messenger doesn't put us into STANDBY state and instead
+	 * retries.  This also ensures that our timestamp is valid by
+	 * the time we finish hunting and delayed_work() checks it.
 	 */
 	ceph_con_keepalive(&monc->con);
+	if (ceph_msgr2(monc->client)) {
+		monc->pending_auth = 1;
+		return;
+	}
 
 	/* initiate authentication handshake */
 	ret = ceph_auth_build_hello(monc->auth,
@@ -543,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(&p, end, false);
+	monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
@@ -1119,8 +1125,9 @@ static void delayed_work(struct work_struct *work)
  */
 static int build_initial_monmap(struct ceph_mon_client *monc)
 {
+	__le32 my_type = ceph_msgr2(monc->client) ?
+		CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	struct ceph_options *opt = monc->client->options;
-	struct ceph_entity_addr *mon_addr = opt->mon_addr;
 	int num_mon = opt->num_mon;
 	int i;
 
@@ -1129,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
+
 	for (i = 0; i < num_mon; i++) {
-		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.nonce = 0;
-		monc->monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+		struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+		memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+		       sizeof(inst->addr.in_addr));
+		inst->addr.type = my_type;
+		inst->addr.nonce = 0;
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
 	return 0;
@@ -1337,6 +1348,88 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_validate_auth);
 
+static int mon_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+					  buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+					  reply, reply_len,
+					  session_key, session_key_len,
+					  con_secret, con_secret_len);
+	finish_auth(monc, ret, was_authed);
+	if (!ret)
+		finish_hunting(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+				    allowed_protos, proto_cnt,
+				    allowed_modes, mode_cnt);
+	finish_auth(monc, -EACCES, was_authed);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
 /*
  * handle incoming message
  */
@@ -1487,4 +1580,8 @@ static const struct ceph_connection_operations mon_con_ops = {
 	.dispatch = dispatch,
 	.fault = mon_fault,
 	.alloc_msg = mon_alloc_msg,
+	.get_auth_request = mon_get_auth_request,
+	.handle_auth_reply_more = mon_handle_auth_reply_more,
+	.handle_auth_done = mon_handle_auth_done,
+	.handle_auth_bad_method = mon_handle_auth_bad_method,
 };
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 51be5a7482fc..662b52e52651 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end,
+						  ceph_msgr2(osdc->client),
+						  osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end, false);
+		newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
@@ -5575,6 +5577,7 @@ static void put_osd_con(struct ceph_connection *con)
 /*
  * authentication
  */
+
 /*
  * Note: returned pointer is the address of a structure that's
  * managed separately.  Caller must *not* attempt to free it.
@@ -5640,6 +5643,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
+static int osd_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static void osd_reencode_message(struct ceph_msg *msg)
 {
 	int type = le16_to_cpu(msg->hdr.type);
@@ -5677,4 +5754,8 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
 	.fault = osd_fault,
+	.get_auth_request = osd_get_auth_request,
+	.handle_auth_reply_more = osd_handle_auth_reply_more,
+	.handle_auth_done = osd_handle_auth_done,
+	.handle_auth_bad_method = osd_handle_auth_bad_method,
 };

From ce287162d9738fe03a3731904710b6700ba686aa Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 19:13:58 +0100
Subject: [PATCH 224/230] libceph, ceph: make use of
 __ceph_auth_get_authorizer() in msgr1

This shouldn't cause any functional changes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c  | 21 +++++----------------
 net/ceph/osd_client.c | 21 +++++----------------
 2 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 740d63d0fc50..98c15ff2e599 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5141,23 +5141,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
 
-	if (force_new && auth->authorizer) {
-		ceph_auth_destroy_authorizer(auth->authorizer);
-		auth->authorizer = NULL;
-	}
-	if (!auth->authorizer) {
-		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
-						      auth);
-		if (ret)
-			return ERR_PTR(ret);
-	} else {
-		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
-						      auth);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	*proto = ac->protocol;
+	ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+					 force_new, proto, NULL, NULL);
+	if (ret)
+		return ERR_PTR(ret);
 
 	return auth;
 }
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 662b52e52651..61229c5e22cb 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5589,23 +5589,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
 
-	if (force_new && auth->authorizer) {
-		ceph_auth_destroy_authorizer(auth->authorizer);
-		auth->authorizer = NULL;
-	}
-	if (!auth->authorizer) {
-		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
-						      auth);
-		if (ret)
-			return ERR_PTR(ret);
-	} else {
-		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
-						     auth);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-	*proto = ac->protocol;
+	ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+					 force_new, proto, NULL, NULL);
+	if (ret)
+		return ERR_PTR(ret);
 
 	return auth;
 }

From 2f0df6cfa325d7106b8a65bc0e02db1086e3f73b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 20:00:10 +0100
Subject: [PATCH 225/230] libceph: drop ceph_auth_{create,update}_authorizer()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h |  6 ------
 net/ceph/auth.c           | 28 ----------------------------
 2 files changed, 34 deletions(-)

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 3fbe72ebd779..71b5d481c653 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -126,13 +126,7 @@ int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
 			       struct ceph_auth_handshake *auth,
 			       int peer_type, bool force_new,
 			       int *proto, int *pref_mode, int *fallb_mode);
-extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				       int peer_type,
-				       struct ceph_auth_handshake *auth);
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
-extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				       int peer_type,
-				       struct ceph_auth_handshake *a);
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 6b315c8212b1..eb261aa5fe18 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -326,40 +326,12 @@ out:
 }
 EXPORT_SYMBOL(__ceph_auth_get_authorizer);
 
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *auth)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->create_authorizer)
-		ret = ac->ops->create_authorizer(ac, peer_type, auth);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
-
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 {
 	a->destroy(a);
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *a)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->update_authorizer)
-		ret = ac->ops->update_authorizer(ac, peer_type, a);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,

From 7be9b38afafbfcc58ede3be66bfc4ea415b3d5f1 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 16 Dec 2020 12:25:13 +0000
Subject: [PATCH 226/230] NFSv4.2: fix error return on memory allocation
 failure

Currently when an alloc_page fails the error return is not set in
variable err and a garbage initialized value is returned. Fix this
by setting err to -ENOMEM before taking the error return path.

Addresses-Coverity: ("Uninitialized scalar variable")
Fixes: a1f26739ccdc ("NFSv4.2: improve page handling for GETXATTR")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs42proc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index b9836e2ce4a2..f3fd935620fc 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -1301,6 +1301,7 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
 		pages[i] = alloc_page(GFP_KERNEL);
 		if (!pages[i]) {
 			np = i + 1;
+			err = -ENOMEM;
 			goto out;
 		}
 	}

From 3316fb80a0b4c1fef03a3eb1a7f0651e2133c429 Mon Sep 17 00:00:00 2001
From: Zheng Yongjun <zhengyongjun3@huawei.com>
Date: Fri, 11 Dec 2020 16:41:58 +0800
Subject: [PATCH 227/230] fs/lockd: convert comma to semicolon

Replace a comma between expression statements by a semicolon.

Signed-off-by: Zheng Yongjun <zhengyongjun3@huawei.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/lockd/host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 771c289f6df7..f802223e71ab 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -163,7 +163,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
 	host->h_nsmhandle  = nsm;
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	host->net	   = ni->net;
-	host->h_cred	   = get_cred(ni->cred),
+	host->h_cred	   = get_cred(ni->cred);
 	strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
 
 out:

From cac1d3a2b8f7f0817ac4feab76f5d3b12e4b02d7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 16 Dec 2020 16:31:26 -0500
Subject: [PATCH 228/230] NFSv4/pnfs: Add tracing for the deviceid cache

Add tracepoints to allow debugging of the deviceid cache.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c  |  2 ++
 fs/nfs/nfs4trace.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/pnfs_dev.c  | 23 +++++++++-----
 3 files changed, 92 insertions(+), 8 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 61a07dcb963d..8e03a647c61a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -9662,6 +9662,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server,
 	if (res.notification != args.notify_types)
 		pdev->nocache = 1;
 
+	trace_nfs4_getdeviceinfo(server, &pdev->dev_id, status);
+
 	dprintk("<-- %s status=%d\n", __func__, status);
 
 	return status;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 484c1da96dea..48d761e593fb 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -2189,6 +2189,81 @@ DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
 DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
 DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
 
+DECLARE_EVENT_CLASS(nfs4_deviceid_event,
+		TP_PROTO(
+			const struct nfs_client *clp,
+			const struct nfs4_deviceid *deviceid
+		),
+
+		TP_ARGS(clp, deviceid),
+
+		TP_STRUCT__entry(
+			__string(dstaddr, clp->cl_hostname)
+			__array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+		),
+
+		TP_fast_assign(
+			__assign_str(dstaddr, clp->cl_hostname);
+			memcpy(__entry->deviceid, deviceid->data,
+			       NFS4_DEVICEID4_SIZE);
+		),
+
+		TP_printk(
+			"deviceid=%s, dstaddr=%s",
+			__print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+			__get_str(dstaddr)
+		)
+);
+#define DEFINE_PNFS_DEVICEID_EVENT(name) \
+	DEFINE_EVENT(nfs4_deviceid_event, name, \
+			TP_PROTO(const struct nfs_client *clp, \
+				const struct nfs4_deviceid *deviceid \
+			), \
+			TP_ARGS(clp, deviceid))
+DEFINE_PNFS_DEVICEID_EVENT(nfs4_deviceid_free);
+
+DECLARE_EVENT_CLASS(nfs4_deviceid_status,
+		TP_PROTO(
+			const struct nfs_server *server,
+			const struct nfs4_deviceid *deviceid,
+			int status
+		),
+
+		TP_ARGS(server, deviceid, status),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(int, status)
+			__string(dstaddr, server->nfs_client->cl_hostname)
+			__array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+		),
+
+		TP_fast_assign(
+			__entry->dev = server->s_dev;
+			__entry->status = status;
+			__assign_str(dstaddr, server->nfs_client->cl_hostname);
+			memcpy(__entry->deviceid, deviceid->data,
+			       NFS4_DEVICEID4_SIZE);
+		),
+
+		TP_printk(
+			"dev=%02x:%02x: deviceid=%s, dstaddr=%s, status=%d",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			__print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+			__get_str(dstaddr),
+			__entry->status
+		)
+);
+#define DEFINE_PNFS_DEVICEID_STATUS(name) \
+	DEFINE_EVENT(nfs4_deviceid_status, name, \
+			TP_PROTO(const struct nfs_server *server, \
+				const struct nfs4_deviceid *deviceid, \
+				int status \
+			), \
+			TP_ARGS(server, deviceid, status))
+DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo);
+DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid);
+
 DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
 		TP_PROTO(
 			const struct nfs_pgio_header *hdr
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 537b80d693f1..ddbbf4fcda86 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -34,6 +34,8 @@
 #include "internal.h"
 #include "pnfs.h"
 
+#include "nfs4trace.h"
+
 #define NFSDBG_FACILITY		NFSDBG_PNFS
 
 /*
@@ -192,24 +194,28 @@ nfs4_find_get_deviceid(struct nfs_server *server,
 
 	d = __nfs4_find_get_deviceid(server, id, hash);
 	if (d)
-		return d;
+		goto found;
 
 	new = nfs4_get_device_info(server, id, cred, gfp_mask);
-	if (!new)
+	if (!new) {
+		trace_nfs4_find_deviceid(server, id, -ENOENT);
 		return new;
+	}
 
 	spin_lock(&nfs4_deviceid_lock);
 	d = __nfs4_find_get_deviceid(server, id, hash);
 	if (d) {
 		spin_unlock(&nfs4_deviceid_lock);
 		server->pnfs_curr_ld->free_deviceid_node(new);
-		return d;
+	} else {
+		atomic_inc(&new->ref);
+		hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+		spin_unlock(&nfs4_deviceid_lock);
+		d = new;
 	}
-	hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
-	atomic_inc(&new->ref);
-	spin_unlock(&nfs4_deviceid_lock);
-
-	return new;
+found:
+	trace_nfs4_find_deviceid(server, id, 0);
+	return d;
 }
 EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
 
@@ -278,6 +284,7 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
 	}
 	if (!atomic_dec_and_test(&d->ref))
 		return false;
+	trace_nfs4_deviceid_free(d->nfs_client, &d->deviceid);
 	d->ld->free_deviceid_node(d);
 	return true;
 }

From 9bfffea3524b49d0268d01f8e7967f06c4d0a942 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 16 Dec 2020 14:01:07 -0500
Subject: [PATCH 229/230] pNFS/flexfiles: Avoid spurious layout returns in
 ff_layout_choose_ds_for_read

The callers of ff_layout_choose_ds_for_read() should decide whether or
not they want to return the layout on error. Sometimes, we may just want
to retry from the beginning.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b7c26836b2cb..439169301d56 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -740,16 +740,12 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
 	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_pnfs_ds *ds;
-	bool fail_return = false;
 	u32 idx;
 
 	/* mirrors are initially sorted by efficiency */
 	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
-		if (idx+1 == fls->mirror_array_cnt)
-			fail_return = !check_device;
-
 		mirror = FF_LAYOUT_COMP(lseg, idx);
-		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return);
+		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
 		if (!ds)
 			continue;
 

From 52104f274e2d7f134d34bab11cada8913d4544e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 16 Dec 2020 17:17:45 -0500
Subject: [PATCH 230/230] NFS/pNFS: Fix a typo in ff_layout_resend_pnfs_read()

Don't bump the index twice.

Fixes: 563c53e73b8b ("NFS: Fix flexfiles read failover")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 439169301d56..43ed743d10ea 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1052,7 +1052,7 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
 	u32 idx = hdr->pgio_mirror_idx + 1;
 	u32 new_idx = 0;
 
-	if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx))
+	if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
 		ff_layout_send_layouterror(hdr->lseg);
 	else
 		pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);