diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd index b44183880935..55285c136cf0 100644 --- a/Documentation/ABI/stable/sysfs-driver-dma-idxd +++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd @@ -77,6 +77,13 @@ Contact: dmaengine@vger.kernel.org Description: The operation capability bit mask specify the operation types supported by the this device. +What: /sys/bus/dsa/devices/dsa/pasid_enabled +Date: Oct 27, 2020 +KernelVersion: 5.11.0 +Contact: dmaengine@vger.kernel.org +Description: To indicate if PASID (process address space identifier) is + enabled or not for this device. + What: /sys/bus/dsa/devices/dsa/state Date: Oct 25, 2019 KernelVersion: 5.6.0 @@ -122,6 +129,13 @@ KernelVersion: 5.10.0 Contact: dmaengine@vger.kernel.org Description: The last executed device administrative command's status/error. +What: /sys/bus/dsa/devices/wq./block_on_fault +Date: Oct 27, 2020 +KernelVersion: 5.11.0 +Contact: dmaengine@vger.kernel.org +Description: To indicate block on fault is allowed or not for the work queue + to support on demand paging. + What: /sys/bus/dsa/devices/wq./group_id Date: Oct 25, 2019 KernelVersion: 5.6.0 @@ -190,6 +204,13 @@ Contact: dmaengine@vger.kernel.org Description: The max batch size for this workqueue. Cannot exceed device max batch size. Configurable parameter. +What: /sys/bus/dsa/devices/wq./ats_disable +Date: Nov 13, 2020 +KernelVersion: 5.11.0 +Contact: dmaengine@vger.kernel.org +Description: Indicate whether ATS disable is turned on for the workqueue. + 0 indicates ATS is on, and 1 indicates ATS is off for the workqueue. + What: /sys/bus/dsa/devices/engine./group_id Date: Oct 25, 2019 KernelVersion: 5.6.0 diff --git a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml index 372679dbd216..b6e1ebfaf366 100644 --- a/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml +++ b/Documentation/devicetree/bindings/dma/allwinner,sun50i-a64-dma.yaml @@ -21,6 +21,7 @@ properties: compatible: oneOf: - const: allwinner,sun50i-a64-dma + - const: allwinner,sun50i-a100-dma - const: allwinner,sun50i-h6-dma - items: - const: allwinner,sun8i-r40-dma @@ -56,7 +57,9 @@ required: if: properties: compatible: - const: allwinner,sun50i-h6-dma + enum: + - allwinner,sun50i-a100-dma + - allwinner,sun50i-h6-dma then: properties: diff --git a/Documentation/devicetree/bindings/dma/atmel-xdma.txt b/Documentation/devicetree/bindings/dma/atmel-xdma.txt index 4dc398e1a371..510b7f25ba24 100644 --- a/Documentation/devicetree/bindings/dma/atmel-xdma.txt +++ b/Documentation/devicetree/bindings/dma/atmel-xdma.txt @@ -2,7 +2,8 @@ * XDMA Controller Required properties: -- compatible: Should be "atmel,sama5d4-dma" or "microchip,sam9x60-dma". +- compatible: Should be "atmel,sama5d4-dma", "microchip,sam9x60-dma" or + "microchip,sama7g5-dma". - reg: Should contain DMA registers location and length. - interrupts: Should contain DMA interrupt. - #dma-cells: Must be <1>, used to represent the number of integer cells in diff --git a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt index 2117db0ce4f2..fef9c1eeb264 100644 --- a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt +++ b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt @@ -4,6 +4,7 @@ Required properties: - compatible should contain: * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA * "mediatek,mt6577-uart-dma" for MT6577 and all of the above + * "mediatek,mt8516-uart-dma", "mediatek,mt6577" for MT8516 SoC - reg: The base address of the APDMA register bank. diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml new file mode 100644 index 000000000000..f8142adf9aea --- /dev/null +++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dma/qcom,gpi.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm Technologies Inc GPI DMA controller + +maintainers: + - Vinod Koul + +description: | + QCOM GPI DMA controller provides DMA capabilities for + peripheral buses such as I2C, UART, and SPI. + +allOf: + - $ref: "dma-controller.yaml#" + +properties: + compatible: + enum: + - qcom,sdm845-gpi-dma + + reg: + maxItems: 1 + + interrupts: + description: + Interrupt lines for each GPI instance + maxItems: 13 + + "#dma-cells": + const: 3 + description: > + DMA clients must use the format described in dma.txt, giving a phandle + to the DMA controller plus the following 3 integer cells: + - channel: if set to 0xffffffff, any available channel will be allocated + for the client. Otherwise, the exact channel specified will be used. + - seid: serial id of the client as defined in the SoC documentation. + - client: type of the client as defined in dt-bindings/dma/qcom-gpi.h + + iommus: + maxItems: 1 + + dma-channels: + maximum: 31 + + dma-channel-mask: + maxItems: 1 + +required: + - compatible + - reg + - interrupts + - "#dma-cells" + - iommus + - dma-channels + - dma-channel-mask + +additionalProperties: false + +examples: + - | + #include + #include + gpi_dma0: dma-controller@800000 { + compatible = "qcom,gpi-dma"; + #dma-cells = <3>; + reg = <0x00800000 0x60000>; + iommus = <&apps_smmu 0x0016 0x0>; + dma-channels = <13>; + dma-channel-mask = <0xfa>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + ; + }; + +... diff --git a/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml new file mode 100644 index 000000000000..b15f68c499cb --- /dev/null +++ b/Documentation/devicetree/bindings/dma/ti/k3-bcdma.yaml @@ -0,0 +1,164 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dma/ti/k3-bcdma.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments K3 DMSS BCDMA Device Tree Bindings + +maintainers: + - Peter Ujfalusi + +description: | + The Block Copy DMA (BCDMA) is intended to perform similar functions as the TR + mode channels of K3 UDMA-P. + BCDMA includes block copy channels and Split channels. + + Block copy channels mainly used for memory to memory transfers, but with + optional triggers a block copy channel can service peripherals by accessing + directly to memory mapped registers or area. + + Split channels can be used to service PSI-L based peripherals. + The peripherals can be PSI-L native or legacy, non PSI-L native peripherals + with PDMAs. PDMA is tasked to act as a bridge between the PSI-L fabric and the + legacy peripheral. + + PDMAs can be configured via BCDMA split channel's peer registers to match with + the configuration of the legacy peripheral. + +allOf: + - $ref: /schemas/dma/dma-controller.yaml# + +properties: + compatible: + const: ti,am64-dmss-bcdma + + "#dma-cells": + const: 3 + description: | + cell 1: type of the BCDMA channel to be used to service the peripheral: + 0 - split channel + 1 - block copy channel using global trigger 1 + 2 - block copy channel using global trigger 2 + 3 - block copy channel using local trigger + + cell 2: parameter for the channel: + if cell 1 is 0 (split channel): + PSI-L thread ID of the remote (to BCDMA) end. + Valid ranges for thread ID depends on the data movement direction: + for source thread IDs (rx): 0 - 0x7fff + for destination thread IDs (tx): 0x8000 - 0xffff + + Please refer to the device documentation for the PSI-L thread map and + also the PSI-L peripheral chapter for the correct thread ID. + if cell 1 is 1 or 2 (block copy channel using global trigger): + Unused, ignored + + The trigger must be configured for the channel externally to BCDMA, + channels using global triggers should not be requested directly, but + via DMA event router. + if cell 1 is 3 (block copy channel using local trigger): + bchan number of the locally triggered channel + + cell 3: ASEL value for the channel + + reg: + maxItems: 5 + + reg-names: + items: + - const: gcfg + - const: bchanrt + - const: rchanrt + - const: tchanrt + - const: ringrt + + msi-parent: true + + ti,asel: + $ref: /schemas/types.yaml#/definitions/uint32 + description: ASEL value for non slave channels + + ti,sci-rm-range-bchan: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of BCDMA block-copy channel resource subtypes for resource + allocation for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + + ti,sci-rm-range-tchan: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of BCDMA split tx channel resource subtypes for resource allocation + for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + + ti,sci-rm-range-rchan: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of BCDMA split rx channel resource subtypes for resource allocation + for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + +required: + - compatible + - "#dma-cells" + - reg + - reg-names + - msi-parent + - ti,sci + - ti,sci-dev-id + - ti,sci-rm-range-bchan + - ti,sci-rm-range-tchan + - ti,sci-rm-range-rchan + +unevaluatedProperties: false + +examples: + - |+ + cbass_main { + #address-cells = <2>; + #size-cells = <2>; + + main_dmss { + compatible = "simple-mfd"; + #address-cells = <2>; + #size-cells = <2>; + dma-ranges; + ranges; + + ti,sci-dev-id = <25>; + + main_bcdma: dma-controller@485c0100 { + compatible = "ti,am64-dmss-bcdma"; + + reg = <0x0 0x485c0100 0x0 0x100>, + <0x0 0x4c000000 0x0 0x20000>, + <0x0 0x4a820000 0x0 0x20000>, + <0x0 0x4aa40000 0x0 0x20000>, + <0x0 0x4bc00000 0x0 0x100000>; + reg-names = "gcfg", "bchanrt", "rchanrt", "tchanrt", "ringrt"; + msi-parent = <&inta_main_dmss>; + #dma-cells = <3>; + + ti,sci = <&dmsc>; + ti,sci-dev-id = <26>; + + ti,sci-rm-range-bchan = <0x20>; /* BLOCK_COPY_CHAN */ + ti,sci-rm-range-rchan = <0x21>; /* SPLIT_TR_RX_CHAN */ + ti,sci-rm-range-tchan = <0x22>; /* SPLIT_TR_TX_CHAN */ + }; + }; + }; diff --git a/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml new file mode 100644 index 000000000000..b13ab60cd740 --- /dev/null +++ b/Documentation/devicetree/bindings/dma/ti/k3-pktdma.yaml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dma/ti/k3-pktdma.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Texas Instruments K3 DMSS PKTDMA Device Tree Bindings + +maintainers: + - Peter Ujfalusi + +description: | + The Packet DMA (PKTDMA) is intended to perform similar functions as the packet + mode channels of K3 UDMA-P. + PKTDMA only includes Split channels to service PSI-L based peripherals. + + The peripherals can be PSI-L native or legacy, non PSI-L native peripherals + with PDMAs. PDMA is tasked to act as a bridge between the PSI-L fabric and the + legacy peripheral. + + PDMAs can be configured via PKTDMA split channel's peer registers to match + with the configuration of the legacy peripheral. + +allOf: + - $ref: /schemas/dma/dma-controller.yaml# + +properties: + compatible: + const: ti,am64-dmss-pktdma + + "#dma-cells": + const: 2 + description: | + The first cell is the PSI-L thread ID of the remote (to PKTDMA) end. + Valid ranges for thread ID depends on the data movement direction: + for source thread IDs (rx): 0 - 0x7fff + for destination thread IDs (tx): 0x8000 - 0xffff + + Please refer to the device documentation for the PSI-L thread map and also + the PSI-L peripheral chapter for the correct thread ID. + + The second cell is the ASEL value for the channel + + reg: + maxItems: 4 + + reg-names: + items: + - const: gcfg + - const: rchanrt + - const: tchanrt + - const: ringrt + + msi-parent: true + + ti,sci-rm-range-tchan: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of PKTDMA split tx channel resource subtypes for resource allocation + for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + + ti,sci-rm-range-tflow: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of PKTDMA split tx flow resource subtypes for resource allocation + for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + + ti,sci-rm-range-rchan: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of PKTDMA split rx channel resource subtypes for resource allocation + for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + + ti,sci-rm-range-rflow: + $ref: /schemas/types.yaml#/definitions/uint32-array + description: | + Array of PKTDMA split rx flow resource subtypes for resource allocation + for this host + minItems: 1 + # Should be enough + maxItems: 255 + items: + maximum: 0x3f + +required: + - compatible + - "#dma-cells" + - reg + - reg-names + - msi-parent + - ti,sci + - ti,sci-dev-id + - ti,sci-rm-range-tchan + - ti,sci-rm-range-tflow + - ti,sci-rm-range-rchan + - ti,sci-rm-range-rflow + +unevaluatedProperties: false + +examples: + - |+ + cbass_main { + #address-cells = <2>; + #size-cells = <2>; + + main_dmss { + compatible = "simple-mfd"; + #address-cells = <2>; + #size-cells = <2>; + dma-ranges; + ranges; + + ti,sci-dev-id = <25>; + + main_pktdma: dma-controller@485c0000 { + compatible = "ti,am64-dmss-pktdma"; + + reg = <0x0 0x485c0000 0x0 0x100>, + <0x0 0x4a800000 0x0 0x20000>, + <0x0 0x4aa00000 0x0 0x40000>, + <0x0 0x4b800000 0x0 0x400000>; + reg-names = "gcfg", "rchanrt", "tchanrt", "ringrt"; + msi-parent = <&inta_main_dmss>; + #dma-cells = <2>; + + ti,sci = <&dmsc>; + ti,sci-dev-id = <30>; + + ti,sci-rm-range-tchan = <0x23>, /* UNMAPPED_TX_CHAN */ + <0x24>, /* CPSW_TX_CHAN */ + <0x25>, /* SAUL_TX_0_CHAN */ + <0x26>, /* SAUL_TX_1_CHAN */ + <0x27>, /* ICSSG_0_TX_CHAN */ + <0x28>; /* ICSSG_1_TX_CHAN */ + ti,sci-rm-range-tflow = <0x10>, /* RING_UNMAPPED_TX_CHAN */ + <0x11>, /* RING_CPSW_TX_CHAN */ + <0x12>, /* RING_SAUL_TX_0_CHAN */ + <0x13>, /* RING_SAUL_TX_1_CHAN */ + <0x14>, /* RING_ICSSG_0_TX_CHAN */ + <0x15>; /* RING_ICSSG_1_TX_CHAN */ + ti,sci-rm-range-rchan = <0x29>, /* UNMAPPED_RX_CHAN */ + <0x2b>, /* CPSW_RX_CHAN */ + <0x2d>, /* SAUL_RX_0_CHAN */ + <0x2f>, /* SAUL_RX_1_CHAN */ + <0x31>, /* SAUL_RX_2_CHAN */ + <0x33>, /* SAUL_RX_3_CHAN */ + <0x35>, /* ICSSG_0_RX_CHAN */ + <0x37>; /* ICSSG_1_RX_CHAN */ + ti,sci-rm-range-rflow = <0x2a>, /* FLOW_UNMAPPED_RX_CHAN */ + <0x2c>, /* FLOW_CPSW_RX_CHAN */ + <0x2e>, /* FLOW_SAUL_RX_0/1_CHAN */ + <0x32>, /* FLOW_SAUL_RX_2/3_CHAN */ + <0x36>, /* FLOW_ICSSG_0_RX_CHAN */ + <0x38>; /* FLOW_ICSSG_1_RX_CHAN */ + }; + }; + }; diff --git a/Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml b/Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml new file mode 100644 index 000000000000..6608545ea66f --- /dev/null +++ b/Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mailbox/arm,mhuv2.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: ARM MHUv2 Mailbox Controller + +maintainers: + - Tushar Khandelwal + - Viresh Kumar + +description: | + The Arm Message Handling Unit (MHU) Version 2 is a mailbox controller that has + between 1 and 124 channel windows (each 32-bit wide) to provide unidirectional + communication with remote processor(s), where the number of channel windows + are implementation dependent. + + Given the unidirectional nature of the controller, an MHUv2 mailbox may only + be written to or read from. If a pair of MHU controllers is implemented + between two processing elements to provide bidirectional communication, these + must be specified as two separate mailboxes. + + If the interrupts property is present in device tree node, then its treated as + a "receiver" mailbox, otherwise a "sender". + + An MHU controller must be specified along with the supported transport + protocols. The transport protocols determine the method of data transmission + as well as the number of provided mailbox channels. + + Following are the possible transport protocols. + + - Data-transfer: Each transfer is made of one or more words, using one or more + channel windows. + + - Doorbell: Each transfer is made up of single bit flag, using any one of the + bits in a channel window. A channel window can support up to 32 doorbells + and the entire window shall be used in doorbell protocol. Optionally, data + may be transmitted through a shared memory region, wherein the MHU is used + strictly as an interrupt generation mechanism but that is out of the scope + of these bindings. + +# We need a select here so we don't match all nodes with 'arm,primecell' +select: + properties: + compatible: + contains: + enum: + - arm,mhuv2-tx + - arm,mhuv2-rx + required: + - compatible + +properties: + compatible: + oneOf: + - description: Sender mode + items: + - const: arm,mhuv2-tx + - const: arm,primecell + + - description: Receiver-mode + items: + - const: arm,mhuv2-rx + - const: arm,primecell + + reg: + maxItems: 1 + + interrupts: + description: | + The MHUv2 controller always implements an interrupt in the "receiver" + mode, while the interrupt in the "sender" mode was not available in the + version MHUv2.0, but the later versions do have it. + maxItems: 1 + + clocks: + maxItems: 1 + + clock-names: + maxItems: 1 + + arm,mhuv2-protocols: + $ref: /schemas/types.yaml#/definitions/uint32-matrix + description: | + The MHUv2 controller may contain up to 124 channel windows (each 32-bit + wide). The hardware and the DT bindings allows any combination of those to + be used for various transport protocols. + + This property allows a platform to describe how these channel windows are + used in various transport protocols. The entries in this property shall be + present as an array of tuples, where each tuple describes details about + one of the transport protocol being implemented over some channel + window(s). + + The first field of a tuple signifies the transfer protocol, 0 is reserved + for doorbell protocol, and 1 is reserved for data-transfer protocol. + Using any other value in the first field of a tuple makes it invalid. + + The second field of a tuple signifies the number of channel windows where + the protocol would be used and should be set to a non zero value. For + doorbell protocol this field signifies the number of 32-bit channel + windows that implement the doorbell protocol. For data-transfer protocol, + this field signifies the number of 32-bit channel windows that implement + the data-transfer protocol. + + The total number of channel windows specified here shouldn't be more than + the ones implemented by the platform, though one can specify lesser number + of windows here than what the platform implements. + + mhu: mailbox@2b1f0000 { + ... + + arm,mhuv2-protocols = <0 2>, <1 1>, <1 5>, <1 7>; + } + + The above example defines the protocols of an ARM MHUv2 mailbox + controller, where a total of 15 channel windows are used. The first two + windows are used in doorbell protocol (64 doorbells), followed by 1, 5 and + 7 windows (separately) used in data-transfer protocol. + + minItems: 1 + maxItems: 124 + items: + items: + - enum: [ 0, 1 ] + - minimum: 0 + maximum: 124 + + + '#mbox-cells': + description: | + It is always set to 2. The first argument in the consumers 'mboxes' + property represents the channel window group, which may be used in + doorbell, or data-transfer protocol, and the second argument (only + relevant in doorbell protocol, should be 0 otherwise) represents the + doorbell number within the 32 bit wide channel window. + + From the example given above for arm,mhuv2-protocols, here is how a client + node can reference them. + + mboxes = <&mhu 0 5>; // Channel Window Group 0, doorbell 5. + mboxes = <&mhu 1 7>; // Channel Window Group 1, doorbell 7. + mboxes = <&mhu 2 0>; // Channel Window Group 2, data transfer protocol with 1 window. + mboxes = <&mhu 3 0>; // Channel Window Group 3, data transfer protocol with 5 windows. + mboxes = <&mhu 4 0>; // Channel Window Group 4, data transfer protocol with 7 windows. + + const: 2 + +if: + # Interrupt is compulsory for receiver + properties: + compatible: + contains: + const: arm,mhuv2-rx +then: + required: + - interrupts + +required: + - compatible + - reg + - '#mbox-cells' + - arm,mhuv2-protocols + +additionalProperties: false + +examples: + # Multiple transport protocols implemented by the mailbox controllers + - | + soc { + #address-cells = <2>; + #size-cells = <2>; + + mhu_tx: mailbox@2b1f0000 { + #mbox-cells = <2>; + compatible = "arm,mhuv2-tx", "arm,primecell"; + reg = <0 0x2b1f0000 0 0x1000>; + clocks = <&clock 0>; + clock-names = "apb_pclk"; + interrupts = <0 45 4>; + arm,mhuv2-protocols = <1 5>, <1 2>, <1 5>, <1 7>, <0 2>; + }; + + mhu_rx: mailbox@2b1f1000 { + #mbox-cells = <2>; + compatible = "arm,mhuv2-rx", "arm,primecell"; + reg = <0 0x2b1f1000 0 0x1000>; + clocks = <&clock 0>; + clock-names = "apb_pclk"; + interrupts = <0 46 4>; + arm,mhuv2-protocols = <1 1>, <1 7>, <0 2>; + }; + + mhu_client: scb@2e000000 { + compatible = "fujitsu,mb86s70-scb-1.0"; + reg = <0 0x2e000000 0 0x4000>; + + mboxes = + //data-transfer protocol with 5 windows, mhu-tx + <&mhu_tx 2 0>, + //data-transfer protocol with 7 windows, mhu-tx + <&mhu_tx 3 0>, + //doorbell protocol channel 4, doorbell 27, mhu-tx + <&mhu_tx 4 27>, + //data-transfer protocol with 1 window, mhu-rx + <&mhu_rx 0 0>; + }; + }; diff --git a/Documentation/driver-api/dmaengine/client.rst b/Documentation/driver-api/dmaengine/client.rst index 09a3f66dcd26..bfd057b21a00 100644 --- a/Documentation/driver-api/dmaengine/client.rst +++ b/Documentation/driver-api/dmaengine/client.rst @@ -120,7 +120,9 @@ The details of these operations are: .. code-block:: c - nr_sg = dma_map_sg(chan->device->dev, sgl, sg_len); + struct device *dma_dev = dmaengine_get_dma_device(chan); + + nr_sg = dma_map_sg(dma_dev, sgl, sg_len); if (nr_sg == 0) /* error */ diff --git a/MAINTAINERS b/MAINTAINERS index 552cc3f2626b..aa1b6a777f22 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10531,6 +10531,15 @@ F: drivers/mailbox/ F: include/linux/mailbox_client.h F: include/linux/mailbox_controller.h +MAILBOX ARM MHUv2 +M: Viresh Kumar +M: Tushar Khandelwal +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/mailbox/arm_mhuv2.c +F: include/linux/mailbox/arm_mhuv2_message.h +F: Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml + MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7 M: Michael Kerrisk L: linux-man@vger.kernel.org diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2ed79b09439a..59cfe71d0b3a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3925,8 +3925,12 @@ static int find_watcher(struct rbd_device *rbd_dev, sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); for (i = 0; i < num_watchers; i++) { - if (!memcmp(&watchers[i].addr, &locker->info.addr, - sizeof(locker->info.addr)) && + /* + * Ignore addr->type while comparing. This mimics + * entity_addr_t::get_legacy_str() + strcmp(). + */ + if (ceph_addr_equal_no_type(&watchers[i].addr, + &locker->info.addr) && watchers[i].cookie == cookie) { struct rbd_client_id cid = { .gid = le64_to_cpu(watchers[i].name.num), diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 90284ffda58a..d242c7632621 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -296,6 +296,16 @@ config INTEL_IDXD If unsure, say N. +# Config symbol that collects all the dependencies that's necessary to +# support shared virtual memory for the devices supported by idxd. +config INTEL_IDXD_SVM + bool "Accelerator Shared Virtual Memory Support" + depends on INTEL_IDXD + depends on INTEL_IOMMU_SVM + depends on PCI_PRI + depends on PCI_PASID + depends on PCI_IOV + config INTEL_IOATDMA tristate "Intel I/OAT DMA support" depends on PCI && X86_64 diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index 3b53115db268..fe45ad5d06c4 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -30,7 +30,24 @@ #define AT_XDMAC_FIFO_SZ(i) (((i) >> 5) & 0x7FF) /* Number of Bytes */ #define AT_XDMAC_NB_REQ(i) ((((i) >> 16) & 0x3F) + 1) /* Number of Peripheral Requests Minus One */ #define AT_XDMAC_GCFG 0x04 /* Global Configuration Register */ +#define AT_XDMAC_WRHP(i) (((i) & 0xF) << 4) +#define AT_XDMAC_WRMP(i) (((i) & 0xF) << 8) +#define AT_XDMAC_WRLP(i) (((i) & 0xF) << 12) +#define AT_XDMAC_RDHP(i) (((i) & 0xF) << 16) +#define AT_XDMAC_RDMP(i) (((i) & 0xF) << 20) +#define AT_XDMAC_RDLP(i) (((i) & 0xF) << 24) +#define AT_XDMAC_RDSG(i) (((i) & 0xF) << 28) +#define AT_XDMAC_GCFG_M2M (AT_XDMAC_RDLP(0xF) | AT_XDMAC_WRLP(0xF)) +#define AT_XDMAC_GCFG_P2M (AT_XDMAC_RDSG(0x1) | AT_XDMAC_RDHP(0x3) | \ + AT_XDMAC_WRHP(0x5)) #define AT_XDMAC_GWAC 0x08 /* Global Weighted Arbiter Configuration Register */ +#define AT_XDMAC_PW0(i) (((i) & 0xF) << 0) +#define AT_XDMAC_PW1(i) (((i) & 0xF) << 4) +#define AT_XDMAC_PW2(i) (((i) & 0xF) << 8) +#define AT_XDMAC_PW3(i) (((i) & 0xF) << 12) +#define AT_XDMAC_GWAC_M2M 0 +#define AT_XDMAC_GWAC_P2M (AT_XDMAC_PW0(0xF) | AT_XDMAC_PW2(0xF)) + #define AT_XDMAC_GIE 0x0C /* Global Interrupt Enable Register */ #define AT_XDMAC_GID 0x10 /* Global Interrupt Disable Register */ #define AT_XDMAC_GIM 0x14 /* Global Interrupt Mask Register */ @@ -38,13 +55,6 @@ #define AT_XDMAC_GE 0x1C /* Global Channel Enable Register */ #define AT_XDMAC_GD 0x20 /* Global Channel Disable Register */ #define AT_XDMAC_GS 0x24 /* Global Channel Status Register */ -#define AT_XDMAC_GRS 0x28 /* Global Channel Read Suspend Register */ -#define AT_XDMAC_GWS 0x2C /* Global Write Suspend Register */ -#define AT_XDMAC_GRWS 0x30 /* Global Channel Read Write Suspend Register */ -#define AT_XDMAC_GRWR 0x34 /* Global Channel Read Write Resume Register */ -#define AT_XDMAC_GSWR 0x38 /* Global Channel Software Request Register */ -#define AT_XDMAC_GSWS 0x3C /* Global channel Software Request Status Register */ -#define AT_XDMAC_GSWF 0x40 /* Global Channel Software Flush Request Register */ #define AT_XDMAC_VERSION 0xFFC /* XDMAC Version Register */ /* Channel relative registers offsets */ @@ -150,8 +160,6 @@ #define AT_XDMAC_CSUS 0x30 /* Channel Source Microblock Stride */ #define AT_XDMAC_CDUS 0x34 /* Channel Destination Microblock Stride */ -#define AT_XDMAC_CHAN_REG_BASE 0x50 /* Channel registers base address */ - /* Microblock control members */ #define AT_XDMAC_MBR_UBC_UBLEN_MAX 0xFFFFFFUL /* Maximum Microblock Length */ #define AT_XDMAC_MBR_UBC_NDE (0x1 << 24) /* Next Descriptor Enable */ @@ -179,6 +187,29 @@ enum atc_status { AT_XDMAC_CHAN_IS_PAUSED, }; +struct at_xdmac_layout { + /* Global Channel Read Suspend Register */ + u8 grs; + /* Global Write Suspend Register */ + u8 gws; + /* Global Channel Read Write Suspend Register */ + u8 grws; + /* Global Channel Read Write Resume Register */ + u8 grwr; + /* Global Channel Software Request Register */ + u8 gswr; + /* Global channel Software Request Status Register */ + u8 gsws; + /* Global Channel Software Flush Request Register */ + u8 gswf; + /* Channel reg base */ + u8 chan_cc_reg_base; + /* Source/Destination Interface must be specified or not */ + bool sdif; + /* AXI queue priority configuration supported */ + bool axi_config; +}; + /* ----- Channels ----- */ struct at_xdmac_chan { struct dma_chan chan; @@ -212,6 +243,7 @@ struct at_xdmac { struct clk *clk; u32 save_gim; struct dma_pool *at_xdmac_desc_pool; + const struct at_xdmac_layout *layout; struct at_xdmac_chan chan[]; }; @@ -244,9 +276,35 @@ struct at_xdmac_desc { struct list_head xfer_node; } __aligned(sizeof(u64)); +static const struct at_xdmac_layout at_xdmac_sama5d4_layout = { + .grs = 0x28, + .gws = 0x2C, + .grws = 0x30, + .grwr = 0x34, + .gswr = 0x38, + .gsws = 0x3C, + .gswf = 0x40, + .chan_cc_reg_base = 0x50, + .sdif = true, + .axi_config = false, +}; + +static const struct at_xdmac_layout at_xdmac_sama7g5_layout = { + .grs = 0x30, + .gws = 0x38, + .grws = 0x40, + .grwr = 0x44, + .gswr = 0x48, + .gsws = 0x4C, + .gswf = 0x50, + .chan_cc_reg_base = 0x60, + .sdif = false, + .axi_config = true, +}; + static inline void __iomem *at_xdmac_chan_reg_base(struct at_xdmac *atxdmac, unsigned int chan_nb) { - return atxdmac->regs + (AT_XDMAC_CHAN_REG_BASE + chan_nb * 0x40); + return atxdmac->regs + (atxdmac->layout->chan_cc_reg_base + chan_nb * 0x40); } #define at_xdmac_read(atxdmac, reg) readl_relaxed((atxdmac)->regs + (reg)) @@ -345,8 +403,10 @@ static void at_xdmac_start_xfer(struct at_xdmac_chan *atchan, first->active_xfer = true; /* Tell xdmac where to get the first descriptor. */ - reg = AT_XDMAC_CNDA_NDA(first->tx_dma_desc.phys) - | AT_XDMAC_CNDA_NDAIF(atchan->memif); + reg = AT_XDMAC_CNDA_NDA(first->tx_dma_desc.phys); + if (atxdmac->layout->sdif) + reg |= AT_XDMAC_CNDA_NDAIF(atchan->memif); + at_xdmac_chan_write(atchan, AT_XDMAC_CNDA, reg); /* @@ -541,6 +601,7 @@ static int at_xdmac_compute_chan_conf(struct dma_chan *chan, enum dma_transfer_direction direction) { struct at_xdmac_chan *atchan = to_at_xdmac_chan(chan); + struct at_xdmac *atxdmac = to_at_xdmac(atchan->chan.device); int csize, dwidth; if (direction == DMA_DEV_TO_MEM) { @@ -548,12 +609,14 @@ static int at_xdmac_compute_chan_conf(struct dma_chan *chan, AT91_XDMAC_DT_PERID(atchan->perid) | AT_XDMAC_CC_DAM_INCREMENTED_AM | AT_XDMAC_CC_SAM_FIXED_AM - | AT_XDMAC_CC_DIF(atchan->memif) - | AT_XDMAC_CC_SIF(atchan->perif) | AT_XDMAC_CC_SWREQ_HWR_CONNECTED | AT_XDMAC_CC_DSYNC_PER2MEM | AT_XDMAC_CC_MBSIZE_SIXTEEN | AT_XDMAC_CC_TYPE_PER_TRAN; + if (atxdmac->layout->sdif) + atchan->cfg |= AT_XDMAC_CC_DIF(atchan->memif) | + AT_XDMAC_CC_SIF(atchan->perif); + csize = ffs(atchan->sconfig.src_maxburst) - 1; if (csize < 0) { dev_err(chan2dev(chan), "invalid src maxburst value\n"); @@ -571,12 +634,14 @@ static int at_xdmac_compute_chan_conf(struct dma_chan *chan, AT91_XDMAC_DT_PERID(atchan->perid) | AT_XDMAC_CC_DAM_FIXED_AM | AT_XDMAC_CC_SAM_INCREMENTED_AM - | AT_XDMAC_CC_DIF(atchan->perif) - | AT_XDMAC_CC_SIF(atchan->memif) | AT_XDMAC_CC_SWREQ_HWR_CONNECTED | AT_XDMAC_CC_DSYNC_MEM2PER | AT_XDMAC_CC_MBSIZE_SIXTEEN | AT_XDMAC_CC_TYPE_PER_TRAN; + if (atxdmac->layout->sdif) + atchan->cfg |= AT_XDMAC_CC_DIF(atchan->perif) | + AT_XDMAC_CC_SIF(atchan->memif); + csize = ffs(atchan->sconfig.dst_maxburst) - 1; if (csize < 0) { dev_err(chan2dev(chan), "invalid src maxburst value\n"); @@ -866,10 +931,12 @@ at_xdmac_interleaved_queue_desc(struct dma_chan *chan, * ERRATA: Even if useless for memory transfers, the PERID has to not * match the one of another channel. If not, it could lead to spurious * flag status. + * For SAMA7G5x case, the SIF and DIF fields are no longer used. + * Thus, no need to have the SIF/DIF interfaces here. + * For SAMA5D4x and SAMA5D2x the SIF and DIF are already configured as + * zero. */ - u32 chan_cc = AT_XDMAC_CC_PERID(0x3f) - | AT_XDMAC_CC_DIF(0) - | AT_XDMAC_CC_SIF(0) + u32 chan_cc = AT_XDMAC_CC_PERID(0x7f) | AT_XDMAC_CC_MBSIZE_SIXTEEN | AT_XDMAC_CC_TYPE_MEM_TRAN; @@ -1048,12 +1115,14 @@ at_xdmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, * ERRATA: Even if useless for memory transfers, the PERID has to not * match the one of another channel. If not, it could lead to spurious * flag status. + * For SAMA7G5x case, the SIF and DIF fields are no longer used. + * Thus, no need to have the SIF/DIF interfaces here. + * For SAMA5D4x and SAMA5D2x the SIF and DIF are already configured as + * zero. */ - u32 chan_cc = AT_XDMAC_CC_PERID(0x3f) + u32 chan_cc = AT_XDMAC_CC_PERID(0x7f) | AT_XDMAC_CC_DAM_INCREMENTED_AM | AT_XDMAC_CC_SAM_INCREMENTED_AM - | AT_XDMAC_CC_DIF(0) - | AT_XDMAC_CC_SIF(0) | AT_XDMAC_CC_MBSIZE_SIXTEEN | AT_XDMAC_CC_TYPE_MEM_TRAN; unsigned long irqflags; @@ -1154,12 +1223,14 @@ static struct at_xdmac_desc *at_xdmac_memset_create_desc(struct dma_chan *chan, * ERRATA: Even if useless for memory transfers, the PERID has to not * match the one of another channel. If not, it could lead to spurious * flag status. + * For SAMA7G5x case, the SIF and DIF fields are no longer used. + * Thus, no need to have the SIF/DIF interfaces here. + * For SAMA5D4x and SAMA5D2x the SIF and DIF are already configured as + * zero. */ - u32 chan_cc = AT_XDMAC_CC_PERID(0x3f) + u32 chan_cc = AT_XDMAC_CC_PERID(0x7f) | AT_XDMAC_CC_DAM_UBS_AM | AT_XDMAC_CC_SAM_INCREMENTED_AM - | AT_XDMAC_CC_DIF(0) - | AT_XDMAC_CC_SIF(0) | AT_XDMAC_CC_MBSIZE_SIXTEEN | AT_XDMAC_CC_MEMSET_HW_MODE | AT_XDMAC_CC_TYPE_MEM_TRAN; @@ -1438,7 +1509,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, mask = AT_XDMAC_CC_TYPE | AT_XDMAC_CC_DSYNC; value = AT_XDMAC_CC_TYPE_PER_TRAN | AT_XDMAC_CC_DSYNC_PER2MEM; if ((desc->lld.mbr_cfg & mask) == value) { - at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask); + at_xdmac_write(atxdmac, atxdmac->layout->gswf, atchan->mask); while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS)) cpu_relax(); } @@ -1496,7 +1567,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie, * FIFO flush ensures that data are really written. */ if ((desc->lld.mbr_cfg & mask) == value) { - at_xdmac_write(atxdmac, AT_XDMAC_GSWF, atchan->mask); + at_xdmac_write(atxdmac, atxdmac->layout->gswf, atchan->mask); while (!(at_xdmac_chan_read(atchan, AT_XDMAC_CIS) & AT_XDMAC_CIS_FIS)) cpu_relax(); } @@ -1761,7 +1832,7 @@ static int at_xdmac_device_pause(struct dma_chan *chan) return 0; spin_lock_irqsave(&atchan->lock, flags); - at_xdmac_write(atxdmac, AT_XDMAC_GRWS, atchan->mask); + at_xdmac_write(atxdmac, atxdmac->layout->grws, atchan->mask); while (at_xdmac_chan_read(atchan, AT_XDMAC_CC) & (AT_XDMAC_CC_WRIP | AT_XDMAC_CC_RDIP)) cpu_relax(); @@ -1784,7 +1855,7 @@ static int at_xdmac_device_resume(struct dma_chan *chan) return 0; } - at_xdmac_write(atxdmac, AT_XDMAC_GRWR, atchan->mask); + at_xdmac_write(atxdmac, atxdmac->layout->grwr, atchan->mask); clear_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status); spin_unlock_irqrestore(&atchan->lock, flags); @@ -1947,6 +2018,30 @@ static int atmel_xdmac_resume(struct device *dev) } #endif /* CONFIG_PM_SLEEP */ +static void at_xdmac_axi_config(struct platform_device *pdev) +{ + struct at_xdmac *atxdmac = (struct at_xdmac *)platform_get_drvdata(pdev); + bool dev_m2m = false; + u32 dma_requests; + + if (!atxdmac->layout->axi_config) + return; /* Not supported */ + + if (!of_property_read_u32(pdev->dev.of_node, "dma-requests", + &dma_requests)) { + dev_info(&pdev->dev, "controller in mem2mem mode.\n"); + dev_m2m = true; + } + + if (dev_m2m) { + at_xdmac_write(atxdmac, AT_XDMAC_GCFG, AT_XDMAC_GCFG_M2M); + at_xdmac_write(atxdmac, AT_XDMAC_GWAC, AT_XDMAC_GWAC_M2M); + } else { + at_xdmac_write(atxdmac, AT_XDMAC_GCFG, AT_XDMAC_GCFG_P2M); + at_xdmac_write(atxdmac, AT_XDMAC_GWAC, AT_XDMAC_GWAC_P2M); + } +} + static int at_xdmac_probe(struct platform_device *pdev) { struct at_xdmac *atxdmac; @@ -1986,6 +2081,10 @@ static int at_xdmac_probe(struct platform_device *pdev) atxdmac->regs = base; atxdmac->irq = irq; + atxdmac->layout = of_device_get_match_data(&pdev->dev); + if (!atxdmac->layout) + return -ENODEV; + atxdmac->clk = devm_clk_get(&pdev->dev, "dma_clk"); if (IS_ERR(atxdmac->clk)) { dev_err(&pdev->dev, "can't get dma_clk\n"); @@ -2087,6 +2186,8 @@ static int at_xdmac_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%d channels, mapped at 0x%p\n", nr_channels, atxdmac->regs); + at_xdmac_axi_config(pdev); + return 0; err_dma_unregister: @@ -2128,6 +2229,10 @@ static const struct dev_pm_ops atmel_xdmac_dev_pm_ops = { static const struct of_device_id atmel_xdmac_dt_ids[] = { { .compatible = "atmel,sama5d4-dma", + .data = &at_xdmac_sama5d4_layout, + }, { + .compatible = "microchip,sama7g5-dma", + .data = &at_xdmac_sama7g5_layout, }, { /* sentinel */ } diff --git a/drivers/dma/dma-jz4780.c b/drivers/dma/dma-jz4780.c index a608efaa435f..612d353648cf 100644 --- a/drivers/dma/dma-jz4780.c +++ b/drivers/dma/dma-jz4780.c @@ -1044,7 +1044,7 @@ static struct platform_driver jz4780_dma_driver = { .remove = jz4780_dma_remove, .driver = { .name = "jz4780-dma", - .of_match_table = of_match_ptr(jz4780_dma_dt_match), + .of_match_table = jz4780_dma_dt_match, }, }; diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index a3a172173e34..f696246f57fd 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -573,6 +573,7 @@ static int dmatest_func(void *data) struct dmatest_params *params; struct dma_chan *chan; struct dma_device *dev; + struct device *dma_dev; unsigned int error_count; unsigned int failed_tests = 0; unsigned int total_tests = 0; @@ -606,6 +607,8 @@ static int dmatest_func(void *data) params = &info->params; chan = thread->chan; dev = chan->device; + dma_dev = dmaengine_get_dma_device(chan); + src = &thread->src; dst = &thread->dst; if (thread->type == DMA_MEMCPY) { @@ -730,7 +733,7 @@ static int dmatest_func(void *data) filltime = ktime_add(filltime, diff); } - um = dmaengine_get_unmap_data(dev->dev, src->cnt + dst->cnt, + um = dmaengine_get_unmap_data(dma_dev, src->cnt + dst->cnt, GFP_KERNEL); if (!um) { failed_tests++; @@ -745,10 +748,10 @@ static int dmatest_func(void *data) struct page *pg = virt_to_page(buf); unsigned long pg_off = offset_in_page(buf); - um->addr[i] = dma_map_page(dev->dev, pg, pg_off, + um->addr[i] = dma_map_page(dma_dev, pg, pg_off, um->len, DMA_TO_DEVICE); srcs[i] = um->addr[i] + src->off; - ret = dma_mapping_error(dev->dev, um->addr[i]); + ret = dma_mapping_error(dma_dev, um->addr[i]); if (ret) { result("src mapping error", total_tests, src->off, dst->off, len, ret); @@ -763,9 +766,9 @@ static int dmatest_func(void *data) struct page *pg = virt_to_page(buf); unsigned long pg_off = offset_in_page(buf); - dsts[i] = dma_map_page(dev->dev, pg, pg_off, um->len, + dsts[i] = dma_map_page(dma_dev, pg, pg_off, um->len, DMA_BIDIRECTIONAL); - ret = dma_mapping_error(dev->dev, dsts[i]); + ret = dma_mapping_error(dma_dev, dsts[i]); if (ret) { result("dst mapping error", total_tests, src->off, dst->off, len, ret); diff --git a/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c b/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c index 14c1ac26f866..e164f3295f5d 100644 --- a/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c +++ b/drivers/dma/dw-axi-dmac/dw-axi-dmac-platform.c @@ -992,7 +992,7 @@ static struct platform_driver dw_driver = { .remove = dw_remove, .driver = { .name = KBUILD_MODNAME, - .of_match_table = of_match_ptr(dw_dma_of_id_table), + .of_match_table = dw_dma_of_id_table, .pm = &dw_axi_dma_pm_ops, }, }; diff --git a/drivers/dma/dw/core.c b/drivers/dma/dw/core.c index 7ab83fe601ed..19a23767533a 100644 --- a/drivers/dma/dw/core.c +++ b/drivers/dma/dw/core.c @@ -982,8 +982,11 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) dev_vdbg(chan2dev(chan), "%s\n", __func__); + pm_runtime_get_sync(dw->dma.dev); + /* ASSERT: channel is idle */ if (dma_readl(dw, CH_EN) & dwc->mask) { + pm_runtime_put_sync_suspend(dw->dma.dev); dev_dbg(chan2dev(chan), "DMA channel not idle?\n"); return -EIO; } @@ -1000,6 +1003,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan) * We need controller-specific data to set up slave transfers. */ if (chan->private && !dw_dma_filter(chan, chan->private)) { + pm_runtime_put_sync_suspend(dw->dma.dev); dev_warn(chan2dev(chan), "Wrong controller-specific data\n"); return -EINVAL; } @@ -1043,6 +1047,8 @@ static void dwc_free_chan_resources(struct dma_chan *chan) if (!dw->in_use) do_dw_dma_off(dw); + pm_runtime_put_sync_suspend(dw->dma.dev); + dev_vdbg(chan2dev(chan), "%s: done\n", __func__); } diff --git a/drivers/dma/hisi_dma.c b/drivers/dma/hisi_dma.c index e1a958ae7925..a259ee010e9b 100644 --- a/drivers/dma/hisi_dma.c +++ b/drivers/dma/hisi_dma.c @@ -431,9 +431,8 @@ static irqreturn_t hisi_dma_irq(int irq, void *data) struct hisi_dma_dev *hdma_dev = chan->hdma_dev; struct hisi_dma_desc *desc; struct hisi_dma_cqe *cqe; - unsigned long flags; - spin_lock_irqsave(&chan->vc.lock, flags); + spin_lock(&chan->vc.lock); desc = chan->desc; cqe = chan->cq + chan->cq_head; @@ -452,7 +451,7 @@ static irqreturn_t hisi_dma_irq(int irq, void *data) chan->desc = NULL; } - spin_unlock_irqrestore(&chan->vc.lock, flags); + spin_unlock(&chan->vc.lock); return IRQ_HANDLED; } diff --git a/drivers/dma/idma64.c b/drivers/dma/idma64.c index f5a84c846394..f4c07ad3be15 100644 --- a/drivers/dma/idma64.c +++ b/drivers/dma/idma64.c @@ -667,9 +667,7 @@ static int idma64_platform_remove(struct platform_device *pdev) return idma64_remove(chip); } -#ifdef CONFIG_PM_SLEEP - -static int idma64_pm_suspend(struct device *dev) +static int __maybe_unused idma64_pm_suspend(struct device *dev) { struct idma64_chip *chip = dev_get_drvdata(dev); @@ -677,7 +675,7 @@ static int idma64_pm_suspend(struct device *dev) return 0; } -static int idma64_pm_resume(struct device *dev) +static int __maybe_unused idma64_pm_resume(struct device *dev) { struct idma64_chip *chip = dev_get_drvdata(dev); @@ -685,8 +683,6 @@ static int idma64_pm_resume(struct device *dev) return 0; } -#endif /* CONFIG_PM_SLEEP */ - static const struct dev_pm_ops idma64_dev_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(idma64_pm_suspend, idma64_pm_resume) }; diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index c3976156db2f..0db9b82ed8cf 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "registers.h" #include "idxd.h" @@ -27,12 +28,15 @@ struct idxd_cdev_context { */ static struct idxd_cdev_context ictx[IDXD_TYPE_MAX] = { { .name = "dsa" }, + { .name = "iax" } }; struct idxd_user_context { struct idxd_wq *wq; struct task_struct *task; + unsigned int pasid; unsigned int flags; + struct iommu_sva *sva; }; enum idxd_cdev_cleanup { @@ -75,6 +79,8 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) struct idxd_wq *wq; struct device *dev; int rc = 0; + struct iommu_sva *sva; + unsigned int pasid; wq = inode_wq(inode); idxd = wq->idxd; @@ -95,6 +101,34 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) ctx->wq = wq; filp->private_data = ctx; + + if (device_pasid_enabled(idxd)) { + sva = iommu_sva_bind_device(dev, current->mm, NULL); + if (IS_ERR(sva)) { + rc = PTR_ERR(sva); + dev_err(dev, "pasid allocation failed: %d\n", rc); + goto failed; + } + + pasid = iommu_sva_get_pasid(sva); + if (pasid == IOMMU_PASID_INVALID) { + iommu_sva_unbind_device(sva); + goto failed; + } + + ctx->sva = sva; + ctx->pasid = pasid; + + if (wq_dedicated(wq)) { + rc = idxd_wq_set_pasid(wq, pasid); + if (rc < 0) { + iommu_sva_unbind_device(sva); + dev_err(dev, "wq set pasid failed: %d\n", rc); + goto failed; + } + } + } + idxd_wq_get(wq); mutex_unlock(&wq->wq_lock); return 0; @@ -111,13 +145,27 @@ static int idxd_cdev_release(struct inode *node, struct file *filep) struct idxd_wq *wq = ctx->wq; struct idxd_device *idxd = wq->idxd; struct device *dev = &idxd->pdev->dev; + int rc; dev_dbg(dev, "%s called\n", __func__); filep->private_data = NULL; /* Wait for in-flight operations to complete. */ - idxd_wq_drain(wq); + if (wq_shared(wq)) { + idxd_device_drain_pasid(idxd, ctx->pasid); + } else { + if (device_pasid_enabled(idxd)) { + /* The wq disable in the disable pasid function will drain the wq */ + rc = idxd_wq_disable_pasid(wq); + if (rc < 0) + dev_err(dev, "wq disable pasid failed.\n"); + } else { + idxd_wq_drain(wq); + } + } + if (ctx->sva) + iommu_sva_unbind_device(ctx->sva); kfree(ctx); mutex_lock(&wq->wq_lock); idxd_wq_put(wq); diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 663344987e3f..95f94a3ed6be 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -131,6 +131,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) struct idxd_device *idxd = wq->idxd; struct device *dev = &idxd->pdev->dev; int rc, num_descs, i; + int align; + u64 tmp; if (wq->type != IDXD_WQT_KERNEL) return 0; @@ -142,14 +144,27 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) if (rc < 0) return rc; - wq->compls_size = num_descs * sizeof(struct dsa_completion_record); - wq->compls = dma_alloc_coherent(dev, wq->compls_size, - &wq->compls_addr, GFP_KERNEL); - if (!wq->compls) { + if (idxd->type == IDXD_TYPE_DSA) + align = 32; + else if (idxd->type == IDXD_TYPE_IAX) + align = 64; + else + return -ENODEV; + + wq->compls_size = num_descs * idxd->compl_size + align; + wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size, + &wq->compls_addr_raw, GFP_KERNEL); + if (!wq->compls_raw) { rc = -ENOMEM; goto fail_alloc_compls; } + /* Adjust alignment */ + wq->compls_addr = (wq->compls_addr_raw + (align - 1)) & ~(align - 1); + tmp = (u64)wq->compls_raw; + tmp = (tmp + (align - 1)) & ~(align - 1); + wq->compls = (struct dsa_completion_record *)tmp; + rc = alloc_descs(wq, num_descs); if (rc < 0) goto fail_alloc_descs; @@ -163,9 +178,11 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) struct idxd_desc *desc = wq->descs[i]; desc->hw = wq->hw_descs[i]; - desc->completion = &wq->compls[i]; - desc->compl_dma = wq->compls_addr + - sizeof(struct dsa_completion_record) * i; + if (idxd->type == IDXD_TYPE_DSA) + desc->completion = &wq->compls[i]; + else if (idxd->type == IDXD_TYPE_IAX) + desc->iax_completion = &wq->iax_compls[i]; + desc->compl_dma = wq->compls_addr + idxd->compl_size * i; desc->id = i; desc->wq = wq; desc->cpu = -1; @@ -178,7 +195,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) fail_sbitmap_init: free_descs(wq); fail_alloc_descs: - dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr); + dma_free_coherent(dev, wq->compls_size, wq->compls_raw, + wq->compls_addr_raw); fail_alloc_compls: free_hw_descs(wq); return rc; @@ -193,7 +211,8 @@ void idxd_wq_free_resources(struct idxd_wq *wq) free_hw_descs(wq); free_descs(wq); - dma_free_coherent(dev, wq->compls_size, wq->compls, wq->compls_addr); + dma_free_coherent(dev, wq->compls_size, wq->compls_raw, + wq->compls_addr_raw); sbitmap_queue_free(&wq->sbq); } @@ -273,10 +292,9 @@ int idxd_wq_map_portal(struct idxd_wq *wq) start = pci_resource_start(pdev, IDXD_WQ_BAR); start += idxd_get_wq_portal_full_offset(wq->id, IDXD_PORTAL_LIMITED); - wq->dportal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE); - if (!wq->dportal) + wq->portal = devm_ioremap(dev, start, IDXD_PORTAL_SIZE); + if (!wq->portal) return -ENOMEM; - dev_dbg(dev, "wq %d portal mapped at %p\n", wq->id, wq->dportal); return 0; } @@ -285,7 +303,61 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq) { struct device *dev = &wq->idxd->pdev->dev; - devm_iounmap(dev, wq->dportal); + devm_iounmap(dev, wq->portal); +} + +int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid) +{ + struct idxd_device *idxd = wq->idxd; + int rc; + union wqcfg wqcfg; + unsigned int offset; + unsigned long flags; + + rc = idxd_wq_disable(wq); + if (rc < 0) + return rc; + + offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX); + spin_lock_irqsave(&idxd->dev_lock, flags); + wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset); + wqcfg.pasid_en = 1; + wqcfg.pasid = pasid; + iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset); + spin_unlock_irqrestore(&idxd->dev_lock, flags); + + rc = idxd_wq_enable(wq); + if (rc < 0) + return rc; + + return 0; +} + +int idxd_wq_disable_pasid(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + int rc; + union wqcfg wqcfg; + unsigned int offset; + unsigned long flags; + + rc = idxd_wq_disable(wq); + if (rc < 0) + return rc; + + offset = WQCFG_OFFSET(idxd, wq->id, WQCFG_PASID_IDX); + spin_lock_irqsave(&idxd->dev_lock, flags); + wqcfg.bits[WQCFG_PASID_IDX] = ioread32(idxd->reg_base + offset); + wqcfg.pasid_en = 0; + wqcfg.pasid = 0; + iowrite32(wqcfg.bits[WQCFG_PASID_IDX], idxd->reg_base + offset); + spin_unlock_irqrestore(&idxd->dev_lock, flags); + + rc = idxd_wq_enable(wq); + if (rc < 0) + return rc; + + return 0; } void idxd_wq_disable_cleanup(struct idxd_wq *wq) @@ -301,6 +373,7 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq) wq->group = NULL; wq->threshold = 0; wq->priority = 0; + wq->ats_dis = 0; clear_bit(WQ_FLAG_DEDICATED, &wq->flags); memset(wq->name, 0, WQ_NAME_SIZE); @@ -468,6 +541,17 @@ void idxd_device_reset(struct idxd_device *idxd) spin_unlock_irqrestore(&idxd->dev_lock, flags); } +void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid) +{ + struct device *dev = &idxd->pdev->dev; + u32 operand; + + operand = pasid; + dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_DRAIN_PASID, operand); + idxd_cmd_exec(idxd, IDXD_CMD_DRAIN_PASID, operand, NULL); + dev_dbg(dev, "pasid %d drained\n", pasid); +} + /* Device configuration bits */ static void idxd_group_config_write(struct idxd_group *group) { @@ -479,24 +563,22 @@ static void idxd_group_config_write(struct idxd_group *group) dev_dbg(dev, "Writing group %d cfg registers\n", group->id); /* setup GRPWQCFG */ - for (i = 0; i < 4; i++) { - grpcfg_offset = idxd->grpcfg_offset + - group->id * 64 + i * sizeof(u64); - iowrite64(group->grpcfg.wqs[i], - idxd->reg_base + grpcfg_offset); + for (i = 0; i < GRPWQCFG_STRIDES; i++) { + grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i); + iowrite64(group->grpcfg.wqs[i], idxd->reg_base + grpcfg_offset); dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n", group->id, i, grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset)); } /* setup GRPENGCFG */ - grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 32; + grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id); iowrite64(group->grpcfg.engines, idxd->reg_base + grpcfg_offset); dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id, grpcfg_offset, ioread64(idxd->reg_base + grpcfg_offset)); /* setup GRPFLAGS */ - grpcfg_offset = idxd->grpcfg_offset + group->id * 64 + 40; + grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id); iowrite32(group->grpcfg.flags.bits, idxd->reg_base + grpcfg_offset); dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n", group->id, grpcfg_offset, @@ -554,9 +636,24 @@ static int idxd_wq_config_write(struct idxd_wq *wq) /* byte 8-11 */ wq->wqcfg->priv = !!(wq->type == IDXD_WQT_KERNEL); - wq->wqcfg->mode = 1; + if (wq_dedicated(wq)) + wq->wqcfg->mode = 1; + + if (device_pasid_enabled(idxd)) { + wq->wqcfg->pasid_en = 1; + if (wq->type == IDXD_WQT_KERNEL && wq_dedicated(wq)) + wq->wqcfg->pasid = idxd->pasid; + } + wq->wqcfg->priority = wq->priority; + if (idxd->hw.gen_cap.block_on_fault && + test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags)) + wq->wqcfg->bof = 1; + + if (idxd->hw.wq_cap.wq_ats_support) + wq->wqcfg->wq_ats_disable = wq->ats_dis; + /* bytes 12-15 */ wq->wqcfg->max_xfer_shift = ilog2(wq->max_xfer_bytes); wq->wqcfg->max_batch_shift = ilog2(wq->max_batch_size); @@ -664,8 +761,8 @@ static int idxd_wqs_setup(struct idxd_device *idxd) if (!wq->size) continue; - if (!wq_dedicated(wq)) { - dev_warn(dev, "No shared workqueue support.\n"); + if (wq_shared(wq) && !device_swq_supported(idxd)) { + dev_warn(dev, "No shared wq support but configured.\n"); return -EINVAL; } diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c index 0c892cbd72e0..8ed2773d8285 100644 --- a/drivers/dma/idxd/dma.c +++ b/drivers/dma/idxd/dma.c @@ -61,8 +61,6 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq, u64 addr_f1, u64 addr_f2, u64 len, u64 compl, u32 flags) { - struct idxd_device *idxd = wq->idxd; - hw->flags = flags; hw->opcode = opcode; hw->src_addr = addr_f1; @@ -70,13 +68,6 @@ static inline void idxd_prep_desc_common(struct idxd_wq *wq, hw->xfer_size = len; hw->priv = !!(wq->type == IDXD_WQT_KERNEL); hw->completion_addr = compl; - - /* - * Descriptor completion vectors are 1-8 for MSIX. We will round - * robin through the 8 vectors. - */ - wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1; - hw->int_handle = wq->vec_ptr; } static struct dma_async_tx_descriptor * diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index d48f193daacc..5a50e91c71bf 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -20,7 +20,8 @@ extern struct kmem_cache *idxd_desc_pool; enum idxd_type { IDXD_TYPE_UNKNOWN = -1, IDXD_TYPE_DSA = 0, - IDXD_TYPE_MAX + IDXD_TYPE_IAX, + IDXD_TYPE_MAX, }; #define IDXD_NAME_SIZE 128 @@ -34,6 +35,11 @@ struct idxd_irq_entry { int id; struct llist_head pending_llist; struct list_head work_list; + /* + * Lock to protect access between irq thread process descriptor + * and irq thread processing error descriptor. + */ + spinlock_t list_lock; }; struct idxd_group { @@ -59,6 +65,7 @@ enum idxd_wq_state { enum idxd_wq_flag { WQ_FLAG_DEDICATED = 0, + WQ_FLAG_BLOCK_ON_FAULT, }; enum idxd_wq_type { @@ -86,10 +93,11 @@ enum idxd_op_type { enum idxd_complete_type { IDXD_COMPLETE_NORMAL = 0, IDXD_COMPLETE_ABORT, + IDXD_COMPLETE_DEV_FAIL, }; struct idxd_wq { - void __iomem *dportal; + void __iomem *portal; struct device conf_dev; struct idxd_cdev idxd_cdev; struct idxd_device *idxd; @@ -107,8 +115,13 @@ struct idxd_wq { u32 vec_ptr; /* interrupt steering */ struct dsa_hw_desc **hw_descs; int num_descs; - struct dsa_completion_record *compls; + union { + struct dsa_completion_record *compls; + struct iax_completion_record *iax_compls; + }; + void *compls_raw; dma_addr_t compls_addr; + dma_addr_t compls_addr_raw; int compls_size; struct idxd_desc **descs; struct sbitmap_queue sbq; @@ -116,6 +129,7 @@ struct idxd_wq { char name[WQ_NAME_SIZE + 1]; u64 max_xfer_bytes; u32 max_batch_size; + bool ats_dis; }; struct idxd_engine { @@ -145,6 +159,7 @@ enum idxd_device_state { enum idxd_device_flag { IDXD_FLAG_CONFIGURABLE = 0, IDXD_FLAG_CMD_RUNNING, + IDXD_FLAG_PASID_ENABLED, }; struct idxd_device { @@ -167,6 +182,9 @@ struct idxd_device { struct idxd_wq *wqs; struct idxd_engine *engines; + struct iommu_sva *sva; + unsigned int pasid; + int num_groups; u32 msix_perm_offset; @@ -184,6 +202,7 @@ struct idxd_device { int token_limit; int nr_tokens; /* non-reserved tokens */ unsigned int wqcfg_size; + int compl_size; union sw_err_reg sw_err; wait_queue_head_t cmd_waitq; @@ -198,9 +217,15 @@ struct idxd_device { /* IDXD software descriptor */ struct idxd_desc { - struct dsa_hw_desc *hw; + union { + struct dsa_hw_desc *hw; + struct iax_hw_desc *iax_hw; + }; dma_addr_t desc_dma; - struct dsa_completion_record *completion; + union { + struct dsa_completion_record *completion; + struct iax_completion_record *iax_completion; + }; dma_addr_t compl_dma; struct dma_async_tx_descriptor txd; struct llist_node llnode; @@ -214,12 +239,30 @@ struct idxd_desc { #define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev) extern struct bus_type dsa_bus_type; +extern struct bus_type iax_bus_type; + +extern bool support_enqcmd; static inline bool wq_dedicated(struct idxd_wq *wq) { return test_bit(WQ_FLAG_DEDICATED, &wq->flags); } +static inline bool wq_shared(struct idxd_wq *wq) +{ + return !test_bit(WQ_FLAG_DEDICATED, &wq->flags); +} + +static inline bool device_pasid_enabled(struct idxd_device *idxd) +{ + return test_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags); +} + +static inline bool device_swq_supported(struct idxd_device *idxd) +{ + return (support_enqcmd && device_pasid_enabled(idxd)); +} + enum idxd_portal_prot { IDXD_PORTAL_UNLIMITED = 0, IDXD_PORTAL_LIMITED, @@ -242,6 +285,8 @@ static inline void idxd_set_type(struct idxd_device *idxd) if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0) idxd->type = IDXD_TYPE_DSA; + else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0) + idxd->type = IDXD_TYPE_IAX; else idxd->type = IDXD_TYPE_UNKNOWN; } @@ -288,6 +333,7 @@ void idxd_device_reset(struct idxd_device *idxd); void idxd_device_cleanup(struct idxd_device *idxd); int idxd_device_config(struct idxd_device *idxd); void idxd_device_wqs_clear_state(struct idxd_device *idxd); +void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid); /* work queue control */ int idxd_wq_alloc_resources(struct idxd_wq *wq); @@ -298,6 +344,8 @@ void idxd_wq_drain(struct idxd_wq *wq); int idxd_wq_map_portal(struct idxd_wq *wq); void idxd_wq_unmap_portal(struct idxd_wq *wq); void idxd_wq_disable_cleanup(struct idxd_wq *wq); +int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid); +int idxd_wq_disable_pasid(struct idxd_wq *wq); /* submission */ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 0a4432b063b5..2c051e07c34c 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include "../dmaengine.h" @@ -26,18 +28,24 @@ MODULE_AUTHOR("Intel Corporation"); #define DRV_NAME "idxd" +bool support_enqcmd; + static struct idr idxd_idrs[IDXD_TYPE_MAX]; static struct mutex idxd_idr_lock; static struct pci_device_id idxd_pci_tbl[] = { /* DSA ver 1.0 platforms */ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) }, + + /* IAX ver 1.0 platforms */ + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IAX_SPR0) }, { 0, } }; MODULE_DEVICE_TABLE(pci, idxd_pci_tbl); static char *idxd_name[] = { "dsa", + "iax" }; const char *idxd_get_dev_name(struct idxd_device *idxd) @@ -53,6 +61,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) struct idxd_irq_entry *irq_entry; int i, msixcnt; int rc = 0; + union msix_perm mperm; msixcnt = pci_msix_vec_count(pdev); if (msixcnt < 0) { @@ -92,6 +101,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) for (i = 0; i < msixcnt; i++) { idxd->irq_entries[i].id = i; idxd->irq_entries[i].idxd = idxd; + spin_lock_init(&idxd->irq_entries[i].list_lock); } msix = &idxd->msix_entries[0]; @@ -131,6 +141,13 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) idxd_unmask_error_interrupts(idxd); + /* Setup MSIX permission table */ + mperm.bits = 0; + mperm.pasid = idxd->pasid; + mperm.pasid_en = device_pasid_enabled(idxd); + for (i = 1; i < msixcnt; i++) + iowrite32(mperm.bits, idxd->reg_base + idxd->msix_perm_offset + i * 8); + return 0; err_no_irq: @@ -201,17 +218,14 @@ static void idxd_read_table_offsets(struct idxd_device *idxd) struct device *dev = &idxd->pdev->dev; offsets.bits[0] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET); - offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET - + sizeof(u64)); - idxd->grpcfg_offset = offsets.grpcfg * 0x100; + offsets.bits[1] = ioread64(idxd->reg_base + IDXD_TABLE_OFFSET + sizeof(u64)); + idxd->grpcfg_offset = offsets.grpcfg * IDXD_TABLE_MULT; dev_dbg(dev, "IDXD Group Config Offset: %#x\n", idxd->grpcfg_offset); - idxd->wqcfg_offset = offsets.wqcfg * 0x100; - dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n", - idxd->wqcfg_offset); - idxd->msix_perm_offset = offsets.msix_perm * 0x100; - dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", - idxd->msix_perm_offset); - idxd->perfmon_offset = offsets.perfmon * 0x100; + idxd->wqcfg_offset = offsets.wqcfg * IDXD_TABLE_MULT; + dev_dbg(dev, "IDXD Work Queue Config Offset: %#x\n", idxd->wqcfg_offset); + idxd->msix_perm_offset = offsets.msix_perm * IDXD_TABLE_MULT; + dev_dbg(dev, "IDXD MSIX Permission Offset: %#x\n", idxd->msix_perm_offset); + idxd->perfmon_offset = offsets.perfmon * IDXD_TABLE_MULT; dev_dbg(dev, "IDXD Perfmon Offset: %#x\n", idxd->perfmon_offset); } @@ -265,8 +279,7 @@ static void idxd_read_caps(struct idxd_device *idxd) } } -static struct idxd_device *idxd_alloc(struct pci_dev *pdev, - void __iomem * const *iomap) +static struct idxd_device *idxd_alloc(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct idxd_device *idxd; @@ -276,12 +289,45 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, return NULL; idxd->pdev = pdev; - idxd->reg_base = iomap[IDXD_MMIO_BAR]; spin_lock_init(&idxd->dev_lock); return idxd; } +static int idxd_enable_system_pasid(struct idxd_device *idxd) +{ + int flags; + unsigned int pasid; + struct iommu_sva *sva; + + flags = SVM_FLAG_SUPERVISOR_MODE; + + sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags); + if (IS_ERR(sva)) { + dev_warn(&idxd->pdev->dev, + "iommu sva bind failed: %ld\n", PTR_ERR(sva)); + return PTR_ERR(sva); + } + + pasid = iommu_sva_get_pasid(sva); + if (pasid == IOMMU_PASID_INVALID) { + iommu_sva_unbind_device(sva); + return -ENODEV; + } + + idxd->sva = sva; + idxd->pasid = pasid; + dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid); + return 0; +} + +static void idxd_disable_system_pasid(struct idxd_device *idxd) +{ + + iommu_sva_unbind_device(idxd->sva); + idxd->sva = NULL; +} + static int idxd_probe(struct idxd_device *idxd) { struct pci_dev *pdev = idxd->pdev; @@ -292,6 +338,14 @@ static int idxd_probe(struct idxd_device *idxd) idxd_device_init_reset(idxd); dev_dbg(dev, "IDXD reset complete\n"); + if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM)) { + rc = idxd_enable_system_pasid(idxd); + if (rc < 0) + dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc); + else + set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags); + } + idxd_read_caps(idxd); idxd_read_table_offsets(idxd); @@ -322,29 +376,37 @@ static int idxd_probe(struct idxd_device *idxd) idxd_mask_error_interrupts(idxd); idxd_mask_msix_vectors(idxd); err_setup: + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); return rc; } +static void idxd_type_init(struct idxd_device *idxd) +{ + if (idxd->type == IDXD_TYPE_DSA) + idxd->compl_size = sizeof(struct dsa_completion_record); + else if (idxd->type == IDXD_TYPE_IAX) + idxd->compl_size = sizeof(struct iax_completion_record); +} + static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - void __iomem * const *iomap; struct device *dev = &pdev->dev; struct idxd_device *idxd; int rc; - unsigned int mask; rc = pcim_enable_device(pdev); if (rc) return rc; - dev_dbg(dev, "Mapping BARs\n"); - mask = (1 << IDXD_MMIO_BAR); - rc = pcim_iomap_regions(pdev, mask, DRV_NAME); - if (rc) - return rc; + dev_dbg(dev, "Alloc IDXD context\n"); + idxd = idxd_alloc(pdev); + if (!idxd) + return -ENOMEM; - iomap = pcim_iomap_table(pdev); - if (!iomap) + dev_dbg(dev, "Mapping BARs\n"); + idxd->reg_base = pcim_iomap(pdev, IDXD_MMIO_BAR, 0); + if (!idxd->reg_base) return -ENOMEM; dev_dbg(dev, "Set DMA masks\n"); @@ -360,13 +422,10 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - dev_dbg(dev, "Alloc IDXD context\n"); - idxd = idxd_alloc(pdev, iomap); - if (!idxd) - return -ENOMEM; - idxd_set_type(idxd); + idxd_type_init(idxd); + dev_dbg(dev, "Set PCI master\n"); pci_set_master(pdev); pci_set_drvdata(pdev, idxd); @@ -452,6 +511,8 @@ static void idxd_remove(struct pci_dev *pdev) dev_dbg(&pdev->dev, "%s called\n", __func__); idxd_cleanup_sysfs(idxd); idxd_shutdown(pdev); + if (device_pasid_enabled(idxd)) + idxd_disable_system_pasid(idxd); mutex_lock(&idxd_idr_lock); idr_remove(&idxd_idrs[idxd->type], idxd->id); mutex_unlock(&idxd_idr_lock); @@ -470,7 +531,7 @@ static int __init idxd_init_module(void) int err, i; /* - * If the CPU does not support write512, there's no point in + * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in * enumerating the device. We can not utilize it. */ if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) { @@ -478,8 +539,10 @@ static int __init idxd_init_module(void) return -ENODEV; } - pr_info("%s: Intel(R) Accelerator Devices Driver %s\n", - DRV_NAME, IDXD_DRIVER_VERSION); + if (!boot_cpu_has(X86_FEATURE_ENQCMD)) + pr_warn("Platform does not have ENQCMD(S) support.\n"); + else + support_enqcmd = true; mutex_init(&idxd_idr_lock); for (i = 0; i < IDXD_TYPE_MAX; i++) diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 17a65a13fb64..593a2f6ed16c 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -11,6 +11,24 @@ #include "idxd.h" #include "registers.h" +enum irq_work_type { + IRQ_WORK_NORMAL = 0, + IRQ_WORK_PROCESS_FAULT, +}; + +struct idxd_fault { + struct work_struct work; + u64 addr; + struct idxd_device *idxd; +}; + +static int irq_process_work_list(struct idxd_irq_entry *irq_entry, + enum irq_work_type wtype, + int *processed, u64 data); +static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry, + enum irq_work_type wtype, + int *processed, u64 data); + static void idxd_device_reinit(struct work_struct *work) { struct idxd_device *idxd = container_of(work, struct idxd_device, work); @@ -44,6 +62,46 @@ static void idxd_device_reinit(struct work_struct *work) idxd_device_wqs_clear_state(idxd); } +static void idxd_device_fault_work(struct work_struct *work) +{ + struct idxd_fault *fault = container_of(work, struct idxd_fault, work); + struct idxd_irq_entry *ie; + int i; + int processed; + int irqcnt = fault->idxd->num_wq_irqs + 1; + + for (i = 1; i < irqcnt; i++) { + ie = &fault->idxd->irq_entries[i]; + irq_process_work_list(ie, IRQ_WORK_PROCESS_FAULT, + &processed, fault->addr); + if (processed) + break; + + irq_process_pending_llist(ie, IRQ_WORK_PROCESS_FAULT, + &processed, fault->addr); + if (processed) + break; + } + + kfree(fault); +} + +static int idxd_device_schedule_fault_process(struct idxd_device *idxd, + u64 fault_addr) +{ + struct idxd_fault *fault; + + fault = kmalloc(sizeof(*fault), GFP_ATOMIC); + if (!fault) + return -ENOMEM; + + fault->addr = fault_addr; + fault->idxd = idxd; + INIT_WORK(&fault->work, idxd_device_fault_work); + queue_work(idxd->wq, &fault->work); + return 0; +} + irqreturn_t idxd_irq_handler(int vec, void *data) { struct idxd_irq_entry *irq_entry = data; @@ -125,6 +183,15 @@ irqreturn_t idxd_misc_thread(int vec, void *data) if (!err) goto out; + /* + * This case should rarely happen and typically is due to software + * programming error by the driver. + */ + if (idxd->sw_err.valid && + idxd->sw_err.desc_valid && + idxd->sw_err.fault_addr) + idxd_device_schedule_fault_process(idxd, idxd->sw_err.fault_addr); + gensts.bits = ioread32(idxd->reg_base + IDXD_GENSTATS_OFFSET); if (gensts.state == IDXD_DEVICE_STATE_HALT) { idxd->state = IDXD_DEV_HALTED; @@ -152,57 +219,110 @@ irqreturn_t idxd_misc_thread(int vec, void *data) return IRQ_HANDLED; } +static bool process_fault(struct idxd_desc *desc, u64 fault_addr) +{ + /* + * Completion address can be bad as well. Check fault address match for descriptor + * and completion address. + */ + if ((u64)desc->hw == fault_addr || + (u64)desc->completion == fault_addr) { + idxd_dma_complete_txd(desc, IDXD_COMPLETE_DEV_FAIL); + return true; + } + + return false; +} + +static bool complete_desc(struct idxd_desc *desc) +{ + if (desc->completion->status) { + idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL); + return true; + } + + return false; +} + static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry, - int *processed) + enum irq_work_type wtype, + int *processed, u64 data) { struct idxd_desc *desc, *t; struct llist_node *head; int queued = 0; + bool completed = false; + unsigned long flags; *processed = 0; head = llist_del_all(&irq_entry->pending_llist); if (!head) - return 0; + goto out; llist_for_each_entry_safe(desc, t, head, llnode) { - if (desc->completion->status) { - idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL); + if (wtype == IRQ_WORK_NORMAL) + completed = complete_desc(desc); + else if (wtype == IRQ_WORK_PROCESS_FAULT) + completed = process_fault(desc, data); + + if (completed) { idxd_free_desc(desc->wq, desc); (*processed)++; + if (wtype == IRQ_WORK_PROCESS_FAULT) + break; } else { - list_add_tail(&desc->list, &irq_entry->work_list); + spin_lock_irqsave(&irq_entry->list_lock, flags); + list_add_tail(&desc->list, + &irq_entry->work_list); + spin_unlock_irqrestore(&irq_entry->list_lock, flags); queued++; } } + out: return queued; } static int irq_process_work_list(struct idxd_irq_entry *irq_entry, - int *processed) + enum irq_work_type wtype, + int *processed, u64 data) { struct list_head *node, *next; int queued = 0; + bool completed = false; + unsigned long flags; *processed = 0; + spin_lock_irqsave(&irq_entry->list_lock, flags); if (list_empty(&irq_entry->work_list)) - return 0; + goto out; list_for_each_safe(node, next, &irq_entry->work_list) { struct idxd_desc *desc = container_of(node, struct idxd_desc, list); - if (desc->completion->status) { + spin_unlock_irqrestore(&irq_entry->list_lock, flags); + if (wtype == IRQ_WORK_NORMAL) + completed = complete_desc(desc); + else if (wtype == IRQ_WORK_PROCESS_FAULT) + completed = process_fault(desc, data); + + if (completed) { + spin_lock_irqsave(&irq_entry->list_lock, flags); list_del(&desc->list); - /* process and callback */ - idxd_dma_complete_txd(desc, IDXD_COMPLETE_NORMAL); + spin_unlock_irqrestore(&irq_entry->list_lock, flags); idxd_free_desc(desc->wq, desc); (*processed)++; + if (wtype == IRQ_WORK_PROCESS_FAULT) + return queued; } else { queued++; } + spin_lock_irqsave(&irq_entry->list_lock, flags); } + out: + spin_unlock_irqrestore(&irq_entry->list_lock, flags); return queued; } @@ -230,12 +350,14 @@ static int idxd_desc_process(struct idxd_irq_entry *irq_entry) * 5. Repeat until no more descriptors. */ do { - rc = irq_process_work_list(irq_entry, &processed); + rc = irq_process_work_list(irq_entry, IRQ_WORK_NORMAL, + &processed, 0); total += processed; if (rc != 0) continue; - rc = irq_process_pending_llist(irq_entry, &processed); + rc = irq_process_pending_llist(irq_entry, IRQ_WORK_NORMAL, + &processed, 0); total += processed; } while (rc != 0); diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 54390334c243..751ecb4f9f81 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -5,6 +5,7 @@ /* PCI Config */ #define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25 +#define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe #define IDXD_MMIO_BAR 0 #define IDXD_WQ_BAR 2 @@ -47,7 +48,7 @@ union wq_cap_reg { u64 rsvd:20; u64 shared_mode:1; u64 dedicated_mode:1; - u64 rsvd2:1; + u64 wq_ats_support:1; u64 priority:1; u64 occupancy:1; u64 occupancy_int:1; @@ -102,6 +103,8 @@ union offsets_reg { u64 bits[2]; } __packed; +#define IDXD_TABLE_MULT 0x100 + #define IDXD_GENCFG_OFFSET 0x80 union gencfg_reg { struct { @@ -301,7 +304,8 @@ union wqcfg { /* bytes 8-11 */ u32 mode:1; /* shared or dedicated */ u32 bof:1; /* block on fault */ - u32 rsvd2:2; + u32 wq_ats_disable:1; + u32 rsvd2:1; u32 priority:4; u32 pasid:20; u32 pasid_en:1; @@ -336,6 +340,8 @@ union wqcfg { u32 bits[8]; } __packed; +#define WQCFG_PASID_IDX 2 + /* * This macro calculates the offset into the WQCFG register * idxd - struct idxd * @@ -354,4 +360,22 @@ union wqcfg { #define WQCFG_STRIDES(_idxd_dev) ((_idxd_dev)->wqcfg_size / sizeof(u32)) +#define GRPCFG_SIZE 64 +#define GRPWQCFG_STRIDES 4 + +/* + * This macro calculates the offset into the GRPCFG register + * idxd - struct idxd * + * n - wq id + * ofs - the index of the 32b dword for the config register + * + * The WQCFG register block is divided into groups per each wq. The n index + * allows us to move to the register group that's for that particular wq. + * Each register is 32bits. The ofs gives us the number of register to access. + */ +#define GRPWQCFG_OFFSET(idxd_dev, n, ofs) ((idxd_dev)->grpcfg_offset +\ + (n) * GRPCFG_SIZE + sizeof(u64) * (ofs)) +#define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32) +#define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40) + #endif diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index 417048e3c42a..a7a61bcc17d5 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -11,11 +11,22 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu) { struct idxd_desc *desc; + struct idxd_device *idxd = wq->idxd; desc = wq->descs[idx]; memset(desc->hw, 0, sizeof(struct dsa_hw_desc)); - memset(desc->completion, 0, sizeof(struct dsa_completion_record)); + memset(desc->completion, 0, idxd->compl_size); desc->cpu = cpu; + + if (device_pasid_enabled(idxd)) + desc->hw->pasid = idxd->pasid; + + /* + * Descriptor completion vectors are 1-8 for MSIX. We will round + * robin through the 8 vectors. + */ + wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1; + desc->hw->int_handle = wq->vec_ptr; return desc; } @@ -70,18 +81,32 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) struct idxd_device *idxd = wq->idxd; int vec = desc->hw->int_handle; void __iomem *portal; + int rc; if (idxd->state != IDXD_DEV_ENABLED) return -EIO; - portal = wq->dportal; + portal = wq->portal; + /* - * The wmb() flushes writes to coherent DMA data before possibly - * triggering a DMA read. The wmb() is necessary even on UP because - * the recipient is a device. + * The wmb() flushes writes to coherent DMA data before + * possibly triggering a DMA read. The wmb() is necessary + * even on UP because the recipient is a device. */ wmb(); - iosubmit_cmds512(portal, desc->hw, 1); + if (wq_dedicated(wq)) { + iosubmit_cmds512(portal, desc->hw, 1); + } else { + /* + * It's not likely that we would receive queue full rejection + * since the descriptor allocation gates at wq size. If we + * receive a -EAGAIN, that means something went wrong such as the + * device is not accepting descriptor at all. + */ + rc = enqcmds(portal, desc->hw); + if (rc < 0) + return rc; + } /* * Pending the descriptor to the lockless list for the irq_entry diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 07a5db06a29a..266423a2cabc 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -41,14 +41,24 @@ static struct device_type dsa_device_type = { .release = idxd_conf_device_release, }; +static struct device_type iax_device_type = { + .name = "iax", + .release = idxd_conf_device_release, +}; + static inline bool is_dsa_dev(struct device *dev) { return dev ? dev->type == &dsa_device_type : false; } +static inline bool is_iax_dev(struct device *dev) +{ + return dev ? dev->type == &iax_device_type : false; +} + static inline bool is_idxd_dev(struct device *dev) { - return is_dsa_dev(dev); + return is_dsa_dev(dev) || is_iax_dev(dev); } static inline bool is_idxd_wq_dev(struct device *dev) @@ -175,6 +185,30 @@ static int idxd_config_bus_probe(struct device *dev) return -EINVAL; } + /* Shared WQ checks */ + if (wq_shared(wq)) { + if (!device_swq_supported(idxd)) { + dev_warn(dev, + "PASID not enabled and shared WQ.\n"); + mutex_unlock(&wq->wq_lock); + return -ENXIO; + } + /* + * Shared wq with the threshold set to 0 means the user + * did not set the threshold or transitioned from a + * dedicated wq but did not set threshold. A value + * of 0 would effectively disable the shared wq. The + * driver does not allow a value of 0 to be set for + * threshold via sysfs. + */ + if (wq->threshold == 0) { + dev_warn(dev, + "Shared WQ and threshold 0.\n"); + mutex_unlock(&wq->wq_lock); + return -EINVAL; + } + } + rc = idxd_wq_alloc_resources(wq); if (rc < 0) { mutex_unlock(&wq->wq_lock); @@ -335,8 +369,17 @@ struct bus_type dsa_bus_type = { .shutdown = idxd_config_bus_shutdown, }; +struct bus_type iax_bus_type = { + .name = "iax", + .match = idxd_config_bus_match, + .probe = idxd_config_bus_probe, + .remove = idxd_config_bus_remove, + .shutdown = idxd_config_bus_shutdown, +}; + static struct bus_type *idxd_bus_types[] = { - &dsa_bus_type + &dsa_bus_type, + &iax_bus_type }; static struct idxd_device_driver dsa_drv = { @@ -348,8 +391,18 @@ static struct idxd_device_driver dsa_drv = { }, }; +static struct idxd_device_driver iax_drv = { + .drv = { + .name = "iax", + .bus = &iax_bus_type, + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + }, +}; + static struct idxd_device_driver *idxd_drvs[] = { - &dsa_drv + &dsa_drv, + &iax_drv }; struct bus_type *idxd_get_bus_type(struct idxd_device *idxd) @@ -361,6 +414,8 @@ static struct device_type *idxd_get_device_type(struct idxd_device *idxd) { if (idxd->type == IDXD_TYPE_DSA) return &dsa_device_type; + else if (idxd->type == IDXD_TYPE_IAX) + return &iax_device_type; else return NULL; } @@ -501,6 +556,9 @@ static ssize_t group_tokens_reserved_store(struct device *dev, if (rc < 0) return -EINVAL; + if (idxd->type == IDXD_TYPE_IAX) + return -EOPNOTSUPP; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) return -EPERM; @@ -546,6 +604,9 @@ static ssize_t group_tokens_allowed_store(struct device *dev, if (rc < 0) return -EINVAL; + if (idxd->type == IDXD_TYPE_IAX) + return -EOPNOTSUPP; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) return -EPERM; @@ -588,6 +649,9 @@ static ssize_t group_use_token_limit_store(struct device *dev, if (rc < 0) return -EINVAL; + if (idxd->type == IDXD_TYPE_IAX) + return -EOPNOTSUPP; + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) return -EPERM; @@ -875,6 +939,8 @@ static ssize_t wq_mode_store(struct device *dev, if (sysfs_streq(buf, "dedicated")) { set_bit(WQ_FLAG_DEDICATED, &wq->flags); wq->threshold = 0; + } else if (sysfs_streq(buf, "shared") && device_swq_supported(idxd)) { + clear_bit(WQ_FLAG_DEDICATED, &wq->flags); } else { return -EINVAL; } @@ -973,6 +1039,87 @@ static ssize_t wq_priority_store(struct device *dev, static struct device_attribute dev_attr_wq_priority = __ATTR(priority, 0644, wq_priority_show, wq_priority_store); +static ssize_t wq_block_on_fault_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + + return sprintf(buf, "%u\n", + test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags)); +} + +static ssize_t wq_block_on_fault_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + struct idxd_device *idxd = wq->idxd; + bool bof; + int rc; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -ENXIO; + + rc = kstrtobool(buf, &bof); + if (rc < 0) + return rc; + + if (bof) + set_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); + else + clear_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags); + + return count; +} + +static struct device_attribute dev_attr_wq_block_on_fault = + __ATTR(block_on_fault, 0644, wq_block_on_fault_show, + wq_block_on_fault_store); + +static ssize_t wq_threshold_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + + return sprintf(buf, "%u\n", wq->threshold); +} + +static ssize_t wq_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + struct idxd_device *idxd = wq->idxd; + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc < 0) + return -EINVAL; + + if (val > wq->size || val <= 0) + return -EINVAL; + + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + return -EPERM; + + if (wq->state != IDXD_WQ_DISABLED) + return -ENXIO; + + if (test_bit(WQ_FLAG_DEDICATED, &wq->flags)) + return -EINVAL; + + wq->threshold = val; + + return count; +} + +static struct device_attribute dev_attr_wq_threshold = + __ATTR(threshold, 0644, wq_threshold_show, wq_threshold_store); + static ssize_t wq_type_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1044,6 +1191,13 @@ static ssize_t wq_name_store(struct device *dev, if (strlen(buf) > WQ_NAME_SIZE || strlen(buf) == 0) return -EINVAL; + /* + * This is temporarily placed here until we have SVM support for + * dmaengine. + */ + if (wq->type == IDXD_WQT_KERNEL && device_pasid_enabled(wq->idxd)) + return -EOPNOTSUPP; + memset(wq->name, 0, WQ_NAME_SIZE + 1); strncpy(wq->name, buf, WQ_NAME_SIZE); strreplace(wq->name, '\n', '\0'); @@ -1147,6 +1301,39 @@ static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribu static struct device_attribute dev_attr_wq_max_batch_size = __ATTR(max_batch_size, 0644, wq_max_batch_size_show, wq_max_batch_size_store); +static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + + return sprintf(buf, "%u\n", wq->ats_dis); +} + +static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + struct idxd_device *idxd = wq->idxd; + bool ats_dis; + int rc; + + if (wq->state != IDXD_WQ_DISABLED) + return -EPERM; + + if (!idxd->hw.wq_cap.wq_ats_support) + return -EOPNOTSUPP; + + rc = kstrtobool(buf, &ats_dis); + if (rc < 0) + return rc; + + wq->ats_dis = ats_dis; + + return count; +} + +static struct device_attribute dev_attr_wq_ats_disable = + __ATTR(ats_disable, 0644, wq_ats_disable_show, wq_ats_disable_store); + static struct attribute *idxd_wq_attributes[] = { &dev_attr_wq_clients.attr, &dev_attr_wq_state.attr, @@ -1154,11 +1341,14 @@ static struct attribute *idxd_wq_attributes[] = { &dev_attr_wq_mode.attr, &dev_attr_wq_size.attr, &dev_attr_wq_priority.attr, + &dev_attr_wq_block_on_fault.attr, + &dev_attr_wq_threshold.attr, &dev_attr_wq_type.attr, &dev_attr_wq_name.attr, &dev_attr_wq_cdev_minor.attr, &dev_attr_wq_max_transfer_size.attr, &dev_attr_wq_max_batch_size.attr, + &dev_attr_wq_ats_disable.attr, NULL, }; @@ -1305,6 +1495,16 @@ static ssize_t clients_show(struct device *dev, } static DEVICE_ATTR_RO(clients); +static ssize_t pasid_enabled_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct idxd_device *idxd = + container_of(dev, struct idxd_device, conf_dev); + + return sprintf(buf, "%u\n", device_pasid_enabled(idxd)); +} +static DEVICE_ATTR_RO(pasid_enabled); + static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1424,6 +1624,7 @@ static struct attribute *idxd_device_attributes[] = { &dev_attr_gen_cap.attr, &dev_attr_configurable.attr, &dev_attr_clients.attr, + &dev_attr_pasid_enabled.attr, &dev_attr_state.attr, &dev_attr_errors.attr, &dev_attr_max_tokens.attr, diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c index 670db04b0757..7f116bbcfad2 100644 --- a/drivers/dma/imx-dma.c +++ b/drivers/dma/imx-dma.c @@ -191,32 +191,13 @@ struct imxdma_filter_data { int request; }; -static const struct platform_device_id imx_dma_devtype[] = { - { - .name = "imx1-dma", - .driver_data = IMX1_DMA, - }, { - .name = "imx21-dma", - .driver_data = IMX21_DMA, - }, { - .name = "imx27-dma", - .driver_data = IMX27_DMA, - }, { - /* sentinel */ - } -}; -MODULE_DEVICE_TABLE(platform, imx_dma_devtype); - static const struct of_device_id imx_dma_of_dev_id[] = { { - .compatible = "fsl,imx1-dma", - .data = &imx_dma_devtype[IMX1_DMA], + .compatible = "fsl,imx1-dma", .data = (const void *)IMX1_DMA, }, { - .compatible = "fsl,imx21-dma", - .data = &imx_dma_devtype[IMX21_DMA], + .compatible = "fsl,imx21-dma", .data = (const void *)IMX21_DMA, }, { - .compatible = "fsl,imx27-dma", - .data = &imx_dma_devtype[IMX27_DMA], + .compatible = "fsl,imx27-dma", .data = (const void *)IMX27_DMA, }, { /* sentinel */ } @@ -1056,20 +1037,15 @@ static int __init imxdma_probe(struct platform_device *pdev) { struct imxdma_engine *imxdma; struct resource *res; - const struct of_device_id *of_id; int ret, i; int irq, irq_err; - of_id = of_match_device(imx_dma_of_dev_id, &pdev->dev); - if (of_id) - pdev->id_entry = of_id->data; - imxdma = devm_kzalloc(&pdev->dev, sizeof(*imxdma), GFP_KERNEL); if (!imxdma) return -ENOMEM; imxdma->dev = &pdev->dev; - imxdma->devtype = pdev->id_entry->driver_data; + imxdma->devtype = (enum imx_dma_type)of_device_get_match_data(&pdev->dev); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); imxdma->base = devm_ioremap_resource(&pdev->dev, res); @@ -1263,7 +1239,6 @@ static struct platform_driver imxdma_driver = { .name = "imx-dma", .of_match_table = imx_dma_of_dev_id, }, - .id_table = imx_dma_devtype, .remove = imxdma_remove, }; diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 16b908c77db3..41ba21eea7c8 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -566,37 +566,6 @@ static struct sdma_driver_data sdma_imx8mq = { .check_ratio = 1, }; -static const struct platform_device_id sdma_devtypes[] = { - { - .name = "imx25-sdma", - .driver_data = (unsigned long)&sdma_imx25, - }, { - .name = "imx31-sdma", - .driver_data = (unsigned long)&sdma_imx31, - }, { - .name = "imx35-sdma", - .driver_data = (unsigned long)&sdma_imx35, - }, { - .name = "imx51-sdma", - .driver_data = (unsigned long)&sdma_imx51, - }, { - .name = "imx53-sdma", - .driver_data = (unsigned long)&sdma_imx53, - }, { - .name = "imx6q-sdma", - .driver_data = (unsigned long)&sdma_imx6q, - }, { - .name = "imx7d-sdma", - .driver_data = (unsigned long)&sdma_imx7d, - }, { - .name = "imx8mq-sdma", - .driver_data = (unsigned long)&sdma_imx8mq, - }, { - /* sentinel */ - } -}; -MODULE_DEVICE_TABLE(platform, sdma_devtypes); - static const struct of_device_id sdma_dt_ids[] = { { .compatible = "fsl,imx6q-sdma", .data = &sdma_imx6q, }, { .compatible = "fsl,imx53-sdma", .data = &sdma_imx53, }, @@ -1998,11 +1967,7 @@ static int sdma_probe(struct platform_device *pdev) s32 *saddr_arr; const struct sdma_driver_data *drvdata = NULL; - if (of_id) - drvdata = of_id->data; - else if (pdev->id_entry) - drvdata = (void *)pdev->id_entry->driver_data; - + drvdata = of_id->data; if (!drvdata) { dev_err(&pdev->dev, "unable to find driver data\n"); return -EINVAL; @@ -2211,7 +2176,6 @@ static struct platform_driver sdma_driver = { .name = "imx-sdma", .of_match_table = sdma_dt_ids, }, - .id_table = sdma_devtypes, .remove = sdma_remove, .probe = sdma_probe, }; diff --git a/drivers/dma/ipu/ipu_idmac.c b/drivers/dma/ipu/ipu_idmac.c index 38036db284cb..104ad420abbe 100644 --- a/drivers/dma/ipu/ipu_idmac.c +++ b/drivers/dma/ipu/ipu_idmac.c @@ -1160,14 +1160,13 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) struct idmac_tx_desc *desc, *descnew; bool done = false; u32 ready0, ready1, curbuf, err; - unsigned long flags; struct dmaengine_desc_callback cb; /* IDMAC has cleared the respective BUFx_RDY bit, we manage the buffer */ dev_dbg(dev, "IDMAC irq %d, buf %d\n", irq, ichan->active_buffer); - spin_lock_irqsave(&ipu_data.lock, flags); + spin_lock(&ipu_data.lock); ready0 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF0_RDY); ready1 = idmac_read_ipureg(&ipu_data, IPU_CHA_BUF1_RDY); @@ -1176,7 +1175,7 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) if (err & (1 << chan_id)) { idmac_write_ipureg(&ipu_data, 1 << chan_id, IPU_INT_STAT_4); - spin_unlock_irqrestore(&ipu_data.lock, flags); + spin_unlock(&ipu_data.lock); /* * Doing this * ichan->sg[0] = ichan->sg[1] = NULL; @@ -1188,7 +1187,7 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) chan_id, ready0, ready1, curbuf); return IRQ_HANDLED; } - spin_unlock_irqrestore(&ipu_data.lock, flags); + spin_unlock(&ipu_data.lock); /* Other interrupts do not interfere with this channel */ spin_lock(&ichan->lock); @@ -1251,9 +1250,9 @@ static irqreturn_t idmac_interrupt(int irq, void *dev_id) if (unlikely(sgnew)) { ipu_submit_buffer(ichan, descnew, sgnew, !ichan->active_buffer); } else { - spin_lock_irqsave(&ipu_data.lock, flags); + spin_lock(&ipu_data.lock); ipu_ic_disable_task(&ipu_data, chan_id); - spin_unlock_irqrestore(&ipu_data.lock, flags); + spin_unlock(&ipu_data.lock); ichan->status = IPU_CHANNEL_READY; /* Continue to check for complete descriptor */ } diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c index f609a84c493c..d0b2e601e3e5 100644 --- a/drivers/dma/k3dma.c +++ b/drivers/dma/k3dma.c @@ -223,24 +223,23 @@ static irqreturn_t k3_dma_int_handler(int irq, void *dev_id) i = __ffs(stat); stat &= ~BIT(i); if (likely(tc1 & BIT(i)) || (tc2 & BIT(i))) { - unsigned long flags; p = &d->phy[i]; c = p->vchan; if (c && (tc1 & BIT(i))) { - spin_lock_irqsave(&c->vc.lock, flags); + spin_lock(&c->vc.lock); if (p->ds_run != NULL) { vchan_cookie_complete(&p->ds_run->vd); p->ds_done = p->ds_run; p->ds_run = NULL; } - spin_unlock_irqrestore(&c->vc.lock, flags); + spin_unlock(&c->vc.lock); } if (c && (tc2 & BIT(i))) { - spin_lock_irqsave(&c->vc.lock, flags); + spin_lock(&c->vc.lock); if (p->ds_run != NULL) vchan_cyclic_callback(&p->ds_run->vd); - spin_unlock_irqrestore(&c->vc.lock, flags); + spin_unlock(&c->vc.lock); } irq_chan |= BIT(i); } diff --git a/drivers/dma/milbeaut-xdmac.c b/drivers/dma/milbeaut-xdmac.c index 85a597228fb0..584c931e807a 100644 --- a/drivers/dma/milbeaut-xdmac.c +++ b/drivers/dma/milbeaut-xdmac.c @@ -160,10 +160,9 @@ static irqreturn_t milbeaut_xdmac_interrupt(int irq, void *dev_id) { struct milbeaut_xdmac_chan *mc = dev_id; struct milbeaut_xdmac_desc *md; - unsigned long flags; u32 val; - spin_lock_irqsave(&mc->vc.lock, flags); + spin_lock(&mc->vc.lock); /* Ack and Stop */ val = FIELD_PREP(M10V_XDDSD_IS_MASK, 0x0); @@ -177,7 +176,7 @@ static irqreturn_t milbeaut_xdmac_interrupt(int irq, void *dev_id) milbeaut_xdmac_start(mc); out: - spin_unlock_irqrestore(&mc->vc.lock, flags); + spin_unlock(&mc->vc.lock); return IRQ_HANDLED; } diff --git a/drivers/dma/moxart-dma.c b/drivers/dma/moxart-dma.c index 347146a6e1d0..74755093e14b 100644 --- a/drivers/dma/moxart-dma.c +++ b/drivers/dma/moxart-dma.c @@ -524,7 +524,6 @@ static irqreturn_t moxart_dma_interrupt(int irq, void *devid) struct moxart_dmadev *mc = devid; struct moxart_chan *ch = &mc->slave_chans[0]; unsigned int i; - unsigned long flags; u32 ctrl; dev_dbg(chan2dev(&ch->vc.chan), "%s\n", __func__); @@ -541,14 +540,14 @@ static irqreturn_t moxart_dma_interrupt(int irq, void *devid) if (ctrl & APB_DMA_FIN_INT_STS) { ctrl &= ~APB_DMA_FIN_INT_STS; if (ch->desc) { - spin_lock_irqsave(&ch->vc.lock, flags); + spin_lock(&ch->vc.lock); if (++ch->sgidx < ch->desc->sglen) { moxart_dma_start_sg(ch, ch->sgidx); } else { vchan_cookie_complete(&ch->desc->vd); moxart_dma_start_desc(&ch->vc.chan); } - spin_unlock_irqrestore(&ch->vc.lock, flags); + spin_unlock(&ch->vc.lock); } } diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index 00cd1335eeba..23b232b57518 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1455,7 +1455,7 @@ static struct platform_driver mv_xor_driver = { .resume = mv_xor_resume, .driver = { .name = MV_XOR_NAME, - .of_match_table = of_match_ptr(mv_xor_dt_ids), + .of_match_table = mv_xor_dt_ids, }, }; diff --git a/drivers/dma/mv_xor_v2.c b/drivers/dma/mv_xor_v2.c index 2753a6b916f6..9b0d463f89bb 100644 --- a/drivers/dma/mv_xor_v2.c +++ b/drivers/dma/mv_xor_v2.c @@ -771,8 +771,10 @@ static int mv_xor_v2_probe(struct platform_device *pdev) goto disable_clk; msi_desc = first_msi_entry(&pdev->dev); - if (!msi_desc) + if (!msi_desc) { + ret = -ENODEV; goto free_msi_irqs; + } xor_dev->msi_desc = msi_desc; ret = devm_request_irq(&pdev->dev, msi_desc->irq, diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c index 65f816b40c32..994fc4d2aca4 100644 --- a/drivers/dma/mxs-dma.c +++ b/drivers/dma/mxs-dma.c @@ -167,29 +167,11 @@ static struct mxs_dma_type mxs_dma_types[] = { } }; -static const struct platform_device_id mxs_dma_ids[] = { - { - .name = "imx23-dma-apbh", - .driver_data = (kernel_ulong_t) &mxs_dma_types[0], - }, { - .name = "imx23-dma-apbx", - .driver_data = (kernel_ulong_t) &mxs_dma_types[1], - }, { - .name = "imx28-dma-apbh", - .driver_data = (kernel_ulong_t) &mxs_dma_types[2], - }, { - .name = "imx28-dma-apbx", - .driver_data = (kernel_ulong_t) &mxs_dma_types[3], - }, { - /* end of list */ - } -}; - static const struct of_device_id mxs_dma_dt_ids[] = { - { .compatible = "fsl,imx23-dma-apbh", .data = &mxs_dma_ids[0], }, - { .compatible = "fsl,imx23-dma-apbx", .data = &mxs_dma_ids[1], }, - { .compatible = "fsl,imx28-dma-apbh", .data = &mxs_dma_ids[2], }, - { .compatible = "fsl,imx28-dma-apbx", .data = &mxs_dma_ids[3], }, + { .compatible = "fsl,imx23-dma-apbh", .data = &mxs_dma_types[0], }, + { .compatible = "fsl,imx23-dma-apbx", .data = &mxs_dma_types[1], }, + { .compatible = "fsl,imx28-dma-apbh", .data = &mxs_dma_types[2], }, + { .compatible = "fsl,imx28-dma-apbx", .data = &mxs_dma_types[3], }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, mxs_dma_dt_ids); @@ -762,8 +744,6 @@ static struct dma_chan *mxs_dma_xlate(struct of_phandle_args *dma_spec, static int __init mxs_dma_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; - const struct platform_device_id *id_entry; - const struct of_device_id *of_id; const struct mxs_dma_type *dma_type; struct mxs_dma_engine *mxs_dma; struct resource *iores; @@ -779,13 +759,7 @@ static int __init mxs_dma_probe(struct platform_device *pdev) return ret; } - of_id = of_match_device(mxs_dma_dt_ids, &pdev->dev); - if (of_id) - id_entry = of_id->data; - else - id_entry = platform_get_device_id(pdev); - - dma_type = (struct mxs_dma_type *)id_entry->driver_data; + dma_type = (struct mxs_dma_type *)of_device_get_match_data(&pdev->dev); mxs_dma->type = dma_type->type; mxs_dma->dev_id = dma_type->id; @@ -865,7 +839,6 @@ static struct platform_driver mxs_dma_driver = { .name = "mxs-dma", .of_match_table = mxs_dma_dt_ids, }, - .id_table = mxs_dma_ids, }; static int __init mxs_dma_module_init(void) diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c index 8a4f608904b9..ec00b20ae8e4 100644 --- a/drivers/dma/of-dma.c +++ b/drivers/dma/of-dma.c @@ -75,8 +75,18 @@ static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec, ofdma->dma_router->route_free(ofdma->dma_router->dev, route_data); } else { + int ret = 0; + chan->router = ofdma->dma_router; chan->route_data = route_data; + + if (chan->device->device_router_config) + ret = chan->device->device_router_config(chan); + + if (ret) { + dma_release_channel(chan); + chan = ERR_PTR(ret); + } } /* diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c index 0f5c19370f6d..bc0f66af0f11 100644 --- a/drivers/dma/pl330.c +++ b/drivers/dma/pl330.c @@ -1527,8 +1527,6 @@ static int pl330_submit_req(struct pl330_thread *thrd, /* First dry run to check if req is acceptable */ ret = _setup_req(pl330, 1, thrd, idx, &xs); - if (ret < 0) - goto xfer_exit; if (ret > pl330->mcbufsz / 2) { dev_info(pl330->ddma.dev, "%s:%d Try increasing mcbufsz (%i/%i)\n", diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c index 71cdaaa8134c..df7704053d91 100644 --- a/drivers/dma/ppc4xx/adma.c +++ b/drivers/dma/ppc4xx/adma.c @@ -69,7 +69,7 @@ struct ppc_dma_chan_ref { }; /* The list of channels exported by ppc440spe ADMA */ -struct list_head +static struct list_head ppc440spe_adma_chan_list = LIST_HEAD_INIT(ppc440spe_adma_chan_list); /* This flag is set when want to refetch the xor chain in the interrupt @@ -559,7 +559,6 @@ static void ppc440spe_desc_set_src_mult(struct ppc440spe_adma_desc_slot *desc, int sg_index, unsigned char mult_value) { struct dma_cdb *dma_hw_desc; - struct xor_cb *xor_hw_desc; u32 *psgu; switch (chan->device->id) { @@ -590,7 +589,6 @@ static void ppc440spe_desc_set_src_mult(struct ppc440spe_adma_desc_slot *desc, *psgu |= cpu_to_le32(mult_value << mult_index); break; case PPC440SPE_XOR_ID: - xor_hw_desc = desc->hw_desc; break; default: BUG(); diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c index 349fb312c872..4a2a796e348c 100644 --- a/drivers/dma/pxa_dma.c +++ b/drivers/dma/pxa_dma.c @@ -606,7 +606,6 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id) struct pxad_chan *chan = phy->vchan; struct virt_dma_desc *vd, *tmp; unsigned int dcsr; - unsigned long flags; bool vd_completed; dma_cookie_t last_started = 0; @@ -616,7 +615,7 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id) if (dcsr & PXA_DCSR_RUN) return IRQ_NONE; - spin_lock_irqsave(&chan->vc.lock, flags); + spin_lock(&chan->vc.lock); list_for_each_entry_safe(vd, tmp, &chan->vc.desc_issued, node) { vd_completed = is_desc_completed(vd); dev_dbg(&chan->vc.chan.dev->device, @@ -658,7 +657,7 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id) pxad_launch_chan(chan, to_pxad_sw_desc(vd)); } } - spin_unlock_irqrestore(&chan->vc.lock, flags); + spin_unlock(&chan->vc.lock); wake_up(&chan->wq_state); return IRQ_HANDLED; diff --git a/drivers/dma/qcom/Kconfig b/drivers/dma/qcom/Kconfig index 3bcb689162c6..365f94eb3b08 100644 --- a/drivers/dma/qcom/Kconfig +++ b/drivers/dma/qcom/Kconfig @@ -1,4 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only +config QCOM_ADM + tristate "Qualcomm ADM support" + depends on (ARCH_QCOM || COMPILE_TEST) && !PHYS_ADDR_T_64BIT + select DMA_ENGINE + select DMA_VIRTUAL_CHANNELS + help + Enable support for the Qualcomm Application Data Mover (ADM) DMA + controller, as present on MSM8x60, APQ8064, and IPQ8064 devices. + This controller provides DMA capabilities for both general purpose + and on-chip peripheral devices. + config QCOM_BAM_DMA tristate "QCOM BAM DMA support" depends on ARCH_QCOM || (COMPILE_TEST && OF && ARM) @@ -8,6 +19,18 @@ config QCOM_BAM_DMA Enable support for the QCOM BAM DMA controller. This controller provides DMA capabilities for a variety of on-chip devices. +config QCOM_GPI_DMA + tristate "Qualcomm Technologies GPI DMA support" + depends on ARCH_QCOM + select DMA_ENGINE + select DMA_VIRTUAL_CHANNELS + help + Enable support for the QCOM GPI DMA controller. This controller + provides DMA capabilities for a variety of peripheral buses such + as I2C, UART, and SPI. By using GPI dmaengine driver, bus drivers + can use a standardize interface that is protocol independent to + transfer data between DDR and peripheral. + config QCOM_HIDMA_MGMT tristate "Qualcomm Technologies HIDMA Management support" select DMA_ENGINE diff --git a/drivers/dma/qcom/Makefile b/drivers/dma/qcom/Makefile index 1ae92da88b0c..50f1e7014693 100644 --- a/drivers/dma/qcom/Makefile +++ b/drivers/dma/qcom/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_QCOM_ADM) += qcom_adm.o obj-$(CONFIG_QCOM_BAM_DMA) += bam_dma.o +obj-$(CONFIG_QCOM_GPI_DMA) += gpi.o obj-$(CONFIG_QCOM_HIDMA_MGMT) += hdma_mgmt.o hdma_mgmt-objs := hidma_mgmt.o hidma_mgmt_sys.o obj-$(CONFIG_QCOM_HIDMA) += hdma.o diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c index 4eeb8bb27279..d5773d474d8f 100644 --- a/drivers/dma/qcom/bam_dma.c +++ b/drivers/dma/qcom/bam_dma.c @@ -875,7 +875,7 @@ static irqreturn_t bam_dma_irq(int irq, void *data) ret = bam_pm_runtime_get_sync(bdev->dev); if (ret < 0) - return ret; + return IRQ_NONE; if (srcs & BAM_IRQ) { clr_mask = readl_relaxed(bam_addr(bdev, 0, BAM_IRQ_STTS)); diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c new file mode 100644 index 000000000000..d2334f535de2 --- /dev/null +++ b/drivers/dma/qcom/gpi.c @@ -0,0 +1,2303 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2017-2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2020, Linaro Limited + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../dmaengine.h" +#include "../virt-dma.h" + +#define TRE_TYPE_DMA 0x10 +#define TRE_TYPE_GO 0x20 +#define TRE_TYPE_CONFIG0 0x22 + +/* TRE flags */ +#define TRE_FLAGS_CHAIN BIT(0) +#define TRE_FLAGS_IEOB BIT(8) +#define TRE_FLAGS_IEOT BIT(9) +#define TRE_FLAGS_BEI BIT(10) +#define TRE_FLAGS_LINK BIT(11) +#define TRE_FLAGS_TYPE GENMASK(23, 16) + +/* SPI CONFIG0 WD0 */ +#define TRE_SPI_C0_WORD_SZ GENMASK(4, 0) +#define TRE_SPI_C0_LOOPBACK BIT(8) +#define TRE_SPI_C0_CS BIT(11) +#define TRE_SPI_C0_CPHA BIT(12) +#define TRE_SPI_C0_CPOL BIT(13) +#define TRE_SPI_C0_TX_PACK BIT(24) +#define TRE_SPI_C0_RX_PACK BIT(25) + +/* CONFIG0 WD2 */ +#define TRE_C0_CLK_DIV GENMASK(11, 0) +#define TRE_C0_CLK_SRC GENMASK(19, 16) + +/* SPI GO WD0 */ +#define TRE_SPI_GO_CMD GENMASK(4, 0) +#define TRE_SPI_GO_CS GENMASK(10, 8) +#define TRE_SPI_GO_FRAG BIT(26) + +/* GO WD2 */ +#define TRE_RX_LEN GENMASK(23, 0) + +/* I2C Config0 WD0 */ +#define TRE_I2C_C0_TLOW GENMASK(7, 0) +#define TRE_I2C_C0_THIGH GENMASK(15, 8) +#define TRE_I2C_C0_TCYL GENMASK(23, 16) +#define TRE_I2C_C0_TX_PACK BIT(24) +#define TRE_I2C_C0_RX_PACK BIT(25) + +/* I2C GO WD0 */ +#define TRE_I2C_GO_CMD GENMASK(4, 0) +#define TRE_I2C_GO_ADDR GENMASK(14, 8) +#define TRE_I2C_GO_STRETCH BIT(26) + +/* DMA TRE */ +#define TRE_DMA_LEN GENMASK(23, 0) + +/* Register offsets from gpi-top */ +#define GPII_n_CH_k_CNTXT_0_OFFS(n, k) (0x20000 + (0x4000 * (n)) + (0x80 * (k))) +#define GPII_n_CH_k_CNTXT_0_EL_SIZE GENMASK(31, 24) +#define GPII_n_CH_k_CNTXT_0_CHSTATE GENMASK(23, 20) +#define GPII_n_CH_k_CNTXT_0_ERIDX GENMASK(18, 14) +#define GPII_n_CH_k_CNTXT_0_DIR BIT(3) +#define GPII_n_CH_k_CNTXT_0_PROTO GENMASK(2, 0) + +#define GPII_n_CH_k_CNTXT_0(el_size, erindex, dir, chtype_proto) \ + (FIELD_PREP(GPII_n_CH_k_CNTXT_0_EL_SIZE, el_size) | \ + FIELD_PREP(GPII_n_CH_k_CNTXT_0_ERIDX, erindex) | \ + FIELD_PREP(GPII_n_CH_k_CNTXT_0_DIR, dir) | \ + FIELD_PREP(GPII_n_CH_k_CNTXT_0_PROTO, chtype_proto)) + +#define GPI_CHTYPE_DIR_IN (0) +#define GPI_CHTYPE_DIR_OUT (1) + +#define GPI_CHTYPE_PROTO_GPI (0x2) + +#define GPII_n_CH_k_DOORBELL_0_OFFS(n, k) (0x22000 + (0x4000 * (n)) + (0x8 * (k))) +#define GPII_n_CH_CMD_OFFS(n) (0x23008 + (0x4000 * (n))) +#define GPII_n_CH_CMD_OPCODE GENMASK(31, 24) +#define GPII_n_CH_CMD_CHID GENMASK(7, 0) +#define GPII_n_CH_CMD(opcode, chid) \ + (FIELD_PREP(GPII_n_CH_CMD_OPCODE, opcode) | \ + FIELD_PREP(GPII_n_CH_CMD_CHID, chid)) + +#define GPII_n_CH_CMD_ALLOCATE (0) +#define GPII_n_CH_CMD_START (1) +#define GPII_n_CH_CMD_STOP (2) +#define GPII_n_CH_CMD_RESET (9) +#define GPII_n_CH_CMD_DE_ALLOC (10) +#define GPII_n_CH_CMD_UART_SW_STALE (32) +#define GPII_n_CH_CMD_UART_RFR_READY (33) +#define GPII_n_CH_CMD_UART_RFR_NOT_READY (34) + +/* EV Context Array */ +#define GPII_n_EV_CH_k_CNTXT_0_OFFS(n, k) (0x21000 + (0x4000 * (n)) + (0x80 * (k))) +#define GPII_n_EV_k_CNTXT_0_EL_SIZE GENMASK(31, 24) +#define GPII_n_EV_k_CNTXT_0_CHSTATE GENMASK(23, 20) +#define GPII_n_EV_k_CNTXT_0_INTYPE BIT(16) +#define GPII_n_EV_k_CNTXT_0_CHTYPE GENMASK(3, 0) + +#define GPII_n_EV_k_CNTXT_0(el_size, inttype, chtype) \ + (FIELD_PREP(GPII_n_EV_k_CNTXT_0_EL_SIZE, el_size) | \ + FIELD_PREP(GPII_n_EV_k_CNTXT_0_INTYPE, inttype) | \ + FIELD_PREP(GPII_n_EV_k_CNTXT_0_CHTYPE, chtype)) + +#define GPI_INTTYPE_IRQ (1) +#define GPI_CHTYPE_GPI_EV (0x2) + +enum CNTXT_OFFS { + CNTXT_0_CONFIG = 0x0, + CNTXT_1_R_LENGTH = 0x4, + CNTXT_2_RING_BASE_LSB = 0x8, + CNTXT_3_RING_BASE_MSB = 0xC, + CNTXT_4_RING_RP_LSB = 0x10, + CNTXT_5_RING_RP_MSB = 0x14, + CNTXT_6_RING_WP_LSB = 0x18, + CNTXT_7_RING_WP_MSB = 0x1C, + CNTXT_8_RING_INT_MOD = 0x20, + CNTXT_9_RING_INTVEC = 0x24, + CNTXT_10_RING_MSI_LSB = 0x28, + CNTXT_11_RING_MSI_MSB = 0x2C, + CNTXT_12_RING_RP_UPDATE_LSB = 0x30, + CNTXT_13_RING_RP_UPDATE_MSB = 0x34, +}; + +#define GPII_n_EV_CH_k_DOORBELL_0_OFFS(n, k) (0x22100 + (0x4000 * (n)) + (0x8 * (k))) +#define GPII_n_EV_CH_CMD_OFFS(n) (0x23010 + (0x4000 * (n))) +#define GPII_n_EV_CMD_OPCODE GENMASK(31, 24) +#define GPII_n_EV_CMD_CHID GENMASK(7, 0) +#define GPII_n_EV_CMD(opcode, chid) \ + (FIELD_PREP(GPII_n_EV_CMD_OPCODE, opcode) | \ + FIELD_PREP(GPII_n_EV_CMD_CHID, chid)) + +#define GPII_n_EV_CH_CMD_ALLOCATE (0x00) +#define GPII_n_EV_CH_CMD_RESET (0x09) +#define GPII_n_EV_CH_CMD_DE_ALLOC (0x0A) + +#define GPII_n_CNTXT_TYPE_IRQ_OFFS(n) (0x23080 + (0x4000 * (n))) + +/* mask type register */ +#define GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(n) (0x23088 + (0x4000 * (n))) +#define GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK GENMASK(6, 0) +#define GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL BIT(6) +#define GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB BIT(3) +#define GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB BIT(2) +#define GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL BIT(1) +#define GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL BIT(0) + +#define GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(n) (0x23090 + (0x4000 * (n))) +#define GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(n) (0x23094 + (0x4000 * (n))) + +/* Mask channel control interrupt register */ +#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(n) (0x23098 + (0x4000 * (n))) +#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK GENMASK(1, 0) + +/* Mask event control interrupt register */ +#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(n) (0x2309C + (0x4000 * (n))) +#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK BIT(0) + +#define GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(n) (0x230A0 + (0x4000 * (n))) +#define GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS(n) (0x230A4 + (0x4000 * (n))) + +/* Mask event interrupt register */ +#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(n) (0x230B8 + (0x4000 * (n))) +#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK BIT(0) + +#define GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(n) (0x230C0 + (0x4000 * (n))) +#define GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(n) (0x23100 + (0x4000 * (n))) +#define GPI_GLOB_IRQ_ERROR_INT_MSK BIT(0) + +/* GPII specific Global - Enable bit register */ +#define GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(n) (0x23108 + (0x4000 * (n))) +#define GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(n) (0x23110 + (0x4000 * (n))) +#define GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(n) (0x23118 + (0x4000 * (n))) + +/* GPII general interrupt - Enable bit register */ +#define GPII_n_CNTXT_GPII_IRQ_EN_OFFS(n) (0x23120 + (0x4000 * (n))) +#define GPII_n_CNTXT_GPII_IRQ_EN_BMSK GENMASK(3, 0) + +#define GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(n) (0x23128 + (0x4000 * (n))) + +/* GPII Interrupt Type register */ +#define GPII_n_CNTXT_INTSET_OFFS(n) (0x23180 + (0x4000 * (n))) +#define GPII_n_CNTXT_INTSET_BMSK BIT(0) + +#define GPII_n_CNTXT_MSI_BASE_LSB_OFFS(n) (0x23188 + (0x4000 * (n))) +#define GPII_n_CNTXT_MSI_BASE_MSB_OFFS(n) (0x2318C + (0x4000 * (n))) +#define GPII_n_CNTXT_SCRATCH_0_OFFS(n) (0x23400 + (0x4000 * (n))) +#define GPII_n_CNTXT_SCRATCH_1_OFFS(n) (0x23404 + (0x4000 * (n))) + +#define GPII_n_ERROR_LOG_OFFS(n) (0x23200 + (0x4000 * (n))) + +/* QOS Registers */ +#define GPII_n_CH_k_QOS_OFFS(n, k) (0x2005C + (0x4000 * (n)) + (0x80 * (k))) + +/* Scratch registers */ +#define GPII_n_CH_k_SCRATCH_0_OFFS(n, k) (0x20060 + (0x4000 * (n)) + (0x80 * (k))) +#define GPII_n_CH_k_SCRATCH_0_SEID GENMASK(2, 0) +#define GPII_n_CH_k_SCRATCH_0_PROTO GENMASK(7, 4) +#define GPII_n_CH_k_SCRATCH_0_PAIR GENMASK(20, 16) +#define GPII_n_CH_k_SCRATCH_0(pair, proto, seid) \ + (FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PAIR, pair) | \ + FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PROTO, proto) | \ + FIELD_PREP(GPII_n_CH_k_SCRATCH_0_SEID, seid)) +#define GPII_n_CH_k_SCRATCH_1_OFFS(n, k) (0x20064 + (0x4000 * (n)) + (0x80 * (k))) +#define GPII_n_CH_k_SCRATCH_2_OFFS(n, k) (0x20068 + (0x4000 * (n)) + (0x80 * (k))) +#define GPII_n_CH_k_SCRATCH_3_OFFS(n, k) (0x2006C + (0x4000 * (n)) + (0x80 * (k))) + +struct __packed gpi_tre { + u32 dword[4]; +}; + +enum msm_gpi_tce_code { + MSM_GPI_TCE_SUCCESS = 1, + MSM_GPI_TCE_EOT = 2, + MSM_GPI_TCE_EOB = 4, + MSM_GPI_TCE_UNEXP_ERR = 16, +}; + +#define CMD_TIMEOUT_MS (250) + +#define MAX_CHANNELS_PER_GPII (2) +#define GPI_TX_CHAN (0) +#define GPI_RX_CHAN (1) +#define STATE_IGNORE (U32_MAX) +#define EV_FACTOR (2) +#define REQ_OF_DMA_ARGS (5) /* # of arguments required from client */ +#define CHAN_TRES 64 + +struct __packed xfer_compl_event { + u64 ptr; + u32 length:24; + u8 code; + u16 status; + u8 type; + u8 chid; +}; + +struct __packed immediate_data_event { + u8 data_bytes[8]; + u8 length:4; + u8 resvd:4; + u16 tre_index; + u8 code; + u16 status; + u8 type; + u8 chid; +}; + +struct __packed qup_notif_event { + u32 status; + u32 time; + u32 count:24; + u8 resvd; + u16 resvd1; + u8 type; + u8 chid; +}; + +struct __packed gpi_ere { + u32 dword[4]; +}; + +enum GPI_EV_TYPE { + XFER_COMPLETE_EV_TYPE = 0x22, + IMMEDIATE_DATA_EV_TYPE = 0x30, + QUP_NOTIF_EV_TYPE = 0x31, + STALE_EV_TYPE = 0xFF, +}; + +union __packed gpi_event { + struct __packed xfer_compl_event xfer_compl_event; + struct __packed immediate_data_event immediate_data_event; + struct __packed qup_notif_event qup_notif_event; + struct __packed gpi_ere gpi_ere; +}; + +enum gpii_irq_settings { + DEFAULT_IRQ_SETTINGS, + MASK_IEOB_SETTINGS, +}; + +enum gpi_ev_state { + DEFAULT_EV_CH_STATE = 0, + EV_STATE_NOT_ALLOCATED = DEFAULT_EV_CH_STATE, + EV_STATE_ALLOCATED, + MAX_EV_STATES +}; + +static const char *const gpi_ev_state_str[MAX_EV_STATES] = { + [EV_STATE_NOT_ALLOCATED] = "NOT ALLOCATED", + [EV_STATE_ALLOCATED] = "ALLOCATED", +}; + +#define TO_GPI_EV_STATE_STR(_state) (((_state) >= MAX_EV_STATES) ? \ + "INVALID" : gpi_ev_state_str[(_state)]) + +enum gpi_ch_state { + DEFAULT_CH_STATE = 0x0, + CH_STATE_NOT_ALLOCATED = DEFAULT_CH_STATE, + CH_STATE_ALLOCATED = 0x1, + CH_STATE_STARTED = 0x2, + CH_STATE_STOPPED = 0x3, + CH_STATE_STOP_IN_PROC = 0x4, + CH_STATE_ERROR = 0xf, + MAX_CH_STATES +}; + +enum gpi_cmd { + GPI_CH_CMD_BEGIN, + GPI_CH_CMD_ALLOCATE = GPI_CH_CMD_BEGIN, + GPI_CH_CMD_START, + GPI_CH_CMD_STOP, + GPI_CH_CMD_RESET, + GPI_CH_CMD_DE_ALLOC, + GPI_CH_CMD_UART_SW_STALE, + GPI_CH_CMD_UART_RFR_READY, + GPI_CH_CMD_UART_RFR_NOT_READY, + GPI_CH_CMD_END = GPI_CH_CMD_UART_RFR_NOT_READY, + GPI_EV_CMD_BEGIN, + GPI_EV_CMD_ALLOCATE = GPI_EV_CMD_BEGIN, + GPI_EV_CMD_RESET, + GPI_EV_CMD_DEALLOC, + GPI_EV_CMD_END = GPI_EV_CMD_DEALLOC, + GPI_MAX_CMD, +}; + +#define IS_CHAN_CMD(_cmd) ((_cmd) <= GPI_CH_CMD_END) + +static const char *const gpi_cmd_str[GPI_MAX_CMD] = { + [GPI_CH_CMD_ALLOCATE] = "CH ALLOCATE", + [GPI_CH_CMD_START] = "CH START", + [GPI_CH_CMD_STOP] = "CH STOP", + [GPI_CH_CMD_RESET] = "CH_RESET", + [GPI_CH_CMD_DE_ALLOC] = "DE ALLOC", + [GPI_CH_CMD_UART_SW_STALE] = "UART SW STALE", + [GPI_CH_CMD_UART_RFR_READY] = "UART RFR READY", + [GPI_CH_CMD_UART_RFR_NOT_READY] = "UART RFR NOT READY", + [GPI_EV_CMD_ALLOCATE] = "EV ALLOCATE", + [GPI_EV_CMD_RESET] = "EV RESET", + [GPI_EV_CMD_DEALLOC] = "EV DEALLOC", +}; + +#define TO_GPI_CMD_STR(_cmd) (((_cmd) >= GPI_MAX_CMD) ? "INVALID" : \ + gpi_cmd_str[(_cmd)]) + +/* + * @DISABLE_STATE: no register access allowed + * @CONFIG_STATE: client has configured the channel + * @PREP_HARDWARE: register access is allowed + * however, no processing EVENTS + * @ACTIVE_STATE: channels are fully operational + * @PREPARE_TERMINATE: graceful termination of channels + * register access is allowed + * @PAUSE_STATE: channels are active, but not processing any events + */ +enum gpi_pm_state { + DISABLE_STATE, + CONFIG_STATE, + PREPARE_HARDWARE, + ACTIVE_STATE, + PREPARE_TERMINATE, + PAUSE_STATE, + MAX_PM_STATE +}; + +#define REG_ACCESS_VALID(_pm_state) ((_pm_state) >= PREPARE_HARDWARE) + +static const char *const gpi_pm_state_str[MAX_PM_STATE] = { + [DISABLE_STATE] = "DISABLE", + [CONFIG_STATE] = "CONFIG", + [PREPARE_HARDWARE] = "PREPARE HARDWARE", + [ACTIVE_STATE] = "ACTIVE", + [PREPARE_TERMINATE] = "PREPARE TERMINATE", + [PAUSE_STATE] = "PAUSE", +}; + +#define TO_GPI_PM_STR(_state) (((_state) >= MAX_PM_STATE) ? \ + "INVALID" : gpi_pm_state_str[(_state)]) + +static const struct { + enum gpi_cmd gpi_cmd; + u32 opcode; + u32 state; +} gpi_cmd_info[GPI_MAX_CMD] = { + { + GPI_CH_CMD_ALLOCATE, + GPII_n_CH_CMD_ALLOCATE, + CH_STATE_ALLOCATED, + }, + { + GPI_CH_CMD_START, + GPII_n_CH_CMD_START, + CH_STATE_STARTED, + }, + { + GPI_CH_CMD_STOP, + GPII_n_CH_CMD_STOP, + CH_STATE_STOPPED, + }, + { + GPI_CH_CMD_RESET, + GPII_n_CH_CMD_RESET, + CH_STATE_ALLOCATED, + }, + { + GPI_CH_CMD_DE_ALLOC, + GPII_n_CH_CMD_DE_ALLOC, + CH_STATE_NOT_ALLOCATED, + }, + { + GPI_CH_CMD_UART_SW_STALE, + GPII_n_CH_CMD_UART_SW_STALE, + STATE_IGNORE, + }, + { + GPI_CH_CMD_UART_RFR_READY, + GPII_n_CH_CMD_UART_RFR_READY, + STATE_IGNORE, + }, + { + GPI_CH_CMD_UART_RFR_NOT_READY, + GPII_n_CH_CMD_UART_RFR_NOT_READY, + STATE_IGNORE, + }, + { + GPI_EV_CMD_ALLOCATE, + GPII_n_EV_CH_CMD_ALLOCATE, + EV_STATE_ALLOCATED, + }, + { + GPI_EV_CMD_RESET, + GPII_n_EV_CH_CMD_RESET, + EV_STATE_ALLOCATED, + }, + { + GPI_EV_CMD_DEALLOC, + GPII_n_EV_CH_CMD_DE_ALLOC, + EV_STATE_NOT_ALLOCATED, + }, +}; + +struct gpi_ring { + void *pre_aligned; + size_t alloc_size; + phys_addr_t phys_addr; + dma_addr_t dma_handle; + void *base; + void *wp; + void *rp; + u32 len; + u32 el_size; + u32 elements; + bool configured; +}; + +struct gpi_dev { + struct dma_device dma_device; + struct device *dev; + struct resource *res; + void __iomem *regs; + void __iomem *ee_base; /*ee register base address*/ + u32 max_gpii; /* maximum # of gpii instances available per gpi block */ + u32 gpii_mask; /* gpii instances available for apps */ + u32 ev_factor; /* ev ring length factor */ + struct gpii *gpiis; +}; + +struct reg_info { + char *name; + u32 offset; + u32 val; +}; + +struct gchan { + struct virt_dma_chan vc; + u32 chid; + u32 seid; + u32 protocol; + struct gpii *gpii; + enum gpi_ch_state ch_state; + enum gpi_pm_state pm_state; + void __iomem *ch_cntxt_base_reg; + void __iomem *ch_cntxt_db_reg; + void __iomem *ch_cmd_reg; + u32 dir; + struct gpi_ring ch_ring; + void *config; +}; + +struct gpii { + u32 gpii_id; + struct gchan gchan[MAX_CHANNELS_PER_GPII]; + struct gpi_dev *gpi_dev; + int irq; + void __iomem *regs; /* points to gpi top */ + void __iomem *ev_cntxt_base_reg; + void __iomem *ev_cntxt_db_reg; + void __iomem *ev_ring_rp_lsb_reg; + void __iomem *ev_cmd_reg; + void __iomem *ieob_clr_reg; + struct mutex ctrl_lock; + enum gpi_ev_state ev_state; + bool configured_irq; + enum gpi_pm_state pm_state; + rwlock_t pm_lock; + struct gpi_ring ev_ring; + struct tasklet_struct ev_task; /* event processing tasklet */ + struct completion cmd_completion; + enum gpi_cmd gpi_cmd; + u32 cntxt_type_irq_msk; + bool ieob_set; +}; + +#define MAX_TRE 3 + +struct gpi_desc { + struct virt_dma_desc vd; + size_t len; + void *db; /* DB register to program */ + struct gchan *gchan; + struct gpi_tre tre[MAX_TRE]; + u32 num_tre; +}; + +static const u32 GPII_CHAN_DIR[MAX_CHANNELS_PER_GPII] = { + GPI_CHTYPE_DIR_OUT, GPI_CHTYPE_DIR_IN +}; + +static irqreturn_t gpi_handle_irq(int irq, void *data); +static void gpi_ring_recycle_ev_element(struct gpi_ring *ring); +static int gpi_ring_add_element(struct gpi_ring *ring, void **wp); +static void gpi_process_events(struct gpii *gpii); + +static inline struct gchan *to_gchan(struct dma_chan *dma_chan) +{ + return container_of(dma_chan, struct gchan, vc.chan); +} + +static inline struct gpi_desc *to_gpi_desc(struct virt_dma_desc *vd) +{ + return container_of(vd, struct gpi_desc, vd); +} + +static inline phys_addr_t to_physical(const struct gpi_ring *const ring, + void *addr) +{ + return ring->phys_addr + (addr - ring->base); +} + +static inline void *to_virtual(const struct gpi_ring *const ring, phys_addr_t addr) +{ + return ring->base + (addr - ring->phys_addr); +} + +static inline u32 gpi_read_reg(struct gpii *gpii, void __iomem *addr) +{ + return readl_relaxed(addr); +} + +static inline void gpi_write_reg(struct gpii *gpii, void __iomem *addr, u32 val) +{ + writel_relaxed(val, addr); +} + +/* gpi_write_reg_field - write to specific bit field */ +static inline void gpi_write_reg_field(struct gpii *gpii, void __iomem *addr, + u32 mask, u32 shift, u32 val) +{ + u32 tmp = gpi_read_reg(gpii, addr); + + tmp &= ~mask; + val = tmp | ((val << shift) & mask); + gpi_write_reg(gpii, addr, val); +} + +static inline void +gpi_update_reg(struct gpii *gpii, u32 offset, u32 mask, u32 val) +{ + void __iomem *addr = gpii->regs + offset; + u32 tmp = gpi_read_reg(gpii, addr); + + tmp &= ~mask; + tmp |= u32_encode_bits(val, mask); + + gpi_write_reg(gpii, addr, tmp); +} + +static void gpi_disable_interrupts(struct gpii *gpii) +{ + gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id), + GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id), + GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id), + GPII_n_CNTXT_INTSET_BMSK, 0); + + gpii->cntxt_type_irq_msk = 0; + devm_free_irq(gpii->gpi_dev->dev, gpii->irq, gpii); + gpii->configured_irq = false; +} + +/* configure and enable interrupts */ +static int gpi_config_interrupts(struct gpii *gpii, enum gpii_irq_settings settings, bool mask) +{ + const u32 enable = (GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL | + GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB | + GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB | + GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL | + GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL); + int ret; + + if (!gpii->configured_irq) { + ret = devm_request_irq(gpii->gpi_dev->dev, gpii->irq, + gpi_handle_irq, IRQF_TRIGGER_HIGH, + "gpi-dma", gpii); + if (ret < 0) { + dev_err(gpii->gpi_dev->dev, "error request irq:%d ret:%d\n", + gpii->irq, ret); + return ret; + } + } + + if (settings == MASK_IEOB_SETTINGS) { + /* + * GPII only uses one EV ring per gpii so we can globally + * enable/disable IEOB interrupt + */ + if (mask) + gpii->cntxt_type_irq_msk |= GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB; + else + gpii->cntxt_type_irq_msk &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB); + gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, gpii->cntxt_type_irq_msk); + } else { + gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, enable); + gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK, + GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK); + gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK, + GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK); + gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id), + GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK, + GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK); + gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id), + GPII_n_CNTXT_GPII_IRQ_EN_BMSK, + GPII_n_CNTXT_GPII_IRQ_EN_BMSK); + gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id), + GPII_n_CNTXT_GPII_IRQ_EN_BMSK, GPII_n_CNTXT_GPII_IRQ_EN_BMSK); + gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_LSB_OFFS(gpii->gpii_id), U32_MAX, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_MSB_OFFS(gpii->gpii_id), U32_MAX, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_0_OFFS(gpii->gpii_id), U32_MAX, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_1_OFFS(gpii->gpii_id), U32_MAX, 0); + gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id), + GPII_n_CNTXT_INTSET_BMSK, 1); + gpi_update_reg(gpii, GPII_n_ERROR_LOG_OFFS(gpii->gpii_id), U32_MAX, 0); + + gpii->cntxt_type_irq_msk = enable; + } + + gpii->configured_irq = true; + return 0; +} + +/* Sends gpii event or channel command */ +static int gpi_send_cmd(struct gpii *gpii, struct gchan *gchan, + enum gpi_cmd gpi_cmd) +{ + u32 chid = MAX_CHANNELS_PER_GPII; + unsigned long timeout; + void __iomem *cmd_reg; + u32 cmd; + + if (gpi_cmd >= GPI_MAX_CMD) + return -EINVAL; + if (IS_CHAN_CMD(gpi_cmd)) + chid = gchan->chid; + + dev_dbg(gpii->gpi_dev->dev, + "sending cmd: %s:%u\n", TO_GPI_CMD_STR(gpi_cmd), chid); + + /* send opcode and wait for completion */ + reinit_completion(&gpii->cmd_completion); + gpii->gpi_cmd = gpi_cmd; + + cmd_reg = IS_CHAN_CMD(gpi_cmd) ? gchan->ch_cmd_reg : gpii->ev_cmd_reg; + cmd = IS_CHAN_CMD(gpi_cmd) ? GPII_n_CH_CMD(gpi_cmd_info[gpi_cmd].opcode, chid) : + GPII_n_EV_CMD(gpi_cmd_info[gpi_cmd].opcode, 0); + gpi_write_reg(gpii, cmd_reg, cmd); + timeout = wait_for_completion_timeout(&gpii->cmd_completion, + msecs_to_jiffies(CMD_TIMEOUT_MS)); + if (!timeout) { + dev_err(gpii->gpi_dev->dev, "cmd: %s completion timeout:%u\n", + TO_GPI_CMD_STR(gpi_cmd), chid); + return -EIO; + } + + /* confirm new ch state is correct , if the cmd is a state change cmd */ + if (gpi_cmd_info[gpi_cmd].state == STATE_IGNORE) + return 0; + + if (IS_CHAN_CMD(gpi_cmd) && gchan->ch_state == gpi_cmd_info[gpi_cmd].state) + return 0; + + if (!IS_CHAN_CMD(gpi_cmd) && gpii->ev_state == gpi_cmd_info[gpi_cmd].state) + return 0; + + return -EIO; +} + +/* program transfer ring DB register */ +static inline void gpi_write_ch_db(struct gchan *gchan, + struct gpi_ring *ring, void *wp) +{ + struct gpii *gpii = gchan->gpii; + phys_addr_t p_wp; + + p_wp = to_physical(ring, wp); + gpi_write_reg(gpii, gchan->ch_cntxt_db_reg, p_wp); +} + +/* program event ring DB register */ +static inline void gpi_write_ev_db(struct gpii *gpii, + struct gpi_ring *ring, void *wp) +{ + phys_addr_t p_wp; + + p_wp = ring->phys_addr + (wp - ring->base); + gpi_write_reg(gpii, gpii->ev_cntxt_db_reg, p_wp); +} + +/* process transfer completion interrupt */ +static void gpi_process_ieob(struct gpii *gpii) +{ + gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0)); + + gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 0); + tasklet_hi_schedule(&gpii->ev_task); +} + +/* process channel control interrupt */ +static void gpi_process_ch_ctrl_irq(struct gpii *gpii) +{ + u32 gpii_id = gpii->gpii_id; + u32 offset = GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(gpii_id); + u32 ch_irq = gpi_read_reg(gpii, gpii->regs + offset); + struct gchan *gchan; + u32 chid, state; + + /* clear the status */ + offset = GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(gpii_id); + gpi_write_reg(gpii, gpii->regs + offset, (u32)ch_irq); + + for (chid = 0; chid < MAX_CHANNELS_PER_GPII; chid++) { + if (!(BIT(chid) & ch_irq)) + continue; + + gchan = &gpii->gchan[chid]; + state = gpi_read_reg(gpii, gchan->ch_cntxt_base_reg + + CNTXT_0_CONFIG); + state = FIELD_GET(GPII_n_CH_k_CNTXT_0_CHSTATE, state); + + /* + * CH_CMD_DEALLOC cmd always successful. However cmd does + * not change hardware status. So overwriting software state + * to default state. + */ + if (gpii->gpi_cmd == GPI_CH_CMD_DE_ALLOC) + state = DEFAULT_CH_STATE; + gchan->ch_state = state; + + /* + * Triggering complete all if ch_state is not a stop in process. + * Stop in process is a transition state and we will wait for + * stop interrupt before notifying. + */ + if (gchan->ch_state != CH_STATE_STOP_IN_PROC) + complete_all(&gpii->cmd_completion); + } +} + +/* processing gpi general error interrupts */ +static void gpi_process_gen_err_irq(struct gpii *gpii) +{ + u32 gpii_id = gpii->gpii_id; + u32 offset = GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(gpii_id); + u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset); + + /* clear the status */ + dev_dbg(gpii->gpi_dev->dev, "irq_stts:0x%x\n", irq_stts); + + /* Clear the register */ + offset = GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(gpii_id); + gpi_write_reg(gpii, gpii->regs + offset, irq_stts); +} + +/* processing gpi level error interrupts */ +static void gpi_process_glob_err_irq(struct gpii *gpii) +{ + u32 gpii_id = gpii->gpii_id; + u32 offset = GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(gpii_id); + u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset); + + offset = GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(gpii_id); + gpi_write_reg(gpii, gpii->regs + offset, irq_stts); + + /* only error interrupt should be set */ + if (irq_stts & ~GPI_GLOB_IRQ_ERROR_INT_MSK) { + dev_err(gpii->gpi_dev->dev, "invalid error status:0x%x\n", irq_stts); + return; + } + + offset = GPII_n_ERROR_LOG_OFFS(gpii_id); + gpi_write_reg(gpii, gpii->regs + offset, 0); +} + +/* gpii interrupt handler */ +static irqreturn_t gpi_handle_irq(int irq, void *data) +{ + struct gpii *gpii = data; + u32 gpii_id = gpii->gpii_id; + u32 type, offset; + unsigned long flags; + + read_lock_irqsave(&gpii->pm_lock, flags); + + /* + * States are out of sync to receive interrupt + * while software state is in DISABLE state, bailing out. + */ + if (!REG_ACCESS_VALID(gpii->pm_state)) { + dev_err(gpii->gpi_dev->dev, "receive interrupt while in %s state\n", + TO_GPI_PM_STR(gpii->pm_state)); + goto exit_irq; + } + + offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id); + type = gpi_read_reg(gpii, gpii->regs + offset); + + do { + /* global gpii error */ + if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB) { + gpi_process_glob_err_irq(gpii); + type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB); + } + + /* transfer complete interrupt */ + if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB) { + gpi_process_ieob(gpii); + type &= ~GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB; + } + + /* event control irq */ + if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL) { + u32 ev_state; + u32 ev_ch_irq; + + dev_dbg(gpii->gpi_dev->dev, + "processing EV CTRL interrupt\n"); + offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(gpii_id); + ev_ch_irq = gpi_read_reg(gpii, gpii->regs + offset); + + offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS + (gpii_id); + gpi_write_reg(gpii, gpii->regs + offset, ev_ch_irq); + ev_state = gpi_read_reg(gpii, gpii->ev_cntxt_base_reg + + CNTXT_0_CONFIG); + ev_state = FIELD_GET(GPII_n_EV_k_CNTXT_0_CHSTATE, ev_state); + + /* + * CMD EV_CMD_DEALLOC is always successful. However + * cmd does not change hardware status. So overwriting + * software state to default state. + */ + if (gpii->gpi_cmd == GPI_EV_CMD_DEALLOC) + ev_state = DEFAULT_EV_CH_STATE; + + gpii->ev_state = ev_state; + dev_dbg(gpii->gpi_dev->dev, "setting EV state to %s\n", + TO_GPI_EV_STATE_STR(gpii->ev_state)); + complete_all(&gpii->cmd_completion); + type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL); + } + + /* channel control irq */ + if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL) { + dev_dbg(gpii->gpi_dev->dev, "process CH CTRL interrupts\n"); + gpi_process_ch_ctrl_irq(gpii); + type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL); + } + + if (type) { + dev_err(gpii->gpi_dev->dev, "Unhandled interrupt status:0x%x\n", type); + gpi_process_gen_err_irq(gpii); + goto exit_irq; + } + + offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id); + type = gpi_read_reg(gpii, gpii->regs + offset); + } while (type); + +exit_irq: + read_unlock_irqrestore(&gpii->pm_lock, flags); + + return IRQ_HANDLED; +} + +/* process DMA Immediate completion data events */ +static void gpi_process_imed_data_event(struct gchan *gchan, + struct immediate_data_event *imed_event) +{ + struct gpii *gpii = gchan->gpii; + struct gpi_ring *ch_ring = &gchan->ch_ring; + void *tre = ch_ring->base + (ch_ring->el_size * imed_event->tre_index); + struct dmaengine_result result; + struct gpi_desc *gpi_desc; + struct virt_dma_desc *vd; + unsigned long flags; + u32 chid; + + /* + * If channel not active don't process event + */ + if (gchan->pm_state != ACTIVE_STATE) { + dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n", + TO_GPI_PM_STR(gchan->pm_state)); + return; + } + + spin_lock_irqsave(&gchan->vc.lock, flags); + vd = vchan_next_desc(&gchan->vc); + if (!vd) { + struct gpi_ere *gpi_ere; + struct gpi_tre *gpi_tre; + + spin_unlock_irqrestore(&gchan->vc.lock, flags); + dev_dbg(gpii->gpi_dev->dev, "event without a pending descriptor!\n"); + gpi_ere = (struct gpi_ere *)imed_event; + dev_dbg(gpii->gpi_dev->dev, + "Event: %08x %08x %08x %08x\n", + gpi_ere->dword[0], gpi_ere->dword[1], + gpi_ere->dword[2], gpi_ere->dword[3]); + gpi_tre = tre; + dev_dbg(gpii->gpi_dev->dev, + "Pending TRE: %08x %08x %08x %08x\n", + gpi_tre->dword[0], gpi_tre->dword[1], + gpi_tre->dword[2], gpi_tre->dword[3]); + return; + } + gpi_desc = to_gpi_desc(vd); + spin_unlock_irqrestore(&gchan->vc.lock, flags); + + /* + * RP pointed by Event is to last TRE processed, + * we need to update ring rp to tre + 1 + */ + tre += ch_ring->el_size; + if (tre >= (ch_ring->base + ch_ring->len)) + tre = ch_ring->base; + ch_ring->rp = tre; + + /* make sure rp updates are immediately visible to all cores */ + smp_wmb(); + + chid = imed_event->chid; + if (imed_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) { + if (chid == GPI_RX_CHAN) + goto gpi_free_desc; + else + return; + } + + if (imed_event->code == MSM_GPI_TCE_UNEXP_ERR) + result.result = DMA_TRANS_ABORTED; + else + result.result = DMA_TRANS_NOERROR; + result.residue = gpi_desc->len - imed_event->length; + + dma_cookie_complete(&vd->tx); + dmaengine_desc_get_callback_invoke(&vd->tx, &result); + +gpi_free_desc: + spin_lock_irqsave(&gchan->vc.lock, flags); + list_del(&vd->node); + spin_unlock_irqrestore(&gchan->vc.lock, flags); + kfree(gpi_desc); + gpi_desc = NULL; +} + +/* processing transfer completion events */ +static void gpi_process_xfer_compl_event(struct gchan *gchan, + struct xfer_compl_event *compl_event) +{ + struct gpii *gpii = gchan->gpii; + struct gpi_ring *ch_ring = &gchan->ch_ring; + void *ev_rp = to_virtual(ch_ring, compl_event->ptr); + struct virt_dma_desc *vd; + struct gpi_desc *gpi_desc; + struct dmaengine_result result; + unsigned long flags; + u32 chid; + + /* only process events on active channel */ + if (unlikely(gchan->pm_state != ACTIVE_STATE)) { + dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n", + TO_GPI_PM_STR(gchan->pm_state)); + return; + } + + spin_lock_irqsave(&gchan->vc.lock, flags); + vd = vchan_next_desc(&gchan->vc); + if (!vd) { + struct gpi_ere *gpi_ere; + + spin_unlock_irqrestore(&gchan->vc.lock, flags); + dev_err(gpii->gpi_dev->dev, "Event without a pending descriptor!\n"); + gpi_ere = (struct gpi_ere *)compl_event; + dev_err(gpii->gpi_dev->dev, + "Event: %08x %08x %08x %08x\n", + gpi_ere->dword[0], gpi_ere->dword[1], + gpi_ere->dword[2], gpi_ere->dword[3]); + return; + } + + gpi_desc = to_gpi_desc(vd); + spin_unlock_irqrestore(&gchan->vc.lock, flags); + + /* + * RP pointed by Event is to last TRE processed, + * we need to update ring rp to ev_rp + 1 + */ + ev_rp += ch_ring->el_size; + if (ev_rp >= (ch_ring->base + ch_ring->len)) + ev_rp = ch_ring->base; + ch_ring->rp = ev_rp; + + /* update must be visible to other cores */ + smp_wmb(); + + chid = compl_event->chid; + if (compl_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) { + if (chid == GPI_RX_CHAN) + goto gpi_free_desc; + else + return; + } + + if (compl_event->code == MSM_GPI_TCE_UNEXP_ERR) { + dev_err(gpii->gpi_dev->dev, "Error in Transaction\n"); + result.result = DMA_TRANS_ABORTED; + } else { + dev_dbg(gpii->gpi_dev->dev, "Transaction Success\n"); + result.result = DMA_TRANS_NOERROR; + } + result.residue = gpi_desc->len - compl_event->length; + dev_dbg(gpii->gpi_dev->dev, "Residue %d\n", result.residue); + + dma_cookie_complete(&vd->tx); + dmaengine_desc_get_callback_invoke(&vd->tx, &result); + +gpi_free_desc: + spin_lock_irqsave(&gchan->vc.lock, flags); + list_del(&vd->node); + spin_unlock_irqrestore(&gchan->vc.lock, flags); + kfree(gpi_desc); + gpi_desc = NULL; +} + +/* process all events */ +static void gpi_process_events(struct gpii *gpii) +{ + struct gpi_ring *ev_ring = &gpii->ev_ring; + phys_addr_t cntxt_rp; + void *rp; + union gpi_event *gpi_event; + struct gchan *gchan; + u32 chid, type; + + cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg); + rp = to_virtual(ev_ring, cntxt_rp); + + do { + while (rp != ev_ring->rp) { + gpi_event = ev_ring->rp; + chid = gpi_event->xfer_compl_event.chid; + type = gpi_event->xfer_compl_event.type; + + dev_dbg(gpii->gpi_dev->dev, + "Event: CHID:%u, type:%x %08x %08x %08x %08x\n", + chid, type, gpi_event->gpi_ere.dword[0], + gpi_event->gpi_ere.dword[1], gpi_event->gpi_ere.dword[2], + gpi_event->gpi_ere.dword[3]); + + switch (type) { + case XFER_COMPLETE_EV_TYPE: + gchan = &gpii->gchan[chid]; + gpi_process_xfer_compl_event(gchan, + &gpi_event->xfer_compl_event); + break; + case STALE_EV_TYPE: + dev_dbg(gpii->gpi_dev->dev, "stale event, not processing\n"); + break; + case IMMEDIATE_DATA_EV_TYPE: + gchan = &gpii->gchan[chid]; + gpi_process_imed_data_event(gchan, + &gpi_event->immediate_data_event); + break; + case QUP_NOTIF_EV_TYPE: + dev_dbg(gpii->gpi_dev->dev, "QUP_NOTIF_EV_TYPE\n"); + break; + default: + dev_dbg(gpii->gpi_dev->dev, + "not supported event type:0x%x\n", type); + } + gpi_ring_recycle_ev_element(ev_ring); + } + gpi_write_ev_db(gpii, ev_ring, ev_ring->wp); + + /* clear pending IEOB events */ + gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0)); + + cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg); + rp = to_virtual(ev_ring, cntxt_rp); + + } while (rp != ev_ring->rp); +} + +/* processing events using tasklet */ +static void gpi_ev_tasklet(unsigned long data) +{ + struct gpii *gpii = (struct gpii *)data; + + read_lock_bh(&gpii->pm_lock); + if (!REG_ACCESS_VALID(gpii->pm_state)) { + read_unlock_bh(&gpii->pm_lock); + dev_err(gpii->gpi_dev->dev, "not processing any events, pm_state:%s\n", + TO_GPI_PM_STR(gpii->pm_state)); + return; + } + + /* process the events */ + gpi_process_events(gpii); + + /* enable IEOB, switching back to interrupts */ + gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 1); + read_unlock_bh(&gpii->pm_lock); +} + +/* marks all pending events for the channel as stale */ +static void gpi_mark_stale_events(struct gchan *gchan) +{ + struct gpii *gpii = gchan->gpii; + struct gpi_ring *ev_ring = &gpii->ev_ring; + u32 cntxt_rp, local_rp; + void *ev_rp; + + cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg); + + ev_rp = ev_ring->rp; + local_rp = (u32)to_physical(ev_ring, ev_rp); + while (local_rp != cntxt_rp) { + union gpi_event *gpi_event = ev_rp; + u32 chid = gpi_event->xfer_compl_event.chid; + + if (chid == gchan->chid) + gpi_event->xfer_compl_event.type = STALE_EV_TYPE; + ev_rp += ev_ring->el_size; + if (ev_rp >= (ev_ring->base + ev_ring->len)) + ev_rp = ev_ring->base; + cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg); + local_rp = (u32)to_physical(ev_ring, ev_rp); + } +} + +/* reset sw state and issue channel reset or de-alloc */ +static int gpi_reset_chan(struct gchan *gchan, enum gpi_cmd gpi_cmd) +{ + struct gpii *gpii = gchan->gpii; + struct gpi_ring *ch_ring = &gchan->ch_ring; + unsigned long flags; + LIST_HEAD(list); + int ret; + + ret = gpi_send_cmd(gpii, gchan, gpi_cmd); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n", + TO_GPI_CMD_STR(gpi_cmd), ret); + return ret; + } + + /* initialize the local ring ptrs */ + ch_ring->rp = ch_ring->base; + ch_ring->wp = ch_ring->base; + + /* visible to other cores */ + smp_wmb(); + + /* check event ring for any stale events */ + write_lock_irq(&gpii->pm_lock); + gpi_mark_stale_events(gchan); + + /* remove all async descriptors */ + spin_lock_irqsave(&gchan->vc.lock, flags); + vchan_get_all_descriptors(&gchan->vc, &list); + spin_unlock_irqrestore(&gchan->vc.lock, flags); + write_unlock_irq(&gpii->pm_lock); + vchan_dma_desc_free_list(&gchan->vc, &list); + + return 0; +} + +static int gpi_start_chan(struct gchan *gchan) +{ + struct gpii *gpii = gchan->gpii; + int ret; + + ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_START); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n", + TO_GPI_CMD_STR(GPI_CH_CMD_START), ret); + return ret; + } + + /* gpii CH is active now */ + write_lock_irq(&gpii->pm_lock); + gchan->pm_state = ACTIVE_STATE; + write_unlock_irq(&gpii->pm_lock); + + return 0; +} + +static int gpi_stop_chan(struct gchan *gchan) +{ + struct gpii *gpii = gchan->gpii; + int ret; + + ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_STOP); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n", + TO_GPI_CMD_STR(GPI_CH_CMD_STOP), ret); + return ret; + } + + return 0; +} + +/* allocate and configure the transfer channel */ +static int gpi_alloc_chan(struct gchan *chan, bool send_alloc_cmd) +{ + struct gpii *gpii = chan->gpii; + struct gpi_ring *ring = &chan->ch_ring; + int ret; + u32 id = gpii->gpii_id; + u32 chid = chan->chid; + u32 pair_chid = !chid; + + if (send_alloc_cmd) { + ret = gpi_send_cmd(gpii, chan, GPI_CH_CMD_ALLOCATE); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n", + TO_GPI_CMD_STR(GPI_CH_CMD_ALLOCATE), ret); + return ret; + } + } + + gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_0_CONFIG, + GPII_n_CH_k_CNTXT_0(ring->el_size, 0, chan->dir, GPI_CHTYPE_PROTO_GPI)); + gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_1_R_LENGTH, ring->len); + gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_2_RING_BASE_LSB, ring->phys_addr); + gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_3_RING_BASE_MSB, + upper_32_bits(ring->phys_addr)); + gpi_write_reg(gpii, chan->ch_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB, + upper_32_bits(ring->phys_addr)); + gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_0_OFFS(id, chid), + GPII_n_CH_k_SCRATCH_0(pair_chid, chan->protocol, chan->seid)); + gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_1_OFFS(id, chid), 0); + gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_2_OFFS(id, chid), 0); + gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_3_OFFS(id, chid), 0); + gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_QOS_OFFS(id, chid), 1); + + /* flush all the writes */ + wmb(); + return 0; +} + +/* allocate and configure event ring */ +static int gpi_alloc_ev_chan(struct gpii *gpii) +{ + struct gpi_ring *ring = &gpii->ev_ring; + void __iomem *base = gpii->ev_cntxt_base_reg; + int ret; + + ret = gpi_send_cmd(gpii, NULL, GPI_EV_CMD_ALLOCATE); + if (ret) { + dev_err(gpii->gpi_dev->dev, "error with cmd:%s ret:%d\n", + TO_GPI_CMD_STR(GPI_EV_CMD_ALLOCATE), ret); + return ret; + } + + /* program event context */ + gpi_write_reg(gpii, base + CNTXT_0_CONFIG, + GPII_n_EV_k_CNTXT_0(ring->el_size, GPI_INTTYPE_IRQ, GPI_CHTYPE_GPI_EV)); + gpi_write_reg(gpii, base + CNTXT_1_R_LENGTH, ring->len); + gpi_write_reg(gpii, base + CNTXT_2_RING_BASE_LSB, lower_32_bits(ring->phys_addr)); + gpi_write_reg(gpii, base + CNTXT_3_RING_BASE_MSB, upper_32_bits(ring->phys_addr)); + gpi_write_reg(gpii, gpii->ev_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB, + upper_32_bits(ring->phys_addr)); + gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0); + gpi_write_reg(gpii, base + CNTXT_10_RING_MSI_LSB, 0); + gpi_write_reg(gpii, base + CNTXT_11_RING_MSI_MSB, 0); + gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0); + gpi_write_reg(gpii, base + CNTXT_12_RING_RP_UPDATE_LSB, 0); + gpi_write_reg(gpii, base + CNTXT_13_RING_RP_UPDATE_MSB, 0); + + /* add events to ring */ + ring->wp = (ring->base + ring->len - ring->el_size); + + /* flush all the writes */ + wmb(); + + /* gpii is active now */ + write_lock_irq(&gpii->pm_lock); + gpii->pm_state = ACTIVE_STATE; + write_unlock_irq(&gpii->pm_lock); + gpi_write_ev_db(gpii, ring, ring->wp); + + return 0; +} + +/* calculate # of ERE/TRE available to queue */ +static int gpi_ring_num_elements_avail(const struct gpi_ring * const ring) +{ + int elements = 0; + + if (ring->wp < ring->rp) { + elements = ((ring->rp - ring->wp) / ring->el_size) - 1; + } else { + elements = (ring->rp - ring->base) / ring->el_size; + elements += ((ring->base + ring->len - ring->wp) / ring->el_size) - 1; + } + + return elements; +} + +static int gpi_ring_add_element(struct gpi_ring *ring, void **wp) +{ + if (gpi_ring_num_elements_avail(ring) <= 0) + return -ENOMEM; + + *wp = ring->wp; + ring->wp += ring->el_size; + if (ring->wp >= (ring->base + ring->len)) + ring->wp = ring->base; + + /* visible to other cores */ + smp_wmb(); + + return 0; +} + +static void gpi_ring_recycle_ev_element(struct gpi_ring *ring) +{ + /* Update the WP */ + ring->wp += ring->el_size; + if (ring->wp >= (ring->base + ring->len)) + ring->wp = ring->base; + + /* Update the RP */ + ring->rp += ring->el_size; + if (ring->rp >= (ring->base + ring->len)) + ring->rp = ring->base; + + /* visible to other cores */ + smp_wmb(); +} + +static void gpi_free_ring(struct gpi_ring *ring, + struct gpii *gpii) +{ + dma_free_coherent(gpii->gpi_dev->dev, ring->alloc_size, + ring->pre_aligned, ring->dma_handle); + memset(ring, 0, sizeof(*ring)); +} + +/* allocate memory for transfer and event rings */ +static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements, + u32 el_size, struct gpii *gpii) +{ + u64 len = elements * el_size; + int bit; + + /* ring len must be power of 2 */ + bit = find_last_bit((unsigned long *)&len, 32); + if (((1 << bit) - 1) & len) + bit++; + len = 1 << bit; + ring->alloc_size = (len + (len - 1)); + dev_dbg(gpii->gpi_dev->dev, + "#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%lu\n", + elements, el_size, (elements * el_size), len, + ring->alloc_size); + + ring->pre_aligned = dma_alloc_coherent(gpii->gpi_dev->dev, + ring->alloc_size, + &ring->dma_handle, GFP_KERNEL); + if (!ring->pre_aligned) { + dev_err(gpii->gpi_dev->dev, "could not alloc size:%lu mem for ring\n", + ring->alloc_size); + return -ENOMEM; + } + + /* align the physical mem */ + ring->phys_addr = (ring->dma_handle + (len - 1)) & ~(len - 1); + ring->base = ring->pre_aligned + (ring->phys_addr - ring->dma_handle); + ring->rp = ring->base; + ring->wp = ring->base; + ring->len = len; + ring->el_size = el_size; + ring->elements = ring->len / ring->el_size; + memset(ring->base, 0, ring->len); + ring->configured = true; + + /* update to other cores */ + smp_wmb(); + + dev_dbg(gpii->gpi_dev->dev, + "phy_pre:0x%0llx phy_alig:0x%0llx len:%u el_size:%u elements:%u\n", + ring->dma_handle, ring->phys_addr, ring->len, + ring->el_size, ring->elements); + + return 0; +} + +/* copy tre into transfer ring */ +static void gpi_queue_xfer(struct gpii *gpii, struct gchan *gchan, + struct gpi_tre *gpi_tre, void **wp) +{ + struct gpi_tre *ch_tre; + int ret; + + /* get next tre location we can copy */ + ret = gpi_ring_add_element(&gchan->ch_ring, (void **)&ch_tre); + if (unlikely(ret)) { + dev_err(gpii->gpi_dev->dev, "Error adding ring element to xfer ring\n"); + return; + } + + /* copy the tre info */ + memcpy(ch_tre, gpi_tre, sizeof(*ch_tre)); + *wp = ch_tre; +} + +/* reset and restart transfer channel */ +static int gpi_terminate_all(struct dma_chan *chan) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + int schid, echid, i; + int ret = 0; + + mutex_lock(&gpii->ctrl_lock); + + /* + * treat both channels as a group if its protocol is not UART + * STOP, RESET, or START needs to be in lockstep + */ + schid = (gchan->protocol == QCOM_GPI_UART) ? gchan->chid : 0; + echid = (gchan->protocol == QCOM_GPI_UART) ? schid + 1 : MAX_CHANNELS_PER_GPII; + + /* stop the channel */ + for (i = schid; i < echid; i++) { + gchan = &gpii->gchan[i]; + + /* disable ch state so no more TRE processing */ + write_lock_irq(&gpii->pm_lock); + gchan->pm_state = PREPARE_TERMINATE; + write_unlock_irq(&gpii->pm_lock); + + /* send command to Stop the channel */ + ret = gpi_stop_chan(gchan); + } + + /* reset the channels (clears any pending tre) */ + for (i = schid; i < echid; i++) { + gchan = &gpii->gchan[i]; + + ret = gpi_reset_chan(gchan, GPI_CH_CMD_RESET); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error resetting channel ret:%d\n", ret); + goto terminate_exit; + } + + /* reprogram channel CNTXT */ + ret = gpi_alloc_chan(gchan, false); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error alloc_channel ret:%d\n", ret); + goto terminate_exit; + } + } + + /* restart the channels */ + for (i = schid; i < echid; i++) { + gchan = &gpii->gchan[i]; + + ret = gpi_start_chan(gchan); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error Starting Channel ret:%d\n", ret); + goto terminate_exit; + } + } + +terminate_exit: + mutex_unlock(&gpii->ctrl_lock); + return ret; +} + +/* pause dma transfer for all channels */ +static int gpi_pause(struct dma_chan *chan) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + int i, ret; + + mutex_lock(&gpii->ctrl_lock); + + /* + * pause/resume are per gpii not per channel, so + * client needs to call pause only once + */ + if (gpii->pm_state == PAUSE_STATE) { + dev_dbg(gpii->gpi_dev->dev, "channel is already paused\n"); + mutex_unlock(&gpii->ctrl_lock); + return 0; + } + + /* send stop command to stop the channels */ + for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) { + ret = gpi_stop_chan(&gpii->gchan[i]); + if (ret) { + mutex_unlock(&gpii->ctrl_lock); + return ret; + } + } + + disable_irq(gpii->irq); + + /* Wait for threads to complete out */ + tasklet_kill(&gpii->ev_task); + + write_lock_irq(&gpii->pm_lock); + gpii->pm_state = PAUSE_STATE; + write_unlock_irq(&gpii->pm_lock); + mutex_unlock(&gpii->ctrl_lock); + + return 0; +} + +/* resume dma transfer */ +static int gpi_resume(struct dma_chan *chan) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + int i, ret; + + mutex_lock(&gpii->ctrl_lock); + if (gpii->pm_state == ACTIVE_STATE) { + dev_dbg(gpii->gpi_dev->dev, "channel is already active\n"); + mutex_unlock(&gpii->ctrl_lock); + return 0; + } + + enable_irq(gpii->irq); + + /* send start command to start the channels */ + for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) { + ret = gpi_send_cmd(gpii, &gpii->gchan[i], GPI_CH_CMD_START); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error starting chan, ret:%d\n", ret); + mutex_unlock(&gpii->ctrl_lock); + return ret; + } + } + + write_lock_irq(&gpii->pm_lock); + gpii->pm_state = ACTIVE_STATE; + write_unlock_irq(&gpii->pm_lock); + mutex_unlock(&gpii->ctrl_lock); + + return 0; +} + +static void gpi_desc_free(struct virt_dma_desc *vd) +{ + struct gpi_desc *gpi_desc = to_gpi_desc(vd); + + kfree(gpi_desc); + gpi_desc = NULL; +} + +static int +gpi_peripheral_config(struct dma_chan *chan, struct dma_slave_config *config) +{ + struct gchan *gchan = to_gchan(chan); + + if (!config->peripheral_config) + return -EINVAL; + + gchan->config = krealloc(gchan->config, config->peripheral_size, GFP_NOWAIT); + if (!gchan->config) + return -ENOMEM; + + memcpy(gchan->config, config->peripheral_config, config->peripheral_size); + + return 0; +} + +static int gpi_create_i2c_tre(struct gchan *chan, struct gpi_desc *desc, + struct scatterlist *sgl, enum dma_transfer_direction direction) +{ + struct gpi_i2c_config *i2c = chan->config; + struct device *dev = chan->gpii->gpi_dev->dev; + unsigned int tre_idx = 0; + dma_addr_t address; + struct gpi_tre *tre; + unsigned int i; + + /* first create config tre if applicable */ + if (i2c->set_config) { + tre = &desc->tre[tre_idx]; + tre_idx++; + + tre->dword[0] = u32_encode_bits(i2c->low_count, TRE_I2C_C0_TLOW); + tre->dword[0] |= u32_encode_bits(i2c->high_count, TRE_I2C_C0_THIGH); + tre->dword[0] |= u32_encode_bits(i2c->cycle_count, TRE_I2C_C0_TCYL); + tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_TX_PACK); + tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_RX_PACK); + + tre->dword[1] = 0; + + tre->dword[2] = u32_encode_bits(i2c->clk_div, TRE_C0_CLK_DIV); + + tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE); + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN); + } + + /* create the GO tre for Tx */ + if (i2c->op == I2C_WRITE) { + tre = &desc->tre[tre_idx]; + tre_idx++; + + if (i2c->multi_msg) + tre->dword[0] = u32_encode_bits(I2C_READ, TRE_I2C_GO_CMD); + else + tre->dword[0] = u32_encode_bits(i2c->op, TRE_I2C_GO_CMD); + + tre->dword[0] |= u32_encode_bits(i2c->addr, TRE_I2C_GO_ADDR); + tre->dword[0] |= u32_encode_bits(i2c->stretch, TRE_I2C_GO_STRETCH); + + tre->dword[1] = 0; + tre->dword[2] = u32_encode_bits(i2c->rx_len, TRE_RX_LEN); + + tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE); + + if (i2c->multi_msg) + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_LINK); + else + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN); + } + + if (i2c->op == I2C_READ || i2c->multi_msg == false) { + /* create the DMA TRE */ + tre = &desc->tre[tre_idx]; + tre_idx++; + + address = sg_dma_address(sgl); + tre->dword[0] = lower_32_bits(address); + tre->dword[1] = upper_32_bits(address); + + tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN); + + tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE); + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT); + }; + + for (i = 0; i < tre_idx; i++) + dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0], + desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]); + + return tre_idx; +} + +static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc, + struct scatterlist *sgl, enum dma_transfer_direction direction) +{ + struct gpi_spi_config *spi = chan->config; + struct device *dev = chan->gpii->gpi_dev->dev; + unsigned int tre_idx = 0; + dma_addr_t address; + struct gpi_tre *tre; + unsigned int i; + + /* first create config tre if applicable */ + if (direction == DMA_MEM_TO_DEV && spi->set_config) { + tre = &desc->tre[tre_idx]; + tre_idx++; + + tre->dword[0] = u32_encode_bits(spi->word_len, TRE_SPI_C0_WORD_SZ); + tre->dword[0] |= u32_encode_bits(spi->loopback_en, TRE_SPI_C0_LOOPBACK); + tre->dword[0] |= u32_encode_bits(spi->clock_pol_high, TRE_SPI_C0_CPOL); + tre->dword[0] |= u32_encode_bits(spi->data_pol_high, TRE_SPI_C0_CPHA); + tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_TX_PACK); + tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_RX_PACK); + + tre->dword[1] = 0; + + tre->dword[2] = u32_encode_bits(spi->clk_div, TRE_C0_CLK_DIV); + tre->dword[2] |= u32_encode_bits(spi->clk_src, TRE_C0_CLK_SRC); + + tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE); + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN); + } + + /* create the GO tre for Tx */ + if (direction == DMA_MEM_TO_DEV) { + tre = &desc->tre[tre_idx]; + tre_idx++; + + tre->dword[0] = u32_encode_bits(spi->fragmentation, TRE_SPI_GO_FRAG); + tre->dword[0] |= u32_encode_bits(spi->cs, TRE_SPI_GO_CS); + tre->dword[0] |= u32_encode_bits(spi->cmd, TRE_SPI_GO_CMD); + + tre->dword[1] = 0; + + tre->dword[2] = u32_encode_bits(spi->rx_len, TRE_RX_LEN); + + tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE); + if (spi->cmd == SPI_RX) + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOB); + else + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN); + } + + /* create the dma tre */ + tre = &desc->tre[tre_idx]; + tre_idx++; + + address = sg_dma_address(sgl); + tre->dword[0] = lower_32_bits(address); + tre->dword[1] = upper_32_bits(address); + + tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN); + + tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE); + if (direction == DMA_MEM_TO_DEV) + tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT); + + for (i = 0; i < tre_idx; i++) + dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0], + desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]); + + return tre_idx; +} + +/* copy tre into transfer ring */ +static struct dma_async_tx_descriptor * +gpi_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_transfer_direction direction, + unsigned long flags, void *context) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + struct device *dev = gpii->gpi_dev->dev; + struct gpi_ring *ch_ring = &gchan->ch_ring; + struct gpi_desc *gpi_desc; + u32 nr, nr_tre = 0; + u8 set_config; + int i; + + gpii->ieob_set = false; + if (!is_slave_direction(direction)) { + dev_err(gpii->gpi_dev->dev, "invalid dma direction: %d\n", direction); + return NULL; + } + + if (sg_len > 1) { + dev_err(dev, "Multi sg sent, we support only one atm: %d\n", sg_len); + return NULL; + } + + nr_tre = 3; + set_config = *(u32 *)gchan->config; + if (!set_config) + nr_tre = 2; + if (direction == DMA_DEV_TO_MEM) /* rx */ + nr_tre = 1; + + /* calculate # of elements required & available */ + nr = gpi_ring_num_elements_avail(ch_ring); + if (nr < nr_tre) { + dev_err(dev, "not enough space in ring, avail:%u required:%u\n", nr, nr_tre); + return NULL; + } + + gpi_desc = kzalloc(sizeof(*gpi_desc), GFP_NOWAIT); + if (!gpi_desc) + return NULL; + + /* create TREs for xfer */ + if (gchan->protocol == QCOM_GPI_SPI) { + i = gpi_create_spi_tre(gchan, gpi_desc, sgl, direction); + } else if (gchan->protocol == QCOM_GPI_I2C) { + i = gpi_create_i2c_tre(gchan, gpi_desc, sgl, direction); + } else { + dev_err(dev, "invalid peripheral: %d\n", gchan->protocol); + kfree(gpi_desc); + return NULL; + } + + /* set up the descriptor */ + gpi_desc->gchan = gchan; + gpi_desc->len = sg_dma_len(sgl); + gpi_desc->num_tre = i; + + return vchan_tx_prep(&gchan->vc, &gpi_desc->vd, flags); +} + +/* rings transfer ring db to being transfer */ +static void gpi_issue_pending(struct dma_chan *chan) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + unsigned long flags, pm_lock_flags; + struct virt_dma_desc *vd = NULL; + struct gpi_desc *gpi_desc; + struct gpi_ring *ch_ring = &gchan->ch_ring; + void *tre, *wp = NULL; + int i; + + read_lock_irqsave(&gpii->pm_lock, pm_lock_flags); + + /* move all submitted discriptors to issued list */ + spin_lock_irqsave(&gchan->vc.lock, flags); + if (vchan_issue_pending(&gchan->vc)) + vd = list_last_entry(&gchan->vc.desc_issued, + struct virt_dma_desc, node); + spin_unlock_irqrestore(&gchan->vc.lock, flags); + + /* nothing to do list is empty */ + if (!vd) { + read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags); + return; + } + + gpi_desc = to_gpi_desc(vd); + for (i = 0; i < gpi_desc->num_tre; i++) { + tre = &gpi_desc->tre[i]; + gpi_queue_xfer(gpii, gchan, tre, &wp); + } + + gpi_desc->db = ch_ring->wp; + gpi_write_ch_db(gchan, &gchan->ch_ring, gpi_desc->db); + read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags); +} + +static int gpi_ch_init(struct gchan *gchan) +{ + struct gpii *gpii = gchan->gpii; + const int ev_factor = gpii->gpi_dev->ev_factor; + u32 elements; + int i = 0, ret = 0; + + gchan->pm_state = CONFIG_STATE; + + /* check if both channels are configured before continue */ + for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) + if (gpii->gchan[i].pm_state != CONFIG_STATE) + goto exit_gpi_init; + + /* protocol must be same for both channels */ + if (gpii->gchan[0].protocol != gpii->gchan[1].protocol) { + dev_err(gpii->gpi_dev->dev, "protocol did not match protocol %u != %u\n", + gpii->gchan[0].protocol, gpii->gchan[1].protocol); + ret = -EINVAL; + goto exit_gpi_init; + } + + /* allocate memory for event ring */ + elements = CHAN_TRES << ev_factor; + ret = gpi_alloc_ring(&gpii->ev_ring, elements, + sizeof(union gpi_event), gpii); + if (ret) + goto exit_gpi_init; + + /* configure interrupts */ + write_lock_irq(&gpii->pm_lock); + gpii->pm_state = PREPARE_HARDWARE; + write_unlock_irq(&gpii->pm_lock); + ret = gpi_config_interrupts(gpii, DEFAULT_IRQ_SETTINGS, 0); + if (ret) { + dev_err(gpii->gpi_dev->dev, "error config. interrupts, ret:%d\n", ret); + goto error_config_int; + } + + /* allocate event rings */ + ret = gpi_alloc_ev_chan(gpii); + if (ret) { + dev_err(gpii->gpi_dev->dev, "error alloc_ev_chan:%d\n", ret); + goto error_alloc_ev_ring; + } + + /* Allocate all channels */ + for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) { + ret = gpi_alloc_chan(&gpii->gchan[i], true); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error allocating chan:%d\n", ret); + goto error_alloc_chan; + } + } + + /* start channels */ + for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) { + ret = gpi_start_chan(&gpii->gchan[i]); + if (ret) { + dev_err(gpii->gpi_dev->dev, "Error start chan:%d\n", ret); + goto error_start_chan; + } + } + return ret; + +error_start_chan: + for (i = i - 1; i >= 0; i++) { + gpi_stop_chan(&gpii->gchan[i]); + gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET); + } + i = 2; +error_alloc_chan: + for (i = i - 1; i >= 0; i--) + gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC); +error_alloc_ev_ring: + gpi_disable_interrupts(gpii); +error_config_int: + gpi_free_ring(&gpii->ev_ring, gpii); +exit_gpi_init: + mutex_unlock(&gpii->ctrl_lock); + return ret; +} + +/* release all channel resources */ +static void gpi_free_chan_resources(struct dma_chan *chan) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + enum gpi_pm_state cur_state; + int ret, i; + + mutex_lock(&gpii->ctrl_lock); + + cur_state = gchan->pm_state; + + /* disable ch state so no more TRE processing for this channel */ + write_lock_irq(&gpii->pm_lock); + gchan->pm_state = PREPARE_TERMINATE; + write_unlock_irq(&gpii->pm_lock); + + /* attempt to do graceful hardware shutdown */ + if (cur_state == ACTIVE_STATE) { + gpi_stop_chan(gchan); + + ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET); + if (ret) + dev_err(gpii->gpi_dev->dev, "error resetting channel:%d\n", ret); + + gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC); + } + + /* free all allocated memory */ + gpi_free_ring(&gchan->ch_ring, gpii); + vchan_free_chan_resources(&gchan->vc); + kfree(gchan->config); + + write_lock_irq(&gpii->pm_lock); + gchan->pm_state = DISABLE_STATE; + write_unlock_irq(&gpii->pm_lock); + + /* if other rings are still active exit */ + for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) + if (gpii->gchan[i].ch_ring.configured) + goto exit_free; + + /* deallocate EV Ring */ + cur_state = gpii->pm_state; + write_lock_irq(&gpii->pm_lock); + gpii->pm_state = PREPARE_TERMINATE; + write_unlock_irq(&gpii->pm_lock); + + /* wait for threads to complete out */ + tasklet_kill(&gpii->ev_task); + + /* send command to de allocate event ring */ + if (cur_state == ACTIVE_STATE) + gpi_send_cmd(gpii, NULL, GPI_EV_CMD_DEALLOC); + + gpi_free_ring(&gpii->ev_ring, gpii); + + /* disable interrupts */ + if (cur_state == ACTIVE_STATE) + gpi_disable_interrupts(gpii); + + /* set final state to disable */ + write_lock_irq(&gpii->pm_lock); + gpii->pm_state = DISABLE_STATE; + write_unlock_irq(&gpii->pm_lock); + +exit_free: + mutex_unlock(&gpii->ctrl_lock); +} + +/* allocate channel resources */ +static int gpi_alloc_chan_resources(struct dma_chan *chan) +{ + struct gchan *gchan = to_gchan(chan); + struct gpii *gpii = gchan->gpii; + int ret; + + mutex_lock(&gpii->ctrl_lock); + + /* allocate memory for transfer ring */ + ret = gpi_alloc_ring(&gchan->ch_ring, CHAN_TRES, + sizeof(struct gpi_tre), gpii); + if (ret) + goto xfer_alloc_err; + + ret = gpi_ch_init(gchan); + + mutex_unlock(&gpii->ctrl_lock); + + return ret; +xfer_alloc_err: + mutex_unlock(&gpii->ctrl_lock); + + return ret; +} + +static int gpi_find_avail_gpii(struct gpi_dev *gpi_dev, u32 seid) +{ + struct gchan *tx_chan, *rx_chan; + unsigned int gpii; + + /* check if same seid is already configured for another chid */ + for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) { + if (!((1 << gpii) & gpi_dev->gpii_mask)) + continue; + + tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN]; + rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN]; + + if (rx_chan->vc.chan.client_count && rx_chan->seid == seid) + return gpii; + if (tx_chan->vc.chan.client_count && tx_chan->seid == seid) + return gpii; + } + + /* no channels configured with same seid, return next avail gpii */ + for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) { + if (!((1 << gpii) & gpi_dev->gpii_mask)) + continue; + + tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN]; + rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN]; + + /* check if gpii is configured */ + if (tx_chan->vc.chan.client_count || + rx_chan->vc.chan.client_count) + continue; + + /* found a free gpii */ + return gpii; + } + + /* no gpii instance available to use */ + return -EIO; +} + +/* gpi_of_dma_xlate: open client requested channel */ +static struct dma_chan *gpi_of_dma_xlate(struct of_phandle_args *args, + struct of_dma *of_dma) +{ + struct gpi_dev *gpi_dev = (struct gpi_dev *)of_dma->of_dma_data; + u32 seid, chid; + int gpii; + struct gchan *gchan; + + if (args->args_count < 3) { + dev_err(gpi_dev->dev, "gpii require minimum 2 args, client passed:%d args\n", + args->args_count); + return NULL; + } + + chid = args->args[0]; + if (chid >= MAX_CHANNELS_PER_GPII) { + dev_err(gpi_dev->dev, "gpii channel:%d not valid\n", chid); + return NULL; + } + + seid = args->args[1]; + + /* find next available gpii to use */ + gpii = gpi_find_avail_gpii(gpi_dev, seid); + if (gpii < 0) { + dev_err(gpi_dev->dev, "no available gpii instances\n"); + return NULL; + } + + gchan = &gpi_dev->gpiis[gpii].gchan[chid]; + if (gchan->vc.chan.client_count) { + dev_err(gpi_dev->dev, "gpii:%d chid:%d seid:%d already configured\n", + gpii, chid, gchan->seid); + return NULL; + } + + gchan->seid = seid; + gchan->protocol = args->args[2]; + + return dma_get_slave_channel(&gchan->vc.chan); +} + +static int gpi_probe(struct platform_device *pdev) +{ + struct gpi_dev *gpi_dev; + unsigned int i; + int ret; + + gpi_dev = devm_kzalloc(&pdev->dev, sizeof(*gpi_dev), GFP_KERNEL); + if (!gpi_dev) + return -ENOMEM; + + gpi_dev->dev = &pdev->dev; + gpi_dev->res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + gpi_dev->regs = devm_ioremap_resource(gpi_dev->dev, gpi_dev->res); + if (IS_ERR(gpi_dev->regs)) + return PTR_ERR(gpi_dev->regs); + gpi_dev->ee_base = gpi_dev->regs; + + ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channels", + &gpi_dev->max_gpii); + if (ret) { + dev_err(gpi_dev->dev, "missing 'max-no-gpii' DT node\n"); + return ret; + } + + ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channel-mask", + &gpi_dev->gpii_mask); + if (ret) { + dev_err(gpi_dev->dev, "missing 'gpii-mask' DT node\n"); + return ret; + } + + gpi_dev->ev_factor = EV_FACTOR; + + ret = dma_set_mask(gpi_dev->dev, DMA_BIT_MASK(64)); + if (ret) { + dev_err(gpi_dev->dev, "Error setting dma_mask to 64, ret:%d\n", ret); + return ret; + } + + gpi_dev->gpiis = devm_kzalloc(gpi_dev->dev, sizeof(*gpi_dev->gpiis) * + gpi_dev->max_gpii, GFP_KERNEL); + if (!gpi_dev->gpiis) + return -ENOMEM; + + /* setup all the supported gpii */ + INIT_LIST_HEAD(&gpi_dev->dma_device.channels); + for (i = 0; i < gpi_dev->max_gpii; i++) { + struct gpii *gpii = &gpi_dev->gpiis[i]; + int chan; + + if (!((1 << i) & gpi_dev->gpii_mask)) + continue; + + /* set up ev cntxt register map */ + gpii->ev_cntxt_base_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_CNTXT_0_OFFS(i, 0); + gpii->ev_cntxt_db_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_DOORBELL_0_OFFS(i, 0); + gpii->ev_ring_rp_lsb_reg = gpii->ev_cntxt_base_reg + CNTXT_4_RING_RP_LSB; + gpii->ev_cmd_reg = gpi_dev->ee_base + GPII_n_EV_CH_CMD_OFFS(i); + gpii->ieob_clr_reg = gpi_dev->ee_base + GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(i); + + /* set up irq */ + ret = platform_get_irq(pdev, i); + if (ret < 0) { + dev_err(gpi_dev->dev, "platform_get_irq failed for %d:%d\n", i, ret); + return ret; + } + gpii->irq = ret; + + /* set up channel specific register info */ + for (chan = 0; chan < MAX_CHANNELS_PER_GPII; chan++) { + struct gchan *gchan = &gpii->gchan[chan]; + + /* set up ch cntxt register map */ + gchan->ch_cntxt_base_reg = gpi_dev->ee_base + + GPII_n_CH_k_CNTXT_0_OFFS(i, chan); + gchan->ch_cntxt_db_reg = gpi_dev->ee_base + + GPII_n_CH_k_DOORBELL_0_OFFS(i, chan); + gchan->ch_cmd_reg = gpi_dev->ee_base + GPII_n_CH_CMD_OFFS(i); + + /* vchan setup */ + vchan_init(&gchan->vc, &gpi_dev->dma_device); + gchan->vc.desc_free = gpi_desc_free; + gchan->chid = chan; + gchan->gpii = gpii; + gchan->dir = GPII_CHAN_DIR[chan]; + } + mutex_init(&gpii->ctrl_lock); + rwlock_init(&gpii->pm_lock); + tasklet_init(&gpii->ev_task, gpi_ev_tasklet, + (unsigned long)gpii); + init_completion(&gpii->cmd_completion); + gpii->gpii_id = i; + gpii->regs = gpi_dev->ee_base; + gpii->gpi_dev = gpi_dev; + } + + platform_set_drvdata(pdev, gpi_dev); + + /* clear and Set capabilities */ + dma_cap_zero(gpi_dev->dma_device.cap_mask); + dma_cap_set(DMA_SLAVE, gpi_dev->dma_device.cap_mask); + + /* configure dmaengine apis */ + gpi_dev->dma_device.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV); + gpi_dev->dma_device.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR; + gpi_dev->dma_device.src_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES; + gpi_dev->dma_device.dst_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES; + gpi_dev->dma_device.device_alloc_chan_resources = gpi_alloc_chan_resources; + gpi_dev->dma_device.device_free_chan_resources = gpi_free_chan_resources; + gpi_dev->dma_device.device_tx_status = dma_cookie_status; + gpi_dev->dma_device.device_issue_pending = gpi_issue_pending; + gpi_dev->dma_device.device_prep_slave_sg = gpi_prep_slave_sg; + gpi_dev->dma_device.device_config = gpi_peripheral_config; + gpi_dev->dma_device.device_terminate_all = gpi_terminate_all; + gpi_dev->dma_device.dev = gpi_dev->dev; + gpi_dev->dma_device.device_pause = gpi_pause; + gpi_dev->dma_device.device_resume = gpi_resume; + + /* register with dmaengine framework */ + ret = dma_async_device_register(&gpi_dev->dma_device); + if (ret) { + dev_err(gpi_dev->dev, "async_device_register failed ret:%d", ret); + return ret; + } + + ret = of_dma_controller_register(gpi_dev->dev->of_node, + gpi_of_dma_xlate, gpi_dev); + if (ret) { + dev_err(gpi_dev->dev, "of_dma_controller_reg failed ret:%d", ret); + return ret; + } + + return ret; +} + +static const struct of_device_id gpi_of_match[] = { + { .compatible = "qcom,sdm845-gpi-dma" }, + { }, +}; +MODULE_DEVICE_TABLE(of, gpi_of_match); + +static struct platform_driver gpi_driver = { + .probe = gpi_probe, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = gpi_of_match, + }, +}; + +static int __init gpi_init(void) +{ + return platform_driver_register(&gpi_driver); +} +subsys_initcall(gpi_init) + +MODULE_DESCRIPTION("QCOM GPI DMA engine driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/qcom/qcom_adm.c b/drivers/dma/qcom/qcom_adm.c new file mode 100644 index 000000000000..ee78bed8d60d --- /dev/null +++ b/drivers/dma/qcom/qcom_adm.c @@ -0,0 +1,905 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2013-2015, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../dmaengine.h" +#include "../virt-dma.h" + +/* ADM registers - calculated from channel number and security domain */ +#define ADM_CHAN_MULTI 0x4 +#define ADM_CI_MULTI 0x4 +#define ADM_CRCI_MULTI 0x4 +#define ADM_EE_MULTI 0x800 +#define ADM_CHAN_OFFS(chan) (ADM_CHAN_MULTI * (chan)) +#define ADM_EE_OFFS(ee) (ADM_EE_MULTI * (ee)) +#define ADM_CHAN_EE_OFFS(chan, ee) (ADM_CHAN_OFFS(chan) + ADM_EE_OFFS(ee)) +#define ADM_CHAN_OFFS(chan) (ADM_CHAN_MULTI * (chan)) +#define ADM_CI_OFFS(ci) (ADM_CHAN_OFF(ci)) +#define ADM_CH_CMD_PTR(chan, ee) (ADM_CHAN_EE_OFFS(chan, ee)) +#define ADM_CH_RSLT(chan, ee) (0x40 + ADM_CHAN_EE_OFFS(chan, ee)) +#define ADM_CH_FLUSH_STATE0(chan, ee) (0x80 + ADM_CHAN_EE_OFFS(chan, ee)) +#define ADM_CH_STATUS_SD(chan, ee) (0x200 + ADM_CHAN_EE_OFFS(chan, ee)) +#define ADM_CH_CONF(chan) (0x240 + ADM_CHAN_OFFS(chan)) +#define ADM_CH_RSLT_CONF(chan, ee) (0x300 + ADM_CHAN_EE_OFFS(chan, ee)) +#define ADM_SEC_DOMAIN_IRQ_STATUS(ee) (0x380 + ADM_EE_OFFS(ee)) +#define ADM_CI_CONF(ci) (0x390 + (ci) * ADM_CI_MULTI) +#define ADM_GP_CTL 0x3d8 +#define ADM_CRCI_CTL(crci, ee) (0x400 + (crci) * ADM_CRCI_MULTI + \ + ADM_EE_OFFS(ee)) + +/* channel status */ +#define ADM_CH_STATUS_VALID BIT(1) + +/* channel result */ +#define ADM_CH_RSLT_VALID BIT(31) +#define ADM_CH_RSLT_ERR BIT(3) +#define ADM_CH_RSLT_FLUSH BIT(2) +#define ADM_CH_RSLT_TPD BIT(1) + +/* channel conf */ +#define ADM_CH_CONF_SHADOW_EN BIT(12) +#define ADM_CH_CONF_MPU_DISABLE BIT(11) +#define ADM_CH_CONF_PERM_MPU_CONF BIT(9) +#define ADM_CH_CONF_FORCE_RSLT_EN BIT(7) +#define ADM_CH_CONF_SEC_DOMAIN(ee) ((((ee) & 0x3) << 4) | (((ee) & 0x4) << 11)) + +/* channel result conf */ +#define ADM_CH_RSLT_CONF_FLUSH_EN BIT(1) +#define ADM_CH_RSLT_CONF_IRQ_EN BIT(0) + +/* CRCI CTL */ +#define ADM_CRCI_CTL_MUX_SEL BIT(18) +#define ADM_CRCI_CTL_RST BIT(17) + +/* CI configuration */ +#define ADM_CI_RANGE_END(x) ((x) << 24) +#define ADM_CI_RANGE_START(x) ((x) << 16) +#define ADM_CI_BURST_4_WORDS BIT(2) +#define ADM_CI_BURST_8_WORDS BIT(3) + +/* GP CTL */ +#define ADM_GP_CTL_LP_EN BIT(12) +#define ADM_GP_CTL_LP_CNT(x) ((x) << 8) + +/* Command pointer list entry */ +#define ADM_CPLE_LP BIT(31) +#define ADM_CPLE_CMD_PTR_LIST BIT(29) + +/* Command list entry */ +#define ADM_CMD_LC BIT(31) +#define ADM_CMD_DST_CRCI(n) (((n) & 0xf) << 7) +#define ADM_CMD_SRC_CRCI(n) (((n) & 0xf) << 3) + +#define ADM_CMD_TYPE_SINGLE 0x0 +#define ADM_CMD_TYPE_BOX 0x3 + +#define ADM_CRCI_MUX_SEL BIT(4) +#define ADM_DESC_ALIGN 8 +#define ADM_MAX_XFER (SZ_64K - 1) +#define ADM_MAX_ROWS (SZ_64K - 1) +#define ADM_MAX_CHANNELS 16 + +struct adm_desc_hw_box { + u32 cmd; + u32 src_addr; + u32 dst_addr; + u32 row_len; + u32 num_rows; + u32 row_offset; +}; + +struct adm_desc_hw_single { + u32 cmd; + u32 src_addr; + u32 dst_addr; + u32 len; +}; + +struct adm_async_desc { + struct virt_dma_desc vd; + struct adm_device *adev; + + size_t length; + enum dma_transfer_direction dir; + dma_addr_t dma_addr; + size_t dma_len; + + void *cpl; + dma_addr_t cp_addr; + u32 crci; + u32 mux; + u32 blk_size; +}; + +struct adm_chan { + struct virt_dma_chan vc; + struct adm_device *adev; + + /* parsed from DT */ + u32 id; /* channel id */ + + struct adm_async_desc *curr_txd; + struct dma_slave_config slave; + struct list_head node; + + int error; + int initialized; +}; + +static inline struct adm_chan *to_adm_chan(struct dma_chan *common) +{ + return container_of(common, struct adm_chan, vc.chan); +} + +struct adm_device { + void __iomem *regs; + struct device *dev; + struct dma_device common; + struct device_dma_parameters dma_parms; + struct adm_chan *channels; + + u32 ee; + + struct clk *core_clk; + struct clk *iface_clk; + + struct reset_control *clk_reset; + struct reset_control *c0_reset; + struct reset_control *c1_reset; + struct reset_control *c2_reset; + int irq; +}; + +/** + * adm_free_chan - Frees dma resources associated with the specific channel + * + * @chan: dma channel + * + * Free all allocated descriptors associated with this channel + */ +static void adm_free_chan(struct dma_chan *chan) +{ + /* free all queued descriptors */ + vchan_free_chan_resources(to_virt_chan(chan)); +} + +/** + * adm_get_blksize - Get block size from burst value + * + * @burst: Burst size of transaction + */ +static int adm_get_blksize(unsigned int burst) +{ + int ret; + + switch (burst) { + case 16: + case 32: + case 64: + case 128: + ret = ffs(burst >> 4) - 1; + break; + case 192: + ret = 4; + break; + case 256: + ret = 5; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +/** + * adm_process_fc_descriptors - Process descriptors for flow controlled xfers + * + * @achan: ADM channel + * @desc: Descriptor memory pointer + * @sg: Scatterlist entry + * @crci: CRCI value + * @burst: Burst size of transaction + * @direction: DMA transfer direction + */ +static void *adm_process_fc_descriptors(struct adm_chan *achan, void *desc, + struct scatterlist *sg, u32 crci, + u32 burst, + enum dma_transfer_direction direction) +{ + struct adm_desc_hw_box *box_desc = NULL; + struct adm_desc_hw_single *single_desc; + u32 remainder = sg_dma_len(sg); + u32 rows, row_offset, crci_cmd; + u32 mem_addr = sg_dma_address(sg); + u32 *incr_addr = &mem_addr; + u32 *src, *dst; + + if (direction == DMA_DEV_TO_MEM) { + crci_cmd = ADM_CMD_SRC_CRCI(crci); + row_offset = burst; + src = &achan->slave.src_addr; + dst = &mem_addr; + } else { + crci_cmd = ADM_CMD_DST_CRCI(crci); + row_offset = burst << 16; + src = &mem_addr; + dst = &achan->slave.dst_addr; + } + + while (remainder >= burst) { + box_desc = desc; + box_desc->cmd = ADM_CMD_TYPE_BOX | crci_cmd; + box_desc->row_offset = row_offset; + box_desc->src_addr = *src; + box_desc->dst_addr = *dst; + + rows = remainder / burst; + rows = min_t(u32, rows, ADM_MAX_ROWS); + box_desc->num_rows = rows << 16 | rows; + box_desc->row_len = burst << 16 | burst; + + *incr_addr += burst * rows; + remainder -= burst * rows; + desc += sizeof(*box_desc); + } + + /* if leftover bytes, do one single descriptor */ + if (remainder) { + single_desc = desc; + single_desc->cmd = ADM_CMD_TYPE_SINGLE | crci_cmd; + single_desc->len = remainder; + single_desc->src_addr = *src; + single_desc->dst_addr = *dst; + desc += sizeof(*single_desc); + + if (sg_is_last(sg)) + single_desc->cmd |= ADM_CMD_LC; + } else { + if (box_desc && sg_is_last(sg)) + box_desc->cmd |= ADM_CMD_LC; + } + + return desc; +} + +/** + * adm_process_non_fc_descriptors - Process descriptors for non-fc xfers + * + * @achan: ADM channel + * @desc: Descriptor memory pointer + * @sg: Scatterlist entry + * @direction: DMA transfer direction + */ +static void *adm_process_non_fc_descriptors(struct adm_chan *achan, void *desc, + struct scatterlist *sg, + enum dma_transfer_direction direction) +{ + struct adm_desc_hw_single *single_desc; + u32 remainder = sg_dma_len(sg); + u32 mem_addr = sg_dma_address(sg); + u32 *incr_addr = &mem_addr; + u32 *src, *dst; + + if (direction == DMA_DEV_TO_MEM) { + src = &achan->slave.src_addr; + dst = &mem_addr; + } else { + src = &mem_addr; + dst = &achan->slave.dst_addr; + } + + do { + single_desc = desc; + single_desc->cmd = ADM_CMD_TYPE_SINGLE; + single_desc->src_addr = *src; + single_desc->dst_addr = *dst; + single_desc->len = (remainder > ADM_MAX_XFER) ? + ADM_MAX_XFER : remainder; + + remainder -= single_desc->len; + *incr_addr += single_desc->len; + desc += sizeof(*single_desc); + } while (remainder); + + /* set last command if this is the end of the whole transaction */ + if (sg_is_last(sg)) + single_desc->cmd |= ADM_CMD_LC; + + return desc; +} + +/** + * adm_prep_slave_sg - Prep slave sg transaction + * + * @chan: dma channel + * @sgl: scatter gather list + * @sg_len: length of sg + * @direction: DMA transfer direction + * @flags: DMA flags + * @context: transfer context (unused) + */ +static struct dma_async_tx_descriptor *adm_prep_slave_sg(struct dma_chan *chan, + struct scatterlist *sgl, + unsigned int sg_len, + enum dma_transfer_direction direction, + unsigned long flags, + void *context) +{ + struct adm_chan *achan = to_adm_chan(chan); + struct adm_device *adev = achan->adev; + struct adm_async_desc *async_desc; + struct scatterlist *sg; + dma_addr_t cple_addr; + u32 i, burst; + u32 single_count = 0, box_count = 0, crci = 0; + void *desc; + u32 *cple; + int blk_size = 0; + + if (!is_slave_direction(direction)) { + dev_err(adev->dev, "invalid dma direction\n"); + return NULL; + } + + /* + * get burst value from slave configuration + */ + burst = (direction == DMA_MEM_TO_DEV) ? + achan->slave.dst_maxburst : + achan->slave.src_maxburst; + + /* if using flow control, validate burst and crci values */ + if (achan->slave.device_fc) { + blk_size = adm_get_blksize(burst); + if (blk_size < 0) { + dev_err(adev->dev, "invalid burst value: %d\n", + burst); + return ERR_PTR(-EINVAL); + } + + crci = achan->slave.slave_id & 0xf; + if (!crci || achan->slave.slave_id > 0x1f) { + dev_err(adev->dev, "invalid crci value\n"); + return ERR_PTR(-EINVAL); + } + } + + /* iterate through sgs and compute allocation size of structures */ + for_each_sg(sgl, sg, sg_len, i) { + if (achan->slave.device_fc) { + box_count += DIV_ROUND_UP(sg_dma_len(sg) / burst, + ADM_MAX_ROWS); + if (sg_dma_len(sg) % burst) + single_count++; + } else { + single_count += DIV_ROUND_UP(sg_dma_len(sg), + ADM_MAX_XFER); + } + } + + async_desc = kzalloc(sizeof(*async_desc), GFP_NOWAIT); + if (!async_desc) + return ERR_PTR(-ENOMEM); + + if (crci) + async_desc->mux = achan->slave.slave_id & ADM_CRCI_MUX_SEL ? + ADM_CRCI_CTL_MUX_SEL : 0; + async_desc->crci = crci; + async_desc->blk_size = blk_size; + async_desc->dma_len = single_count * sizeof(struct adm_desc_hw_single) + + box_count * sizeof(struct adm_desc_hw_box) + + sizeof(*cple) + 2 * ADM_DESC_ALIGN; + + async_desc->cpl = kzalloc(async_desc->dma_len, GFP_NOWAIT); + if (!async_desc->cpl) + goto free; + + async_desc->adev = adev; + + /* both command list entry and descriptors must be 8 byte aligned */ + cple = PTR_ALIGN(async_desc->cpl, ADM_DESC_ALIGN); + desc = PTR_ALIGN(cple + 1, ADM_DESC_ALIGN); + + for_each_sg(sgl, sg, sg_len, i) { + async_desc->length += sg_dma_len(sg); + + if (achan->slave.device_fc) + desc = adm_process_fc_descriptors(achan, desc, sg, crci, + burst, direction); + else + desc = adm_process_non_fc_descriptors(achan, desc, sg, + direction); + } + + async_desc->dma_addr = dma_map_single(adev->dev, async_desc->cpl, + async_desc->dma_len, + DMA_TO_DEVICE); + if (dma_mapping_error(adev->dev, async_desc->dma_addr)) + goto free; + + cple_addr = async_desc->dma_addr + ((void *)cple - async_desc->cpl); + + /* init cmd list */ + dma_sync_single_for_cpu(adev->dev, cple_addr, sizeof(*cple), + DMA_TO_DEVICE); + *cple = ADM_CPLE_LP; + *cple |= (async_desc->dma_addr + ADM_DESC_ALIGN) >> 3; + dma_sync_single_for_device(adev->dev, cple_addr, sizeof(*cple), + DMA_TO_DEVICE); + + return vchan_tx_prep(&achan->vc, &async_desc->vd, flags); + +free: + kfree(async_desc); + return ERR_PTR(-ENOMEM); +} + +/** + * adm_terminate_all - terminate all transactions on a channel + * @chan: dma channel + * + * Dequeues and frees all transactions, aborts current transaction + * No callbacks are done + * + */ +static int adm_terminate_all(struct dma_chan *chan) +{ + struct adm_chan *achan = to_adm_chan(chan); + struct adm_device *adev = achan->adev; + unsigned long flags; + LIST_HEAD(head); + + spin_lock_irqsave(&achan->vc.lock, flags); + vchan_get_all_descriptors(&achan->vc, &head); + + /* send flush command to terminate current transaction */ + writel_relaxed(0x0, + adev->regs + ADM_CH_FLUSH_STATE0(achan->id, adev->ee)); + + spin_unlock_irqrestore(&achan->vc.lock, flags); + + vchan_dma_desc_free_list(&achan->vc, &head); + + return 0; +} + +static int adm_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg) +{ + struct adm_chan *achan = to_adm_chan(chan); + unsigned long flag; + + spin_lock_irqsave(&achan->vc.lock, flag); + memcpy(&achan->slave, cfg, sizeof(struct dma_slave_config)); + spin_unlock_irqrestore(&achan->vc.lock, flag); + + return 0; +} + +/** + * adm_start_dma - start next transaction + * @achan: ADM dma channel + */ +static void adm_start_dma(struct adm_chan *achan) +{ + struct virt_dma_desc *vd = vchan_next_desc(&achan->vc); + struct adm_device *adev = achan->adev; + struct adm_async_desc *async_desc; + + lockdep_assert_held(&achan->vc.lock); + + if (!vd) + return; + + list_del(&vd->node); + + /* write next command list out to the CMD FIFO */ + async_desc = container_of(vd, struct adm_async_desc, vd); + achan->curr_txd = async_desc; + + /* reset channel error */ + achan->error = 0; + + if (!achan->initialized) { + /* enable interrupts */ + writel(ADM_CH_CONF_SHADOW_EN | + ADM_CH_CONF_PERM_MPU_CONF | + ADM_CH_CONF_MPU_DISABLE | + ADM_CH_CONF_SEC_DOMAIN(adev->ee), + adev->regs + ADM_CH_CONF(achan->id)); + + writel(ADM_CH_RSLT_CONF_IRQ_EN | ADM_CH_RSLT_CONF_FLUSH_EN, + adev->regs + ADM_CH_RSLT_CONF(achan->id, adev->ee)); + + achan->initialized = 1; + } + + /* set the crci block size if this transaction requires CRCI */ + if (async_desc->crci) { + writel(async_desc->mux | async_desc->blk_size, + adev->regs + ADM_CRCI_CTL(async_desc->crci, adev->ee)); + } + + /* make sure IRQ enable doesn't get reordered */ + wmb(); + + /* write next command list out to the CMD FIFO */ + writel(ALIGN(async_desc->dma_addr, ADM_DESC_ALIGN) >> 3, + adev->regs + ADM_CH_CMD_PTR(achan->id, adev->ee)); +} + +/** + * adm_dma_irq - irq handler for ADM controller + * @irq: IRQ of interrupt + * @data: callback data + * + * IRQ handler for the bam controller + */ +static irqreturn_t adm_dma_irq(int irq, void *data) +{ + struct adm_device *adev = data; + u32 srcs, i; + struct adm_async_desc *async_desc; + unsigned long flags; + + srcs = readl_relaxed(adev->regs + + ADM_SEC_DOMAIN_IRQ_STATUS(adev->ee)); + + for (i = 0; i < ADM_MAX_CHANNELS; i++) { + struct adm_chan *achan = &adev->channels[i]; + u32 status, result; + + if (srcs & BIT(i)) { + status = readl_relaxed(adev->regs + + ADM_CH_STATUS_SD(i, adev->ee)); + + /* if no result present, skip */ + if (!(status & ADM_CH_STATUS_VALID)) + continue; + + result = readl_relaxed(adev->regs + + ADM_CH_RSLT(i, adev->ee)); + + /* no valid results, skip */ + if (!(result & ADM_CH_RSLT_VALID)) + continue; + + /* flag error if transaction was flushed or failed */ + if (result & (ADM_CH_RSLT_ERR | ADM_CH_RSLT_FLUSH)) + achan->error = 1; + + spin_lock_irqsave(&achan->vc.lock, flags); + async_desc = achan->curr_txd; + + achan->curr_txd = NULL; + + if (async_desc) { + vchan_cookie_complete(&async_desc->vd); + + /* kick off next DMA */ + adm_start_dma(achan); + } + + spin_unlock_irqrestore(&achan->vc.lock, flags); + } + } + + return IRQ_HANDLED; +} + +/** + * adm_tx_status - returns status of transaction + * @chan: dma channel + * @cookie: transaction cookie + * @txstate: DMA transaction state + * + * Return status of dma transaction + */ +static enum dma_status adm_tx_status(struct dma_chan *chan, dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + struct adm_chan *achan = to_adm_chan(chan); + struct virt_dma_desc *vd; + enum dma_status ret; + unsigned long flags; + size_t residue = 0; + + ret = dma_cookie_status(chan, cookie, txstate); + if (ret == DMA_COMPLETE || !txstate) + return ret; + + spin_lock_irqsave(&achan->vc.lock, flags); + + vd = vchan_find_desc(&achan->vc, cookie); + if (vd) + residue = container_of(vd, struct adm_async_desc, vd)->length; + + spin_unlock_irqrestore(&achan->vc.lock, flags); + + /* + * residue is either the full length if it is in the issued list, or 0 + * if it is in progress. We have no reliable way of determining + * anything inbetween + */ + dma_set_residue(txstate, residue); + + if (achan->error) + return DMA_ERROR; + + return ret; +} + +/** + * adm_issue_pending - starts pending transactions + * @chan: dma channel + * + * Issues all pending transactions and starts DMA + */ +static void adm_issue_pending(struct dma_chan *chan) +{ + struct adm_chan *achan = to_adm_chan(chan); + unsigned long flags; + + spin_lock_irqsave(&achan->vc.lock, flags); + + if (vchan_issue_pending(&achan->vc) && !achan->curr_txd) + adm_start_dma(achan); + spin_unlock_irqrestore(&achan->vc.lock, flags); +} + +/** + * adm_dma_free_desc - free descriptor memory + * @vd: virtual descriptor + * + */ +static void adm_dma_free_desc(struct virt_dma_desc *vd) +{ + struct adm_async_desc *async_desc = container_of(vd, + struct adm_async_desc, vd); + + dma_unmap_single(async_desc->adev->dev, async_desc->dma_addr, + async_desc->dma_len, DMA_TO_DEVICE); + kfree(async_desc->cpl); + kfree(async_desc); +} + +static void adm_channel_init(struct adm_device *adev, struct adm_chan *achan, + u32 index) +{ + achan->id = index; + achan->adev = adev; + + vchan_init(&achan->vc, &adev->common); + achan->vc.desc_free = adm_dma_free_desc; +} + +static int adm_dma_probe(struct platform_device *pdev) +{ + struct adm_device *adev; + int ret; + u32 i; + + adev = devm_kzalloc(&pdev->dev, sizeof(*adev), GFP_KERNEL); + if (!adev) + return -ENOMEM; + + adev->dev = &pdev->dev; + + adev->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(adev->regs)) + return PTR_ERR(adev->regs); + + adev->irq = platform_get_irq(pdev, 0); + if (adev->irq < 0) + return adev->irq; + + ret = of_property_read_u32(pdev->dev.of_node, "qcom,ee", &adev->ee); + if (ret) { + dev_err(adev->dev, "Execution environment unspecified\n"); + return ret; + } + + adev->core_clk = devm_clk_get(adev->dev, "core"); + if (IS_ERR(adev->core_clk)) + return PTR_ERR(adev->core_clk); + + adev->iface_clk = devm_clk_get(adev->dev, "iface"); + if (IS_ERR(adev->iface_clk)) + return PTR_ERR(adev->iface_clk); + + adev->clk_reset = devm_reset_control_get_exclusive(&pdev->dev, "clk"); + if (IS_ERR(adev->clk_reset)) { + dev_err(adev->dev, "failed to get ADM0 reset\n"); + return PTR_ERR(adev->clk_reset); + } + + adev->c0_reset = devm_reset_control_get_exclusive(&pdev->dev, "c0"); + if (IS_ERR(adev->c0_reset)) { + dev_err(adev->dev, "failed to get ADM0 C0 reset\n"); + return PTR_ERR(adev->c0_reset); + } + + adev->c1_reset = devm_reset_control_get_exclusive(&pdev->dev, "c1"); + if (IS_ERR(adev->c1_reset)) { + dev_err(adev->dev, "failed to get ADM0 C1 reset\n"); + return PTR_ERR(adev->c1_reset); + } + + adev->c2_reset = devm_reset_control_get_exclusive(&pdev->dev, "c2"); + if (IS_ERR(adev->c2_reset)) { + dev_err(adev->dev, "failed to get ADM0 C2 reset\n"); + return PTR_ERR(adev->c2_reset); + } + + ret = clk_prepare_enable(adev->core_clk); + if (ret) { + dev_err(adev->dev, "failed to prepare/enable core clock\n"); + return ret; + } + + ret = clk_prepare_enable(adev->iface_clk); + if (ret) { + dev_err(adev->dev, "failed to prepare/enable iface clock\n"); + goto err_disable_core_clk; + } + + reset_control_assert(adev->clk_reset); + reset_control_assert(adev->c0_reset); + reset_control_assert(adev->c1_reset); + reset_control_assert(adev->c2_reset); + + udelay(2); + + reset_control_deassert(adev->clk_reset); + reset_control_deassert(adev->c0_reset); + reset_control_deassert(adev->c1_reset); + reset_control_deassert(adev->c2_reset); + + adev->channels = devm_kcalloc(adev->dev, ADM_MAX_CHANNELS, + sizeof(*adev->channels), GFP_KERNEL); + + if (!adev->channels) { + ret = -ENOMEM; + goto err_disable_clks; + } + + /* allocate and initialize channels */ + INIT_LIST_HEAD(&adev->common.channels); + + for (i = 0; i < ADM_MAX_CHANNELS; i++) + adm_channel_init(adev, &adev->channels[i], i); + + /* reset CRCIs */ + for (i = 0; i < 16; i++) + writel(ADM_CRCI_CTL_RST, adev->regs + + ADM_CRCI_CTL(i, adev->ee)); + + /* configure client interfaces */ + writel(ADM_CI_RANGE_START(0x40) | ADM_CI_RANGE_END(0xb0) | + ADM_CI_BURST_8_WORDS, adev->regs + ADM_CI_CONF(0)); + writel(ADM_CI_RANGE_START(0x2a) | ADM_CI_RANGE_END(0x2c) | + ADM_CI_BURST_8_WORDS, adev->regs + ADM_CI_CONF(1)); + writel(ADM_CI_RANGE_START(0x12) | ADM_CI_RANGE_END(0x28) | + ADM_CI_BURST_8_WORDS, adev->regs + ADM_CI_CONF(2)); + writel(ADM_GP_CTL_LP_EN | ADM_GP_CTL_LP_CNT(0xf), + adev->regs + ADM_GP_CTL); + + ret = devm_request_irq(adev->dev, adev->irq, adm_dma_irq, + 0, "adm_dma", adev); + if (ret) + goto err_disable_clks; + + platform_set_drvdata(pdev, adev); + + adev->common.dev = adev->dev; + adev->common.dev->dma_parms = &adev->dma_parms; + + /* set capabilities */ + dma_cap_zero(adev->common.cap_mask); + dma_cap_set(DMA_SLAVE, adev->common.cap_mask); + dma_cap_set(DMA_PRIVATE, adev->common.cap_mask); + + /* initialize dmaengine apis */ + adev->common.directions = BIT(DMA_DEV_TO_MEM | DMA_MEM_TO_DEV); + adev->common.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR; + adev->common.src_addr_widths = DMA_SLAVE_BUSWIDTH_4_BYTES; + adev->common.dst_addr_widths = DMA_SLAVE_BUSWIDTH_4_BYTES; + adev->common.device_free_chan_resources = adm_free_chan; + adev->common.device_prep_slave_sg = adm_prep_slave_sg; + adev->common.device_issue_pending = adm_issue_pending; + adev->common.device_tx_status = adm_tx_status; + adev->common.device_terminate_all = adm_terminate_all; + adev->common.device_config = adm_slave_config; + + ret = dma_async_device_register(&adev->common); + if (ret) { + dev_err(adev->dev, "failed to register dma async device\n"); + goto err_disable_clks; + } + + ret = of_dma_controller_register(pdev->dev.of_node, + of_dma_xlate_by_chan_id, + &adev->common); + if (ret) + goto err_unregister_dma; + + return 0; + +err_unregister_dma: + dma_async_device_unregister(&adev->common); +err_disable_clks: + clk_disable_unprepare(adev->iface_clk); +err_disable_core_clk: + clk_disable_unprepare(adev->core_clk); + + return ret; +} + +static int adm_dma_remove(struct platform_device *pdev) +{ + struct adm_device *adev = platform_get_drvdata(pdev); + struct adm_chan *achan; + u32 i; + + of_dma_controller_free(pdev->dev.of_node); + dma_async_device_unregister(&adev->common); + + for (i = 0; i < ADM_MAX_CHANNELS; i++) { + achan = &adev->channels[i]; + + /* mask IRQs for this channel/EE pair */ + writel(0, adev->regs + ADM_CH_RSLT_CONF(achan->id, adev->ee)); + + tasklet_kill(&adev->channels[i].vc.task); + adm_terminate_all(&adev->channels[i].vc.chan); + } + + devm_free_irq(adev->dev, adev->irq, adev); + + clk_disable_unprepare(adev->core_clk); + clk_disable_unprepare(adev->iface_clk); + + return 0; +} + +static const struct of_device_id adm_of_match[] = { + { .compatible = "qcom,adm", }, + {} +}; +MODULE_DEVICE_TABLE(of, adm_of_match); + +static struct platform_driver adm_dma_driver = { + .probe = adm_dma_probe, + .remove = adm_dma_remove, + .driver = { + .name = "adm-dma-engine", + .of_match_table = adm_of_match, + }, +}; + +module_platform_driver(adm_dma_driver); + +MODULE_AUTHOR("Andy Gross "); +MODULE_DESCRIPTION("QCOM ADM DMA engine driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/dma/sf-pdma/sf-pdma.c b/drivers/dma/sf-pdma/sf-pdma.c index 528deb5d9f31..c4c4e8575764 100644 --- a/drivers/dma/sf-pdma/sf-pdma.c +++ b/drivers/dma/sf-pdma/sf-pdma.c @@ -326,10 +326,9 @@ static irqreturn_t sf_pdma_done_isr(int irq, void *dev_id) { struct sf_pdma_chan *chan = dev_id; struct pdma_regs *regs = &chan->regs; - unsigned long flags; u64 residue; - spin_lock_irqsave(&chan->vchan.lock, flags); + spin_lock(&chan->vchan.lock); writel((readl(regs->ctrl)) & ~PDMA_DONE_STATUS_MASK, regs->ctrl); residue = readq(regs->residue); @@ -346,7 +345,7 @@ static irqreturn_t sf_pdma_done_isr(int irq, void *dev_id) sf_pdma_xfer_desc(chan); } - spin_unlock_irqrestore(&chan->vchan.lock, flags); + spin_unlock(&chan->vchan.lock); return IRQ_HANDLED; } @@ -355,11 +354,10 @@ static irqreturn_t sf_pdma_err_isr(int irq, void *dev_id) { struct sf_pdma_chan *chan = dev_id; struct pdma_regs *regs = &chan->regs; - unsigned long flags; - spin_lock_irqsave(&chan->lock, flags); + spin_lock(&chan->lock); writel((readl(regs->ctrl)) & ~PDMA_ERR_STATUS_MASK, regs->ctrl); - spin_unlock_irqrestore(&chan->lock, flags); + spin_unlock(&chan->lock); tasklet_schedule(&chan->err_tasklet); @@ -584,7 +582,7 @@ static struct platform_driver sf_pdma_driver = { .remove = sf_pdma_remove, .driver = { .name = "sf-pdma", - .of_match_table = of_match_ptr(sf_pdma_dt_ids), + .of_match_table = sf_pdma_dt_ids, }, }; diff --git a/drivers/dma/ste_dma40.c b/drivers/dma/ste_dma40.c index 77ab1f4730be..4256e55bbf25 100644 --- a/drivers/dma/ste_dma40.c +++ b/drivers/dma/ste_dma40.c @@ -1643,13 +1643,12 @@ static irqreturn_t d40_handle_interrupt(int irq, void *data) u32 row; long chan = -1; struct d40_chan *d40c; - unsigned long flags; struct d40_base *base = data; u32 *regs = base->regs_interrupt; struct d40_interrupt_lookup *il = base->gen_dmac.il; u32 il_size = base->gen_dmac.il_size; - spin_lock_irqsave(&base->interrupt_lock, flags); + spin_lock(&base->interrupt_lock); /* Read interrupt status of both logical and physical channels */ for (i = 0; i < il_size; i++) @@ -1694,7 +1693,7 @@ static irqreturn_t d40_handle_interrupt(int irq, void *data) spin_unlock(&d40c->lock); } - spin_unlock_irqrestore(&base->interrupt_lock, flags); + spin_unlock(&base->interrupt_lock); return IRQ_HANDLED; } diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c index d0055d2f0b9a..f54ecb123a52 100644 --- a/drivers/dma/stm32-dma.c +++ b/drivers/dma/stm32-dma.c @@ -264,9 +264,11 @@ static int stm32_dma_get_width(struct stm32_dma_chan *chan, } static enum dma_slave_buswidth stm32_dma_get_max_width(u32 buf_len, + dma_addr_t buf_addr, u32 threshold) { enum dma_slave_buswidth max_width; + u64 addr = buf_addr; if (threshold == STM32_DMA_FIFO_THRESHOLD_FULL) max_width = DMA_SLAVE_BUSWIDTH_4_BYTES; @@ -277,6 +279,9 @@ static enum dma_slave_buswidth stm32_dma_get_max_width(u32 buf_len, max_width > DMA_SLAVE_BUSWIDTH_1_BYTE) max_width = max_width >> 1; + if (do_div(addr, max_width)) + max_width = DMA_SLAVE_BUSWIDTH_1_BYTE; + return max_width; } @@ -648,21 +653,12 @@ static irqreturn_t stm32_dma_chan_irq(int irq, void *devid) scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id)); sfcr = stm32_dma_read(dmadev, STM32_DMA_SFCR(chan->id)); - if (status & STM32_DMA_TCI) { - stm32_dma_irq_clear(chan, STM32_DMA_TCI); - if (scr & STM32_DMA_SCR_TCIE) - stm32_dma_handle_chan_done(chan); - status &= ~STM32_DMA_TCI; - } - if (status & STM32_DMA_HTI) { - stm32_dma_irq_clear(chan, STM32_DMA_HTI); - status &= ~STM32_DMA_HTI; - } if (status & STM32_DMA_FEI) { stm32_dma_irq_clear(chan, STM32_DMA_FEI); status &= ~STM32_DMA_FEI; if (sfcr & STM32_DMA_SFCR_FEIE) { - if (!(scr & STM32_DMA_SCR_EN)) + if (!(scr & STM32_DMA_SCR_EN) && + !(status & STM32_DMA_TCI)) dev_err(chan2dev(chan), "FIFO Error\n"); else dev_dbg(chan2dev(chan), "FIFO over/underrun\n"); @@ -674,6 +670,19 @@ static irqreturn_t stm32_dma_chan_irq(int irq, void *devid) if (sfcr & STM32_DMA_SCR_DMEIE) dev_dbg(chan2dev(chan), "Direct mode overrun\n"); } + + if (status & STM32_DMA_TCI) { + stm32_dma_irq_clear(chan, STM32_DMA_TCI); + if (scr & STM32_DMA_SCR_TCIE) + stm32_dma_handle_chan_done(chan); + status &= ~STM32_DMA_TCI; + } + + if (status & STM32_DMA_HTI) { + stm32_dma_irq_clear(chan, STM32_DMA_HTI); + status &= ~STM32_DMA_HTI; + } + if (status) { stm32_dma_irq_clear(chan, status); dev_err(chan2dev(chan), "DMA error: status=0x%08x\n", status); @@ -703,7 +712,7 @@ static void stm32_dma_issue_pending(struct dma_chan *c) static int stm32_dma_set_xfer_param(struct stm32_dma_chan *chan, enum dma_transfer_direction direction, enum dma_slave_buswidth *buswidth, - u32 buf_len) + u32 buf_len, dma_addr_t buf_addr) { enum dma_slave_buswidth src_addr_width, dst_addr_width; int src_bus_width, dst_bus_width; @@ -735,7 +744,8 @@ static int stm32_dma_set_xfer_param(struct stm32_dma_chan *chan, return dst_burst_size; /* Set memory data size */ - src_addr_width = stm32_dma_get_max_width(buf_len, fifoth); + src_addr_width = stm32_dma_get_max_width(buf_len, buf_addr, + fifoth); chan->mem_width = src_addr_width; src_bus_width = stm32_dma_get_width(chan, src_addr_width); if (src_bus_width < 0) @@ -784,7 +794,8 @@ static int stm32_dma_set_xfer_param(struct stm32_dma_chan *chan, return src_burst_size; /* Set memory data size */ - dst_addr_width = stm32_dma_get_max_width(buf_len, fifoth); + dst_addr_width = stm32_dma_get_max_width(buf_len, buf_addr, + fifoth); chan->mem_width = dst_addr_width; dst_bus_width = stm32_dma_get_width(chan, dst_addr_width); if (dst_bus_width < 0) @@ -872,7 +883,8 @@ static struct dma_async_tx_descriptor *stm32_dma_prep_slave_sg( for_each_sg(sgl, sg, sg_len, i) { ret = stm32_dma_set_xfer_param(chan, direction, &buswidth, - sg_dma_len(sg)); + sg_dma_len(sg), + sg_dma_address(sg)); if (ret < 0) goto err; @@ -940,7 +952,8 @@ static struct dma_async_tx_descriptor *stm32_dma_prep_dma_cyclic( return NULL; } - ret = stm32_dma_set_xfer_param(chan, direction, &buswidth, period_len); + ret = stm32_dma_set_xfer_param(chan, direction, &buswidth, period_len, + buf_addr); if (ret < 0) return NULL; @@ -1216,6 +1229,8 @@ static void stm32_dma_free_chan_resources(struct dma_chan *c) pm_runtime_put(dmadev->ddev.dev); vchan_free_chan_resources(to_virt_chan(c)); + stm32_dma_clear_reg(&chan->chan_reg); + chan->threshold = 0; } static void stm32_dma_desc_free(struct virt_dma_desc *vdesc) diff --git a/drivers/dma/stm32-dmamux.c b/drivers/dma/stm32-dmamux.c index a10ccd964376..ef0d0555103d 100644 --- a/drivers/dma/stm32-dmamux.c +++ b/drivers/dma/stm32-dmamux.c @@ -168,7 +168,7 @@ error_chan_id: return ERR_PTR(ret); } -static const struct of_device_id stm32_stm32dma_master_match[] = { +static const struct of_device_id stm32_stm32dma_master_match[] __maybe_unused = { { .compatible = "st,stm32-dma", }, {}, }; diff --git a/drivers/dma/stm32-mdma.c b/drivers/dma/stm32-mdma.c index 08cfbfab837b..e4637ec786d3 100644 --- a/drivers/dma/stm32-mdma.c +++ b/drivers/dma/stm32-mdma.c @@ -339,7 +339,7 @@ static struct stm32_mdma_desc *stm32_mdma_alloc_desc( struct stm32_mdma_desc *desc; int i; - desc = kzalloc(offsetof(typeof(*desc), node[count]), GFP_NOWAIT); + desc = kzalloc(struct_size(desc, node, count), GFP_NOWAIT); if (!desc) return NULL; @@ -1346,7 +1346,7 @@ static irqreturn_t stm32_mdma_irq_handler(int irq, void *devid) { struct stm32_mdma_device *dmadev = devid; struct stm32_mdma_chan *chan = devid; - u32 reg, id, ien, status, flag; + u32 reg, id, ccr, ien, status; /* Find out which channel generates the interrupt */ status = readl_relaxed(dmadev->base + STM32_MDMA_GISR0); @@ -1368,67 +1368,71 @@ static irqreturn_t stm32_mdma_irq_handler(int irq, void *devid) chan = &dmadev->chan[id]; if (!chan) { - dev_dbg(mdma2dev(dmadev), "MDMA channel not initialized\n"); - goto exit; + dev_warn(mdma2dev(dmadev), "MDMA channel not initialized\n"); + return IRQ_NONE; } /* Handle interrupt for the channel */ spin_lock(&chan->vchan.lock); - status = stm32_mdma_read(dmadev, STM32_MDMA_CISR(chan->id)); - ien = stm32_mdma_read(dmadev, STM32_MDMA_CCR(chan->id)); - ien &= STM32_MDMA_CCR_IRQ_MASK; - ien >>= 1; + status = stm32_mdma_read(dmadev, STM32_MDMA_CISR(id)); + /* Mask Channel ReQuest Active bit which can be set in case of MEM2MEM */ + status &= ~STM32_MDMA_CISR_CRQA; + ccr = stm32_mdma_read(dmadev, STM32_MDMA_CCR(id)); + ien = (ccr & STM32_MDMA_CCR_IRQ_MASK) >> 1; if (!(status & ien)) { spin_unlock(&chan->vchan.lock); - dev_dbg(chan2dev(chan), - "spurious it (status=0x%04x, ien=0x%04x)\n", - status, ien); + dev_warn(chan2dev(chan), + "spurious it (status=0x%04x, ien=0x%04x)\n", + status, ien); return IRQ_NONE; } - flag = __ffs(status & ien); - reg = STM32_MDMA_CIFCR(chan->id); + reg = STM32_MDMA_CIFCR(id); - switch (1 << flag) { - case STM32_MDMA_CISR_TEIF: - id = chan->id; - status = readl_relaxed(dmadev->base + STM32_MDMA_CESR(id)); - dev_err(chan2dev(chan), "Transfer Err: stat=0x%08x\n", status); + if (status & STM32_MDMA_CISR_TEIF) { + dev_err(chan2dev(chan), "Transfer Err: stat=0x%08x\n", + readl_relaxed(dmadev->base + STM32_MDMA_CESR(id))); stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CTEIF); - break; + status &= ~STM32_MDMA_CISR_TEIF; + } - case STM32_MDMA_CISR_CTCIF: + if (status & STM32_MDMA_CISR_CTCIF) { stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CCTCIF); + status &= ~STM32_MDMA_CISR_CTCIF; stm32_mdma_xfer_end(chan); - break; + } - case STM32_MDMA_CISR_BRTIF: + if (status & STM32_MDMA_CISR_BRTIF) { stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CBRTIF); - break; + status &= ~STM32_MDMA_CISR_BRTIF; + } - case STM32_MDMA_CISR_BTIF: + if (status & STM32_MDMA_CISR_BTIF) { stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CBTIF); + status &= ~STM32_MDMA_CISR_BTIF; chan->curr_hwdesc++; if (chan->desc && chan->desc->cyclic) { if (chan->curr_hwdesc == chan->desc->count) chan->curr_hwdesc = 0; vchan_cyclic_callback(&chan->desc->vdesc); } - break; + } - case STM32_MDMA_CISR_TCIF: + if (status & STM32_MDMA_CISR_TCIF) { stm32_mdma_set_bits(dmadev, reg, STM32_MDMA_CIFCR_CLTCIF); - break; + status &= ~STM32_MDMA_CISR_TCIF; + } - default: - dev_err(chan2dev(chan), "it %d unhandled (status=0x%04x)\n", - 1 << flag, status); + if (status) { + stm32_mdma_set_bits(dmadev, reg, status); + dev_err(chan2dev(chan), "DMA error: status=0x%08x\n", status); + if (!(ccr & STM32_MDMA_CCR_EN)) + dev_err(chan2dev(chan), "chan disabled by HW\n"); } spin_unlock(&chan->vchan.lock); -exit: return IRQ_HANDLED; } diff --git a/drivers/dma/sun6i-dma.c b/drivers/dma/sun6i-dma.c index f5f9c86c50bc..5cadd4d2b824 100644 --- a/drivers/dma/sun6i-dma.c +++ b/drivers/dma/sun6i-dma.c @@ -1173,6 +1173,30 @@ static struct sun6i_dma_config sun50i_a64_dma_cfg = { BIT(DMA_SLAVE_BUSWIDTH_8_BYTES), }; +/* + * TODO: Add support for more than 4g physical addressing. + * + * The A100 binding uses the number of dma channels from the + * device tree node. + */ +static struct sun6i_dma_config sun50i_a100_dma_cfg = { + .clock_autogate_enable = sun6i_enable_clock_autogate_h3, + .set_burst_length = sun6i_set_burst_length_h3, + .set_drq = sun6i_set_drq_h6, + .set_mode = sun6i_set_mode_h6, + .src_burst_lengths = BIT(1) | BIT(4) | BIT(8) | BIT(16), + .dst_burst_lengths = BIT(1) | BIT(4) | BIT(8) | BIT(16), + .src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | + BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | + BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | + BIT(DMA_SLAVE_BUSWIDTH_8_BYTES), + .dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_1_BYTE) | + BIT(DMA_SLAVE_BUSWIDTH_2_BYTES) | + BIT(DMA_SLAVE_BUSWIDTH_4_BYTES) | + BIT(DMA_SLAVE_BUSWIDTH_8_BYTES), + .has_mbus_clk = true, +}; + /* * The H6 binding uses the number of dma channels from the * device tree node. @@ -1225,6 +1249,7 @@ static const struct of_device_id sun6i_dma_match[] = { { .compatible = "allwinner,sun8i-h3-dma", .data = &sun8i_h3_dma_cfg }, { .compatible = "allwinner,sun8i-v3s-dma", .data = &sun8i_v3s_dma_cfg }, { .compatible = "allwinner,sun50i-a64-dma", .data = &sun50i_a64_dma_cfg }, + { .compatible = "allwinner,sun50i-a100-dma", .data = &sun50i_a100_dma_cfg }, { .compatible = "allwinner,sun50i-h6-dma", .data = &sun50i_h6_dma_cfg }, { /* sentinel */ } }; diff --git a/drivers/dma/tegra210-adma.c b/drivers/dma/tegra210-adma.c index c5fa2ef74abc..4735742e826d 100644 --- a/drivers/dma/tegra210-adma.c +++ b/drivers/dma/tegra210-adma.c @@ -408,19 +408,18 @@ static irqreturn_t tegra_adma_isr(int irq, void *dev_id) { struct tegra_adma_chan *tdc = dev_id; unsigned long status; - unsigned long flags; - spin_lock_irqsave(&tdc->vc.lock, flags); + spin_lock(&tdc->vc.lock); status = tegra_adma_irq_clear(tdc); if (status == 0 || !tdc->desc) { - spin_unlock_irqrestore(&tdc->vc.lock, flags); + spin_unlock(&tdc->vc.lock); return IRQ_NONE; } vchan_cyclic_callback(&tdc->desc->vd); - spin_unlock_irqrestore(&tdc->vc.lock, flags); + spin_unlock(&tdc->vc.lock); return IRQ_HANDLED; } diff --git a/drivers/dma/ti/Makefile b/drivers/dma/ti/Makefile index 0c67254caee6..bd496efadff7 100644 --- a/drivers/dma/ti/Makefile +++ b/drivers/dma/ti/Makefile @@ -7,5 +7,6 @@ obj-$(CONFIG_TI_K3_UDMA_GLUE_LAYER) += k3-udma-glue.o obj-$(CONFIG_TI_K3_PSIL) += k3-psil.o \ k3-psil-am654.o \ k3-psil-j721e.o \ - k3-psil-j7200.o + k3-psil-j7200.o \ + k3-psil-am64.o obj-$(CONFIG_TI_DMA_CROSSBAR) += dma-crossbar.o diff --git a/drivers/dma/ti/dma-crossbar.c b/drivers/dma/ti/dma-crossbar.c index 4ba8fa5d9c36..71d24fc07c00 100644 --- a/drivers/dma/ti/dma-crossbar.c +++ b/drivers/dma/ti/dma-crossbar.c @@ -122,7 +122,7 @@ static void *ti_am335x_xbar_route_allocate(struct of_phandle_args *dma_spec, return map; } -static const struct of_device_id ti_am335x_master_match[] = { +static const struct of_device_id ti_am335x_master_match[] __maybe_unused = { { .compatible = "ti,edma3-tpcc", }, {}, }; @@ -292,7 +292,7 @@ static const u32 ti_dma_offset[] = { [TI_XBAR_SDMA_OFFSET] = 1, }; -static const struct of_device_id ti_dra7_master_match[] = { +static const struct of_device_id ti_dra7_master_match[] __maybe_unused = { { .compatible = "ti,omap4430-sdma", .data = &ti_dma_offset[TI_XBAR_SDMA_OFFSET], @@ -460,7 +460,7 @@ static int ti_dma_xbar_probe(struct platform_device *pdev) static struct platform_driver ti_dma_xbar_driver = { .driver = { .name = "ti-dma-crossbar", - .of_match_table = of_match_ptr(ti_dma_xbar_match), + .of_match_table = ti_dma_xbar_match, }, .probe = ti_dma_xbar_probe, }; diff --git a/drivers/dma/ti/k3-psil-am64.c b/drivers/dma/ti/k3-psil-am64.c new file mode 100644 index 000000000000..9fdeaa11a4fc --- /dev/null +++ b/drivers/dma/ti/k3-psil-am64.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Texas Instruments Incorporated - https://www.ti.com + * Author: Peter Ujfalusi + */ + +#include + +#include "k3-psil-priv.h" + +#define PSIL_PDMA_XY_TR(x) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_PDMA_XY, \ + .mapped_channel_id = -1, \ + .default_flow_id = -1, \ + }, \ + } + +#define PSIL_PDMA_XY_PKT(x) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_PDMA_XY, \ + .mapped_channel_id = -1, \ + .default_flow_id = -1, \ + .pkt_mode = 1, \ + }, \ + } + +#define PSIL_ETHERNET(x, ch, flow_base, flow_cnt) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_NATIVE, \ + .pkt_mode = 1, \ + .needs_epib = 1, \ + .psd_size = 16, \ + .mapped_channel_id = ch, \ + .flow_start = flow_base, \ + .flow_num = flow_cnt, \ + .default_flow_id = flow_base, \ + }, \ + } + +#define PSIL_SAUL(x, ch, flow_base, flow_cnt, default_flow, tx) \ + { \ + .thread_id = x, \ + .ep_config = { \ + .ep_type = PSIL_EP_NATIVE, \ + .pkt_mode = 1, \ + .needs_epib = 1, \ + .psd_size = 64, \ + .mapped_channel_id = ch, \ + .flow_start = flow_base, \ + .flow_num = flow_cnt, \ + .default_flow_id = default_flow, \ + .notdpkt = tx, \ + }, \ + } + +/* PSI-L source thread IDs, used for RX (DMA_DEV_TO_MEM) */ +static struct psil_ep am64_src_ep_map[] = { + /* SAUL */ + PSIL_SAUL(0x4000, 17, 32, 8, 32, 0), + PSIL_SAUL(0x4001, 18, 32, 8, 33, 0), + PSIL_SAUL(0x4002, 19, 40, 8, 40, 0), + PSIL_SAUL(0x4003, 20, 40, 8, 41, 0), + /* ICSS_G0 */ + PSIL_ETHERNET(0x4100, 21, 48, 16), + PSIL_ETHERNET(0x4101, 22, 64, 16), + PSIL_ETHERNET(0x4102, 23, 80, 16), + PSIL_ETHERNET(0x4103, 24, 96, 16), + /* ICSS_G1 */ + PSIL_ETHERNET(0x4200, 25, 112, 16), + PSIL_ETHERNET(0x4201, 26, 128, 16), + PSIL_ETHERNET(0x4202, 27, 144, 16), + PSIL_ETHERNET(0x4203, 28, 160, 16), + /* PDMA_MAIN0 - SPI0-3 */ + PSIL_PDMA_XY_PKT(0x4300), + PSIL_PDMA_XY_PKT(0x4301), + PSIL_PDMA_XY_PKT(0x4302), + PSIL_PDMA_XY_PKT(0x4303), + PSIL_PDMA_XY_PKT(0x4304), + PSIL_PDMA_XY_PKT(0x4305), + PSIL_PDMA_XY_PKT(0x4306), + PSIL_PDMA_XY_PKT(0x4307), + PSIL_PDMA_XY_PKT(0x4308), + PSIL_PDMA_XY_PKT(0x4309), + PSIL_PDMA_XY_PKT(0x430a), + PSIL_PDMA_XY_PKT(0x430b), + PSIL_PDMA_XY_PKT(0x430c), + PSIL_PDMA_XY_PKT(0x430d), + PSIL_PDMA_XY_PKT(0x430e), + PSIL_PDMA_XY_PKT(0x430f), + /* PDMA_MAIN0 - USART0-1 */ + PSIL_PDMA_XY_PKT(0x4310), + PSIL_PDMA_XY_PKT(0x4311), + /* PDMA_MAIN1 - SPI4 */ + PSIL_PDMA_XY_PKT(0x4400), + PSIL_PDMA_XY_PKT(0x4401), + PSIL_PDMA_XY_PKT(0x4402), + PSIL_PDMA_XY_PKT(0x4403), + /* PDMA_MAIN1 - USART2-6 */ + PSIL_PDMA_XY_PKT(0x4404), + PSIL_PDMA_XY_PKT(0x4405), + PSIL_PDMA_XY_PKT(0x4406), + PSIL_PDMA_XY_PKT(0x4407), + PSIL_PDMA_XY_PKT(0x4408), + /* PDMA_MAIN1 - ADCs */ + PSIL_PDMA_XY_TR(0x440f), + PSIL_PDMA_XY_TR(0x4410), + /* CPSW2 */ + PSIL_ETHERNET(0x4500, 16, 16, 16), +}; + +/* PSI-L destination thread IDs, used for TX (DMA_MEM_TO_DEV) */ +static struct psil_ep am64_dst_ep_map[] = { + /* SAUL */ + PSIL_SAUL(0xc000, 24, 80, 8, 80, 1), + PSIL_SAUL(0xc001, 25, 88, 8, 88, 1), + /* ICSS_G0 */ + PSIL_ETHERNET(0xc100, 26, 96, 1), + PSIL_ETHERNET(0xc101, 27, 97, 1), + PSIL_ETHERNET(0xc102, 28, 98, 1), + PSIL_ETHERNET(0xc103, 29, 99, 1), + PSIL_ETHERNET(0xc104, 30, 100, 1), + PSIL_ETHERNET(0xc105, 31, 101, 1), + PSIL_ETHERNET(0xc106, 32, 102, 1), + PSIL_ETHERNET(0xc107, 33, 103, 1), + /* ICSS_G1 */ + PSIL_ETHERNET(0xc200, 34, 104, 1), + PSIL_ETHERNET(0xc201, 35, 105, 1), + PSIL_ETHERNET(0xc202, 36, 106, 1), + PSIL_ETHERNET(0xc203, 37, 107, 1), + PSIL_ETHERNET(0xc204, 38, 108, 1), + PSIL_ETHERNET(0xc205, 39, 109, 1), + PSIL_ETHERNET(0xc206, 40, 110, 1), + PSIL_ETHERNET(0xc207, 41, 111, 1), + /* CPSW2 */ + PSIL_ETHERNET(0xc500, 16, 16, 8), + PSIL_ETHERNET(0xc501, 17, 24, 8), + PSIL_ETHERNET(0xc502, 18, 32, 8), + PSIL_ETHERNET(0xc503, 19, 40, 8), + PSIL_ETHERNET(0xc504, 20, 48, 8), + PSIL_ETHERNET(0xc505, 21, 56, 8), + PSIL_ETHERNET(0xc506, 22, 64, 8), + PSIL_ETHERNET(0xc507, 23, 72, 8), +}; + +struct psil_ep_map am64_ep_map = { + .name = "am64", + .src = am64_src_ep_map, + .src_count = ARRAY_SIZE(am64_src_ep_map), + .dst = am64_dst_ep_map, + .dst_count = ARRAY_SIZE(am64_dst_ep_map), +}; diff --git a/drivers/dma/ti/k3-psil-priv.h b/drivers/dma/ti/k3-psil-priv.h index b4b0fb359eff..b74e192e3c2d 100644 --- a/drivers/dma/ti/k3-psil-priv.h +++ b/drivers/dma/ti/k3-psil-priv.h @@ -40,5 +40,6 @@ struct psil_endpoint_config *psil_get_ep_config(u32 thread_id); extern struct psil_ep_map am654_ep_map; extern struct psil_ep_map j721e_ep_map; extern struct psil_ep_map j7200_ep_map; +extern struct psil_ep_map am64_ep_map; #endif /* K3_PSIL_PRIV_H_ */ diff --git a/drivers/dma/ti/k3-psil.c b/drivers/dma/ti/k3-psil.c index 837853aab95a..13ce7367d870 100644 --- a/drivers/dma/ti/k3-psil.c +++ b/drivers/dma/ti/k3-psil.c @@ -20,6 +20,7 @@ static const struct soc_device_attribute k3_soc_devices[] = { { .family = "AM65X", .data = &am654_ep_map }, { .family = "J721E", .data = &j721e_ep_map }, { .family = "J7200", .data = &j7200_ep_map }, + { .family = "AM64X", .data = &am64_ep_map }, { /* sentinel */ } }; diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c index a367584f0d7b..4fdd9f06b723 100644 --- a/drivers/dma/ti/k3-udma-glue.c +++ b/drivers/dma/ti/k3-udma-glue.c @@ -22,6 +22,7 @@ struct k3_udma_glue_common { struct device *dev; + struct device chan_dev; struct udma_dev *udmax; const struct udma_tisci_rm *tisci_rm; struct k3_ringacc *ringacc; @@ -32,7 +33,8 @@ struct k3_udma_glue_common { bool epib; u32 psdata_size; u32 swdata_size; - u32 atype; + u32 atype_asel; + struct psil_endpoint_config *ep_config; }; struct k3_udma_glue_tx_channel { @@ -53,6 +55,8 @@ struct k3_udma_glue_tx_channel { bool tx_filt_einfo; bool tx_filt_pswords; bool tx_supr_tdpkt; + + int udma_tflow_id; }; struct k3_udma_glue_rx_flow { @@ -81,20 +85,26 @@ struct k3_udma_glue_rx_channel { u32 flows_ready; }; +static void k3_udma_chan_dev_release(struct device *dev) +{ + /* The struct containing the device is devm managed */ +} + +static struct class k3_udma_glue_devclass = { + .name = "k3_udma_glue_chan", + .dev_release = k3_udma_chan_dev_release, +}; + #define K3_UDMAX_TDOWN_TIMEOUT_US 1000 static int of_k3_udma_glue_parse(struct device_node *udmax_np, struct k3_udma_glue_common *common) { - common->ringacc = of_k3_ringacc_get_by_phandle(udmax_np, - "ti,ringacc"); - if (IS_ERR(common->ringacc)) - return PTR_ERR(common->ringacc); - common->udmax = of_xudma_dev_get(udmax_np, NULL); if (IS_ERR(common->udmax)) return PTR_ERR(common->udmax); + common->ringacc = xudma_get_ringacc(common->udmax); common->tisci_rm = xudma_dev_get_tisci_rm(common->udmax); return 0; @@ -104,7 +114,6 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np, const char *name, struct k3_udma_glue_common *common, bool tx_chn) { - struct psil_endpoint_config *ep_config; struct of_phandle_args dma_spec; u32 thread_id; int ret = 0; @@ -121,15 +130,26 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np, &dma_spec)) return -ENOENT; + ret = of_k3_udma_glue_parse(dma_spec.np, common); + if (ret) + goto out_put_spec; + thread_id = dma_spec.args[0]; if (dma_spec.args_count == 2) { - if (dma_spec.args[1] > 2) { + if (dma_spec.args[1] > 2 && !xudma_is_pktdma(common->udmax)) { dev_err(common->dev, "Invalid channel atype: %u\n", dma_spec.args[1]); ret = -EINVAL; goto out_put_spec; } - common->atype = dma_spec.args[1]; + if (dma_spec.args[1] > 15 && xudma_is_pktdma(common->udmax)) { + dev_err(common->dev, "Invalid channel asel: %u\n", + dma_spec.args[1]); + ret = -EINVAL; + goto out_put_spec; + } + + common->atype_asel = dma_spec.args[1]; } if (tx_chn && !(thread_id & K3_PSIL_DST_THREAD_ID_OFFSET)) { @@ -143,25 +163,23 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np, } /* get psil endpoint config */ - ep_config = psil_get_ep_config(thread_id); - if (IS_ERR(ep_config)) { + common->ep_config = psil_get_ep_config(thread_id); + if (IS_ERR(common->ep_config)) { dev_err(common->dev, "No configuration for psi-l thread 0x%04x\n", thread_id); - ret = PTR_ERR(ep_config); + ret = PTR_ERR(common->ep_config); goto out_put_spec; } - common->epib = ep_config->needs_epib; - common->psdata_size = ep_config->psd_size; + common->epib = common->ep_config->needs_epib; + common->psdata_size = common->ep_config->psd_size; if (tx_chn) common->dst_thread = thread_id; else common->src_thread = thread_id; - ret = of_k3_udma_glue_parse(dma_spec.np, common); - out_put_spec: of_node_put(dma_spec.np); return ret; @@ -227,7 +245,7 @@ static int k3_udma_glue_cfg_tx_chn(struct k3_udma_glue_tx_channel *tx_chn) req.tx_supr_tdpkt = 1; req.tx_fetch_size = tx_chn->common.hdesc_size >> 2; req.txcq_qnum = k3_ringacc_get_ring_id(tx_chn->ringtxcq); - req.tx_atype = tx_chn->common.atype; + req.tx_atype = tx_chn->common.atype_asel; return tisci_rm->tisci_udmap_ops->tx_ch_cfg(tisci_rm->tisci, &req); } @@ -259,8 +277,14 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev, tx_chn->common.psdata_size, tx_chn->common.swdata_size); + if (xudma_is_pktdma(tx_chn->common.udmax)) + tx_chn->udma_tchan_id = tx_chn->common.ep_config->mapped_channel_id; + else + tx_chn->udma_tchan_id = -1; + /* request and cfg UDMAP TX channel */ - tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax, -1); + tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax, + tx_chn->udma_tchan_id); if (IS_ERR(tx_chn->udma_tchanx)) { ret = PTR_ERR(tx_chn->udma_tchanx); dev_err(dev, "UDMAX tchanx get err %d\n", ret); @@ -268,11 +292,34 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev, } tx_chn->udma_tchan_id = xudma_tchan_get_id(tx_chn->udma_tchanx); + tx_chn->common.chan_dev.class = &k3_udma_glue_devclass; + tx_chn->common.chan_dev.parent = xudma_get_device(tx_chn->common.udmax); + dev_set_name(&tx_chn->common.chan_dev, "tchan%d-0x%04x", + tx_chn->udma_tchan_id, tx_chn->common.dst_thread); + ret = device_register(&tx_chn->common.chan_dev); + if (ret) { + dev_err(dev, "Channel Device registration failed %d\n", ret); + tx_chn->common.chan_dev.parent = NULL; + goto err; + } + + if (xudma_is_pktdma(tx_chn->common.udmax)) { + /* prepare the channel device as coherent */ + tx_chn->common.chan_dev.dma_coherent = true; + dma_coerce_mask_and_coherent(&tx_chn->common.chan_dev, + DMA_BIT_MASK(48)); + } + atomic_set(&tx_chn->free_pkts, cfg->txcq_cfg.size); + if (xudma_is_pktdma(tx_chn->common.udmax)) + tx_chn->udma_tflow_id = tx_chn->common.ep_config->default_flow_id; + else + tx_chn->udma_tflow_id = tx_chn->udma_tchan_id; + /* request and cfg rings */ ret = k3_ringacc_request_rings_pair(tx_chn->common.ringacc, - tx_chn->udma_tchan_id, -1, + tx_chn->udma_tflow_id, -1, &tx_chn->ringtx, &tx_chn->ringtxcq); if (ret) { @@ -280,6 +327,16 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev, goto err; } + /* Set the dma_dev for the rings to be configured */ + cfg->tx_cfg.dma_dev = k3_udma_glue_tx_get_dma_device(tx_chn); + cfg->txcq_cfg.dma_dev = cfg->tx_cfg.dma_dev; + + /* Set the ASEL value for DMA rings of PKTDMA */ + if (xudma_is_pktdma(tx_chn->common.udmax)) { + cfg->tx_cfg.asel = tx_chn->common.atype_asel; + cfg->txcq_cfg.asel = tx_chn->common.atype_asel; + } + ret = k3_ringacc_ring_cfg(tx_chn->ringtx, &cfg->tx_cfg); if (ret) { dev_err(dev, "Failed to cfg ringtx %d\n", ret); @@ -303,19 +360,6 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev, goto err; } - ret = xudma_navss_psil_pair(tx_chn->common.udmax, - tx_chn->common.src_thread, - tx_chn->common.dst_thread); - if (ret) { - dev_err(dev, "PSI-L request err %d\n", ret); - goto err; - } - - tx_chn->psil_paired = true; - - /* reset TX RT registers */ - k3_udma_glue_disable_tx_chn(tx_chn); - k3_udma_glue_dump_tx_chn(tx_chn); return tx_chn; @@ -344,6 +388,11 @@ void k3_udma_glue_release_tx_chn(struct k3_udma_glue_tx_channel *tx_chn) if (tx_chn->ringtx) k3_ringacc_ring_free(tx_chn->ringtx); + + if (tx_chn->common.chan_dev.parent) { + device_unregister(&tx_chn->common.chan_dev); + tx_chn->common.chan_dev.parent = NULL; + } } EXPORT_SYMBOL_GPL(k3_udma_glue_release_tx_chn); @@ -378,6 +427,18 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_pop_tx_chn); int k3_udma_glue_enable_tx_chn(struct k3_udma_glue_tx_channel *tx_chn) { + int ret; + + ret = xudma_navss_psil_pair(tx_chn->common.udmax, + tx_chn->common.src_thread, + tx_chn->common.dst_thread); + if (ret) { + dev_err(tx_chn->common.dev, "PSI-L request err %d\n", ret); + return ret; + } + + tx_chn->psil_paired = true; + xudma_tchanrt_write(tx_chn->udma_tchanx, UDMA_CHAN_RT_PEER_RT_EN_REG, UDMA_PEER_RT_EN_ENABLE); @@ -398,6 +459,13 @@ void k3_udma_glue_disable_tx_chn(struct k3_udma_glue_tx_channel *tx_chn) xudma_tchanrt_write(tx_chn->udma_tchanx, UDMA_CHAN_RT_PEER_RT_EN_REG, 0); k3_udma_glue_dump_tx_rt_chn(tx_chn, "txchn dis2"); + + if (tx_chn->psil_paired) { + xudma_navss_psil_unpair(tx_chn->common.udmax, + tx_chn->common.src_thread, + tx_chn->common.dst_thread); + tx_chn->psil_paired = false; + } } EXPORT_SYMBOL_GPL(k3_udma_glue_disable_tx_chn); @@ -437,13 +505,10 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn, void *data, void (*cleanup)(void *data, dma_addr_t desc_dma)) { + struct device *dev = tx_chn->common.dev; dma_addr_t desc_dma; int occ_tx, i, ret; - /* reset TXCQ as it is not input for udma - expected to be empty */ - if (tx_chn->ringtxcq) - k3_ringacc_ring_reset(tx_chn->ringtxcq); - /* * TXQ reset need to be special way as it is input for udma and its * state cached by udma, so: @@ -452,17 +517,20 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn, * 3) reset TXQ in a special way */ occ_tx = k3_ringacc_ring_get_occ(tx_chn->ringtx); - dev_dbg(tx_chn->common.dev, "TX reset occ_tx %u\n", occ_tx); + dev_dbg(dev, "TX reset occ_tx %u\n", occ_tx); for (i = 0; i < occ_tx; i++) { ret = k3_ringacc_ring_pop(tx_chn->ringtx, &desc_dma); if (ret) { - dev_err(tx_chn->common.dev, "TX reset pop %d\n", ret); + if (ret != -ENODATA) + dev_err(dev, "TX reset pop %d\n", ret); break; } cleanup(data, desc_dma); } + /* reset TXCQ as it is not input for udma - expected to be empty */ + k3_ringacc_ring_reset(tx_chn->ringtxcq); k3_ringacc_ring_reset_dma(tx_chn->ringtx, occ_tx); } EXPORT_SYMBOL_GPL(k3_udma_glue_reset_tx_chn); @@ -481,12 +549,50 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_txcq_id); int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn) { - tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq); + if (xudma_is_pktdma(tx_chn->common.udmax)) { + tx_chn->virq = xudma_pktdma_tflow_get_irq(tx_chn->common.udmax, + tx_chn->udma_tflow_id); + } else { + tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq); + } return tx_chn->virq; } EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq); +struct device * + k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn) +{ + if (xudma_is_pktdma(tx_chn->common.udmax) && + (tx_chn->common.atype_asel == 14 || tx_chn->common.atype_asel == 15)) + return &tx_chn->common.chan_dev; + + return xudma_get_device(tx_chn->common.udmax); +} +EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device); + +void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn, + dma_addr_t *addr) +{ + if (!xudma_is_pktdma(tx_chn->common.udmax) || + !tx_chn->common.atype_asel) + return; + + *addr |= (u64)tx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT; +} +EXPORT_SYMBOL_GPL(k3_udma_glue_tx_dma_to_cppi5_addr); + +void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn, + dma_addr_t *addr) +{ + if (!xudma_is_pktdma(tx_chn->common.udmax) || + !tx_chn->common.atype_asel) + return; + + *addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0); +} +EXPORT_SYMBOL_GPL(k3_udma_glue_tx_cppi5_to_dma_addr); + static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn) { const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm; @@ -498,8 +604,6 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn) req.valid_params = TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID | TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID | TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID | - TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID | - TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID | TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID; req.nav_id = tisci_rm->tisci_dev_id; @@ -511,13 +615,16 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn) * req.rxcq_qnum = k3_ringacc_get_ring_id(rx_chn->flows[0].ringrx); */ req.rxcq_qnum = 0xFFFF; - if (rx_chn->flow_num && rx_chn->flow_id_base != rx_chn->udma_rchan_id) { + if (!xudma_is_pktdma(rx_chn->common.udmax) && rx_chn->flow_num && + rx_chn->flow_id_base != rx_chn->udma_rchan_id) { /* Default flow + extra ones */ + req.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID | + TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID; req.flowid_start = rx_chn->flow_id_base; req.flowid_cnt = rx_chn->flow_num; } req.rx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR; - req.rx_atype = rx_chn->common.atype; + req.rx_atype = rx_chn->common.atype_asel; ret = tisci_rm->tisci_udmap_ops->rx_ch_cfg(tisci_rm->tisci, &req); if (ret) @@ -571,10 +678,18 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn, goto err_rflow_put; } + if (xudma_is_pktdma(rx_chn->common.udmax)) { + rx_ringfdq_id = flow->udma_rflow_id + + xudma_get_rflow_ring_offset(rx_chn->common.udmax); + rx_ring_id = 0; + } else { + rx_ring_id = flow_cfg->ring_rxq_id; + rx_ringfdq_id = flow_cfg->ring_rxfdq0_id; + } + /* request and cfg rings */ ret = k3_ringacc_request_rings_pair(rx_chn->common.ringacc, - flow_cfg->ring_rxfdq0_id, - flow_cfg->ring_rxq_id, + rx_ringfdq_id, rx_ring_id, &flow->ringrxfdq, &flow->ringrx); if (ret) { @@ -582,6 +697,16 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn, goto err_rflow_put; } + /* Set the dma_dev for the rings to be configured */ + flow_cfg->rx_cfg.dma_dev = k3_udma_glue_rx_get_dma_device(rx_chn); + flow_cfg->rxfdq_cfg.dma_dev = flow_cfg->rx_cfg.dma_dev; + + /* Set the ASEL value for DMA rings of PKTDMA */ + if (xudma_is_pktdma(rx_chn->common.udmax)) { + flow_cfg->rx_cfg.asel = rx_chn->common.atype_asel; + flow_cfg->rxfdq_cfg.asel = rx_chn->common.atype_asel; + } + ret = k3_ringacc_ring_cfg(flow->ringrx, &flow_cfg->rx_cfg); if (ret) { dev_err(dev, "Failed to cfg ringrx %d\n", ret); @@ -740,6 +865,7 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name, struct k3_udma_glue_rx_channel_cfg *cfg) { struct k3_udma_glue_rx_channel *rx_chn; + struct psil_endpoint_config *ep_cfg; int ret, i; if (cfg->flow_id_num <= 0) @@ -767,8 +893,16 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name, rx_chn->common.psdata_size, rx_chn->common.swdata_size); + ep_cfg = rx_chn->common.ep_config; + + if (xudma_is_pktdma(rx_chn->common.udmax)) + rx_chn->udma_rchan_id = ep_cfg->mapped_channel_id; + else + rx_chn->udma_rchan_id = -1; + /* request and cfg UDMAP RX channel */ - rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax, -1); + rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax, + rx_chn->udma_rchan_id); if (IS_ERR(rx_chn->udma_rchanx)) { ret = PTR_ERR(rx_chn->udma_rchanx); dev_err(dev, "UDMAX rchanx get err %d\n", ret); @@ -776,12 +910,48 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name, } rx_chn->udma_rchan_id = xudma_rchan_get_id(rx_chn->udma_rchanx); - rx_chn->flow_num = cfg->flow_id_num; - rx_chn->flow_id_base = cfg->flow_id_base; + rx_chn->common.chan_dev.class = &k3_udma_glue_devclass; + rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax); + dev_set_name(&rx_chn->common.chan_dev, "rchan%d-0x%04x", + rx_chn->udma_rchan_id, rx_chn->common.src_thread); + ret = device_register(&rx_chn->common.chan_dev); + if (ret) { + dev_err(dev, "Channel Device registration failed %d\n", ret); + rx_chn->common.chan_dev.parent = NULL; + goto err; + } - /* Use RX channel id as flow id: target dev can't generate flow_id */ - if (cfg->flow_id_use_rxchan_id) - rx_chn->flow_id_base = rx_chn->udma_rchan_id; + if (xudma_is_pktdma(rx_chn->common.udmax)) { + /* prepare the channel device as coherent */ + rx_chn->common.chan_dev.dma_coherent = true; + dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev, + DMA_BIT_MASK(48)); + } + + if (xudma_is_pktdma(rx_chn->common.udmax)) { + int flow_start = cfg->flow_id_base; + int flow_end; + + if (flow_start == -1) + flow_start = ep_cfg->flow_start; + + flow_end = flow_start + cfg->flow_id_num - 1; + if (flow_start < ep_cfg->flow_start || + flow_end > (ep_cfg->flow_start + ep_cfg->flow_num - 1)) { + dev_err(dev, "Invalid flow range requested\n"); + ret = -EINVAL; + goto err; + } + rx_chn->flow_id_base = flow_start; + } else { + rx_chn->flow_id_base = cfg->flow_id_base; + + /* Use RX channel id as flow id: target dev can't generate flow_id */ + if (cfg->flow_id_use_rxchan_id) + rx_chn->flow_id_base = rx_chn->udma_rchan_id; + } + + rx_chn->flow_num = cfg->flow_id_num; rx_chn->flows = devm_kcalloc(dev, rx_chn->flow_num, sizeof(*rx_chn->flows), GFP_KERNEL); @@ -815,19 +985,6 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name, goto err; } - ret = xudma_navss_psil_pair(rx_chn->common.udmax, - rx_chn->common.src_thread, - rx_chn->common.dst_thread); - if (ret) { - dev_err(dev, "PSI-L request err %d\n", ret); - goto err; - } - - rx_chn->psil_paired = true; - - /* reset RX RT registers */ - k3_udma_glue_disable_rx_chn(rx_chn); - k3_udma_glue_dump_rx_chn(rx_chn); return rx_chn; @@ -884,6 +1041,24 @@ k3_udma_glue_request_remote_rx_chn(struct device *dev, const char *name, goto err; } + rx_chn->common.chan_dev.class = &k3_udma_glue_devclass; + rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax); + dev_set_name(&rx_chn->common.chan_dev, "rchan_remote-0x%04x", + rx_chn->common.src_thread); + ret = device_register(&rx_chn->common.chan_dev); + if (ret) { + dev_err(dev, "Channel Device registration failed %d\n", ret); + rx_chn->common.chan_dev.parent = NULL; + goto err; + } + + if (xudma_is_pktdma(rx_chn->common.udmax)) { + /* prepare the channel device as coherent */ + rx_chn->common.chan_dev.dma_coherent = true; + dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev, + DMA_BIT_MASK(48)); + } + ret = k3_udma_glue_allocate_rx_flows(rx_chn, cfg); if (ret) goto err; @@ -936,6 +1111,11 @@ void k3_udma_glue_release_rx_chn(struct k3_udma_glue_rx_channel *rx_chn) if (!IS_ERR_OR_NULL(rx_chn->udma_rchanx)) xudma_rchan_put(rx_chn->common.udmax, rx_chn->udma_rchanx); + + if (rx_chn->common.chan_dev.parent) { + device_unregister(&rx_chn->common.chan_dev); + rx_chn->common.chan_dev.parent = NULL; + } } EXPORT_SYMBOL_GPL(k3_udma_glue_release_rx_chn); @@ -1052,12 +1232,24 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_rx_flow_disable); int k3_udma_glue_enable_rx_chn(struct k3_udma_glue_rx_channel *rx_chn) { + int ret; + if (rx_chn->remote) return -EINVAL; if (rx_chn->flows_ready < rx_chn->flow_num) return -EINVAL; + ret = xudma_navss_psil_pair(rx_chn->common.udmax, + rx_chn->common.src_thread, + rx_chn->common.dst_thread); + if (ret) { + dev_err(rx_chn->common.dev, "PSI-L request err %d\n", ret); + return ret; + } + + rx_chn->psil_paired = true; + xudma_rchanrt_write(rx_chn->udma_rchanx, UDMA_CHAN_RT_CTL_REG, UDMA_CHAN_RT_CTL_EN); @@ -1078,6 +1270,13 @@ void k3_udma_glue_disable_rx_chn(struct k3_udma_glue_rx_channel *rx_chn) xudma_rchanrt_write(rx_chn->udma_rchanx, UDMA_CHAN_RT_CTL_REG, 0); k3_udma_glue_dump_rx_rt_chn(rx_chn, "rxrt dis2"); + + if (rx_chn->psil_paired) { + xudma_navss_psil_unpair(rx_chn->common.udmax, + rx_chn->common.src_thread, + rx_chn->common.dst_thread); + rx_chn->psil_paired = false; + } } EXPORT_SYMBOL_GPL(k3_udma_glue_disable_rx_chn); @@ -1128,12 +1327,10 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn, /* reset RXCQ as it is not input for udma - expected to be empty */ occ_rx = k3_ringacc_ring_get_occ(flow->ringrx); dev_dbg(dev, "RX reset flow %u occ_rx %u\n", flow_num, occ_rx); - if (flow->ringrx) - k3_ringacc_ring_reset(flow->ringrx); /* Skip RX FDQ in case one FDQ is used for the set of flows */ if (skip_fdq) - return; + goto do_reset; /* * RX FDQ reset need to be special way as it is input for udma and its @@ -1148,13 +1345,17 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn, for (i = 0; i < occ_rx; i++) { ret = k3_ringacc_ring_pop(flow->ringrxfdq, &desc_dma); if (ret) { - dev_err(dev, "RX reset pop %d\n", ret); + if (ret != -ENODATA) + dev_err(dev, "RX reset pop %d\n", ret); break; } cleanup(data, desc_dma); } k3_ringacc_ring_reset_dma(flow->ringrxfdq, occ_rx); + +do_reset: + k3_ringacc_ring_reset(flow->ringrx); } EXPORT_SYMBOL_GPL(k3_udma_glue_reset_rx_chn); @@ -1184,8 +1385,52 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn, flow = &rx_chn->flows[flow_num]; - flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx); + if (xudma_is_pktdma(rx_chn->common.udmax)) { + flow->virq = xudma_pktdma_rflow_get_irq(rx_chn->common.udmax, + flow->udma_rflow_id); + } else { + flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx); + } return flow->virq; } EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq); + +struct device * + k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn) +{ + if (xudma_is_pktdma(rx_chn->common.udmax) && + (rx_chn->common.atype_asel == 14 || rx_chn->common.atype_asel == 15)) + return &rx_chn->common.chan_dev; + + return xudma_get_device(rx_chn->common.udmax); +} +EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device); + +void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn, + dma_addr_t *addr) +{ + if (!xudma_is_pktdma(rx_chn->common.udmax) || + !rx_chn->common.atype_asel) + return; + + *addr |= (u64)rx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT; +} +EXPORT_SYMBOL_GPL(k3_udma_glue_rx_dma_to_cppi5_addr); + +void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn, + dma_addr_t *addr) +{ + if (!xudma_is_pktdma(rx_chn->common.udmax) || + !rx_chn->common.atype_asel) + return; + + *addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0); +} +EXPORT_SYMBOL_GPL(k3_udma_glue_rx_cppi5_to_dma_addr); + +static int __init k3_udma_glue_class_init(void) +{ + return class_register(&k3_udma_glue_devclass); +} +arch_initcall(k3_udma_glue_class_init); diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c index 8563a392f30b..aada84f40723 100644 --- a/drivers/dma/ti/k3-udma-private.c +++ b/drivers/dma/ti/k3-udma-private.c @@ -50,6 +50,18 @@ struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property) } EXPORT_SYMBOL(of_xudma_dev_get); +struct device *xudma_get_device(struct udma_dev *ud) +{ + return ud->dev; +} +EXPORT_SYMBOL(xudma_get_device); + +struct k3_ringacc *xudma_get_ringacc(struct udma_dev *ud) +{ + return ud->ringacc; +} +EXPORT_SYMBOL(xudma_get_ringacc); + u32 xudma_dev_get_psil_base(struct udma_dev *ud) { return ud->psil_base; @@ -76,6 +88,9 @@ EXPORT_SYMBOL(xudma_free_gp_rflow_range); bool xudma_rflow_is_gp(struct udma_dev *ud, int id) { + if (!ud->rflow_gp_map) + return false; + return !test_bit(id, ud->rflow_gp_map); } EXPORT_SYMBOL(xudma_rflow_is_gp); @@ -107,6 +122,12 @@ void xudma_rflow_put(struct udma_dev *ud, struct udma_rflow *p) } EXPORT_SYMBOL(xudma_rflow_put); +int xudma_get_rflow_ring_offset(struct udma_dev *ud) +{ + return ud->tflow_cnt; +} +EXPORT_SYMBOL(xudma_get_rflow_ring_offset); + #define XUDMA_GET_RESOURCE_ID(res) \ int xudma_##res##_get_id(struct udma_##res *p) \ { \ @@ -136,3 +157,27 @@ void xudma_##res##rt_write(struct udma_##res *p, int reg, u32 val) \ EXPORT_SYMBOL(xudma_##res##rt_write) XUDMA_RT_IO_FUNCTIONS(tchan); XUDMA_RT_IO_FUNCTIONS(rchan); + +int xudma_is_pktdma(struct udma_dev *ud) +{ + return ud->match_data->type == DMA_TYPE_PKTDMA; +} +EXPORT_SYMBOL(xudma_is_pktdma); + +int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id) +{ + const struct udma_oes_offsets *oes = &ud->soc_data->oes; + + return ti_sci_inta_msi_get_virq(ud->dev, udma_tflow_id + + oes->pktdma_tchan_flow); +} +EXPORT_SYMBOL(xudma_pktdma_tflow_get_irq); + +int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id) +{ + const struct udma_oes_offsets *oes = &ud->soc_data->oes; + + return ti_sci_inta_msi_get_virq(ud->dev, udma_rflow_id + + oes->pktdma_rchan_flow); +} +EXPORT_SYMBOL(xudma_pktdma_rflow_get_irq); diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 82cf6c77f5c9..87157cbae1b8 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "../virt-dma.h" @@ -55,14 +56,26 @@ struct udma_static_tr { struct udma_chan; +enum k3_dma_type { + DMA_TYPE_UDMA = 0, + DMA_TYPE_BCDMA, + DMA_TYPE_PKTDMA, +}; + enum udma_mmr { MMR_GCFG = 0, + MMR_BCHANRT, MMR_RCHANRT, MMR_TCHANRT, MMR_LAST, }; -static const char * const mmr_names[] = { "gcfg", "rchanrt", "tchanrt" }; +static const char * const mmr_names[] = { + [MMR_GCFG] = "gcfg", + [MMR_BCHANRT] = "bchanrt", + [MMR_RCHANRT] = "rchanrt", + [MMR_TCHANRT] = "tchanrt", +}; struct udma_tchan { void __iomem *reg_rt; @@ -70,8 +83,12 @@ struct udma_tchan { int id; struct k3_ring *t_ring; /* Transmit ring */ struct k3_ring *tc_ring; /* Transmit Completion ring */ + int tflow_id; /* applicable only for PKTDMA */ + }; +#define udma_bchan udma_tchan + struct udma_rflow { int id; struct k3_ring *fd_ring; /* Free Descriptor ring */ @@ -84,10 +101,29 @@ struct udma_rchan { int id; }; +struct udma_oes_offsets { + /* K3 UDMA Output Event Offset */ + u32 udma_rchan; + + /* BCDMA Output Event Offsets */ + u32 bcdma_bchan_data; + u32 bcdma_bchan_ring; + u32 bcdma_tchan_data; + u32 bcdma_tchan_ring; + u32 bcdma_rchan_data; + u32 bcdma_rchan_ring; + + /* PKTDMA Output Event Offsets */ + u32 pktdma_tchan_flow; + u32 pktdma_rchan_flow; +}; + #define UDMA_FLAG_PDMA_ACC32 BIT(0) #define UDMA_FLAG_PDMA_BURST BIT(1) +#define UDMA_FLAG_TDTYPE BIT(2) struct udma_match_data { + enum k3_dma_type type; u32 psil_base; bool enable_memcpy_support; u32 flags; @@ -95,7 +131,8 @@ struct udma_match_data { }; struct udma_soc_data { - u32 rchan_oes_offset; + struct udma_oes_offsets oes; + u32 bcdma_trigger_event_offset; }; struct udma_hwdesc { @@ -116,6 +153,11 @@ struct udma_rx_flush { dma_addr_t buffer_paddr; }; +struct udma_tpl { + u8 levels; + u32 start_idx[3]; +}; + struct udma_dev { struct dma_device ddev; struct device *dev; @@ -123,8 +165,9 @@ struct udma_dev { const struct udma_match_data *match_data; const struct udma_soc_data *soc_data; - u8 tpl_levels; - u32 tpl_start_idx[3]; + struct udma_tpl bchan_tpl; + struct udma_tpl tchan_tpl; + struct udma_tpl rchan_tpl; size_t desc_align; /* alignment to use for descriptors */ @@ -138,16 +181,21 @@ struct udma_dev { struct udma_rx_flush rx_flush; + int bchan_cnt; int tchan_cnt; int echan_cnt; int rchan_cnt; int rflow_cnt; + int tflow_cnt; + unsigned long *bchan_map; unsigned long *tchan_map; unsigned long *rchan_map; unsigned long *rflow_gp_map; unsigned long *rflow_gp_map_allocated; unsigned long *rflow_in_use; + unsigned long *tflow_map; + struct udma_bchan *bchans; struct udma_tchan *tchans; struct udma_rchan *rchans; struct udma_rflow *rflows; @@ -155,6 +203,7 @@ struct udma_dev { struct udma_chan *channels; u32 psil_base; u32 atype; + u32 asel; }; struct udma_desc { @@ -199,6 +248,7 @@ struct udma_chan_config { bool notdpkt; /* Suppress sending TDC packet */ int remote_thread_id; u32 atype; + u32 asel; u32 src_thread; u32 dst_thread; enum psil_endpoint_type ep_type; @@ -206,6 +256,13 @@ struct udma_chan_config { bool enable_burst; enum udma_tp_level channel_tpl; /* Channel Throughput Level */ + u32 tr_trigger_type; + + /* PKDMA mapped channel */ + int mapped_channel_id; + /* PKTDMA default tflow or rflow for mapped channel */ + int default_flow_id; + enum dma_transfer_direction dir; }; @@ -213,11 +270,13 @@ struct udma_chan { struct virt_dma_chan vc; struct dma_slave_config cfg; struct udma_dev *ud; + struct device *dma_dev; struct udma_desc *desc; struct udma_desc *terminated_desc; struct udma_static_tr static_tr; char *name; + struct udma_bchan *bchan; struct udma_tchan *tchan; struct udma_rchan *rchan; struct udma_rflow *rflow; @@ -353,10 +412,36 @@ static int navss_psil_unpair(struct udma_dev *ud, u32 src_thread, src_thread, dst_thread); } +static void k3_configure_chan_coherency(struct dma_chan *chan, u32 asel) +{ + struct device *chan_dev = &chan->dev->device; + + if (asel == 0) { + /* No special handling for the channel */ + chan->dev->chan_dma_dev = false; + + chan_dev->dma_coherent = false; + chan_dev->dma_parms = NULL; + } else if (asel == 14 || asel == 15) { + chan->dev->chan_dma_dev = true; + + chan_dev->dma_coherent = true; + dma_coerce_mask_and_coherent(chan_dev, DMA_BIT_MASK(48)); + chan_dev->dma_parms = chan_dev->parent->dma_parms; + } else { + dev_warn(chan->device->dev, "Invalid ASEL value: %u\n", asel); + + chan_dev->dma_coherent = false; + chan_dev->dma_parms = NULL; + } +} + static void udma_reset_uchan(struct udma_chan *uc) { memset(&uc->config, 0, sizeof(uc->config)); uc->config.remote_thread_id = -1; + uc->config.mapped_channel_id = -1; + uc->config.default_flow_id = -1; uc->state = UDMA_CHAN_IS_IDLE; } @@ -439,9 +524,7 @@ static void udma_free_hwdesc(struct udma_chan *uc, struct udma_desc *d) d->hwdesc[i].cppi5_desc_vaddr = NULL; } } else if (d->hwdesc[0].cppi5_desc_vaddr) { - struct udma_dev *ud = uc->ud; - - dma_free_coherent(ud->dev, d->hwdesc[0].cppi5_desc_size, + dma_free_coherent(uc->dma_dev, d->hwdesc[0].cppi5_desc_size, d->hwdesc[0].cppi5_desc_vaddr, d->hwdesc[0].cppi5_desc_paddr); @@ -670,8 +753,10 @@ static void udma_reset_counters(struct udma_chan *uc) val = udma_tchanrt_read(uc, UDMA_CHAN_RT_PCNT_REG); udma_tchanrt_write(uc, UDMA_CHAN_RT_PCNT_REG, val); - val = udma_tchanrt_read(uc, UDMA_CHAN_RT_PEER_BCNT_REG); - udma_tchanrt_write(uc, UDMA_CHAN_RT_PEER_BCNT_REG, val); + if (!uc->bchan) { + val = udma_tchanrt_read(uc, UDMA_CHAN_RT_PEER_BCNT_REG); + udma_tchanrt_write(uc, UDMA_CHAN_RT_PEER_BCNT_REG, val); + } } if (uc->rchan) { @@ -746,10 +831,16 @@ static void udma_start_desc(struct udma_chan *uc) { struct udma_chan_config *ucc = &uc->config; - if (ucc->pkt_mode && (uc->cyclic || ucc->dir == DMA_DEV_TO_MEM)) { + if (uc->ud->match_data->type == DMA_TYPE_UDMA && ucc->pkt_mode && + (uc->cyclic || ucc->dir == DMA_DEV_TO_MEM)) { int i; - /* Push all descriptors to ring for packet mode cyclic or RX */ + /* + * UDMA only: Push all descriptors to ring for packet mode + * cyclic or RX + * PKTDMA supports pre-linked descriptor and cyclic is not + * supported + */ for (i = 0; i < uc->desc->sglen; i++) udma_push_to_ring(uc, i); } else { @@ -1020,13 +1111,12 @@ static irqreturn_t udma_ring_irq_handler(int irq, void *data) { struct udma_chan *uc = data; struct udma_desc *d; - unsigned long flags; dma_addr_t paddr = 0; if (udma_pop_from_ring(uc, &paddr) || !paddr) return IRQ_HANDLED; - spin_lock_irqsave(&uc->vc.lock, flags); + spin_lock(&uc->vc.lock); /* Teardown completion message */ if (cppi5_desc_is_tdcm(paddr)) { @@ -1077,7 +1167,7 @@ static irqreturn_t udma_ring_irq_handler(int irq, void *data) } } out: - spin_unlock_irqrestore(&uc->vc.lock, flags); + spin_unlock(&uc->vc.lock); return IRQ_HANDLED; } @@ -1086,9 +1176,8 @@ static irqreturn_t udma_udma_irq_handler(int irq, void *data) { struct udma_chan *uc = data; struct udma_desc *d; - unsigned long flags; - spin_lock_irqsave(&uc->vc.lock, flags); + spin_lock(&uc->vc.lock); d = uc->desc; if (d) { d->tr_idx = (d->tr_idx + 1) % d->sglen; @@ -1103,7 +1192,7 @@ static irqreturn_t udma_udma_irq_handler(int irq, void *data) } } - spin_unlock_irqrestore(&uc->vc.lock, flags); + spin_unlock(&uc->vc.lock); return IRQ_HANDLED; } @@ -1181,10 +1270,12 @@ static struct udma_rflow *__udma_get_rflow(struct udma_dev *ud, int id) if (test_bit(id, ud->rflow_in_use)) return ERR_PTR(-ENOENT); - /* GP rflow has to be allocated first */ - if (!test_bit(id, ud->rflow_gp_map) && - !test_bit(id, ud->rflow_gp_map_allocated)) - return ERR_PTR(-EINVAL); + if (ud->rflow_gp_map) { + /* GP rflow has to be allocated first */ + if (!test_bit(id, ud->rflow_gp_map) && + !test_bit(id, ud->rflow_gp_map_allocated)) + return ERR_PTR(-EINVAL); + } dev_dbg(ud->dev, "get rflow%d\n", id); set_bit(id, ud->rflow_in_use); @@ -1215,10 +1306,10 @@ static struct udma_##res *__udma_reserve_##res(struct udma_dev *ud, \ } else { \ int start; \ \ - if (tpl >= ud->tpl_levels) \ - tpl = ud->tpl_levels - 1; \ + if (tpl >= ud->res##_tpl.levels) \ + tpl = ud->res##_tpl.levels - 1; \ \ - start = ud->tpl_start_idx[tpl]; \ + start = ud->res##_tpl.start_idx[tpl]; \ \ id = find_next_zero_bit(ud->res##_map, ud->res##_cnt, \ start); \ @@ -1231,9 +1322,39 @@ static struct udma_##res *__udma_reserve_##res(struct udma_dev *ud, \ return &ud->res##s[id]; \ } +UDMA_RESERVE_RESOURCE(bchan); UDMA_RESERVE_RESOURCE(tchan); UDMA_RESERVE_RESOURCE(rchan); +static int bcdma_get_bchan(struct udma_chan *uc) +{ + struct udma_dev *ud = uc->ud; + enum udma_tp_level tpl; + + if (uc->bchan) { + dev_dbg(ud->dev, "chan%d: already have bchan%d allocated\n", + uc->id, uc->bchan->id); + return 0; + } + + /* + * Use normal channels for peripherals, and highest TPL channel for + * mem2mem + */ + if (uc->config.tr_trigger_type) + tpl = 0; + else + tpl = ud->bchan_tpl.levels - 1; + + uc->bchan = __udma_reserve_bchan(ud, tpl, -1); + if (IS_ERR(uc->bchan)) + return PTR_ERR(uc->bchan); + + uc->tchan = uc->bchan; + + return 0; +} + static int udma_get_tchan(struct udma_chan *uc) { struct udma_dev *ud = uc->ud; @@ -1244,9 +1365,39 @@ static int udma_get_tchan(struct udma_chan *uc) return 0; } - uc->tchan = __udma_reserve_tchan(ud, uc->config.channel_tpl, -1); + /* + * mapped_channel_id is -1 for UDMA, BCDMA and PKTDMA unmapped channels. + * For PKTDMA mapped channels it is configured to a channel which must + * be used to service the peripheral. + */ + uc->tchan = __udma_reserve_tchan(ud, uc->config.channel_tpl, + uc->config.mapped_channel_id); + if (IS_ERR(uc->tchan)) + return PTR_ERR(uc->tchan); - return PTR_ERR_OR_ZERO(uc->tchan); + if (ud->tflow_cnt) { + int tflow_id; + + /* Only PKTDMA have support for tx flows */ + if (uc->config.default_flow_id >= 0) + tflow_id = uc->config.default_flow_id; + else + tflow_id = uc->tchan->id; + + if (test_bit(tflow_id, ud->tflow_map)) { + dev_err(ud->dev, "tflow%d is in use\n", tflow_id); + clear_bit(uc->tchan->id, ud->tchan_map); + uc->tchan = NULL; + return -ENOENT; + } + + uc->tchan->tflow_id = tflow_id; + set_bit(tflow_id, ud->tflow_map); + } else { + uc->tchan->tflow_id = -1; + } + + return 0; } static int udma_get_rchan(struct udma_chan *uc) @@ -1259,7 +1410,13 @@ static int udma_get_rchan(struct udma_chan *uc) return 0; } - uc->rchan = __udma_reserve_rchan(ud, uc->config.channel_tpl, -1); + /* + * mapped_channel_id is -1 for UDMA, BCDMA and PKTDMA unmapped channels. + * For PKTDMA mapped channels it is configured to a channel which must + * be used to service the peripheral. + */ + uc->rchan = __udma_reserve_rchan(ud, uc->config.channel_tpl, + uc->config.mapped_channel_id); return PTR_ERR_OR_ZERO(uc->rchan); } @@ -1287,8 +1444,11 @@ static int udma_get_chan_pair(struct udma_chan *uc) /* Can be optimized, but let's have it like this for now */ end = min(ud->tchan_cnt, ud->rchan_cnt); - /* Try to use the highest TPL channel pair for MEM_TO_MEM channels */ - chan_id = ud->tpl_start_idx[ud->tpl_levels - 1]; + /* + * Try to use the highest TPL channel pair for MEM_TO_MEM channels + * Note: in UDMAP the channel TPL is symmetric between tchan and rchan + */ + chan_id = ud->tchan_tpl.start_idx[ud->tchan_tpl.levels - 1]; for (; chan_id < end; chan_id++) { if (!test_bit(chan_id, ud->tchan_map) && !test_bit(chan_id, ud->rchan_map)) @@ -1303,6 +1463,9 @@ static int udma_get_chan_pair(struct udma_chan *uc) uc->tchan = &ud->tchans[chan_id]; uc->rchan = &ud->rchans[chan_id]; + /* UDMA does not use tx flows */ + uc->tchan->tflow_id = -1; + return 0; } @@ -1326,6 +1489,19 @@ static int udma_get_rflow(struct udma_chan *uc, int flow_id) return PTR_ERR_OR_ZERO(uc->rflow); } +static void bcdma_put_bchan(struct udma_chan *uc) +{ + struct udma_dev *ud = uc->ud; + + if (uc->bchan) { + dev_dbg(ud->dev, "chan%d: put bchan%d\n", uc->id, + uc->bchan->id); + clear_bit(uc->bchan->id, ud->bchan_map); + uc->bchan = NULL; + uc->tchan = NULL; + } +} + static void udma_put_rchan(struct udma_chan *uc) { struct udma_dev *ud = uc->ud; @@ -1346,6 +1522,10 @@ static void udma_put_tchan(struct udma_chan *uc) dev_dbg(ud->dev, "chan%d: put tchan%d\n", uc->id, uc->tchan->id); clear_bit(uc->tchan->id, ud->tchan_map); + + if (uc->tchan->tflow_id >= 0) + clear_bit(uc->tchan->tflow_id, ud->tflow_map); + uc->tchan = NULL; } } @@ -1362,6 +1542,65 @@ static void udma_put_rflow(struct udma_chan *uc) } } +static void bcdma_free_bchan_resources(struct udma_chan *uc) +{ + if (!uc->bchan) + return; + + k3_ringacc_ring_free(uc->bchan->tc_ring); + k3_ringacc_ring_free(uc->bchan->t_ring); + uc->bchan->tc_ring = NULL; + uc->bchan->t_ring = NULL; + k3_configure_chan_coherency(&uc->vc.chan, 0); + + bcdma_put_bchan(uc); +} + +static int bcdma_alloc_bchan_resources(struct udma_chan *uc) +{ + struct k3_ring_cfg ring_cfg; + struct udma_dev *ud = uc->ud; + int ret; + + ret = bcdma_get_bchan(uc); + if (ret) + return ret; + + ret = k3_ringacc_request_rings_pair(ud->ringacc, uc->bchan->id, -1, + &uc->bchan->t_ring, + &uc->bchan->tc_ring); + if (ret) { + ret = -EBUSY; + goto err_ring; + } + + memset(&ring_cfg, 0, sizeof(ring_cfg)); + ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE; + ring_cfg.elm_size = K3_RINGACC_RING_ELSIZE_8; + ring_cfg.mode = K3_RINGACC_RING_MODE_RING; + + k3_configure_chan_coherency(&uc->vc.chan, ud->asel); + ring_cfg.asel = ud->asel; + ring_cfg.dma_dev = dmaengine_get_dma_device(&uc->vc.chan); + + ret = k3_ringacc_ring_cfg(uc->bchan->t_ring, &ring_cfg); + if (ret) + goto err_ringcfg; + + return 0; + +err_ringcfg: + k3_ringacc_ring_free(uc->bchan->tc_ring); + uc->bchan->tc_ring = NULL; + k3_ringacc_ring_free(uc->bchan->t_ring); + uc->bchan->t_ring = NULL; + k3_configure_chan_coherency(&uc->vc.chan, 0); +err_ring: + bcdma_put_bchan(uc); + + return ret; +} + static void udma_free_tx_resources(struct udma_chan *uc) { if (!uc->tchan) @@ -1379,15 +1618,22 @@ static int udma_alloc_tx_resources(struct udma_chan *uc) { struct k3_ring_cfg ring_cfg; struct udma_dev *ud = uc->ud; - int ret; + struct udma_tchan *tchan; + int ring_idx, ret; ret = udma_get_tchan(uc); if (ret) return ret; - ret = k3_ringacc_request_rings_pair(ud->ringacc, uc->tchan->id, -1, - &uc->tchan->t_ring, - &uc->tchan->tc_ring); + tchan = uc->tchan; + if (tchan->tflow_id >= 0) + ring_idx = tchan->tflow_id; + else + ring_idx = ud->bchan_cnt + tchan->id; + + ret = k3_ringacc_request_rings_pair(ud->ringacc, ring_idx, -1, + &tchan->t_ring, + &tchan->tc_ring); if (ret) { ret = -EBUSY; goto err_ring; @@ -1396,10 +1642,18 @@ static int udma_alloc_tx_resources(struct udma_chan *uc) memset(&ring_cfg, 0, sizeof(ring_cfg)); ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE; ring_cfg.elm_size = K3_RINGACC_RING_ELSIZE_8; - ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE; + if (ud->match_data->type == DMA_TYPE_UDMA) { + ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE; + } else { + ring_cfg.mode = K3_RINGACC_RING_MODE_RING; - ret = k3_ringacc_ring_cfg(uc->tchan->t_ring, &ring_cfg); - ret |= k3_ringacc_ring_cfg(uc->tchan->tc_ring, &ring_cfg); + k3_configure_chan_coherency(&uc->vc.chan, uc->config.asel); + ring_cfg.asel = uc->config.asel; + ring_cfg.dma_dev = dmaengine_get_dma_device(&uc->vc.chan); + } + + ret = k3_ringacc_ring_cfg(tchan->t_ring, &ring_cfg); + ret |= k3_ringacc_ring_cfg(tchan->tc_ring, &ring_cfg); if (ret) goto err_ringcfg; @@ -1452,14 +1706,23 @@ static int udma_alloc_rx_resources(struct udma_chan *uc) if (uc->config.dir == DMA_MEM_TO_MEM) return 0; - ret = udma_get_rflow(uc, uc->rchan->id); + if (uc->config.default_flow_id >= 0) + ret = udma_get_rflow(uc, uc->config.default_flow_id); + else + ret = udma_get_rflow(uc, uc->rchan->id); + if (ret) { ret = -EBUSY; goto err_rflow; } rflow = uc->rflow; - fd_ring_id = ud->tchan_cnt + ud->echan_cnt + uc->rchan->id; + if (ud->tflow_cnt) + fd_ring_id = ud->tflow_cnt + rflow->id; + else + fd_ring_id = ud->bchan_cnt + ud->tchan_cnt + ud->echan_cnt + + uc->rchan->id; + ret = k3_ringacc_request_rings_pair(ud->ringacc, fd_ring_id, -1, &rflow->fd_ring, &rflow->r_ring); if (ret) { @@ -1469,15 +1732,25 @@ static int udma_alloc_rx_resources(struct udma_chan *uc) memset(&ring_cfg, 0, sizeof(ring_cfg)); - if (uc->config.pkt_mode) - ring_cfg.size = SG_MAX_SEGMENTS; - else - ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE; - ring_cfg.elm_size = K3_RINGACC_RING_ELSIZE_8; - ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE; + if (ud->match_data->type == DMA_TYPE_UDMA) { + if (uc->config.pkt_mode) + ring_cfg.size = SG_MAX_SEGMENTS; + else + ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE; + + ring_cfg.mode = K3_RINGACC_RING_MODE_MESSAGE; + } else { + ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE; + ring_cfg.mode = K3_RINGACC_RING_MODE_RING; + + k3_configure_chan_coherency(&uc->vc.chan, uc->config.asel); + ring_cfg.asel = uc->config.asel; + ring_cfg.dma_dev = dmaengine_get_dma_device(&uc->vc.chan); + } ret = k3_ringacc_ring_cfg(rflow->fd_ring, &ring_cfg); + ring_cfg.size = K3_UDMA_DEFAULT_RING_SIZE; ret |= k3_ringacc_ring_cfg(rflow->r_ring, &ring_cfg); @@ -1499,7 +1772,18 @@ err_rflow: return ret; } -#define TISCI_TCHAN_VALID_PARAMS ( \ +#define TISCI_BCDMA_BCHAN_VALID_PARAMS ( \ + TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID | \ + TI_SCI_MSG_VALUE_RM_UDMAP_CH_EXTENDED_CH_TYPE_VALID) + +#define TISCI_BCDMA_TCHAN_VALID_PARAMS ( \ + TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID | \ + TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID) + +#define TISCI_BCDMA_RCHAN_VALID_PARAMS ( \ + TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID) + +#define TISCI_UDMA_TCHAN_VALID_PARAMS ( \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID | \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_EINFO_VALID | \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_PSWORDS_VALID | \ @@ -1509,7 +1793,7 @@ err_rflow: TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID | \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID) -#define TISCI_RCHAN_VALID_PARAMS ( \ +#define TISCI_UDMA_RCHAN_VALID_PARAMS ( \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID | \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID | \ TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID | \ @@ -1534,7 +1818,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; - req_tx.valid_params = TISCI_TCHAN_VALID_PARAMS; + req_tx.valid_params = TISCI_UDMA_TCHAN_VALID_PARAMS; req_tx.nav_id = tisci_rm->tisci_dev_id; req_tx.index = tchan->id; req_tx.tx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR; @@ -1548,7 +1832,7 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) return ret; } - req_rx.valid_params = TISCI_RCHAN_VALID_PARAMS; + req_rx.valid_params = TISCI_UDMA_RCHAN_VALID_PARAMS; req_rx.nav_id = tisci_rm->tisci_dev_id; req_rx.index = rchan->id; req_rx.rx_fetch_size = sizeof(struct cppi5_desc_hdr_t) >> 2; @@ -1563,6 +1847,27 @@ static int udma_tisci_m2m_channel_config(struct udma_chan *uc) return ret; } +static int bcdma_tisci_m2m_channel_config(struct udma_chan *uc) +{ + struct udma_dev *ud = uc->ud; + struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; + const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; + struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; + struct udma_bchan *bchan = uc->bchan; + int ret = 0; + + req_tx.valid_params = TISCI_BCDMA_BCHAN_VALID_PARAMS; + req_tx.nav_id = tisci_rm->tisci_dev_id; + req_tx.extended_ch_type = TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_BCHAN; + req_tx.index = bchan->id; + + ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); + if (ret) + dev_err(ud->dev, "bchan%d cfg failed %d\n", bchan->id, ret); + + return ret; +} + static int udma_tisci_tx_channel_config(struct udma_chan *uc) { struct udma_dev *ud = uc->ud; @@ -1583,7 +1888,7 @@ static int udma_tisci_tx_channel_config(struct udma_chan *uc) fetch_size = sizeof(struct cppi5_desc_hdr_t); } - req_tx.valid_params = TISCI_TCHAN_VALID_PARAMS; + req_tx.valid_params = TISCI_UDMA_TCHAN_VALID_PARAMS; req_tx.nav_id = tisci_rm->tisci_dev_id; req_tx.index = tchan->id; req_tx.tx_chan_type = mode; @@ -1591,6 +1896,13 @@ static int udma_tisci_tx_channel_config(struct udma_chan *uc) req_tx.tx_fetch_size = fetch_size >> 2; req_tx.txcq_qnum = tc_ring; req_tx.tx_atype = uc->config.atype; + if (uc->config.ep_type == PSIL_EP_PDMA_XY && + ud->match_data->flags & UDMA_FLAG_TDTYPE) { + /* wait for peer to complete the teardown for PDMAs */ + req_tx.valid_params |= + TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID; + req_tx.tx_tdtype = 1; + } ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); if (ret) @@ -1599,6 +1911,35 @@ static int udma_tisci_tx_channel_config(struct udma_chan *uc) return ret; } +static int bcdma_tisci_tx_channel_config(struct udma_chan *uc) +{ + struct udma_dev *ud = uc->ud; + struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; + const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; + struct udma_tchan *tchan = uc->tchan; + struct ti_sci_msg_rm_udmap_tx_ch_cfg req_tx = { 0 }; + int ret = 0; + + req_tx.valid_params = TISCI_BCDMA_TCHAN_VALID_PARAMS; + req_tx.nav_id = tisci_rm->tisci_dev_id; + req_tx.index = tchan->id; + req_tx.tx_supr_tdpkt = uc->config.notdpkt; + if (ud->match_data->flags & UDMA_FLAG_TDTYPE) { + /* wait for peer to complete the teardown for PDMAs */ + req_tx.valid_params |= + TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID; + req_tx.tx_tdtype = 1; + } + + ret = tisci_ops->tx_ch_cfg(tisci_rm->tisci, &req_tx); + if (ret) + dev_err(ud->dev, "tchan%d cfg failed %d\n", tchan->id, ret); + + return ret; +} + +#define pktdma_tisci_tx_channel_config bcdma_tisci_tx_channel_config + static int udma_tisci_rx_channel_config(struct udma_chan *uc) { struct udma_dev *ud = uc->ud; @@ -1621,7 +1962,7 @@ static int udma_tisci_rx_channel_config(struct udma_chan *uc) fetch_size = sizeof(struct cppi5_desc_hdr_t); } - req_rx.valid_params = TISCI_RCHAN_VALID_PARAMS; + req_rx.valid_params = TISCI_UDMA_RCHAN_VALID_PARAMS; req_rx.nav_id = tisci_rm->tisci_dev_id; req_rx.index = rchan->id; req_rx.rx_fetch_size = fetch_size >> 2; @@ -1680,6 +2021,72 @@ static int udma_tisci_rx_channel_config(struct udma_chan *uc) return 0; } +static int bcdma_tisci_rx_channel_config(struct udma_chan *uc) +{ + struct udma_dev *ud = uc->ud; + struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; + const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; + struct udma_rchan *rchan = uc->rchan; + struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; + int ret = 0; + + req_rx.valid_params = TISCI_BCDMA_RCHAN_VALID_PARAMS; + req_rx.nav_id = tisci_rm->tisci_dev_id; + req_rx.index = rchan->id; + + ret = tisci_ops->rx_ch_cfg(tisci_rm->tisci, &req_rx); + if (ret) + dev_err(ud->dev, "rchan%d cfg failed %d\n", rchan->id, ret); + + return ret; +} + +static int pktdma_tisci_rx_channel_config(struct udma_chan *uc) +{ + struct udma_dev *ud = uc->ud; + struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; + const struct ti_sci_rm_udmap_ops *tisci_ops = tisci_rm->tisci_udmap_ops; + struct ti_sci_msg_rm_udmap_rx_ch_cfg req_rx = { 0 }; + struct ti_sci_msg_rm_udmap_flow_cfg flow_req = { 0 }; + int ret = 0; + + req_rx.valid_params = TISCI_BCDMA_RCHAN_VALID_PARAMS; + req_rx.nav_id = tisci_rm->tisci_dev_id; + req_rx.index = uc->rchan->id; + + ret = tisci_ops->rx_ch_cfg(tisci_rm->tisci, &req_rx); + if (ret) { + dev_err(ud->dev, "rchan%d cfg failed %d\n", uc->rchan->id, ret); + return ret; + } + + flow_req.valid_params = + TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_EINFO_PRESENT_VALID | + TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PSINFO_PRESENT_VALID | + TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_ERROR_HANDLING_VALID; + + flow_req.nav_id = tisci_rm->tisci_dev_id; + flow_req.flow_index = uc->rflow->id; + + if (uc->config.needs_epib) + flow_req.rx_einfo_present = 1; + else + flow_req.rx_einfo_present = 0; + if (uc->config.psd_size) + flow_req.rx_psinfo_present = 1; + else + flow_req.rx_psinfo_present = 0; + flow_req.rx_error_handling = 1; + + ret = tisci_ops->rx_flow_cfg(tisci_rm->tisci, &flow_req); + + if (ret) + dev_err(ud->dev, "flow%d config failed: %d\n", uc->rflow->id, + ret); + + return ret; +} + static int udma_alloc_chan_resources(struct dma_chan *chan) { struct udma_chan *uc = to_udma_chan(chan); @@ -1689,6 +2096,8 @@ static int udma_alloc_chan_resources(struct dma_chan *chan) u32 irq_udma_idx; int ret; + uc->dma_dev = ud->dev; + if (uc->config.pkt_mode || uc->config.dir == DMA_MEM_TO_MEM) { uc->use_dma_pool = true; /* in case of MEM_TO_MEM we have maximum of two TRs */ @@ -1784,7 +2193,7 @@ static int udma_alloc_chan_resources(struct dma_chan *chan) K3_PSIL_DST_THREAD_ID_OFFSET; irq_ring = uc->rflow->r_ring; - irq_udma_idx = soc_data->rchan_oes_offset + uc->rchan->id; + irq_udma_idx = soc_data->oes.udma_rchan + uc->rchan->id; ret = udma_tisci_rx_channel_config(uc); break; @@ -1884,6 +2293,369 @@ err_cleanup: return ret; } +static int bcdma_alloc_chan_resources(struct dma_chan *chan) +{ + struct udma_chan *uc = to_udma_chan(chan); + struct udma_dev *ud = to_udma_dev(chan->device); + const struct udma_oes_offsets *oes = &ud->soc_data->oes; + u32 irq_udma_idx, irq_ring_idx; + int ret; + + /* Only TR mode is supported */ + uc->config.pkt_mode = false; + + /* + * Make sure that the completion is in a known state: + * No teardown, the channel is idle + */ + reinit_completion(&uc->teardown_completed); + complete_all(&uc->teardown_completed); + uc->state = UDMA_CHAN_IS_IDLE; + + switch (uc->config.dir) { + case DMA_MEM_TO_MEM: + /* Non synchronized - mem to mem type of transfer */ + dev_dbg(uc->ud->dev, "%s: chan%d as MEM-to-MEM\n", __func__, + uc->id); + + ret = bcdma_alloc_bchan_resources(uc); + if (ret) + return ret; + + irq_ring_idx = uc->bchan->id + oes->bcdma_bchan_ring; + irq_udma_idx = uc->bchan->id + oes->bcdma_bchan_data; + + ret = bcdma_tisci_m2m_channel_config(uc); + break; + case DMA_MEM_TO_DEV: + /* Slave transfer synchronized - mem to dev (TX) trasnfer */ + dev_dbg(uc->ud->dev, "%s: chan%d as MEM-to-DEV\n", __func__, + uc->id); + + ret = udma_alloc_tx_resources(uc); + if (ret) { + uc->config.remote_thread_id = -1; + return ret; + } + + uc->config.src_thread = ud->psil_base + uc->tchan->id; + uc->config.dst_thread = uc->config.remote_thread_id; + uc->config.dst_thread |= K3_PSIL_DST_THREAD_ID_OFFSET; + + irq_ring_idx = uc->tchan->id + oes->bcdma_tchan_ring; + irq_udma_idx = uc->tchan->id + oes->bcdma_tchan_data; + + ret = bcdma_tisci_tx_channel_config(uc); + break; + case DMA_DEV_TO_MEM: + /* Slave transfer synchronized - dev to mem (RX) trasnfer */ + dev_dbg(uc->ud->dev, "%s: chan%d as DEV-to-MEM\n", __func__, + uc->id); + + ret = udma_alloc_rx_resources(uc); + if (ret) { + uc->config.remote_thread_id = -1; + return ret; + } + + uc->config.src_thread = uc->config.remote_thread_id; + uc->config.dst_thread = (ud->psil_base + uc->rchan->id) | + K3_PSIL_DST_THREAD_ID_OFFSET; + + irq_ring_idx = uc->rchan->id + oes->bcdma_rchan_ring; + irq_udma_idx = uc->rchan->id + oes->bcdma_rchan_data; + + ret = bcdma_tisci_rx_channel_config(uc); + break; + default: + /* Can not happen */ + dev_err(uc->ud->dev, "%s: chan%d invalid direction (%u)\n", + __func__, uc->id, uc->config.dir); + return -EINVAL; + } + + /* check if the channel configuration was successful */ + if (ret) + goto err_res_free; + + if (udma_is_chan_running(uc)) { + dev_warn(ud->dev, "chan%d: is running!\n", uc->id); + udma_reset_chan(uc, false); + if (udma_is_chan_running(uc)) { + dev_err(ud->dev, "chan%d: won't stop!\n", uc->id); + ret = -EBUSY; + goto err_res_free; + } + } + + uc->dma_dev = dmaengine_get_dma_device(chan); + if (uc->config.dir == DMA_MEM_TO_MEM && !uc->config.tr_trigger_type) { + uc->config.hdesc_size = cppi5_trdesc_calc_size( + sizeof(struct cppi5_tr_type15_t), 2); + + uc->hdesc_pool = dma_pool_create(uc->name, ud->ddev.dev, + uc->config.hdesc_size, + ud->desc_align, + 0); + if (!uc->hdesc_pool) { + dev_err(ud->ddev.dev, + "Descriptor pool allocation failed\n"); + uc->use_dma_pool = false; + return -ENOMEM; + } + + uc->use_dma_pool = true; + } else if (uc->config.dir != DMA_MEM_TO_MEM) { + /* PSI-L pairing */ + ret = navss_psil_pair(ud, uc->config.src_thread, + uc->config.dst_thread); + if (ret) { + dev_err(ud->dev, + "PSI-L pairing failed: 0x%04x -> 0x%04x\n", + uc->config.src_thread, uc->config.dst_thread); + goto err_res_free; + } + + uc->psil_paired = true; + } + + uc->irq_num_ring = ti_sci_inta_msi_get_virq(ud->dev, irq_ring_idx); + if (uc->irq_num_ring <= 0) { + dev_err(ud->dev, "Failed to get ring irq (index: %u)\n", + irq_ring_idx); + ret = -EINVAL; + goto err_psi_free; + } + + ret = request_irq(uc->irq_num_ring, udma_ring_irq_handler, + IRQF_TRIGGER_HIGH, uc->name, uc); + if (ret) { + dev_err(ud->dev, "chan%d: ring irq request failed\n", uc->id); + goto err_irq_free; + } + + /* Event from BCDMA (TR events) only needed for slave channels */ + if (is_slave_direction(uc->config.dir)) { + uc->irq_num_udma = ti_sci_inta_msi_get_virq(ud->dev, + irq_udma_idx); + if (uc->irq_num_udma <= 0) { + dev_err(ud->dev, "Failed to get bcdma irq (index: %u)\n", + irq_udma_idx); + free_irq(uc->irq_num_ring, uc); + ret = -EINVAL; + goto err_irq_free; + } + + ret = request_irq(uc->irq_num_udma, udma_udma_irq_handler, 0, + uc->name, uc); + if (ret) { + dev_err(ud->dev, "chan%d: BCDMA irq request failed\n", + uc->id); + free_irq(uc->irq_num_ring, uc); + goto err_irq_free; + } + } else { + uc->irq_num_udma = 0; + } + + udma_reset_rings(uc); + + INIT_DELAYED_WORK_ONSTACK(&uc->tx_drain.work, + udma_check_tx_completion); + return 0; + +err_irq_free: + uc->irq_num_ring = 0; + uc->irq_num_udma = 0; +err_psi_free: + if (uc->psil_paired) + navss_psil_unpair(ud, uc->config.src_thread, + uc->config.dst_thread); + uc->psil_paired = false; +err_res_free: + bcdma_free_bchan_resources(uc); + udma_free_tx_resources(uc); + udma_free_rx_resources(uc); + + udma_reset_uchan(uc); + + if (uc->use_dma_pool) { + dma_pool_destroy(uc->hdesc_pool); + uc->use_dma_pool = false; + } + + return ret; +} + +static int bcdma_router_config(struct dma_chan *chan) +{ + struct k3_event_route_data *router_data = chan->route_data; + struct udma_chan *uc = to_udma_chan(chan); + u32 trigger_event; + + if (!uc->bchan) + return -EINVAL; + + if (uc->config.tr_trigger_type != 1 && uc->config.tr_trigger_type != 2) + return -EINVAL; + + trigger_event = uc->ud->soc_data->bcdma_trigger_event_offset; + trigger_event += (uc->bchan->id * 2) + uc->config.tr_trigger_type - 1; + + return router_data->set_event(router_data->priv, trigger_event); +} + +static int pktdma_alloc_chan_resources(struct dma_chan *chan) +{ + struct udma_chan *uc = to_udma_chan(chan); + struct udma_dev *ud = to_udma_dev(chan->device); + const struct udma_oes_offsets *oes = &ud->soc_data->oes; + u32 irq_ring_idx; + int ret; + + /* + * Make sure that the completion is in a known state: + * No teardown, the channel is idle + */ + reinit_completion(&uc->teardown_completed); + complete_all(&uc->teardown_completed); + uc->state = UDMA_CHAN_IS_IDLE; + + switch (uc->config.dir) { + case DMA_MEM_TO_DEV: + /* Slave transfer synchronized - mem to dev (TX) trasnfer */ + dev_dbg(uc->ud->dev, "%s: chan%d as MEM-to-DEV\n", __func__, + uc->id); + + ret = udma_alloc_tx_resources(uc); + if (ret) { + uc->config.remote_thread_id = -1; + return ret; + } + + uc->config.src_thread = ud->psil_base + uc->tchan->id; + uc->config.dst_thread = uc->config.remote_thread_id; + uc->config.dst_thread |= K3_PSIL_DST_THREAD_ID_OFFSET; + + irq_ring_idx = uc->tchan->tflow_id + oes->pktdma_tchan_flow; + + ret = pktdma_tisci_tx_channel_config(uc); + break; + case DMA_DEV_TO_MEM: + /* Slave transfer synchronized - dev to mem (RX) trasnfer */ + dev_dbg(uc->ud->dev, "%s: chan%d as DEV-to-MEM\n", __func__, + uc->id); + + ret = udma_alloc_rx_resources(uc); + if (ret) { + uc->config.remote_thread_id = -1; + return ret; + } + + uc->config.src_thread = uc->config.remote_thread_id; + uc->config.dst_thread = (ud->psil_base + uc->rchan->id) | + K3_PSIL_DST_THREAD_ID_OFFSET; + + irq_ring_idx = uc->rflow->id + oes->pktdma_rchan_flow; + + ret = pktdma_tisci_rx_channel_config(uc); + break; + default: + /* Can not happen */ + dev_err(uc->ud->dev, "%s: chan%d invalid direction (%u)\n", + __func__, uc->id, uc->config.dir); + return -EINVAL; + } + + /* check if the channel configuration was successful */ + if (ret) + goto err_res_free; + + if (udma_is_chan_running(uc)) { + dev_warn(ud->dev, "chan%d: is running!\n", uc->id); + udma_reset_chan(uc, false); + if (udma_is_chan_running(uc)) { + dev_err(ud->dev, "chan%d: won't stop!\n", uc->id); + ret = -EBUSY; + goto err_res_free; + } + } + + uc->dma_dev = dmaengine_get_dma_device(chan); + uc->hdesc_pool = dma_pool_create(uc->name, uc->dma_dev, + uc->config.hdesc_size, ud->desc_align, + 0); + if (!uc->hdesc_pool) { + dev_err(ud->ddev.dev, + "Descriptor pool allocation failed\n"); + uc->use_dma_pool = false; + ret = -ENOMEM; + goto err_res_free; + } + + uc->use_dma_pool = true; + + /* PSI-L pairing */ + ret = navss_psil_pair(ud, uc->config.src_thread, uc->config.dst_thread); + if (ret) { + dev_err(ud->dev, "PSI-L pairing failed: 0x%04x -> 0x%04x\n", + uc->config.src_thread, uc->config.dst_thread); + goto err_res_free; + } + + uc->psil_paired = true; + + uc->irq_num_ring = ti_sci_inta_msi_get_virq(ud->dev, irq_ring_idx); + if (uc->irq_num_ring <= 0) { + dev_err(ud->dev, "Failed to get ring irq (index: %u)\n", + irq_ring_idx); + ret = -EINVAL; + goto err_psi_free; + } + + ret = request_irq(uc->irq_num_ring, udma_ring_irq_handler, + IRQF_TRIGGER_HIGH, uc->name, uc); + if (ret) { + dev_err(ud->dev, "chan%d: ring irq request failed\n", uc->id); + goto err_irq_free; + } + + uc->irq_num_udma = 0; + + udma_reset_rings(uc); + + INIT_DELAYED_WORK_ONSTACK(&uc->tx_drain.work, + udma_check_tx_completion); + + if (uc->tchan) + dev_dbg(ud->dev, + "chan%d: tchan%d, tflow%d, Remote thread: 0x%04x\n", + uc->id, uc->tchan->id, uc->tchan->tflow_id, + uc->config.remote_thread_id); + else if (uc->rchan) + dev_dbg(ud->dev, + "chan%d: rchan%d, rflow%d, Remote thread: 0x%04x\n", + uc->id, uc->rchan->id, uc->rflow->id, + uc->config.remote_thread_id); + return 0; + +err_irq_free: + uc->irq_num_ring = 0; +err_psi_free: + navss_psil_unpair(ud, uc->config.src_thread, uc->config.dst_thread); + uc->psil_paired = false; +err_res_free: + udma_free_tx_resources(uc); + udma_free_rx_resources(uc); + + udma_reset_uchan(uc); + + dma_pool_destroy(uc->hdesc_pool); + uc->use_dma_pool = false; + + return ret; +} + static int udma_slave_config(struct dma_chan *chan, struct dma_slave_config *cfg) { @@ -2028,6 +2800,7 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl, size_t tr_size; int num_tr = 0; int tr_idx = 0; + u64 asel; /* estimate the number of TRs we will need */ for_each_sg(sgl, sgent, sglen, i) { @@ -2045,6 +2818,11 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl, d->sglen = sglen; + if (uc->ud->match_data->type == DMA_TYPE_UDMA) + asel = 0; + else + asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT; + tr_req = d->hwdesc[0].tr_req_base; for_each_sg(sgl, sgent, sglen, i) { dma_addr_t sg_addr = sg_dma_address(sgent); @@ -2063,6 +2841,7 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl, false, CPPI5_TR_EVENT_SIZE_COMPLETION, 0); cppi5_tr_csf_set(&tr_req[tr_idx].flags, CPPI5_TR_CSF_SUPR_EVT); + sg_addr |= asel; tr_req[tr_idx].addr = sg_addr; tr_req[tr_idx].icnt0 = tr0_cnt0; tr_req[tr_idx].icnt1 = tr0_cnt1; @@ -2092,6 +2871,205 @@ udma_prep_slave_sg_tr(struct udma_chan *uc, struct scatterlist *sgl, return d; } +static struct udma_desc * +udma_prep_slave_sg_triggered_tr(struct udma_chan *uc, struct scatterlist *sgl, + unsigned int sglen, + enum dma_transfer_direction dir, + unsigned long tx_flags, void *context) +{ + struct scatterlist *sgent; + struct cppi5_tr_type15_t *tr_req = NULL; + enum dma_slave_buswidth dev_width; + u16 tr_cnt0, tr_cnt1; + dma_addr_t dev_addr; + struct udma_desc *d; + unsigned int i; + size_t tr_size, sg_len; + int num_tr = 0; + int tr_idx = 0; + u32 burst, trigger_size, port_window; + u64 asel; + + if (dir == DMA_DEV_TO_MEM) { + dev_addr = uc->cfg.src_addr; + dev_width = uc->cfg.src_addr_width; + burst = uc->cfg.src_maxburst; + port_window = uc->cfg.src_port_window_size; + } else if (dir == DMA_MEM_TO_DEV) { + dev_addr = uc->cfg.dst_addr; + dev_width = uc->cfg.dst_addr_width; + burst = uc->cfg.dst_maxburst; + port_window = uc->cfg.dst_port_window_size; + } else { + dev_err(uc->ud->dev, "%s: bad direction?\n", __func__); + return NULL; + } + + if (!burst) + burst = 1; + + if (port_window) { + if (port_window != burst) { + dev_err(uc->ud->dev, + "The burst must be equal to port_window\n"); + return NULL; + } + + tr_cnt0 = dev_width * port_window; + tr_cnt1 = 1; + } else { + tr_cnt0 = dev_width; + tr_cnt1 = burst; + } + trigger_size = tr_cnt0 * tr_cnt1; + + /* estimate the number of TRs we will need */ + for_each_sg(sgl, sgent, sglen, i) { + sg_len = sg_dma_len(sgent); + + if (sg_len % trigger_size) { + dev_err(uc->ud->dev, + "Not aligned SG entry (%zu for %u)\n", sg_len, + trigger_size); + return NULL; + } + + if (sg_len / trigger_size < SZ_64K) + num_tr++; + else + num_tr += 2; + } + + /* Now allocate and setup the descriptor. */ + tr_size = sizeof(struct cppi5_tr_type15_t); + d = udma_alloc_tr_desc(uc, tr_size, num_tr, dir); + if (!d) + return NULL; + + d->sglen = sglen; + + if (uc->ud->match_data->type == DMA_TYPE_UDMA) { + asel = 0; + } else { + asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT; + dev_addr |= asel; + } + + tr_req = d->hwdesc[0].tr_req_base; + for_each_sg(sgl, sgent, sglen, i) { + u16 tr0_cnt2, tr0_cnt3, tr1_cnt2; + dma_addr_t sg_addr = sg_dma_address(sgent); + + sg_len = sg_dma_len(sgent); + num_tr = udma_get_tr_counters(sg_len / trigger_size, 0, + &tr0_cnt2, &tr0_cnt3, &tr1_cnt2); + if (num_tr < 0) { + dev_err(uc->ud->dev, "size %zu is not supported\n", + sg_len); + udma_free_hwdesc(uc, d); + kfree(d); + return NULL; + } + + cppi5_tr_init(&tr_req[tr_idx].flags, CPPI5_TR_TYPE15, false, + true, CPPI5_TR_EVENT_SIZE_COMPLETION, 0); + cppi5_tr_csf_set(&tr_req[tr_idx].flags, CPPI5_TR_CSF_SUPR_EVT); + cppi5_tr_set_trigger(&tr_req[tr_idx].flags, + uc->config.tr_trigger_type, + CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, 0, 0); + + sg_addr |= asel; + if (dir == DMA_DEV_TO_MEM) { + tr_req[tr_idx].addr = dev_addr; + tr_req[tr_idx].icnt0 = tr_cnt0; + tr_req[tr_idx].icnt1 = tr_cnt1; + tr_req[tr_idx].icnt2 = tr0_cnt2; + tr_req[tr_idx].icnt3 = tr0_cnt3; + tr_req[tr_idx].dim1 = (-1) * tr_cnt0; + + tr_req[tr_idx].daddr = sg_addr; + tr_req[tr_idx].dicnt0 = tr_cnt0; + tr_req[tr_idx].dicnt1 = tr_cnt1; + tr_req[tr_idx].dicnt2 = tr0_cnt2; + tr_req[tr_idx].dicnt3 = tr0_cnt3; + tr_req[tr_idx].ddim1 = tr_cnt0; + tr_req[tr_idx].ddim2 = trigger_size; + tr_req[tr_idx].ddim3 = trigger_size * tr0_cnt2; + } else { + tr_req[tr_idx].addr = sg_addr; + tr_req[tr_idx].icnt0 = tr_cnt0; + tr_req[tr_idx].icnt1 = tr_cnt1; + tr_req[tr_idx].icnt2 = tr0_cnt2; + tr_req[tr_idx].icnt3 = tr0_cnt3; + tr_req[tr_idx].dim1 = tr_cnt0; + tr_req[tr_idx].dim2 = trigger_size; + tr_req[tr_idx].dim3 = trigger_size * tr0_cnt2; + + tr_req[tr_idx].daddr = dev_addr; + tr_req[tr_idx].dicnt0 = tr_cnt0; + tr_req[tr_idx].dicnt1 = tr_cnt1; + tr_req[tr_idx].dicnt2 = tr0_cnt2; + tr_req[tr_idx].dicnt3 = tr0_cnt3; + tr_req[tr_idx].ddim1 = (-1) * tr_cnt0; + } + + tr_idx++; + + if (num_tr == 2) { + cppi5_tr_init(&tr_req[tr_idx].flags, CPPI5_TR_TYPE15, + false, true, + CPPI5_TR_EVENT_SIZE_COMPLETION, 0); + cppi5_tr_csf_set(&tr_req[tr_idx].flags, + CPPI5_TR_CSF_SUPR_EVT); + cppi5_tr_set_trigger(&tr_req[tr_idx].flags, + uc->config.tr_trigger_type, + CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, + 0, 0); + + sg_addr += trigger_size * tr0_cnt2 * tr0_cnt3; + if (dir == DMA_DEV_TO_MEM) { + tr_req[tr_idx].addr = dev_addr; + tr_req[tr_idx].icnt0 = tr_cnt0; + tr_req[tr_idx].icnt1 = tr_cnt1; + tr_req[tr_idx].icnt2 = tr1_cnt2; + tr_req[tr_idx].icnt3 = 1; + tr_req[tr_idx].dim1 = (-1) * tr_cnt0; + + tr_req[tr_idx].daddr = sg_addr; + tr_req[tr_idx].dicnt0 = tr_cnt0; + tr_req[tr_idx].dicnt1 = tr_cnt1; + tr_req[tr_idx].dicnt2 = tr1_cnt2; + tr_req[tr_idx].dicnt3 = 1; + tr_req[tr_idx].ddim1 = tr_cnt0; + tr_req[tr_idx].ddim2 = trigger_size; + } else { + tr_req[tr_idx].addr = sg_addr; + tr_req[tr_idx].icnt0 = tr_cnt0; + tr_req[tr_idx].icnt1 = tr_cnt1; + tr_req[tr_idx].icnt2 = tr1_cnt2; + tr_req[tr_idx].icnt3 = 1; + tr_req[tr_idx].dim1 = tr_cnt0; + tr_req[tr_idx].dim2 = trigger_size; + + tr_req[tr_idx].daddr = dev_addr; + tr_req[tr_idx].dicnt0 = tr_cnt0; + tr_req[tr_idx].dicnt1 = tr_cnt1; + tr_req[tr_idx].dicnt2 = tr1_cnt2; + tr_req[tr_idx].dicnt3 = 1; + tr_req[tr_idx].ddim1 = (-1) * tr_cnt0; + } + tr_idx++; + } + + d->residue += sg_len; + } + + cppi5_tr_csf_set(&tr_req[tr_idx - 1].flags, + CPPI5_TR_CSF_SUPR_EVT | CPPI5_TR_CSF_EOP); + + return d; +} + static int udma_configure_statictr(struct udma_chan *uc, struct udma_desc *d, enum dma_slave_buswidth dev_width, u16 elcnt) @@ -2156,6 +3134,7 @@ udma_prep_slave_sg_pkt(struct udma_chan *uc, struct scatterlist *sgl, struct udma_desc *d; u32 ring_id; unsigned int i; + u64 asel; d = kzalloc(struct_size(d, hwdesc, sglen), GFP_NOWAIT); if (!d) @@ -2169,6 +3148,11 @@ udma_prep_slave_sg_pkt(struct udma_chan *uc, struct scatterlist *sgl, else ring_id = k3_ringacc_get_ring_id(uc->tchan->tc_ring); + if (uc->ud->match_data->type == DMA_TYPE_UDMA) + asel = 0; + else + asel = (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT; + for_each_sg(sgl, sgent, sglen, i) { struct udma_hwdesc *hwdesc = &d->hwdesc[i]; dma_addr_t sg_addr = sg_dma_address(sgent); @@ -2203,14 +3187,16 @@ udma_prep_slave_sg_pkt(struct udma_chan *uc, struct scatterlist *sgl, } /* attach the sg buffer to the descriptor */ + sg_addr |= asel; cppi5_hdesc_attach_buf(desc, sg_addr, sg_len, sg_addr, sg_len); /* Attach link as host buffer descriptor */ if (h_desc) cppi5_hdesc_link_hbdesc(h_desc, - hwdesc->cppi5_desc_paddr); + hwdesc->cppi5_desc_paddr | asel); - if (dir == DMA_MEM_TO_DEV) + if (uc->ud->match_data->type == DMA_TYPE_PKTDMA || + dir == DMA_MEM_TO_DEV) h_desc = desc; } @@ -2333,7 +3319,8 @@ udma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, struct udma_desc *d; u32 burst; - if (dir != uc->config.dir) { + if (dir != uc->config.dir && + (uc->config.dir == DMA_MEM_TO_MEM && !uc->config.tr_trigger_type)) { dev_err(chan->device->dev, "%s: chan%d is for %s, not supporting %s\n", __func__, uc->id, @@ -2359,9 +3346,12 @@ udma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, if (uc->config.pkt_mode) d = udma_prep_slave_sg_pkt(uc, sgl, sglen, dir, tx_flags, context); - else + else if (is_slave_direction(uc->config.dir)) d = udma_prep_slave_sg_tr(uc, sgl, sglen, dir, tx_flags, context); + else + d = udma_prep_slave_sg_triggered_tr(uc, sgl, sglen, dir, + tx_flags, context); if (!d) return NULL; @@ -2415,7 +3405,12 @@ udma_prep_dma_cyclic_tr(struct udma_chan *uc, dma_addr_t buf_addr, return NULL; tr_req = d->hwdesc[0].tr_req_base; - period_addr = buf_addr; + if (uc->ud->match_data->type == DMA_TYPE_UDMA) + period_addr = buf_addr; + else + period_addr = buf_addr | + ((u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT); + for (i = 0; i < periods; i++) { int tr_idx = i * num_tr; @@ -2480,6 +3475,9 @@ udma_prep_dma_cyclic_pkt(struct udma_chan *uc, dma_addr_t buf_addr, else ring_id = k3_ringacc_get_ring_id(uc->tchan->tc_ring); + if (uc->ud->match_data->type != DMA_TYPE_UDMA) + buf_addr |= (u64)uc->config.asel << K3_ADDRESS_ASEL_SHIFT; + for (i = 0; i < periods; i++) { struct udma_hwdesc *hwdesc = &d->hwdesc[i]; dma_addr_t period_addr = buf_addr + (period_len * i); @@ -2621,6 +3619,11 @@ udma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, d->tr_idx = 0; d->residue = len; + if (uc->ud->match_data->type != DMA_TYPE_UDMA) { + src |= (u64)uc->ud->asel << K3_ADDRESS_ASEL_SHIFT; + dest |= (u64)uc->ud->asel << K3_ADDRESS_ASEL_SHIFT; + } + tr_req = d->hwdesc[0].tr_req_base; cppi5_tr_init(&tr_req[0].flags, CPPI5_TR_TYPE15, false, true, @@ -2978,6 +3981,7 @@ static void udma_free_chan_resources(struct dma_chan *chan) vchan_free_chan_resources(&uc->vc); tasklet_kill(&uc->vc.task); + bcdma_free_bchan_resources(uc); udma_free_tx_resources(uc); udma_free_rx_resources(uc); udma_reset_uchan(uc); @@ -2989,10 +3993,14 @@ static void udma_free_chan_resources(struct dma_chan *chan) } static struct platform_driver udma_driver; +static struct platform_driver bcdma_driver; +static struct platform_driver pktdma_driver; struct udma_filter_param { int remote_thread_id; u32 atype; + u32 asel; + u32 tr_trigger_type; }; static bool udma_dma_filter_fn(struct dma_chan *chan, void *param) @@ -3003,7 +4011,9 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param) struct udma_chan *uc; struct udma_dev *ud; - if (chan->device->dev->driver != &udma_driver.driver) + if (chan->device->dev->driver != &udma_driver.driver && + chan->device->dev->driver != &bcdma_driver.driver && + chan->device->dev->driver != &pktdma_driver.driver) return false; uc = to_udma_chan(chan); @@ -3017,13 +4027,25 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param) return false; } + if (filter_param->asel > 15) { + dev_err(ud->dev, "Invalid channel asel: %u\n", + filter_param->asel); + return false; + } + ucc->remote_thread_id = filter_param->remote_thread_id; ucc->atype = filter_param->atype; + ucc->asel = filter_param->asel; + ucc->tr_trigger_type = filter_param->tr_trigger_type; - if (ucc->remote_thread_id & K3_PSIL_DST_THREAD_ID_OFFSET) + if (ucc->tr_trigger_type) { + ucc->dir = DMA_MEM_TO_MEM; + goto triggered_bchan; + } else if (ucc->remote_thread_id & K3_PSIL_DST_THREAD_ID_OFFSET) { ucc->dir = DMA_MEM_TO_DEV; - else + } else { ucc->dir = DMA_DEV_TO_MEM; + } ep_config = psil_get_ep_config(ucc->remote_thread_id); if (IS_ERR(ep_config)) { @@ -3032,6 +4054,19 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param) ucc->dir = DMA_MEM_TO_MEM; ucc->remote_thread_id = -1; ucc->atype = 0; + ucc->asel = 0; + return false; + } + + if (ud->match_data->type == DMA_TYPE_BCDMA && + ep_config->pkt_mode) { + dev_err(ud->dev, + "Only TR mode is supported (psi-l thread 0x%04x)\n", + ucc->remote_thread_id); + ucc->dir = DMA_MEM_TO_MEM; + ucc->remote_thread_id = -1; + ucc->atype = 0; + ucc->asel = 0; return false; } @@ -3040,6 +4075,15 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param) ucc->notdpkt = ep_config->notdpkt; ucc->ep_type = ep_config->ep_type; + if (ud->match_data->type == DMA_TYPE_PKTDMA && + ep_config->mapped_channel_id >= 0) { + ucc->mapped_channel_id = ep_config->mapped_channel_id; + ucc->default_flow_id = ep_config->default_flow_id; + } else { + ucc->mapped_channel_id = -1; + ucc->default_flow_id = -1; + } + if (ucc->ep_type != PSIL_EP_NATIVE) { const struct udma_match_data *match_data = ud->match_data; @@ -3063,6 +4107,13 @@ static bool udma_dma_filter_fn(struct dma_chan *chan, void *param) ucc->remote_thread_id, dmaengine_get_direction_text(ucc->dir)); return true; + +triggered_bchan: + dev_dbg(ud->dev, "chan%d: triggered channel (type: %u)\n", uc->id, + ucc->tr_trigger_type); + + return true; + } static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, @@ -3073,14 +4124,33 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, struct udma_filter_param filter_param; struct dma_chan *chan; - if (dma_spec->args_count != 1 && dma_spec->args_count != 2) - return NULL; + if (ud->match_data->type == DMA_TYPE_BCDMA) { + if (dma_spec->args_count != 3) + return NULL; - filter_param.remote_thread_id = dma_spec->args[0]; - if (dma_spec->args_count == 2) - filter_param.atype = dma_spec->args[1]; - else + filter_param.tr_trigger_type = dma_spec->args[0]; + filter_param.remote_thread_id = dma_spec->args[1]; + filter_param.asel = dma_spec->args[2]; filter_param.atype = 0; + } else { + if (dma_spec->args_count != 1 && dma_spec->args_count != 2) + return NULL; + + filter_param.remote_thread_id = dma_spec->args[0]; + filter_param.tr_trigger_type = 0; + if (dma_spec->args_count == 2) { + if (ud->match_data->type == DMA_TYPE_UDMA) { + filter_param.atype = dma_spec->args[1]; + filter_param.asel = 0; + } else { + filter_param.atype = 0; + filter_param.asel = dma_spec->args[1]; + } + } else { + filter_param.atype = 0; + filter_param.asel = 0; + } + } chan = __dma_request_channel(&mask, udma_dma_filter_fn, &filter_param, ofdma->of_node); @@ -3093,28 +4163,48 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, } static struct udma_match_data am654_main_data = { + .type = DMA_TYPE_UDMA, .psil_base = 0x1000, .enable_memcpy_support = true, .statictr_z_mask = GENMASK(11, 0), }; static struct udma_match_data am654_mcu_data = { + .type = DMA_TYPE_UDMA, .psil_base = 0x6000, .enable_memcpy_support = false, .statictr_z_mask = GENMASK(11, 0), }; static struct udma_match_data j721e_main_data = { + .type = DMA_TYPE_UDMA, .psil_base = 0x1000, .enable_memcpy_support = true, - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST, + .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, .statictr_z_mask = GENMASK(23, 0), }; static struct udma_match_data j721e_mcu_data = { + .type = DMA_TYPE_UDMA, .psil_base = 0x6000, .enable_memcpy_support = false, /* MEM_TO_MEM is slow via MCU UDMA */ - .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST, + .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, + .statictr_z_mask = GENMASK(23, 0), +}; + +static struct udma_match_data am64_bcdma_data = { + .type = DMA_TYPE_BCDMA, + .psil_base = 0x2000, /* for tchan and rchan, not applicable to bchan */ + .enable_memcpy_support = true, /* Supported via bchan */ + .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, + .statictr_z_mask = GENMASK(23, 0), +}; + +static struct udma_match_data am64_pktdma_data = { + .type = DMA_TYPE_PKTDMA, + .psil_base = 0x1000, + .enable_memcpy_support = false, /* PKTDMA does not support MEM_TO_MEM */ + .flags = UDMA_FLAG_PDMA_ACC32 | UDMA_FLAG_PDMA_BURST | UDMA_FLAG_TDTYPE, .statictr_z_mask = GENMASK(23, 0), }; @@ -3136,30 +4226,105 @@ static const struct of_device_id udma_of_match[] = { { /* Sentinel */ }, }; +static const struct of_device_id bcdma_of_match[] = { + { + .compatible = "ti,am64-dmss-bcdma", + .data = &am64_bcdma_data, + }, + { /* Sentinel */ }, +}; + +static const struct of_device_id pktdma_of_match[] = { + { + .compatible = "ti,am64-dmss-pktdma", + .data = &am64_pktdma_data, + }, + { /* Sentinel */ }, +}; + static struct udma_soc_data am654_soc_data = { - .rchan_oes_offset = 0x200, + .oes = { + .udma_rchan = 0x200, + }, }; static struct udma_soc_data j721e_soc_data = { - .rchan_oes_offset = 0x400, + .oes = { + .udma_rchan = 0x400, + }, }; static struct udma_soc_data j7200_soc_data = { - .rchan_oes_offset = 0x80, + .oes = { + .udma_rchan = 0x80, + }, +}; + +static struct udma_soc_data am64_soc_data = { + .oes = { + .bcdma_bchan_data = 0x2200, + .bcdma_bchan_ring = 0x2400, + .bcdma_tchan_data = 0x2800, + .bcdma_tchan_ring = 0x2a00, + .bcdma_rchan_data = 0x2e00, + .bcdma_rchan_ring = 0x3000, + .pktdma_tchan_flow = 0x1200, + .pktdma_rchan_flow = 0x1600, + }, + .bcdma_trigger_event_offset = 0xc400, }; static const struct soc_device_attribute k3_soc_devices[] = { { .family = "AM65X", .data = &am654_soc_data }, { .family = "J721E", .data = &j721e_soc_data }, { .family = "J7200", .data = &j7200_soc_data }, + { .family = "AM64X", .data = &am64_soc_data }, { /* sentinel */ } }; static int udma_get_mmrs(struct platform_device *pdev, struct udma_dev *ud) { + u32 cap2, cap3, cap4; int i; - for (i = 0; i < MMR_LAST; i++) { + ud->mmrs[MMR_GCFG] = devm_platform_ioremap_resource_byname(pdev, mmr_names[MMR_GCFG]); + if (IS_ERR(ud->mmrs[MMR_GCFG])) + return PTR_ERR(ud->mmrs[MMR_GCFG]); + + cap2 = udma_read(ud->mmrs[MMR_GCFG], 0x28); + cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c); + + switch (ud->match_data->type) { + case DMA_TYPE_UDMA: + ud->rflow_cnt = UDMA_CAP3_RFLOW_CNT(cap3); + ud->tchan_cnt = UDMA_CAP2_TCHAN_CNT(cap2); + ud->echan_cnt = UDMA_CAP2_ECHAN_CNT(cap2); + ud->rchan_cnt = UDMA_CAP2_RCHAN_CNT(cap2); + break; + case DMA_TYPE_BCDMA: + ud->bchan_cnt = BCDMA_CAP2_BCHAN_CNT(cap2); + ud->tchan_cnt = BCDMA_CAP2_TCHAN_CNT(cap2); + ud->rchan_cnt = BCDMA_CAP2_RCHAN_CNT(cap2); + break; + case DMA_TYPE_PKTDMA: + cap4 = udma_read(ud->mmrs[MMR_GCFG], 0x30); + ud->tchan_cnt = UDMA_CAP2_TCHAN_CNT(cap2); + ud->rchan_cnt = UDMA_CAP2_RCHAN_CNT(cap2); + ud->rflow_cnt = UDMA_CAP3_RFLOW_CNT(cap3); + ud->tflow_cnt = PKTDMA_CAP4_TFLOW_CNT(cap4); + break; + default: + return -EINVAL; + } + + for (i = 1; i < MMR_LAST; i++) { + if (i == MMR_BCHANRT && ud->bchan_cnt == 0) + continue; + if (i == MMR_TCHANRT && ud->tchan_cnt == 0) + continue; + if (i == MMR_RCHANRT && ud->rchan_cnt == 0) + continue; + ud->mmrs[i] = devm_platform_ioremap_resource_byname(pdev, mmr_names[i]); if (IS_ERR(ud->mmrs[i])) return PTR_ERR(ud->mmrs[i]); @@ -3168,48 +4333,58 @@ static int udma_get_mmrs(struct platform_device *pdev, struct udma_dev *ud) return 0; } +static void udma_mark_resource_ranges(struct udma_dev *ud, unsigned long *map, + struct ti_sci_resource_desc *rm_desc, + char *name) +{ + bitmap_clear(map, rm_desc->start, rm_desc->num); + bitmap_clear(map, rm_desc->start_sec, rm_desc->num_sec); + dev_dbg(ud->dev, "ti_sci resource range for %s: %d:%d | %d:%d\n", name, + rm_desc->start, rm_desc->num, rm_desc->start_sec, + rm_desc->num_sec); +} + +static const char * const range_names[] = { + [RM_RANGE_BCHAN] = "ti,sci-rm-range-bchan", + [RM_RANGE_TCHAN] = "ti,sci-rm-range-tchan", + [RM_RANGE_RCHAN] = "ti,sci-rm-range-rchan", + [RM_RANGE_RFLOW] = "ti,sci-rm-range-rflow", + [RM_RANGE_TFLOW] = "ti,sci-rm-range-tflow", +}; + static int udma_setup_resources(struct udma_dev *ud) { + int ret, i, j; struct device *dev = ud->dev; - int ch_count, ret, i, j; - u32 cap2, cap3; - struct ti_sci_resource_desc *rm_desc; struct ti_sci_resource *rm_res, irq_res; struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; - static const char * const range_names[] = { "ti,sci-rm-range-tchan", - "ti,sci-rm-range-rchan", - "ti,sci-rm-range-rflow" }; - - cap2 = udma_read(ud->mmrs[MMR_GCFG], UDMA_CAP_REG(2)); - cap3 = udma_read(ud->mmrs[MMR_GCFG], UDMA_CAP_REG(3)); - - ud->rflow_cnt = UDMA_CAP3_RFLOW_CNT(cap3); - ud->tchan_cnt = UDMA_CAP2_TCHAN_CNT(cap2); - ud->echan_cnt = UDMA_CAP2_ECHAN_CNT(cap2); - ud->rchan_cnt = UDMA_CAP2_RCHAN_CNT(cap2); - ch_count = ud->tchan_cnt + ud->rchan_cnt; + u32 cap3; /* Set up the throughput level start indexes */ + cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c); if (of_device_is_compatible(dev->of_node, "ti,am654-navss-main-udmap")) { - ud->tpl_levels = 2; - ud->tpl_start_idx[0] = 8; + ud->tchan_tpl.levels = 2; + ud->tchan_tpl.start_idx[0] = 8; } else if (of_device_is_compatible(dev->of_node, "ti,am654-navss-mcu-udmap")) { - ud->tpl_levels = 2; - ud->tpl_start_idx[0] = 2; + ud->tchan_tpl.levels = 2; + ud->tchan_tpl.start_idx[0] = 2; } else if (UDMA_CAP3_UCHAN_CNT(cap3)) { - ud->tpl_levels = 3; - ud->tpl_start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3); - ud->tpl_start_idx[0] = ud->tpl_start_idx[1] + - UDMA_CAP3_HCHAN_CNT(cap3); + ud->tchan_tpl.levels = 3; + ud->tchan_tpl.start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3); + ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3); } else if (UDMA_CAP3_HCHAN_CNT(cap3)) { - ud->tpl_levels = 2; - ud->tpl_start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3); + ud->tchan_tpl.levels = 2; + ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3); } else { - ud->tpl_levels = 1; + ud->tchan_tpl.levels = 1; } + ud->rchan_tpl.levels = ud->tchan_tpl.levels; + ud->rchan_tpl.start_idx[0] = ud->tchan_tpl.start_idx[0]; + ud->rchan_tpl.start_idx[1] = ud->tchan_tpl.start_idx[1]; + ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt), sizeof(unsigned long), GFP_KERNEL); ud->tchans = devm_kcalloc(dev, ud->tchan_cnt, sizeof(*ud->tchans), @@ -3247,11 +4422,15 @@ static int udma_setup_resources(struct udma_dev *ud) bitmap_set(ud->rflow_gp_map, 0, ud->rflow_cnt); /* Get resource ranges from tisci */ - for (i = 0; i < RM_RANGE_LAST; i++) + for (i = 0; i < RM_RANGE_LAST; i++) { + if (i == RM_RANGE_BCHAN || i == RM_RANGE_TFLOW) + continue; + tisci_rm->rm_ranges[i] = devm_ti_sci_get_of_resource(tisci_rm->tisci, dev, tisci_rm->tisci_dev_id, (char *)range_names[i]); + } /* tchan ranges */ rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN]; @@ -3259,13 +4438,9 @@ static int udma_setup_resources(struct udma_dev *ud) bitmap_zero(ud->tchan_map, ud->tchan_cnt); } else { bitmap_fill(ud->tchan_map, ud->tchan_cnt); - for (i = 0; i < rm_res->sets; i++) { - rm_desc = &rm_res->desc[i]; - bitmap_clear(ud->tchan_map, rm_desc->start, - rm_desc->num); - dev_dbg(dev, "ti-sci-res: tchan: %d:%d\n", - rm_desc->start, rm_desc->num); - } + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->tchan_map, + &rm_res->desc[i], "tchan"); } irq_res.sets = rm_res->sets; @@ -3275,13 +4450,9 @@ static int udma_setup_resources(struct udma_dev *ud) bitmap_zero(ud->rchan_map, ud->rchan_cnt); } else { bitmap_fill(ud->rchan_map, ud->rchan_cnt); - for (i = 0; i < rm_res->sets; i++) { - rm_desc = &rm_res->desc[i]; - bitmap_clear(ud->rchan_map, rm_desc->start, - rm_desc->num); - dev_dbg(dev, "ti-sci-res: rchan: %d:%d\n", - rm_desc->start, rm_desc->num); - } + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->rchan_map, + &rm_res->desc[i], "rchan"); } irq_res.sets += rm_res->sets; @@ -3290,12 +4461,21 @@ static int udma_setup_resources(struct udma_dev *ud) for (i = 0; i < rm_res->sets; i++) { irq_res.desc[i].start = rm_res->desc[i].start; irq_res.desc[i].num = rm_res->desc[i].num; + irq_res.desc[i].start_sec = rm_res->desc[i].start_sec; + irq_res.desc[i].num_sec = rm_res->desc[i].num_sec; } rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN]; for (j = 0; j < rm_res->sets; j++, i++) { - irq_res.desc[i].start = rm_res->desc[j].start + - ud->soc_data->rchan_oes_offset; - irq_res.desc[i].num = rm_res->desc[j].num; + if (rm_res->desc[j].num) { + irq_res.desc[i].start = rm_res->desc[j].start + + ud->soc_data->oes.udma_rchan; + irq_res.desc[i].num = rm_res->desc[j].num; + } + if (rm_res->desc[j].num_sec) { + irq_res.desc[i].start_sec = rm_res->desc[j].start_sec + + ud->soc_data->oes.udma_rchan; + irq_res.desc[i].num_sec = rm_res->desc[j].num_sec; + } } ret = ti_sci_inta_msi_domain_alloc_irqs(ud->dev, &irq_res); kfree(irq_res.desc); @@ -3311,15 +4491,344 @@ static int udma_setup_resources(struct udma_dev *ud) bitmap_clear(ud->rflow_gp_map, ud->rchan_cnt, ud->rflow_cnt - ud->rchan_cnt); } else { + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->rflow_gp_map, + &rm_res->desc[i], "gp-rflow"); + } + + return 0; +} + +static int bcdma_setup_resources(struct udma_dev *ud) +{ + int ret, i, j; + struct device *dev = ud->dev; + struct ti_sci_resource *rm_res, irq_res; + struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; + const struct udma_oes_offsets *oes = &ud->soc_data->oes; + u32 cap; + + /* Set up the throughput level start indexes */ + cap = udma_read(ud->mmrs[MMR_GCFG], 0x2c); + if (BCDMA_CAP3_UBCHAN_CNT(cap)) { + ud->bchan_tpl.levels = 3; + ud->bchan_tpl.start_idx[1] = BCDMA_CAP3_UBCHAN_CNT(cap); + ud->bchan_tpl.start_idx[0] = BCDMA_CAP3_HBCHAN_CNT(cap); + } else if (BCDMA_CAP3_HBCHAN_CNT(cap)) { + ud->bchan_tpl.levels = 2; + ud->bchan_tpl.start_idx[0] = BCDMA_CAP3_HBCHAN_CNT(cap); + } else { + ud->bchan_tpl.levels = 1; + } + + cap = udma_read(ud->mmrs[MMR_GCFG], 0x30); + if (BCDMA_CAP4_URCHAN_CNT(cap)) { + ud->rchan_tpl.levels = 3; + ud->rchan_tpl.start_idx[1] = BCDMA_CAP4_URCHAN_CNT(cap); + ud->rchan_tpl.start_idx[0] = BCDMA_CAP4_HRCHAN_CNT(cap); + } else if (BCDMA_CAP4_HRCHAN_CNT(cap)) { + ud->rchan_tpl.levels = 2; + ud->rchan_tpl.start_idx[0] = BCDMA_CAP4_HRCHAN_CNT(cap); + } else { + ud->rchan_tpl.levels = 1; + } + + if (BCDMA_CAP4_UTCHAN_CNT(cap)) { + ud->tchan_tpl.levels = 3; + ud->tchan_tpl.start_idx[1] = BCDMA_CAP4_UTCHAN_CNT(cap); + ud->tchan_tpl.start_idx[0] = BCDMA_CAP4_HTCHAN_CNT(cap); + } else if (BCDMA_CAP4_HTCHAN_CNT(cap)) { + ud->tchan_tpl.levels = 2; + ud->tchan_tpl.start_idx[0] = BCDMA_CAP4_HTCHAN_CNT(cap); + } else { + ud->tchan_tpl.levels = 1; + } + + ud->bchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->bchan_cnt), + sizeof(unsigned long), GFP_KERNEL); + ud->bchans = devm_kcalloc(dev, ud->bchan_cnt, sizeof(*ud->bchans), + GFP_KERNEL); + ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt), + sizeof(unsigned long), GFP_KERNEL); + ud->tchans = devm_kcalloc(dev, ud->tchan_cnt, sizeof(*ud->tchans), + GFP_KERNEL); + ud->rchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->rchan_cnt), + sizeof(unsigned long), GFP_KERNEL); + ud->rchans = devm_kcalloc(dev, ud->rchan_cnt, sizeof(*ud->rchans), + GFP_KERNEL); + /* BCDMA do not really have flows, but the driver expect it */ + ud->rflow_in_use = devm_kcalloc(dev, BITS_TO_LONGS(ud->rchan_cnt), + sizeof(unsigned long), + GFP_KERNEL); + ud->rflows = devm_kcalloc(dev, ud->rchan_cnt, sizeof(*ud->rflows), + GFP_KERNEL); + + if (!ud->bchan_map || !ud->tchan_map || !ud->rchan_map || + !ud->rflow_in_use || !ud->bchans || !ud->tchans || !ud->rchans || + !ud->rflows) + return -ENOMEM; + + /* Get resource ranges from tisci */ + for (i = 0; i < RM_RANGE_LAST; i++) { + if (i == RM_RANGE_RFLOW || i == RM_RANGE_TFLOW) + continue; + if (i == RM_RANGE_BCHAN && ud->bchan_cnt == 0) + continue; + if (i == RM_RANGE_TCHAN && ud->tchan_cnt == 0) + continue; + if (i == RM_RANGE_RCHAN && ud->rchan_cnt == 0) + continue; + + tisci_rm->rm_ranges[i] = + devm_ti_sci_get_of_resource(tisci_rm->tisci, dev, + tisci_rm->tisci_dev_id, + (char *)range_names[i]); + } + + irq_res.sets = 0; + + /* bchan ranges */ + if (ud->bchan_cnt) { + rm_res = tisci_rm->rm_ranges[RM_RANGE_BCHAN]; + if (IS_ERR(rm_res)) { + bitmap_zero(ud->bchan_map, ud->bchan_cnt); + } else { + bitmap_fill(ud->bchan_map, ud->bchan_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->bchan_map, + &rm_res->desc[i], + "bchan"); + } + irq_res.sets += rm_res->sets; + } + + /* tchan ranges */ + if (ud->tchan_cnt) { + rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN]; + if (IS_ERR(rm_res)) { + bitmap_zero(ud->tchan_map, ud->tchan_cnt); + } else { + bitmap_fill(ud->tchan_map, ud->tchan_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->tchan_map, + &rm_res->desc[i], + "tchan"); + } + irq_res.sets += rm_res->sets * 2; + } + + /* rchan ranges */ + if (ud->rchan_cnt) { + rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN]; + if (IS_ERR(rm_res)) { + bitmap_zero(ud->rchan_map, ud->rchan_cnt); + } else { + bitmap_fill(ud->rchan_map, ud->rchan_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->rchan_map, + &rm_res->desc[i], + "rchan"); + } + irq_res.sets += rm_res->sets * 2; + } + + irq_res.desc = kcalloc(irq_res.sets, sizeof(*irq_res.desc), GFP_KERNEL); + if (ud->bchan_cnt) { + rm_res = tisci_rm->rm_ranges[RM_RANGE_BCHAN]; for (i = 0; i < rm_res->sets; i++) { - rm_desc = &rm_res->desc[i]; - bitmap_clear(ud->rflow_gp_map, rm_desc->start, - rm_desc->num); - dev_dbg(dev, "ti-sci-res: rflow: %d:%d\n", - rm_desc->start, rm_desc->num); + irq_res.desc[i].start = rm_res->desc[i].start + + oes->bcdma_bchan_ring; + irq_res.desc[i].num = rm_res->desc[i].num; + } + } + if (ud->tchan_cnt) { + rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN]; + for (j = 0; j < rm_res->sets; j++, i += 2) { + irq_res.desc[i].start = rm_res->desc[j].start + + oes->bcdma_tchan_data; + irq_res.desc[i].num = rm_res->desc[j].num; + + irq_res.desc[i + 1].start = rm_res->desc[j].start + + oes->bcdma_tchan_ring; + irq_res.desc[i + 1].num = rm_res->desc[j].num; + } + } + if (ud->rchan_cnt) { + rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN]; + for (j = 0; j < rm_res->sets; j++, i += 2) { + irq_res.desc[i].start = rm_res->desc[j].start + + oes->bcdma_rchan_data; + irq_res.desc[i].num = rm_res->desc[j].num; + + irq_res.desc[i + 1].start = rm_res->desc[j].start + + oes->bcdma_rchan_ring; + irq_res.desc[i + 1].num = rm_res->desc[j].num; } } + ret = ti_sci_inta_msi_domain_alloc_irqs(ud->dev, &irq_res); + kfree(irq_res.desc); + if (ret) { + dev_err(ud->dev, "Failed to allocate MSI interrupts\n"); + return ret; + } + + return 0; +} + +static int pktdma_setup_resources(struct udma_dev *ud) +{ + int ret, i, j; + struct device *dev = ud->dev; + struct ti_sci_resource *rm_res, irq_res; + struct udma_tisci_rm *tisci_rm = &ud->tisci_rm; + const struct udma_oes_offsets *oes = &ud->soc_data->oes; + u32 cap3; + + /* Set up the throughput level start indexes */ + cap3 = udma_read(ud->mmrs[MMR_GCFG], 0x2c); + if (UDMA_CAP3_UCHAN_CNT(cap3)) { + ud->tchan_tpl.levels = 3; + ud->tchan_tpl.start_idx[1] = UDMA_CAP3_UCHAN_CNT(cap3); + ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3); + } else if (UDMA_CAP3_HCHAN_CNT(cap3)) { + ud->tchan_tpl.levels = 2; + ud->tchan_tpl.start_idx[0] = UDMA_CAP3_HCHAN_CNT(cap3); + } else { + ud->tchan_tpl.levels = 1; + } + + ud->tchan_tpl.levels = ud->tchan_tpl.levels; + ud->tchan_tpl.start_idx[0] = ud->tchan_tpl.start_idx[0]; + ud->tchan_tpl.start_idx[1] = ud->tchan_tpl.start_idx[1]; + + ud->tchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tchan_cnt), + sizeof(unsigned long), GFP_KERNEL); + ud->tchans = devm_kcalloc(dev, ud->tchan_cnt, sizeof(*ud->tchans), + GFP_KERNEL); + ud->rchan_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->rchan_cnt), + sizeof(unsigned long), GFP_KERNEL); + ud->rchans = devm_kcalloc(dev, ud->rchan_cnt, sizeof(*ud->rchans), + GFP_KERNEL); + ud->rflow_in_use = devm_kcalloc(dev, BITS_TO_LONGS(ud->rflow_cnt), + sizeof(unsigned long), + GFP_KERNEL); + ud->rflows = devm_kcalloc(dev, ud->rflow_cnt, sizeof(*ud->rflows), + GFP_KERNEL); + ud->tflow_map = devm_kmalloc_array(dev, BITS_TO_LONGS(ud->tflow_cnt), + sizeof(unsigned long), GFP_KERNEL); + + if (!ud->tchan_map || !ud->rchan_map || !ud->tflow_map || !ud->tchans || + !ud->rchans || !ud->rflows || !ud->rflow_in_use) + return -ENOMEM; + + /* Get resource ranges from tisci */ + for (i = 0; i < RM_RANGE_LAST; i++) { + if (i == RM_RANGE_BCHAN) + continue; + + tisci_rm->rm_ranges[i] = + devm_ti_sci_get_of_resource(tisci_rm->tisci, dev, + tisci_rm->tisci_dev_id, + (char *)range_names[i]); + } + + /* tchan ranges */ + rm_res = tisci_rm->rm_ranges[RM_RANGE_TCHAN]; + if (IS_ERR(rm_res)) { + bitmap_zero(ud->tchan_map, ud->tchan_cnt); + } else { + bitmap_fill(ud->tchan_map, ud->tchan_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->tchan_map, + &rm_res->desc[i], "tchan"); + } + + /* rchan ranges */ + rm_res = tisci_rm->rm_ranges[RM_RANGE_RCHAN]; + if (IS_ERR(rm_res)) { + bitmap_zero(ud->rchan_map, ud->rchan_cnt); + } else { + bitmap_fill(ud->rchan_map, ud->rchan_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->rchan_map, + &rm_res->desc[i], "rchan"); + } + + /* rflow ranges */ + rm_res = tisci_rm->rm_ranges[RM_RANGE_RFLOW]; + if (IS_ERR(rm_res)) { + /* all rflows are assigned exclusively to Linux */ + bitmap_zero(ud->rflow_in_use, ud->rflow_cnt); + } else { + bitmap_fill(ud->rflow_in_use, ud->rflow_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->rflow_in_use, + &rm_res->desc[i], "rflow"); + } + irq_res.sets = rm_res->sets; + + /* tflow ranges */ + rm_res = tisci_rm->rm_ranges[RM_RANGE_TFLOW]; + if (IS_ERR(rm_res)) { + /* all tflows are assigned exclusively to Linux */ + bitmap_zero(ud->tflow_map, ud->tflow_cnt); + } else { + bitmap_fill(ud->tflow_map, ud->tflow_cnt); + for (i = 0; i < rm_res->sets; i++) + udma_mark_resource_ranges(ud, ud->tflow_map, + &rm_res->desc[i], "tflow"); + } + irq_res.sets += rm_res->sets; + + irq_res.desc = kcalloc(irq_res.sets, sizeof(*irq_res.desc), GFP_KERNEL); + rm_res = tisci_rm->rm_ranges[RM_RANGE_TFLOW]; + for (i = 0; i < rm_res->sets; i++) { + irq_res.desc[i].start = rm_res->desc[i].start + + oes->pktdma_tchan_flow; + irq_res.desc[i].num = rm_res->desc[i].num; + } + rm_res = tisci_rm->rm_ranges[RM_RANGE_RFLOW]; + for (j = 0; j < rm_res->sets; j++, i++) { + irq_res.desc[i].start = rm_res->desc[j].start + + oes->pktdma_rchan_flow; + irq_res.desc[i].num = rm_res->desc[j].num; + } + ret = ti_sci_inta_msi_domain_alloc_irqs(ud->dev, &irq_res); + kfree(irq_res.desc); + if (ret) { + dev_err(ud->dev, "Failed to allocate MSI interrupts\n"); + return ret; + } + + return 0; +} + +static int setup_resources(struct udma_dev *ud) +{ + struct device *dev = ud->dev; + int ch_count, ret; + + switch (ud->match_data->type) { + case DMA_TYPE_UDMA: + ret = udma_setup_resources(ud); + break; + case DMA_TYPE_BCDMA: + ret = bcdma_setup_resources(ud); + break; + case DMA_TYPE_PKTDMA: + ret = pktdma_setup_resources(ud); + break; + default: + return -EINVAL; + } + + if (ret) + return ret; + + ch_count = ud->bchan_cnt + ud->tchan_cnt + ud->rchan_cnt; + if (ud->bchan_cnt) + ch_count -= bitmap_weight(ud->bchan_map, ud->bchan_cnt); ch_count -= bitmap_weight(ud->tchan_map, ud->tchan_cnt); ch_count -= bitmap_weight(ud->rchan_map, ud->rchan_cnt); if (!ch_count) @@ -3330,12 +4839,40 @@ static int udma_setup_resources(struct udma_dev *ud) if (!ud->channels) return -ENOMEM; - dev_info(dev, "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n", - ch_count, - ud->tchan_cnt - bitmap_weight(ud->tchan_map, ud->tchan_cnt), - ud->rchan_cnt - bitmap_weight(ud->rchan_map, ud->rchan_cnt), - ud->rflow_cnt - bitmap_weight(ud->rflow_gp_map, - ud->rflow_cnt)); + switch (ud->match_data->type) { + case DMA_TYPE_UDMA: + dev_info(dev, + "Channels: %d (tchan: %u, rchan: %u, gp-rflow: %u)\n", + ch_count, + ud->tchan_cnt - bitmap_weight(ud->tchan_map, + ud->tchan_cnt), + ud->rchan_cnt - bitmap_weight(ud->rchan_map, + ud->rchan_cnt), + ud->rflow_cnt - bitmap_weight(ud->rflow_gp_map, + ud->rflow_cnt)); + break; + case DMA_TYPE_BCDMA: + dev_info(dev, + "Channels: %d (bchan: %u, tchan: %u, rchan: %u)\n", + ch_count, + ud->bchan_cnt - bitmap_weight(ud->bchan_map, + ud->bchan_cnt), + ud->tchan_cnt - bitmap_weight(ud->tchan_map, + ud->tchan_cnt), + ud->rchan_cnt - bitmap_weight(ud->rchan_map, + ud->rchan_cnt)); + break; + case DMA_TYPE_PKTDMA: + dev_info(dev, + "Channels: %d (tchan: %u, rchan: %u)\n", + ch_count, + ud->tchan_cnt - bitmap_weight(ud->tchan_map, + ud->tchan_cnt), + ud->rchan_cnt - bitmap_weight(ud->rchan_map, + ud->rchan_cnt)); + default: + break; + } return ch_count; } @@ -3444,20 +4981,33 @@ static void udma_dbg_summary_show_chan(struct seq_file *s, seq_printf(s, " %-13s| %s", dma_chan_name(chan), chan->dbg_client_name ?: "in-use"); - seq_printf(s, " (%s, ", dmaengine_get_direction_text(uc->config.dir)); + if (ucc->tr_trigger_type) + seq_puts(s, " (triggered, "); + else + seq_printf(s, " (%s, ", + dmaengine_get_direction_text(uc->config.dir)); switch (uc->config.dir) { case DMA_MEM_TO_MEM: + if (uc->ud->match_data->type == DMA_TYPE_BCDMA) { + seq_printf(s, "bchan%d)\n", uc->bchan->id); + return; + } + seq_printf(s, "chan%d pair [0x%04x -> 0x%04x], ", uc->tchan->id, ucc->src_thread, ucc->dst_thread); break; case DMA_DEV_TO_MEM: seq_printf(s, "rchan%d [0x%04x -> 0x%04x], ", uc->rchan->id, ucc->src_thread, ucc->dst_thread); + if (uc->ud->match_data->type == DMA_TYPE_PKTDMA) + seq_printf(s, "rflow%d, ", uc->rflow->id); break; case DMA_MEM_TO_DEV: seq_printf(s, "tchan%d [0x%04x -> 0x%04x], ", uc->tchan->id, ucc->src_thread, ucc->dst_thread); + if (uc->ud->match_data->type == DMA_TYPE_PKTDMA) + seq_printf(s, "tflow%d, ", uc->tchan->tflow_id); break; default: seq_printf(s, ")\n"); @@ -3519,6 +5069,25 @@ static int udma_probe(struct platform_device *pdev) if (!ud) return -ENOMEM; + match = of_match_node(udma_of_match, dev->of_node); + if (!match) + match = of_match_node(bcdma_of_match, dev->of_node); + if (!match) { + match = of_match_node(pktdma_of_match, dev->of_node); + if (!match) { + dev_err(dev, "No compatible match found\n"); + return -ENODEV; + } + } + ud->match_data = match->data; + + soc = soc_device_match(k3_soc_devices); + if (!soc) { + dev_err(dev, "No compatible SoC found\n"); + return -ENODEV; + } + ud->soc_data = soc->data; + ret = udma_get_mmrs(pdev, ud); if (ret) return ret; @@ -3542,16 +5111,44 @@ static int udma_probe(struct platform_device *pdev) return ret; } - ret = of_property_read_u32(dev->of_node, "ti,udma-atype", &ud->atype); - if (!ret && ud->atype > 2) { - dev_err(dev, "Invalid atype: %u\n", ud->atype); - return -EINVAL; + if (ud->match_data->type == DMA_TYPE_UDMA) { + ret = of_property_read_u32(dev->of_node, "ti,udma-atype", + &ud->atype); + if (!ret && ud->atype > 2) { + dev_err(dev, "Invalid atype: %u\n", ud->atype); + return -EINVAL; + } + } else { + ret = of_property_read_u32(dev->of_node, "ti,asel", + &ud->asel); + if (!ret && ud->asel > 15) { + dev_err(dev, "Invalid asel: %u\n", ud->asel); + return -EINVAL; + } } ud->tisci_rm.tisci_udmap_ops = &ud->tisci_rm.tisci->ops.rm_udmap_ops; ud->tisci_rm.tisci_psil_ops = &ud->tisci_rm.tisci->ops.rm_psil_ops; - ud->ringacc = of_k3_ringacc_get_by_phandle(dev->of_node, "ti,ringacc"); + if (ud->match_data->type == DMA_TYPE_UDMA) { + ud->ringacc = of_k3_ringacc_get_by_phandle(dev->of_node, "ti,ringacc"); + } else { + struct k3_ringacc_init_data ring_init_data; + + ring_init_data.tisci = ud->tisci_rm.tisci; + ring_init_data.tisci_dev_id = ud->tisci_rm.tisci_dev_id; + if (ud->match_data->type == DMA_TYPE_BCDMA) { + ring_init_data.num_rings = ud->bchan_cnt + + ud->tchan_cnt + + ud->rchan_cnt; + } else { + ring_init_data.num_rings = ud->rflow_cnt + + ud->tflow_cnt; + } + + ud->ringacc = k3_ringacc_dmarings_init(pdev, &ring_init_data); + } + if (IS_ERR(ud->ringacc)) return PTR_ERR(ud->ringacc); @@ -3562,27 +5159,15 @@ static int udma_probe(struct platform_device *pdev) return -EPROBE_DEFER; } - match = of_match_node(udma_of_match, dev->of_node); - if (!match) { - dev_err(dev, "No compatible match found\n"); - return -ENODEV; - } - ud->match_data = match->data; - - soc = soc_device_match(k3_soc_devices); - if (!soc) { - dev_err(dev, "No compatible SoC found\n"); - return -ENODEV; - } - ud->soc_data = soc->data; - dma_cap_set(DMA_SLAVE, ud->ddev.cap_mask); - dma_cap_set(DMA_CYCLIC, ud->ddev.cap_mask); + /* cyclic operation is not supported via PKTDMA */ + if (ud->match_data->type != DMA_TYPE_PKTDMA) { + dma_cap_set(DMA_CYCLIC, ud->ddev.cap_mask); + ud->ddev.device_prep_dma_cyclic = udma_prep_dma_cyclic; + } - ud->ddev.device_alloc_chan_resources = udma_alloc_chan_resources; ud->ddev.device_config = udma_slave_config; ud->ddev.device_prep_slave_sg = udma_prep_slave_sg; - ud->ddev.device_prep_dma_cyclic = udma_prep_dma_cyclic; ud->ddev.device_issue_pending = udma_issue_pending; ud->ddev.device_tx_status = udma_tx_status; ud->ddev.device_pause = udma_pause; @@ -3593,7 +5178,25 @@ static int udma_probe(struct platform_device *pdev) ud->ddev.dbg_summary_show = udma_dbg_summary_show; #endif + switch (ud->match_data->type) { + case DMA_TYPE_UDMA: + ud->ddev.device_alloc_chan_resources = + udma_alloc_chan_resources; + break; + case DMA_TYPE_BCDMA: + ud->ddev.device_alloc_chan_resources = + bcdma_alloc_chan_resources; + ud->ddev.device_router_config = bcdma_router_config; + break; + case DMA_TYPE_PKTDMA: + ud->ddev.device_alloc_chan_resources = + pktdma_alloc_chan_resources; + break; + default: + return -EINVAL; + } ud->ddev.device_free_chan_resources = udma_free_chan_resources; + ud->ddev.src_addr_widths = TI_UDMAC_BUSWIDTHS; ud->ddev.dst_addr_widths = TI_UDMAC_BUSWIDTHS; ud->ddev.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV); @@ -3601,7 +5204,8 @@ static int udma_probe(struct platform_device *pdev) ud->ddev.copy_align = DMAENGINE_ALIGN_8_BYTES; ud->ddev.desc_metadata_modes = DESC_METADATA_CLIENT | DESC_METADATA_ENGINE; - if (ud->match_data->enable_memcpy_support) { + if (ud->match_data->enable_memcpy_support && + !(ud->match_data->type == DMA_TYPE_BCDMA && ud->bchan_cnt == 0)) { dma_cap_set(DMA_MEMCPY, ud->ddev.cap_mask); ud->ddev.device_prep_dma_memcpy = udma_prep_dma_memcpy; ud->ddev.directions |= BIT(DMA_MEM_TO_MEM); @@ -3614,7 +5218,7 @@ static int udma_probe(struct platform_device *pdev) INIT_LIST_HEAD(&ud->ddev.channels); INIT_LIST_HEAD(&ud->desc_to_purge); - ch_count = udma_setup_resources(ud); + ch_count = setup_resources(ud); if (ch_count <= 0) return ch_count; @@ -3629,6 +5233,13 @@ static int udma_probe(struct platform_device *pdev) if (ret) return ret; + for (i = 0; i < ud->bchan_cnt; i++) { + struct udma_bchan *bchan = &ud->bchans[i]; + + bchan->id = i; + bchan->reg_rt = ud->mmrs[MMR_BCHANRT] + i * 0x1000; + } + for (i = 0; i < ud->tchan_cnt; i++) { struct udma_tchan *tchan = &ud->tchans[i]; @@ -3655,9 +5266,12 @@ static int udma_probe(struct platform_device *pdev) uc->ud = ud; uc->vc.desc_free = udma_desc_free; uc->id = i; + uc->bchan = NULL; uc->tchan = NULL; uc->rchan = NULL; uc->config.remote_thread_id = -1; + uc->config.mapped_channel_id = -1; + uc->config.default_flow_id = -1; uc->config.dir = DMA_MEM_TO_MEM; uc->name = devm_kasprintf(dev, GFP_KERNEL, "%s chan%d", dev_name(dev), i); @@ -3696,5 +5310,25 @@ static struct platform_driver udma_driver = { }; builtin_platform_driver(udma_driver); +static struct platform_driver bcdma_driver = { + .driver = { + .name = "ti-bcdma", + .of_match_table = bcdma_of_match, + .suppress_bind_attrs = true, + }, + .probe = udma_probe, +}; +builtin_platform_driver(bcdma_driver); + +static struct platform_driver pktdma_driver = { + .driver = { + .name = "ti-pktdma", + .of_match_table = pktdma_of_match, + .suppress_bind_attrs = true, + }, + .probe = udma_probe, +}; +builtin_platform_driver(pktdma_driver); + /* Private interfaces to UDMA */ #include "k3-udma-private.c" diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h index 09c4529e013d..d349c6d482ae 100644 --- a/drivers/dma/ti/k3-udma.h +++ b/drivers/dma/ti/k3-udma.h @@ -18,7 +18,7 @@ #define UDMA_RX_FLOW_ID_FW_OES_REG 0x80 #define UDMA_RX_FLOW_ID_FW_STATUS_REG 0x88 -/* TCHANRT/RCHANRT registers */ +/* BCHANRT/TCHANRT/RCHANRT registers */ #define UDMA_CHAN_RT_CTL_REG 0x0 #define UDMA_CHAN_RT_SWTRIG_REG 0x8 #define UDMA_CHAN_RT_STDATA_REG 0x80 @@ -45,6 +45,18 @@ #define UDMA_CAP3_HCHAN_CNT(val) (((val) >> 14) & 0x1ff) #define UDMA_CAP3_UCHAN_CNT(val) (((val) >> 23) & 0x1ff) +#define BCDMA_CAP2_BCHAN_CNT(val) ((val) & 0x1ff) +#define BCDMA_CAP2_TCHAN_CNT(val) (((val) >> 9) & 0x1ff) +#define BCDMA_CAP2_RCHAN_CNT(val) (((val) >> 18) & 0x1ff) +#define BCDMA_CAP3_HBCHAN_CNT(val) (((val) >> 14) & 0x1ff) +#define BCDMA_CAP3_UBCHAN_CNT(val) (((val) >> 23) & 0x1ff) +#define BCDMA_CAP4_HRCHAN_CNT(val) ((val) & 0xff) +#define BCDMA_CAP4_URCHAN_CNT(val) (((val) >> 8) & 0xff) +#define BCDMA_CAP4_HTCHAN_CNT(val) (((val) >> 16) & 0xff) +#define BCDMA_CAP4_UTCHAN_CNT(val) (((val) >> 24) & 0xff) + +#define PKTDMA_CAP4_TFLOW_CNT(val) ((val) & 0x3fff) + /* UDMA_CHAN_RT_CTL_REG */ #define UDMA_CHAN_RT_CTL_EN BIT(31) #define UDMA_CHAN_RT_CTL_TDOWN BIT(30) @@ -82,15 +94,20 @@ */ #define PDMA_STATIC_TR_Z(x, mask) ((x) & (mask)) +/* Address Space Select */ +#define K3_ADDRESS_ASEL_SHIFT 48 + struct udma_dev; struct udma_tchan; struct udma_rchan; struct udma_rflow; enum udma_rm_range { - RM_RANGE_TCHAN = 0, + RM_RANGE_BCHAN = 0, + RM_RANGE_TCHAN, RM_RANGE_RCHAN, RM_RANGE_RFLOW, + RM_RANGE_TFLOW, RM_RANGE_LAST, }; @@ -112,6 +129,8 @@ int xudma_navss_psil_unpair(struct udma_dev *ud, u32 src_thread, u32 dst_thread); struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property); +struct device *xudma_get_device(struct udma_dev *ud); +struct k3_ringacc *xudma_get_ringacc(struct udma_dev *ud); void xudma_dev_put(struct udma_dev *ud); u32 xudma_dev_get_psil_base(struct udma_dev *ud); struct udma_tisci_rm *xudma_dev_get_tisci_rm(struct udma_dev *ud); @@ -136,5 +155,10 @@ void xudma_tchanrt_write(struct udma_tchan *tchan, int reg, u32 val); u32 xudma_rchanrt_read(struct udma_rchan *rchan, int reg); void xudma_rchanrt_write(struct udma_rchan *rchan, int reg, u32 val); bool xudma_rflow_is_gp(struct udma_dev *ud, int id); +int xudma_get_rflow_ring_offset(struct udma_dev *ud); +int xudma_is_pktdma(struct udma_dev *ud); + +int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id); +int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id); #endif /* K3_UDMA_H_ */ diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig index 05b1009e2820..f4abe3529acd 100644 --- a/drivers/mailbox/Kconfig +++ b/drivers/mailbox/Kconfig @@ -16,6 +16,13 @@ config ARM_MHU The controller has 3 mailbox channels, the last of which can be used in Secure mode only. +config ARM_MHU_V2 + tristate "ARM MHUv2 Mailbox" + depends on ARM_AMBA + help + Say Y here if you want to build the ARM MHUv2 controller driver, + which provides unidirectional mailboxes between processing elements. + config IMX_MBOX tristate "i.MX Mailbox" depends on ARCH_MXC || COMPILE_TEST @@ -201,7 +208,7 @@ config BCM_FLEXRM_MBOX config STM32_IPCC tristate "STM32 IPCC Mailbox" - depends on MACH_STM32MP157 + depends on MACH_STM32MP157 || COMPILE_TEST help Mailbox implementation for STMicroelectonics STM32 family chips with hardware for Inter-Processor Communication Controller (IPCC) diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile index 2e06e02b2e03..7194fa92c787 100644 --- a/drivers/mailbox/Makefile +++ b/drivers/mailbox/Makefile @@ -7,6 +7,8 @@ obj-$(CONFIG_MAILBOX_TEST) += mailbox-test.o obj-$(CONFIG_ARM_MHU) += arm_mhu.o arm_mhu_db.o +obj-$(CONFIG_ARM_MHU_V2) += arm_mhuv2.o + obj-$(CONFIG_IMX_MBOX) += imx-mailbox.o obj-$(CONFIG_ARMADA_37XX_RWTM_MBOX) += armada-37xx-rwtm-mailbox.o diff --git a/drivers/mailbox/arm_mhu_db.c b/drivers/mailbox/arm_mhu_db.c index 275efe4cca0c..8eb66c4ecf5b 100644 --- a/drivers/mailbox/arm_mhu_db.c +++ b/drivers/mailbox/arm_mhu_db.c @@ -180,7 +180,7 @@ static void mhu_db_shutdown(struct mbox_chan *chan) /* Reset channel */ mhu_db_mbox_clear_irq(chan); - kfree(chan->con_priv); + devm_kfree(mbox->dev, chan->con_priv); chan->con_priv = NULL; } diff --git a/drivers/mailbox/arm_mhuv2.c b/drivers/mailbox/arm_mhuv2.c new file mode 100644 index 000000000000..67fb10885bb4 --- /dev/null +++ b/drivers/mailbox/arm_mhuv2.c @@ -0,0 +1,1136 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM Message Handling Unit Version 2 (MHUv2) driver. + * + * Copyright (C) 2020 ARM Ltd. + * Copyright (C) 2020 Linaro Ltd. + * + * An MHUv2 mailbox controller can provide up to 124 channel windows (each 32 + * bit long) and the driver allows any combination of both the transport + * protocol modes: data-transfer and doorbell, to be used on those channel + * windows. + * + * The transport protocols should be specified in the device tree entry for the + * device. The transport protocols determine how the underlying hardware + * resources of the device are utilized when transmitting data. Refer to the + * device tree bindings of the ARM MHUv2 controller for more details. + * + * The number of registered mailbox channels is dependent on both the underlying + * hardware - mainly the number of channel windows implemented by the platform, + * as well as the selected transport protocols. + * + * The MHUv2 controller can work both as a sender and receiver, but the driver + * and the DT bindings support unidirectional transfers for better allocation of + * the channels. That is, this driver will be probed for two separate devices + * for each mailbox controller, a sender device and a receiver device. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* ====== MHUv2 Registers ====== */ + +/* Maximum number of channel windows */ +#define MHUV2_CH_WN_MAX 124 +/* Number of combined interrupt status registers */ +#define MHUV2_CMB_INT_ST_REG_CNT 4 +#define MHUV2_STAT_BYTES (sizeof(u32)) +#define MHUV2_STAT_BITS (MHUV2_STAT_BYTES * __CHAR_BIT__) + +#define LSB_MASK(n) ((1 << (n * __CHAR_BIT__)) - 1) +#define MHUV2_PROTOCOL_PROP "arm,mhuv2-protocols" + +/* Register Message Handling Unit Configuration fields */ +struct mhu_cfg_t { + u32 num_ch : 7; + u32 pad : 25; +} __packed; + +/* register Interrupt Status fields */ +struct int_st_t { + u32 nr2r : 1; + u32 r2nr : 1; + u32 pad : 30; +} __packed; + +/* Register Interrupt Clear fields */ +struct int_clr_t { + u32 nr2r : 1; + u32 r2nr : 1; + u32 pad : 30; +} __packed; + +/* Register Interrupt Enable fields */ +struct int_en_t { + u32 r2nr : 1; + u32 nr2r : 1; + u32 chcomb : 1; + u32 pad : 29; +} __packed; + +/* Register Implementer Identification fields */ +struct iidr_t { + u32 implementer : 12; + u32 revision : 4; + u32 variant : 4; + u32 product_id : 12; +} __packed; + +/* Register Architecture Identification Register fields */ +struct aidr_t { + u32 arch_minor_rev : 4; + u32 arch_major_rev : 4; + u32 pad : 24; +} __packed; + +/* Sender Channel Window fields */ +struct mhu2_send_ch_wn_reg { + u32 stat; + u8 pad1[0x0C - 0x04]; + u32 stat_set; + u32 int_st; + u32 int_clr; + u32 int_en; + u8 pad2[0x20 - 0x1C]; +} __packed; + +/* Sender frame register fields */ +struct mhu2_send_frame_reg { + struct mhu2_send_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX]; + struct mhu_cfg_t mhu_cfg; + u32 resp_cfg; + u32 access_request; + u32 access_ready; + struct int_st_t int_st; + struct int_clr_t int_clr; + struct int_en_t int_en; + u32 reserved0; + u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT]; + u8 pad[0xFC8 - 0xFB0]; + struct iidr_t iidr; + struct aidr_t aidr; +} __packed; + +/* Receiver Channel Window fields */ +struct mhu2_recv_ch_wn_reg { + u32 stat; + u32 stat_masked; + u32 stat_clear; + u8 reserved0[0x10 - 0x0C]; + u32 mask; + u32 mask_set; + u32 mask_clear; + u8 pad[0x20 - 0x1C]; +} __packed; + +/* Receiver frame register fields */ +struct mhu2_recv_frame_reg { + struct mhu2_recv_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX]; + struct mhu_cfg_t mhu_cfg; + u8 reserved0[0xF90 - 0xF84]; + struct int_st_t int_st; + struct int_clr_t int_clr; + struct int_en_t int_en; + u32 pad; + u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT]; + u8 reserved2[0xFC8 - 0xFB0]; + struct iidr_t iidr; + struct aidr_t aidr; +} __packed; + + +/* ====== MHUv2 data structures ====== */ + +enum mhuv2_transport_protocol { + DOORBELL = 0, + DATA_TRANSFER = 1 +}; + +enum mhuv2_frame { + RECEIVER_FRAME, + SENDER_FRAME +}; + +/** + * struct mhuv2 - MHUv2 mailbox controller data + * + * @mbox: Mailbox controller belonging to the MHU frame. + * @send/recv: Base address of the register mapping region. + * @frame: Frame type: RECEIVER_FRAME or SENDER_FRAME. + * @irq: Interrupt. + * @windows: Channel windows implemented by the platform. + * @minor: Minor version of the controller. + * @length: Length of the protocols array in bytes. + * @protocols: Raw protocol information, derived from device tree. + * @doorbell_pending_lock: spinlock required for correct operation of Tx + * interrupt for doorbells. + */ +struct mhuv2 { + struct mbox_controller mbox; + union { + struct mhu2_send_frame_reg __iomem *send; + struct mhu2_recv_frame_reg __iomem *recv; + }; + enum mhuv2_frame frame; + unsigned int irq; + unsigned int windows; + unsigned int minor; + unsigned int length; + u32 *protocols; + + spinlock_t doorbell_pending_lock; +}; + +#define mhu_from_mbox(_mbox) container_of(_mbox, struct mhuv2, mbox) + +/** + * struct mhuv2_protocol_ops - MHUv2 operations + * + * Each transport protocol must provide an implementation of the operations + * provided here. + * + * @rx_startup: Startup callback for receiver. + * @rx_shutdown: Shutdown callback for receiver. + * @read_data: Reads and clears newly available data. + * @tx_startup: Startup callback for receiver. + * @tx_shutdown: Shutdown callback for receiver. + * @last_tx_done: Report back if the last tx is completed or not. + * @send_data: Send data to the receiver. + */ +struct mhuv2_protocol_ops { + int (*rx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan); + void (*rx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan); + void *(*read_data)(struct mhuv2 *mhu, struct mbox_chan *chan); + + void (*tx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan); + void (*tx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan); + int (*last_tx_done)(struct mhuv2 *mhu, struct mbox_chan *chan); + int (*send_data)(struct mhuv2 *mhu, struct mbox_chan *chan, void *arg); +}; + +/* + * MHUv2 mailbox channel's private information + * + * @ops: protocol specific ops for the channel. + * @ch_wn_idx: Channel window index allocated to the channel. + * @windows: Total number of windows consumed by the channel, only relevant + * in DATA_TRANSFER protocol. + * @doorbell: Doorbell bit number within the ch_wn_idx window, only relevant + * in DOORBELL protocol. + * @pending: Flag indicating pending doorbell interrupt, only relevant in + * DOORBELL protocol. + */ +struct mhuv2_mbox_chan_priv { + const struct mhuv2_protocol_ops *ops; + u32 ch_wn_idx; + union { + u32 windows; + struct { + u32 doorbell; + u32 pending; + }; + }; +}; + +/* Macro for reading a bitfield within a physically mapped packed struct */ +#define readl_relaxed_bitfield(_regptr, _field) \ + ({ \ + u32 _regval; \ + _regval = readl_relaxed((_regptr)); \ + (*(typeof((_regptr)))(&_regval))._field; \ + }) + +/* Macro for writing a bitfield within a physically mapped packed struct */ +#define writel_relaxed_bitfield(_value, _regptr, _field) \ + ({ \ + u32 _regval; \ + _regval = readl_relaxed(_regptr); \ + (*(typeof(_regptr))(&_regval))._field = _value; \ + writel_relaxed(_regval, _regptr); \ + }) + + +/* =================== Doorbell transport protocol operations =============== */ + +static int mhuv2_doorbell_rx_startup(struct mhuv2 *mhu, struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + writel_relaxed(BIT(priv->doorbell), + &mhu->recv->ch_wn[priv->ch_wn_idx].mask_clear); + return 0; +} + +static void mhuv2_doorbell_rx_shutdown(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + writel_relaxed(BIT(priv->doorbell), + &mhu->recv->ch_wn[priv->ch_wn_idx].mask_set); +} + +static void *mhuv2_doorbell_read_data(struct mhuv2 *mhu, struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + writel_relaxed(BIT(priv->doorbell), + &mhu->recv->ch_wn[priv->ch_wn_idx].stat_clear); + return NULL; +} + +static int mhuv2_doorbell_last_tx_done(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + return !(readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat) & + BIT(priv->doorbell)); +} + +static int mhuv2_doorbell_send_data(struct mhuv2 *mhu, struct mbox_chan *chan, + void *arg) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + unsigned long flags; + + spin_lock_irqsave(&mhu->doorbell_pending_lock, flags); + + priv->pending = 1; + writel_relaxed(BIT(priv->doorbell), + &mhu->send->ch_wn[priv->ch_wn_idx].stat_set); + + spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags); + + return 0; +} + +static const struct mhuv2_protocol_ops mhuv2_doorbell_ops = { + .rx_startup = mhuv2_doorbell_rx_startup, + .rx_shutdown = mhuv2_doorbell_rx_shutdown, + .read_data = mhuv2_doorbell_read_data, + .last_tx_done = mhuv2_doorbell_last_tx_done, + .send_data = mhuv2_doorbell_send_data, +}; +#define IS_PROTOCOL_DOORBELL(_priv) (_priv->ops == &mhuv2_doorbell_ops) + +/* ============= Data transfer transport protocol operations ================ */ + +static int mhuv2_data_transfer_rx_startup(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + int i = priv->ch_wn_idx + priv->windows - 1; + + /* + * The protocol mandates that all but the last status register must be + * masked. + */ + writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_clear); + return 0; +} + +static void mhuv2_data_transfer_rx_shutdown(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + int i = priv->ch_wn_idx + priv->windows - 1; + + writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set); +} + +static void *mhuv2_data_transfer_read_data(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + const int windows = priv->windows; + struct arm_mhuv2_mbox_msg *msg; + u32 *data; + int i, idx; + + msg = kzalloc(sizeof(*msg) + windows * MHUV2_STAT_BYTES, GFP_KERNEL); + if (!msg) + return ERR_PTR(-ENOMEM); + + data = msg->data = msg + 1; + msg->len = windows * MHUV2_STAT_BYTES; + + /* + * Messages are expected in order of most significant word to least + * significant word. Refer mhuv2_data_transfer_send_data() for more + * details. + * + * We also need to read the stat register instead of stat_masked, as we + * masked all but the last window. + * + * Last channel window must be cleared as the final operation. Upon + * clearing the last channel window register, which is unmasked in + * data-transfer protocol, the interrupt is de-asserted. + */ + for (i = 0; i < windows; i++) { + idx = priv->ch_wn_idx + i; + data[windows - 1 - i] = readl_relaxed(&mhu->recv->ch_wn[idx].stat); + writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[idx].stat_clear); + } + + return msg; +} + +static void mhuv2_data_transfer_tx_startup(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + int i = priv->ch_wn_idx + priv->windows - 1; + + /* Enable interrupts only for the last window */ + if (mhu->minor) { + writel_relaxed(0x1, &mhu->send->ch_wn[i].int_clr); + writel_relaxed(0x1, &mhu->send->ch_wn[i].int_en); + } +} + +static void mhuv2_data_transfer_tx_shutdown(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + int i = priv->ch_wn_idx + priv->windows - 1; + + if (mhu->minor) + writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en); +} + +static int mhuv2_data_transfer_last_tx_done(struct mhuv2 *mhu, + struct mbox_chan *chan) +{ + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + int i = priv->ch_wn_idx + priv->windows - 1; + + /* Just checking the last channel window should be enough */ + return !readl_relaxed(&mhu->send->ch_wn[i].stat); +} + +/* + * Message will be transmitted from most significant to least significant word. + * This is to allow for messages shorter than channel windows to still trigger + * the receiver interrupt which gets activated when the last stat register is + * written. As an example, a 6-word message is to be written on a 4-channel MHU + * connection: Registers marked with '*' are masked, and will not generate an + * interrupt on the receiver side once written. + * + * u32 *data = [0x00000001], [0x00000002], [0x00000003], [0x00000004], + * [0x00000005], [0x00000006] + * + * ROUND 1: + * stat reg To write Write sequence + * [ stat 3 ] <- [0x00000001] 4 <- triggers interrupt on receiver + * [ stat 2 ] <- [0x00000002] 3 + * [ stat 1 ] <- [0x00000003] 2 + * [ stat 0 ] <- [0x00000004] 1 + * + * data += 4 // Increment data pointer by number of stat regs + * + * ROUND 2: + * stat reg To write Write sequence + * [ stat 3 ] <- [0x00000005] 2 <- triggers interrupt on receiver + * [ stat 2 ] <- [0x00000006] 1 + * [ stat 1 ] <- [0x00000000] + * [ stat 0 ] <- [0x00000000] + */ +static int mhuv2_data_transfer_send_data(struct mhuv2 *mhu, + struct mbox_chan *chan, void *arg) +{ + const struct arm_mhuv2_mbox_msg *msg = arg; + int bytes_left = msg->len, bytes_to_send, bytes_in_round, i; + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + int windows = priv->windows; + u32 *data = msg->data, word; + + while (bytes_left) { + if (!data[0]) { + dev_err(mhu->mbox.dev, "Data aligned at first window can't be zero to guarantee interrupt generation at receiver"); + return -EINVAL; + } + + while(!mhuv2_data_transfer_last_tx_done(mhu, chan)) + continue; + + bytes_in_round = min(bytes_left, (int)(windows * MHUV2_STAT_BYTES)); + + for (i = windows - 1; i >= 0; i--) { + /* Data less than windows can transfer ? */ + if (unlikely(bytes_in_round <= i * MHUV2_STAT_BYTES)) + continue; + + word = data[i]; + bytes_to_send = bytes_in_round & (MHUV2_STAT_BYTES - 1); + if (unlikely(bytes_to_send)) + word &= LSB_MASK(bytes_to_send); + else + bytes_to_send = MHUV2_STAT_BYTES; + + writel_relaxed(word, &mhu->send->ch_wn[priv->ch_wn_idx + windows - 1 - i].stat_set); + bytes_left -= bytes_to_send; + bytes_in_round -= bytes_to_send; + } + + data += windows; + } + + return 0; +} + +static const struct mhuv2_protocol_ops mhuv2_data_transfer_ops = { + .rx_startup = mhuv2_data_transfer_rx_startup, + .rx_shutdown = mhuv2_data_transfer_rx_shutdown, + .read_data = mhuv2_data_transfer_read_data, + .tx_startup = mhuv2_data_transfer_tx_startup, + .tx_shutdown = mhuv2_data_transfer_tx_shutdown, + .last_tx_done = mhuv2_data_transfer_last_tx_done, + .send_data = mhuv2_data_transfer_send_data, +}; + +/* Interrupt handlers */ + +static struct mbox_chan *get_irq_chan_comb(struct mhuv2 *mhu, u32 *reg) +{ + struct mbox_chan *chans = mhu->mbox.chans; + int channel = 0, i, offset = 0, windows, protocol, ch_wn; + u32 stat; + + for (i = 0; i < MHUV2_CMB_INT_ST_REG_CNT; i++) { + stat = readl_relaxed(reg + i); + if (!stat) + continue; + + ch_wn = i * MHUV2_STAT_BITS + __builtin_ctz(stat); + + for (i = 0; i < mhu->length; i += 2) { + protocol = mhu->protocols[i]; + windows = mhu->protocols[i + 1]; + + if (ch_wn >= offset + windows) { + if (protocol == DOORBELL) + channel += MHUV2_STAT_BITS * windows; + else + channel++; + + offset += windows; + continue; + } + + /* Return first chan of the window in doorbell mode */ + if (protocol == DOORBELL) + channel += MHUV2_STAT_BITS * (ch_wn - offset); + + return &chans[channel]; + } + } + + return ERR_PTR(-EIO); +} + +static irqreturn_t mhuv2_sender_interrupt(int irq, void *data) +{ + struct mhuv2 *mhu = data; + struct device *dev = mhu->mbox.dev; + struct mhuv2_mbox_chan_priv *priv; + struct mbox_chan *chan; + unsigned long flags; + int i, found = 0; + u32 stat; + + chan = get_irq_chan_comb(mhu, mhu->send->chcomb_int_st); + if (IS_ERR(chan)) { + dev_warn(dev, "Failed to find channel for the Tx interrupt\n"); + return IRQ_NONE; + } + priv = chan->con_priv; + + if (!IS_PROTOCOL_DOORBELL(priv)) { + writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx + priv->windows - 1].int_clr); + + if (chan->cl) { + mbox_chan_txdone(chan, 0); + return IRQ_HANDLED; + } + + dev_warn(dev, "Tx interrupt Received on channel (%u) not currently attached to a mailbox client\n", + priv->ch_wn_idx); + return IRQ_NONE; + } + + /* Clear the interrupt first, so we don't miss any doorbell later */ + writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx].int_clr); + + /* + * In Doorbell mode, make sure no new transitions happen while the + * interrupt handler is trying to find the finished doorbell tx + * operations, else we may think few of the transfers were complete + * before they actually were. + */ + spin_lock_irqsave(&mhu->doorbell_pending_lock, flags); + + /* + * In case of doorbell mode, the first channel of the window is returned + * by get_irq_chan_comb(). Find all the pending channels here. + */ + stat = readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat); + + for (i = 0; i < MHUV2_STAT_BITS; i++) { + priv = chan[i].con_priv; + + /* Find cases where pending was 1, but stat's bit is cleared */ + if (priv->pending ^ ((stat >> i) & 0x1)) { + BUG_ON(!priv->pending); + + if (!chan->cl) { + dev_warn(dev, "Tx interrupt received on doorbell (%u : %u) channel not currently attached to a mailbox client\n", + priv->ch_wn_idx, i); + continue; + } + + mbox_chan_txdone(&chan[i], 0); + priv->pending = 0; + found++; + } + } + + spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags); + + if (!found) { + /* + * We may have already processed the doorbell in the previous + * iteration if the interrupt came right after we cleared it but + * before we read the stat register. + */ + dev_dbg(dev, "Couldn't find the doorbell (%u) for the Tx interrupt interrupt\n", + priv->ch_wn_idx); + return IRQ_NONE; + } + + return IRQ_HANDLED; +} + +static struct mbox_chan *get_irq_chan_comb_rx(struct mhuv2 *mhu) +{ + struct mhuv2_mbox_chan_priv *priv; + struct mbox_chan *chan; + u32 stat; + + chan = get_irq_chan_comb(mhu, mhu->recv->chcomb_int_st); + if (IS_ERR(chan)) + return chan; + + priv = chan->con_priv; + if (!IS_PROTOCOL_DOORBELL(priv)) + return chan; + + /* + * In case of doorbell mode, the first channel of the window is returned + * by the routine. Find the exact channel here. + */ + stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked); + BUG_ON(!stat); + + return chan + __builtin_ctz(stat); +} + +static struct mbox_chan *get_irq_chan_stat_rx(struct mhuv2 *mhu) +{ + struct mbox_chan *chans = mhu->mbox.chans; + struct mhuv2_mbox_chan_priv *priv; + u32 stat; + int i = 0; + + while (i < mhu->mbox.num_chans) { + priv = chans[i].con_priv; + stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked); + + if (stat) { + if (IS_PROTOCOL_DOORBELL(priv)) + i += __builtin_ctz(stat); + return &chans[i]; + } + + i += IS_PROTOCOL_DOORBELL(priv) ? MHUV2_STAT_BITS : 1; + } + + return ERR_PTR(-EIO); +} + +static struct mbox_chan *get_irq_chan_rx(struct mhuv2 *mhu) +{ + if (!mhu->minor) + return get_irq_chan_stat_rx(mhu); + + return get_irq_chan_comb_rx(mhu); +} + +static irqreturn_t mhuv2_receiver_interrupt(int irq, void *arg) +{ + struct mhuv2 *mhu = arg; + struct mbox_chan *chan = get_irq_chan_rx(mhu); + struct device *dev = mhu->mbox.dev; + struct mhuv2_mbox_chan_priv *priv; + int ret = IRQ_NONE; + void *data; + + if (IS_ERR(chan)) { + dev_warn(dev, "Failed to find channel for the rx interrupt\n"); + return IRQ_NONE; + } + priv = chan->con_priv; + + /* Read and clear the data first */ + data = priv->ops->read_data(mhu, chan); + + if (!chan->cl) { + dev_warn(dev, "Received data on channel (%u) not currently attached to a mailbox client\n", + priv->ch_wn_idx); + } else if (IS_ERR(data)) { + dev_err(dev, "Failed to read data: %lu\n", PTR_ERR(data)); + } else { + mbox_chan_received_data(chan, data); + ret = IRQ_HANDLED; + } + + kfree(data); + return ret; +} + +/* Sender and receiver ops */ +static bool mhuv2_sender_last_tx_done(struct mbox_chan *chan) +{ + struct mhuv2 *mhu = mhu_from_mbox(chan->mbox); + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + return priv->ops->last_tx_done(mhu, chan); +} + +static int mhuv2_sender_send_data(struct mbox_chan *chan, void *data) +{ + struct mhuv2 *mhu = mhu_from_mbox(chan->mbox); + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + if (!priv->ops->last_tx_done(mhu, chan)) + return -EBUSY; + + return priv->ops->send_data(mhu, chan, data); +} + +static int mhuv2_sender_startup(struct mbox_chan *chan) +{ + struct mhuv2 *mhu = mhu_from_mbox(chan->mbox); + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + if (priv->ops->tx_startup) + priv->ops->tx_startup(mhu, chan); + return 0; +} + +static void mhuv2_sender_shutdown(struct mbox_chan *chan) +{ + struct mhuv2 *mhu = mhu_from_mbox(chan->mbox); + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + if (priv->ops->tx_shutdown) + priv->ops->tx_shutdown(mhu, chan); +} + +static const struct mbox_chan_ops mhuv2_sender_ops = { + .send_data = mhuv2_sender_send_data, + .startup = mhuv2_sender_startup, + .shutdown = mhuv2_sender_shutdown, + .last_tx_done = mhuv2_sender_last_tx_done, +}; + +static int mhuv2_receiver_startup(struct mbox_chan *chan) +{ + struct mhuv2 *mhu = mhu_from_mbox(chan->mbox); + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + return priv->ops->rx_startup(mhu, chan); +} + +static void mhuv2_receiver_shutdown(struct mbox_chan *chan) +{ + struct mhuv2 *mhu = mhu_from_mbox(chan->mbox); + struct mhuv2_mbox_chan_priv *priv = chan->con_priv; + + priv->ops->rx_shutdown(mhu, chan); +} + +static int mhuv2_receiver_send_data(struct mbox_chan *chan, void *data) +{ + dev_err(chan->mbox->dev, + "Trying to transmit on a receiver MHU frame\n"); + return -EIO; +} + +static bool mhuv2_receiver_last_tx_done(struct mbox_chan *chan) +{ + dev_err(chan->mbox->dev, "Trying to Tx poll on a receiver MHU frame\n"); + return true; +} + +static const struct mbox_chan_ops mhuv2_receiver_ops = { + .send_data = mhuv2_receiver_send_data, + .startup = mhuv2_receiver_startup, + .shutdown = mhuv2_receiver_shutdown, + .last_tx_done = mhuv2_receiver_last_tx_done, +}; + +static struct mbox_chan *mhuv2_mbox_of_xlate(struct mbox_controller *mbox, + const struct of_phandle_args *pa) +{ + struct mhuv2 *mhu = mhu_from_mbox(mbox); + struct mbox_chan *chans = mbox->chans; + int channel = 0, i, offset, doorbell, protocol, windows; + + if (pa->args_count != 2) + return ERR_PTR(-EINVAL); + + offset = pa->args[0]; + doorbell = pa->args[1]; + if (doorbell >= MHUV2_STAT_BITS) + goto out; + + for (i = 0; i < mhu->length; i += 2) { + protocol = mhu->protocols[i]; + windows = mhu->protocols[i + 1]; + + if (protocol == DOORBELL) { + if (offset < windows) + return &chans[channel + MHUV2_STAT_BITS * offset + doorbell]; + + channel += MHUV2_STAT_BITS * windows; + offset -= windows; + } else { + if (offset == 0) { + if (doorbell) + goto out; + + return &chans[channel]; + } + + channel++; + offset--; + } + } + +out: + dev_err(mbox->dev, "Couldn't xlate to a valid channel (%d: %d)\n", + pa->args[0], doorbell); + return ERR_PTR(-ENODEV); +} + +static int mhuv2_verify_protocol(struct mhuv2 *mhu) +{ + struct device *dev = mhu->mbox.dev; + int protocol, windows, channels = 0, total_windows = 0, i; + + for (i = 0; i < mhu->length; i += 2) { + protocol = mhu->protocols[i]; + windows = mhu->protocols[i + 1]; + + if (!windows) { + dev_err(dev, "Window size can't be zero (%d)\n", i); + return -EINVAL; + } + total_windows += windows; + + if (protocol == DOORBELL) { + channels += MHUV2_STAT_BITS * windows; + } else if (protocol == DATA_TRANSFER) { + channels++; + } else { + dev_err(dev, "Invalid protocol (%d) present in %s property at index %d\n", + protocol, MHUV2_PROTOCOL_PROP, i); + return -EINVAL; + } + } + + if (total_windows > mhu->windows) { + dev_err(dev, "Channel windows can't be more than what's implemented by the hardware ( %d: %d)\n", + total_windows, mhu->windows); + return -EINVAL; + } + + mhu->mbox.num_chans = channels; + return 0; +} + +static int mhuv2_allocate_channels(struct mhuv2 *mhu) +{ + struct mbox_controller *mbox = &mhu->mbox; + struct mhuv2_mbox_chan_priv *priv; + struct device *dev = mbox->dev; + struct mbox_chan *chans; + int protocol, windows = 0, next_window = 0, i, j, k; + + chans = devm_kcalloc(dev, mbox->num_chans, sizeof(*chans), GFP_KERNEL); + if (!chans) + return -ENOMEM; + + mbox->chans = chans; + + for (i = 0; i < mhu->length; i += 2) { + next_window += windows; + + protocol = mhu->protocols[i]; + windows = mhu->protocols[i + 1]; + + if (protocol == DATA_TRANSFER) { + priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->ch_wn_idx = next_window; + priv->ops = &mhuv2_data_transfer_ops; + priv->windows = windows; + chans++->con_priv = priv; + continue; + } + + for (j = 0; j < windows; j++) { + for (k = 0; k < MHUV2_STAT_BITS; k++) { + priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->ch_wn_idx = next_window + j; + priv->ops = &mhuv2_doorbell_ops; + priv->doorbell = k; + chans++->con_priv = priv; + } + + /* + * Permanently enable interrupt as we can't + * control it per doorbell. + */ + if (mhu->frame == SENDER_FRAME && mhu->minor) + writel_relaxed(0x1, &mhu->send->ch_wn[priv->ch_wn_idx].int_en); + } + } + + /* Make sure we have initialized all channels */ + BUG_ON(chans - mbox->chans != mbox->num_chans); + + return 0; +} + +static int mhuv2_parse_channels(struct mhuv2 *mhu) +{ + struct device *dev = mhu->mbox.dev; + const struct device_node *np = dev->of_node; + int ret, count; + u32 *protocols; + + count = of_property_count_u32_elems(np, MHUV2_PROTOCOL_PROP); + if (count <= 0 || count % 2) { + dev_err(dev, "Invalid %s property (%d)\n", MHUV2_PROTOCOL_PROP, + count); + return -EINVAL; + } + + protocols = devm_kmalloc_array(dev, count, sizeof(*protocols), GFP_KERNEL); + if (!protocols) + return -ENOMEM; + + ret = of_property_read_u32_array(np, MHUV2_PROTOCOL_PROP, protocols, count); + if (ret) { + dev_err(dev, "Failed to read %s property: %d\n", + MHUV2_PROTOCOL_PROP, ret); + return ret; + } + + mhu->protocols = protocols; + mhu->length = count; + + ret = mhuv2_verify_protocol(mhu); + if (ret) + return ret; + + return mhuv2_allocate_channels(mhu); +} + +static int mhuv2_tx_init(struct amba_device *adev, struct mhuv2 *mhu, + void __iomem *reg) +{ + struct device *dev = mhu->mbox.dev; + int ret, i; + + mhu->frame = SENDER_FRAME; + mhu->mbox.ops = &mhuv2_sender_ops; + mhu->send = reg; + + mhu->windows = readl_relaxed_bitfield(&mhu->send->mhu_cfg, num_ch); + mhu->minor = readl_relaxed_bitfield(&mhu->send->aidr, arch_minor_rev); + + spin_lock_init(&mhu->doorbell_pending_lock); + + /* + * For minor version 1 and forward, tx interrupt is provided by + * the controller. + */ + if (mhu->minor && adev->irq[0]) { + ret = devm_request_threaded_irq(dev, adev->irq[0], NULL, + mhuv2_sender_interrupt, + IRQF_ONESHOT, "mhuv2-tx", mhu); + if (ret) { + dev_err(dev, "Failed to request tx IRQ, fallback to polling mode: %d\n", + ret); + } else { + mhu->mbox.txdone_irq = true; + mhu->mbox.txdone_poll = false; + mhu->irq = adev->irq[0]; + + writel_relaxed_bitfield(1, &mhu->send->int_en, chcomb); + + /* Disable all channel interrupts */ + for (i = 0; i < mhu->windows; i++) + writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en); + + goto out; + } + } + + mhu->mbox.txdone_irq = false; + mhu->mbox.txdone_poll = true; + mhu->mbox.txpoll_period = 1; + +out: + /* Wait for receiver to be ready */ + writel_relaxed(0x1, &mhu->send->access_request); + while (!readl_relaxed(&mhu->send->access_ready)) + continue; + + return 0; +} + +static int mhuv2_rx_init(struct amba_device *adev, struct mhuv2 *mhu, + void __iomem *reg) +{ + struct device *dev = mhu->mbox.dev; + int ret, i; + + mhu->frame = RECEIVER_FRAME; + mhu->mbox.ops = &mhuv2_receiver_ops; + mhu->recv = reg; + + mhu->windows = readl_relaxed_bitfield(&mhu->recv->mhu_cfg, num_ch); + mhu->minor = readl_relaxed_bitfield(&mhu->recv->aidr, arch_minor_rev); + + mhu->irq = adev->irq[0]; + if (!mhu->irq) { + dev_err(dev, "Missing receiver IRQ\n"); + return -EINVAL; + } + + ret = devm_request_threaded_irq(dev, mhu->irq, NULL, + mhuv2_receiver_interrupt, IRQF_ONESHOT, + "mhuv2-rx", mhu); + if (ret) { + dev_err(dev, "Failed to request rx IRQ\n"); + return ret; + } + + /* Mask all the channel windows */ + for (i = 0; i < mhu->windows; i++) + writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set); + + if (mhu->minor) + writel_relaxed_bitfield(1, &mhu->recv->int_en, chcomb); + + return 0; +} + +static int mhuv2_probe(struct amba_device *adev, const struct amba_id *id) +{ + struct device *dev = &adev->dev; + const struct device_node *np = dev->of_node; + struct mhuv2 *mhu; + void __iomem *reg; + int ret = -EINVAL; + + reg = devm_of_iomap(dev, dev->of_node, 0, NULL); + if (!reg) + return -ENOMEM; + + mhu = devm_kzalloc(dev, sizeof(*mhu), GFP_KERNEL); + if (!mhu) + return -ENOMEM; + + mhu->mbox.dev = dev; + mhu->mbox.of_xlate = mhuv2_mbox_of_xlate; + + if (of_device_is_compatible(np, "arm,mhuv2-tx")) + ret = mhuv2_tx_init(adev, mhu, reg); + else if (of_device_is_compatible(np, "arm,mhuv2-rx")) + ret = mhuv2_rx_init(adev, mhu, reg); + else + dev_err(dev, "Invalid compatible property\n"); + + if (ret) + return ret; + + /* Channel windows can't be 0 */ + BUG_ON(!mhu->windows); + + ret = mhuv2_parse_channels(mhu); + if (ret) + return ret; + + amba_set_drvdata(adev, mhu); + + ret = devm_mbox_controller_register(dev, &mhu->mbox); + if (ret) + dev_err(dev, "failed to register ARM MHUv2 driver %d\n", ret); + + return ret; +} + +static int mhuv2_remove(struct amba_device *adev) +{ + struct mhuv2 *mhu = amba_get_drvdata(adev); + + if (mhu->frame == SENDER_FRAME) + writel_relaxed(0x0, &mhu->send->access_request); + + return 0; +} + +static struct amba_id mhuv2_ids[] = { + { + /* 2.0 */ + .id = 0xbb0d1, + .mask = 0xfffff, + }, + { + /* 2.1 */ + .id = 0xbb076, + .mask = 0xfffff, + }, + { 0, 0 }, +}; +MODULE_DEVICE_TABLE(amba, mhuv2_ids); + +static struct amba_driver mhuv2_driver = { + .drv = { + .name = "arm-mhuv2", + }, + .id_table = mhuv2_ids, + .probe = mhuv2_probe, + .remove = mhuv2_remove, +}; +module_amba_driver(mhuv2_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ARM MHUv2 Driver"); +MODULE_AUTHOR("Viresh Kumar "); +MODULE_AUTHOR("Tushar Khandelwal "); diff --git a/drivers/mailbox/stm32-ipcc.c b/drivers/mailbox/stm32-ipcc.c index ef966887aa15..b84e0587937c 100644 --- a/drivers/mailbox/stm32-ipcc.c +++ b/drivers/mailbox/stm32-ipcc.c @@ -144,11 +144,11 @@ static irqreturn_t stm32_ipcc_tx_irq(int irq, void *data) static int stm32_ipcc_send_data(struct mbox_chan *link, void *data) { - unsigned int chan = (unsigned int)link->con_priv; + unsigned long chan = (unsigned long)link->con_priv; struct stm32_ipcc *ipcc = container_of(link->mbox, struct stm32_ipcc, controller); - dev_dbg(ipcc->controller.dev, "%s: chan:%d\n", __func__, chan); + dev_dbg(ipcc->controller.dev, "%s: chan:%lu\n", __func__, chan); /* set channel n occupied */ stm32_ipcc_set_bits(&ipcc->lock, ipcc->reg_proc + IPCC_XSCR, @@ -163,7 +163,7 @@ static int stm32_ipcc_send_data(struct mbox_chan *link, void *data) static int stm32_ipcc_startup(struct mbox_chan *link) { - unsigned int chan = (unsigned int)link->con_priv; + unsigned long chan = (unsigned long)link->con_priv; struct stm32_ipcc *ipcc = container_of(link->mbox, struct stm32_ipcc, controller); int ret; @@ -183,7 +183,7 @@ static int stm32_ipcc_startup(struct mbox_chan *link) static void stm32_ipcc_shutdown(struct mbox_chan *link) { - unsigned int chan = (unsigned int)link->con_priv; + unsigned long chan = (unsigned long)link->con_priv; struct stm32_ipcc *ipcc = container_of(link->mbox, struct stm32_ipcc, controller); @@ -206,7 +206,7 @@ static int stm32_ipcc_probe(struct platform_device *pdev) struct device_node *np = dev->of_node; struct stm32_ipcc *ipcc; struct resource *res; - unsigned int i; + unsigned long i; int ret; u32 ip_ver; static const char * const irq_name[] = {"rx", "tx"}; @@ -257,9 +257,6 @@ static int stm32_ipcc_probe(struct platform_device *pdev) for (i = 0; i < IPCC_IRQ_NUM; i++) { ipcc->irqs[i] = platform_get_irq_byname(pdev, irq_name[i]); if (ipcc->irqs[i] < 0) { - if (ipcc->irqs[i] != -EPROBE_DEFER) - dev_err(dev, "no IRQ specified %s\n", - irq_name[i]); ret = ipcc->irqs[i]; goto err_clk; } @@ -268,7 +265,7 @@ static int stm32_ipcc_probe(struct platform_device *pdev) irq_thread[i], IRQF_ONESHOT, dev_name(dev), ipcc); if (ret) { - dev_err(dev, "failed to request irq %d (%d)\n", i, ret); + dev_err(dev, "failed to request irq %lu (%d)\n", i, ret); goto err_clk; } } diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c index 119164abcb41..b495b0d5d0fa 100644 --- a/drivers/soc/ti/k3-ringacc.c +++ b/drivers/soc/ti/k3-ringacc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ static LIST_HEAD(k3_ringacc_list); static DEFINE_MUTEX(k3_ringacc_list_lock); #define K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK GENMASK(19, 0) +#define K3_DMARING_CFG_RING_SIZE_ELCNT_MASK GENMASK(15, 0) /** * struct k3_ring_rt_regs - The RA realtime Control/Status Registers region @@ -43,7 +45,13 @@ struct k3_ring_rt_regs { u32 hwindx; }; -#define K3_RINGACC_RT_REGS_STEP 0x1000 +#define K3_RINGACC_RT_REGS_STEP 0x1000 +#define K3_DMARING_RT_REGS_STEP 0x2000 +#define K3_DMARING_RT_REGS_REVERSE_OFS 0x1000 +#define K3_RINGACC_RT_OCC_MASK GENMASK(20, 0) +#define K3_DMARING_RT_OCC_TDOWN_COMPLETE BIT(31) +#define K3_DMARING_RT_DB_ENTRY_MASK GENMASK(7, 0) +#define K3_DMARING_RT_DB_TDOWN_ACK BIT(31) /** * struct k3_ring_fifo_regs - The Ring Accelerator Queues Registers region @@ -122,6 +130,7 @@ struct k3_ring_state { u32 occ; u32 windex; u32 rindex; + u32 tdown_complete:1; }; /** @@ -143,6 +152,7 @@ struct k3_ring_state { * @use_count: Use count for shared rings * @proxy_id: RA Ring Proxy Id (only if @K3_RINGACC_RING_USE_PROXY) * @dma_dev: device to be used for DMA API (allocation, mapping) + * @asel: Address Space Select value for physical addresses */ struct k3_ring { struct k3_ring_rt_regs __iomem *rt; @@ -157,12 +167,15 @@ struct k3_ring { u32 flags; #define K3_RING_FLAG_BUSY BIT(1) #define K3_RING_FLAG_SHARED BIT(2) +#define K3_RING_FLAG_REVERSE BIT(3) struct k3_ring_state state; u32 ring_id; struct k3_ringacc *parent; u32 use_count; int proxy_id; struct device *dma_dev; + u32 asel; +#define K3_ADDRESS_ASEL_SHIFT 48 }; struct k3_ringacc_ops { @@ -188,6 +201,7 @@ struct k3_ringacc_ops { * @tisci_ring_ops: ti-sci rings ops * @tisci_dev_id: ti-sci device id * @ops: SoC specific ringacc operation + * @dma_rings: indicate DMA ring (dual ring within BCDMA/PKTDMA) */ struct k3_ringacc { struct device *dev; @@ -210,6 +224,7 @@ struct k3_ringacc { u32 tisci_dev_id; const struct k3_ringacc_ops *ops; + bool dma_rings; }; /** @@ -221,6 +236,21 @@ struct k3_ringacc_soc_data { unsigned dma_ring_reset_quirk:1; }; +static int k3_ringacc_ring_read_occ(struct k3_ring *ring) +{ + return readl(&ring->rt->occ) & K3_RINGACC_RT_OCC_MASK; +} + +static void k3_ringacc_ring_update_occ(struct k3_ring *ring) +{ + u32 val; + + val = readl(&ring->rt->occ); + + ring->state.occ = val & K3_RINGACC_RT_OCC_MASK; + ring->state.tdown_complete = !!(val & K3_DMARING_RT_OCC_TDOWN_COMPLETE); +} + static long k3_ringacc_ring_get_fifo_pos(struct k3_ring *ring) { return K3_RINGACC_FIFO_WINDOW_SIZE_BYTES - @@ -234,12 +264,24 @@ static void *k3_ringacc_get_elm_addr(struct k3_ring *ring, u32 idx) static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem); static int k3_ringacc_ring_pop_mem(struct k3_ring *ring, void *elem); +static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem); +static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem); static struct k3_ring_ops k3_ring_mode_ring_ops = { .push_tail = k3_ringacc_ring_push_mem, .pop_head = k3_ringacc_ring_pop_mem, }; +static struct k3_ring_ops k3_dmaring_fwd_ops = { + .push_tail = k3_ringacc_ring_push_mem, + .pop_head = k3_dmaring_fwd_pop, +}; + +static struct k3_ring_ops k3_dmaring_reverse_ops = { + /* Reverse side of the DMA ring can only be popped by SW */ + .pop_head = k3_dmaring_reverse_pop, +}; + static int k3_ringacc_ring_push_io(struct k3_ring *ring, void *elem); static int k3_ringacc_ring_pop_io(struct k3_ring *ring, void *elem); static int k3_ringacc_ring_push_head_io(struct k3_ring *ring, void *elem); @@ -342,6 +384,40 @@ error: } EXPORT_SYMBOL_GPL(k3_ringacc_request_ring); +static int k3_dmaring_request_dual_ring(struct k3_ringacc *ringacc, int fwd_id, + struct k3_ring **fwd_ring, + struct k3_ring **compl_ring) +{ + int ret = 0; + + /* + * DMA rings must be requested by ID, completion ring is the reverse + * side of the forward ring + */ + if (fwd_id < 0) + return -EINVAL; + + mutex_lock(&ringacc->req_lock); + + if (test_bit(fwd_id, ringacc->rings_inuse)) { + ret = -EBUSY; + goto error; + } + + *fwd_ring = &ringacc->rings[fwd_id]; + *compl_ring = &ringacc->rings[fwd_id + ringacc->num_rings]; + set_bit(fwd_id, ringacc->rings_inuse); + ringacc->rings[fwd_id].use_count++; + dev_dbg(ringacc->dev, "Giving ring#%d\n", fwd_id); + + mutex_unlock(&ringacc->req_lock); + return 0; + +error: + mutex_unlock(&ringacc->req_lock); + return ret; +} + int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc, int fwd_id, int compl_id, struct k3_ring **fwd_ring, @@ -352,6 +428,10 @@ int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc, if (!fwd_ring || !compl_ring) return -EINVAL; + if (ringacc->dma_rings) + return k3_dmaring_request_dual_ring(ringacc, fwd_id, + fwd_ring, compl_ring); + *fwd_ring = k3_ringacc_request_ring(ringacc, fwd_id, 0); if (!(*fwd_ring)) return -ENODEV; @@ -421,7 +501,7 @@ void k3_ringacc_ring_reset_dma(struct k3_ring *ring, u32 occ) goto reset; if (!occ) - occ = readl(&ring->rt->occ); + occ = k3_ringacc_ring_read_occ(ring); if (occ) { u32 db_ring_cnt, db_ring_cnt_cur; @@ -496,6 +576,13 @@ int k3_ringacc_ring_free(struct k3_ring *ring) ringacc = ring->parent; + /* + * DMA rings: rings shared memory and configuration, only forward ring + * is configured and reverse ring considered as slave. + */ + if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE)) + return 0; + dev_dbg(ring->parent->dev, "flags: 0x%08x\n", ring->flags); if (!test_bit(ring->ring_id, ringacc->rings_inuse)) @@ -517,6 +604,8 @@ int k3_ringacc_ring_free(struct k3_ring *ring) ring->flags = 0; ring->ops = NULL; ring->dma_dev = NULL; + ring->asel = 0; + if (ring->proxy_id != K3_RINGACC_PROXY_NOT_USED) { clear_bit(ring->proxy_id, ringacc->proxy_inuse); ring->proxy = NULL; @@ -581,6 +670,7 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring) ring_cfg.count = ring->size; ring_cfg.mode = ring->mode; ring_cfg.size = ring->elm_size; + ring_cfg.asel = ring->asel; ret = ringacc->tisci_ring_ops->set_cfg(ringacc->tisci, &ring_cfg); if (ret) @@ -590,6 +680,90 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring) return ret; } +static int k3_dmaring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg) +{ + struct k3_ringacc *ringacc; + struct k3_ring *reverse_ring; + int ret = 0; + + if (cfg->elm_size != K3_RINGACC_RING_ELSIZE_8 || + cfg->mode != K3_RINGACC_RING_MODE_RING || + cfg->size & ~K3_DMARING_CFG_RING_SIZE_ELCNT_MASK) + return -EINVAL; + + ringacc = ring->parent; + + /* + * DMA rings: rings shared memory and configuration, only forward ring + * is configured and reverse ring considered as slave. + */ + if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE)) + return 0; + + if (!test_bit(ring->ring_id, ringacc->rings_inuse)) + return -EINVAL; + + ring->size = cfg->size; + ring->elm_size = cfg->elm_size; + ring->mode = cfg->mode; + ring->asel = cfg->asel; + ring->dma_dev = cfg->dma_dev; + if (!ring->dma_dev) { + dev_warn(ringacc->dev, "dma_dev is not provided for ring%d\n", + ring->ring_id); + ring->dma_dev = ringacc->dev; + } + + memset(&ring->state, 0, sizeof(ring->state)); + + ring->ops = &k3_dmaring_fwd_ops; + + ring->ring_mem_virt = dma_alloc_coherent(ring->dma_dev, + ring->size * (4 << ring->elm_size), + &ring->ring_mem_dma, GFP_KERNEL); + if (!ring->ring_mem_virt) { + dev_err(ringacc->dev, "Failed to alloc ring mem\n"); + ret = -ENOMEM; + goto err_free_ops; + } + + ret = k3_ringacc_ring_cfg_sci(ring); + if (ret) + goto err_free_mem; + + ring->flags |= K3_RING_FLAG_BUSY; + + k3_ringacc_ring_dump(ring); + + /* DMA rings: configure reverse ring */ + reverse_ring = &ringacc->rings[ring->ring_id + ringacc->num_rings]; + reverse_ring->size = cfg->size; + reverse_ring->elm_size = cfg->elm_size; + reverse_ring->mode = cfg->mode; + reverse_ring->asel = cfg->asel; + memset(&reverse_ring->state, 0, sizeof(reverse_ring->state)); + reverse_ring->ops = &k3_dmaring_reverse_ops; + + reverse_ring->ring_mem_virt = ring->ring_mem_virt; + reverse_ring->ring_mem_dma = ring->ring_mem_dma; + reverse_ring->flags |= K3_RING_FLAG_BUSY; + k3_ringacc_ring_dump(reverse_ring); + + return 0; + +err_free_mem: + dma_free_coherent(ring->dma_dev, + ring->size * (4 << ring->elm_size), + ring->ring_mem_virt, + ring->ring_mem_dma); +err_free_ops: + ring->ops = NULL; + ring->proxy = NULL; + ring->dma_dev = NULL; + ring->asel = 0; + return ret; +} + int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg) { struct k3_ringacc *ringacc; @@ -597,8 +771,12 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg) if (!ring || !cfg) return -EINVAL; + ringacc = ring->parent; + if (ringacc->dma_rings) + return k3_dmaring_cfg(ring, cfg); + if (cfg->elm_size > K3_RINGACC_RING_ELSIZE_256 || cfg->mode >= K3_RINGACC_RING_MODE_INVALID || cfg->size & ~K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK || @@ -705,7 +883,7 @@ u32 k3_ringacc_ring_get_free(struct k3_ring *ring) return -EINVAL; if (!ring->state.free) - ring->state.free = ring->size - readl(&ring->rt->occ); + ring->state.free = ring->size - k3_ringacc_ring_read_occ(ring); return ring->state.free; } @@ -716,7 +894,7 @@ u32 k3_ringacc_ring_get_occ(struct k3_ring *ring) if (!ring || !(ring->flags & K3_RING_FLAG_BUSY)) return -EINVAL; - return readl(&ring->rt->occ); + return k3_ringacc_ring_read_occ(ring); } EXPORT_SYMBOL_GPL(k3_ringacc_ring_get_occ); @@ -892,6 +1070,72 @@ static int k3_ringacc_ring_pop_tail_io(struct k3_ring *ring, void *elem) K3_RINGACC_ACCESS_MODE_POP_HEAD); } +/* + * The element is 48 bits of address + ASEL bits in the ring. + * ASEL is used by the DMAs and should be removed for the kernel as it is not + * part of the physical memory address. + */ +static void k3_dmaring_remove_asel_from_elem(u64 *elem) +{ + *elem &= GENMASK_ULL(K3_ADDRESS_ASEL_SHIFT - 1, 0); +} + +static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem) +{ + void *elem_ptr; + u32 elem_idx; + + /* + * DMA rings: forward ring is always tied DMA channel and HW does not + * maintain any state data required for POP operation and its unknown + * how much elements were consumed by HW. So, to actually + * do POP, the read pointer has to be recalculated every time. + */ + ring->state.occ = k3_ringacc_ring_read_occ(ring); + if (ring->state.windex >= ring->state.occ) + elem_idx = ring->state.windex - ring->state.occ; + else + elem_idx = ring->size - (ring->state.occ - ring->state.windex); + + elem_ptr = k3_ringacc_get_elm_addr(ring, elem_idx); + memcpy(elem, elem_ptr, (4 << ring->elm_size)); + k3_dmaring_remove_asel_from_elem(elem); + + ring->state.occ--; + writel(-1, &ring->rt->db); + + dev_dbg(ring->parent->dev, "%s: occ%d Windex%d Rindex%d pos_ptr%px\n", + __func__, ring->state.occ, ring->state.windex, elem_idx, + elem_ptr); + return 0; +} + +static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem) +{ + void *elem_ptr; + + elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.rindex); + + if (ring->state.occ) { + memcpy(elem, elem_ptr, (4 << ring->elm_size)); + k3_dmaring_remove_asel_from_elem(elem); + + ring->state.rindex = (ring->state.rindex + 1) % ring->size; + ring->state.occ--; + writel(-1 & K3_DMARING_RT_DB_ENTRY_MASK, &ring->rt->db); + } else if (ring->state.tdown_complete) { + dma_addr_t *value = elem; + + *value = CPPI5_TDCM_MARKER; + writel(K3_DMARING_RT_DB_TDOWN_ACK, &ring->rt->db); + ring->state.tdown_complete = false; + } + + dev_dbg(ring->parent->dev, "%s: occ%d index%d pos_ptr%px\n", + __func__, ring->state.occ, ring->state.rindex, elem_ptr); + return 0; +} + static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem) { void *elem_ptr; @@ -899,6 +1143,11 @@ static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem) elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.windex); memcpy(elem_ptr, elem, (4 << ring->elm_size)); + if (ring->parent->dma_rings) { + u64 *addr = elem_ptr; + + *addr |= ((u64)ring->asel << K3_ADDRESS_ASEL_SHIFT); + } ring->state.windex = (ring->state.windex + 1) % ring->size; ring->state.free--; @@ -975,12 +1224,12 @@ int k3_ringacc_ring_pop(struct k3_ring *ring, void *elem) return -EINVAL; if (!ring->state.occ) - ring->state.occ = k3_ringacc_ring_get_occ(ring); + k3_ringacc_ring_update_occ(ring); dev_dbg(ring->parent->dev, "ring_pop: occ%d index%d\n", ring->state.occ, ring->state.rindex); - if (!ring->state.occ) + if (!ring->state.occ && !ring->state.tdown_complete) return -ENODATA; if (ring->ops && ring->ops->pop_head) @@ -998,7 +1247,7 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem) return -EINVAL; if (!ring->state.occ) - ring->state.occ = k3_ringacc_ring_get_occ(ring); + k3_ringacc_ring_update_occ(ring); dev_dbg(ring->parent->dev, "ring_pop_tail: occ%d index%d\n", ring->state.occ, ring->state.rindex); @@ -1203,6 +1452,68 @@ static const struct of_device_id k3_ringacc_of_match[] = { {}, }; +struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev, + struct k3_ringacc_init_data *data) +{ + struct device *dev = &pdev->dev; + struct k3_ringacc *ringacc; + void __iomem *base_rt; + struct resource *res; + int i; + + ringacc = devm_kzalloc(dev, sizeof(*ringacc), GFP_KERNEL); + if (!ringacc) + return ERR_PTR(-ENOMEM); + + ringacc->dev = dev; + ringacc->dma_rings = true; + ringacc->num_rings = data->num_rings; + ringacc->tisci = data->tisci; + ringacc->tisci_dev_id = data->tisci_dev_id; + + mutex_init(&ringacc->req_lock); + + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ringrt"); + base_rt = devm_ioremap_resource(dev, res); + if (IS_ERR(base_rt)) + return ERR_CAST(base_rt); + + ringacc->rings = devm_kzalloc(dev, + sizeof(*ringacc->rings) * + ringacc->num_rings * 2, + GFP_KERNEL); + ringacc->rings_inuse = devm_kcalloc(dev, + BITS_TO_LONGS(ringacc->num_rings), + sizeof(unsigned long), GFP_KERNEL); + + if (!ringacc->rings || !ringacc->rings_inuse) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < ringacc->num_rings; i++) { + struct k3_ring *ring = &ringacc->rings[i]; + + ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i; + ring->parent = ringacc; + ring->ring_id = i; + ring->proxy_id = K3_RINGACC_PROXY_NOT_USED; + + ring = &ringacc->rings[ringacc->num_rings + i]; + ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i + + K3_DMARING_RT_REGS_REVERSE_OFS; + ring->parent = ringacc; + ring->ring_id = i; + ring->proxy_id = K3_RINGACC_PROXY_NOT_USED; + ring->flags = K3_RING_FLAG_REVERSE; + } + + ringacc->tisci_ring_ops = &ringacc->tisci->ops.rm_ring_ops; + + dev_info(dev, "Number of rings: %u\n", ringacc->num_rings); + + return ringacc; +} +EXPORT_SYMBOL_GPL(k3_ringacc_dmarings_init); + static int k3_ringacc_probe(struct platform_device *pdev) { const struct ringacc_match_data *match_data; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 35c83f65475b..950552944436 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -840,7 +840,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1264,7 +1264,7 @@ ceph_find_incompatible(struct page *page) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { dout(" page %p forced umount\n", page); return ERR_PTR(-EIO); } @@ -1321,7 +1321,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); for (;;) { - page = grab_cache_page_write_begin(mapping, index, 0); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { r = -ENOMEM; break; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ded4229c314a..255a512f1277 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1140,16 +1140,24 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_inode_info *ci = cap->ci; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; + struct ceph_mds_client *mdsc; int removed = 0; + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + dout("%s: cap inode is NULL\n", __func__); + return; + } + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); + mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; + /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); if (ci->i_auth_cap == cap) { - WARN_ON_ONCE(!list_empty(&ci->i_dirty_item)); + WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) && + !mdsc->fsc->blocklisted); ci->i_auth_cap = NULL; } @@ -2746,7 +2754,7 @@ again: goto out_unlock; } - if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { dout("get_cap_refs %p forced umount\n", inode); ret = -EIO; goto out_unlock; @@ -4027,15 +4035,13 @@ void ceph_handle_caps(struct ceph_mds_session *session, } if (msg_version >= 8) { - u64 flush_tid; - u32 caller_uid, caller_gid; u32 pool_ns_len; /* version >= 6 */ - ceph_decode_64_safe(&p, end, flush_tid, bad); + ceph_decode_skip_64(&p, end, bad); // flush_tid /* version >= 7 */ - ceph_decode_32_safe(&p, end, caller_uid, bad); - ceph_decode_32_safe(&p, end, caller_gid, bad); + ceph_decode_skip_32(&p, end, bad); // caller_uid + ceph_decode_skip_32(&p, end, bad); // caller_gid /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { @@ -4058,9 +4064,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, } if (msg_version >= 11) { - u32 flags; /* version >= 10 */ - ceph_decode_32_safe(&p, end, flags, bad); + ceph_decode_skip_32(&p, end, bad); // flags /* version >= 11 */ extra_info.dirstat_valid = true; ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7a8fbe3e4751..66989c880adb 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -304,11 +304,25 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) return 0; } +static int status_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_entity_inst *inst = &fsc->client->msgr.inst; + struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client); + + seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name), + ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce)); + seq_printf(s, "blocklisted: %s\n", fsc->blocklisted ? "true" : "false"); + + return 0; +} + DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); DEFINE_SHOW_ATTRIBUTE(mds_sessions); DEFINE_SHOW_ATTRIBUTE(metric); +DEFINE_SHOW_ATTRIBUTE(status); /* @@ -394,6 +408,12 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc->client->debugfs_dir, fsc, &caps_fops); + + fsc->debugfs_status = debugfs_create_file("status", + 0400, + fsc->client->debugfs_dir, + fsc, + &status_fops); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a4d48370b2b3..858ee7362ff5 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1202,12 +1202,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, op = CEPH_MDS_OP_RENAMESNAP; else return -EROFS; - } else if (old_dir != new_dir) { - err = ceph_quota_check_rename(mdsc, d_inode(old_dentry), - new_dir); - if (err) - return err; } + /* don't allow cross-quota renames */ + if ((old_dir != new_dir) && + (!ceph_quota_is_same_realm(old_dir, new_dir))) + return -EXDEV; dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 526faf4778ce..adc8fc3c5d85 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1315,15 +1315,10 @@ retry_lookup: } if (rinfo->head->is_target) { - tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); - tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); - - in = ceph_get_inode(sb, tvino); - if (IS_ERR(in)) { - err = PTR_ERR(in); - goto done; - } + /* Should be filled in by handle_reply */ + BUG_ON(!req->r_target_inode); + in = req->r_target_inode; err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && @@ -1333,11 +1328,13 @@ retry_lookup: if (err < 0) { pr_err("ceph_fill_inode badness %p %llx.%llx\n", in, ceph_vinop(in)); + req->r_target_inode = NULL; if (in->i_state & I_NEW) discard_new_inode(in); + else + iput(in); goto done; } - req->r_target_inode = in; if (in->i_state & I_NEW) unlock_new_inode(in); } @@ -1597,8 +1594,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct dentry *dn; struct inode *in; int err = 0, skipped = 0, ret, i; - struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 frag = le32_to_cpu(req->r_args.readdir.frag); u32 last_hash = 0; u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; @@ -1615,7 +1611,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } else if (rinfo->offset_hash) { /* mds understands offset_hash */ WARN_ON_ONCE(req->r_readdir_offset != 2); - last_hash = le32_to_cpu(rhead->args.readdir.offset_hash); + last_hash = le32_to_cpu(req->r_args.readdir.offset_hash); } } @@ -1888,7 +1884,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) mutex_lock(&ci->i_truncate_mutex); - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", inode, ceph_ino(inode)); mapping_set_error(inode->i_mapping, -EIO); @@ -2340,15 +2336,23 @@ int ceph_permission(struct inode *inode, int mask) } /* Craft a mask of needed caps given a set of requested statx attrs. */ -static int statx_to_caps(u32 want) +static int statx_to_caps(u32 want, umode_t mode) { int mask = 0; if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME)) mask |= CEPH_CAP_AUTH_SHARED; - if (want & (STATX_NLINK|STATX_CTIME)) - mask |= CEPH_CAP_LINK_SHARED; + if (want & (STATX_NLINK|STATX_CTIME)) { + /* + * The link count for directories depends on inode->i_subdirs, + * and that is only updated when Fs caps are held. + */ + if (S_ISDIR(mode)) + mask |= CEPH_CAP_FILE_SHARED; + else + mask |= CEPH_CAP_LINK_SHARED; + } if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE| STATX_BLOCKS)) @@ -2374,8 +2378,9 @@ int ceph_getattr(const struct path *path, struct kstat *stat, /* Skip the getattr altogether if we're asked not to sync */ if (!(flags & AT_STATX_DONT_SYNC)) { - err = ceph_do_getattr(inode, statx_to_caps(request_mask), - flags & AT_STATX_FORCE_SYNC); + err = ceph_do_getattr(inode, + statx_to_caps(request_mask, inode->i_mode), + flags & AT_STATX_FORCE_SYNC); if (err) return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 048a435a29be..fa8a847743d0 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -57,7 +57,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = { .fl_release_private = ceph_fl_release_lock, }; -/** +/* * Implement fcntl and flock locking functions. */ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, @@ -225,7 +225,7 @@ static int try_unlock_file(struct file *file, struct file_lock *fl) return 1; } -/** +/* * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. */ @@ -408,7 +408,7 @@ static int lock_to_ceph_filelock(struct file_lock *lock, return err; } -/** +/* * Encode the flock and fcntl locks for the given inode into the ceph_filelock * array. Must be called with inode->i_lock already held. * If we encounter more of a specific lock type than expected, return -ENOSPC. @@ -458,7 +458,7 @@ fail: return err; } -/** +/* * Copy the encoded flock and fcntl locks into the pagelist. * Format is: #fcntl locks, sequential fcntl locks, #flock locks, * sequential flock locks. diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8f1d7500a7ec..98c15ff2e599 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -516,13 +516,9 @@ static int parse_reply_info_create(void **p, void *end, /* Malformed reply? */ info->has_create_ino = false; } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { - u8 struct_v, struct_compat; - u32 len; - info->has_create_ino = true; - ceph_decode_8_safe(p, end, struct_v, bad); - ceph_decode_8_safe(p, end, struct_compat, bad); - ceph_decode_32_safe(p, end, len, bad); + /* struct_v, struct_compat, and len */ + ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad); ceph_decode_64_safe(p, end, info->ino, bad); ret = ceph_parse_deleg_inos(p, end, s); if (ret) @@ -837,6 +833,7 @@ void ceph_mdsc_release_request(struct kref *kref) } kfree(req->r_path1); kfree(req->r_path2); + put_cred(req->r_cred); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); put_request_session(req); @@ -892,8 +889,7 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); insert_request(&mdsc->request_tree, req); - req->r_uid = current_fsuid(); - req->r_gid = current_fsgid(); + req->r_cred = get_current_cred(); if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) mdsc->oldest_tid = req->r_tid; @@ -1243,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 { struct ceph_msg *msg; struct ceph_mds_session_head *h; - int i = -1; + int i; int extra_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; @@ -1595,7 +1591,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (inode->i_data.nrpages > 0) invalidate = true; if (ci->i_wrbuffer_ref > 0) @@ -2482,21 +2478,24 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, /* * called under mdsc->mutex */ -static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, +static struct ceph_msg *create_request_message(struct ceph_mds_session *session, struct ceph_mds_request *req, - int mds, bool drop_cap_releases) + bool drop_cap_releases) { + int mds = session->s_mds; + struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_msg *msg; - struct ceph_mds_request_head *head; + struct ceph_mds_request_head_old *head; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; bool freepath1 = false, freepath2 = false; - int len; + int len, i; u16 releases; void *p, *end; int ret; + bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, @@ -2518,14 +2517,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free1; } - len = sizeof(*head) + - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + + if (legacy) { + /* Old style */ + len = sizeof(*head); + } else { + /* New style: add gid_list and any later fields */ + len = sizeof(struct ceph_mds_request_head) + sizeof(u32) + + (sizeof(u64) * req->r_cred->group_info->ngroups); + } + + len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + sizeof(struct ceph_timespec); /* calculate (max) length for cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); + if (req->r_dentry_drop) len += pathlen1; if (req->r_old_dentry_drop) @@ -2537,17 +2545,33 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } - msg->hdr.version = cpu_to_le16(2); msg->hdr.tid = cpu_to_le64(req->r_tid); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); + /* + * The old ceph_mds_request_header didn't contain a version field, and + * one was added when we moved the message version from 3->4. + */ + if (legacy) { + msg->hdr.version = cpu_to_le16(3); + head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*head); + } else { + struct ceph_mds_request_head *new_head = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(4); + new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; + p = msg->front.iov_base + sizeof(*new_head); + } + end = msg->front.iov_base + msg->front.iov_len; head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + req->r_cred->fsuid)); + head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + req->r_cred->fsgid)); head->ino = cpu_to_le64(req->r_deleg_ino); head->args = req->r_args; @@ -2592,6 +2616,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_copy(&p, &ts, sizeof(ts)); } + /* gid list */ + if (!legacy) { + ceph_encode_32(&p, req->r_cred->group_info->ngroups); + for (i = 0; i < req->r_cred->group_info->ngroups; i++) + ceph_encode_64(&p, from_kgid(&init_user_ns, + req->r_cred->group_info->gid[i])); + } + if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); msg = ERR_PTR(-ERANGE); @@ -2635,14 +2667,28 @@ static void complete_request(struct ceph_mds_client *mdsc, complete_all(&req->r_completion); } +static struct ceph_mds_request_head_old * +find_old_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head *new_head; + + if (legacy) + return (struct ceph_mds_request_head_old *)p; + new_head = (struct ceph_mds_request_head *)p; + return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; +} + /* * called under mdsc->mutex */ -static int __prepare_send_request(struct ceph_mds_client *mdsc, +static int __prepare_send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, - int mds, bool drop_cap_releases) + bool drop_cap_releases) { - struct ceph_mds_request_head *rhead; + int mds = session->s_mds; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_mds_request_head_old *rhead; struct ceph_msg *msg; int flags = 0; @@ -2661,6 +2707,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { void *p; + /* * Replay. Do not regenerate message (and rebuild * paths, etc.); just use the original message. @@ -2668,7 +2715,8 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, * d_move mangles the src name. */ msg = req->r_request; - rhead = msg->front.iov_base; + rhead = find_old_request_head(msg->front.iov_base, + session->s_con.peer_features); flags = le32_to_cpu(rhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; @@ -2699,14 +2747,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds, drop_cap_releases); + msg = create_request_message(session, req, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); return PTR_ERR(msg); } req->r_request = msg; - rhead = msg->front.iov_base; + rhead = find_old_request_head(msg->front.iov_base, + session->s_con.peer_features); rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; @@ -2725,15 +2774,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* * called under mdsc->mutex */ -static int __send_request(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, +static int __send_request(struct ceph_mds_session *session, struct ceph_mds_request *req, bool drop_cap_releases) { int err; - err = __prepare_send_request(mdsc, req, session->s_mds, - drop_cap_releases); + err = __prepare_send_request(session, req, drop_cap_releases); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2818,10 +2865,6 @@ static void __do_request(struct ceph_mds_client *mdsc, ceph_session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { - if (session->s_state == CEPH_MDS_SESSION_REJECTED) { - err = -EACCES; - goto out_session; - } /* * We cannot queue async requests since the caps and delegated * inodes are bound to the session. Just return -EJUKEBOX and @@ -2831,6 +2874,20 @@ static void __do_request(struct ceph_mds_client *mdsc, err = -EJUKEBOX; goto out_session; } + + /* + * If the session has been REJECTED, then return a hard error, + * unless it's a CLEANRECOVER mount, in which case we'll queue + * it to the mdsc queue. + */ + if (session->s_state == CEPH_MDS_SESSION_REJECTED) { + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) + list_add(&req->r_wait, &mdsc->waiting_for_map); + else + err = -EACCES; + goto out_session; + } + if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { err = __open_session(mdsc, session); @@ -2850,7 +2907,7 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __send_request(mdsc, session, req, false); + err = __send_request(session, req, false); out_session: ceph_put_mds_session(session); @@ -3173,6 +3230,23 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); + /* Must find target inode outside of mutexes to avoid deadlocks */ + if ((err >= 0) && rinfo->head->is_target) { + struct inode *in; + struct ceph_vino tvino = { + .ino = le64_to_cpu(rinfo->targeti.in->ino), + .snap = le64_to_cpu(rinfo->targeti.in->snapid) + }; + + in = ceph_get_inode(mdsc->fsc->sb, tvino); + if (IS_ERR(in)) { + err = PTR_ERR(in); + mutex_lock(&session->s_mutex); + goto out_err; + } + req->r_target_inode = in; + } + mutex_lock(&session->s_mutex); if (err < 0) { pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); @@ -3514,7 +3588,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) - __send_request(mdsc, session, req, true); + __send_request(session, req, true); /* * also re-send old requests when MDS enters reconnect stage. So that MDS @@ -3535,7 +3609,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ceph_mdsc_release_dir_caps_no_check(req); - __send_request(mdsc, session, req, true); + __send_request(session, req, true); } mutex_unlock(&mdsc->mutex); } @@ -4374,12 +4448,7 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) if (!READ_ONCE(fsc->blocklisted)) return; - if (fsc->last_auto_reconnect && - time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) - return; - pr_info("auto reconnect after blocklisted\n"); - fsc->last_auto_reconnect = jiffies; ceph_force_reconnect(fsc->sb); } @@ -4678,7 +4747,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { u64 want_tid, want_flush; - if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) return; dout("sync\n"); @@ -4855,10 +4924,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) void *p = msg->front.iov_base; void *end = p + msg->front.iov_len; u32 epoch; - u32 map_len; u32 num_fs; u32 mount_fscid = (u32)-1; - u8 struct_v, struct_cv; int err = -EINVAL; ceph_decode_need(&p, end, sizeof(u32), bad); @@ -4866,24 +4933,17 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) dout("handle_fsmap epoch %u\n", epoch); - ceph_decode_need(&p, end, 2 + sizeof(u32), bad); - struct_v = ceph_decode_8(&p); - struct_cv = ceph_decode_8(&p); - map_len = ceph_decode_32(&p); + /* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad); - ceph_decode_need(&p, end, sizeof(u32) * 3, bad); - p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */ - - num_fs = ceph_decode_32(&p); + ceph_decode_32_safe(&p, end, num_fs, bad); while (num_fs-- > 0) { void *info_p, *info_end; u32 info_len; - u8 info_v, info_cv; u32 fscid, namelen; ceph_decode_need(&p, end, 2 + sizeof(u32), bad); - info_v = ceph_decode_8(&p); - info_cv = ceph_decode_8(&p); + p += 2; // info_v, info_cv info_len = ceph_decode_32(&p); ceph_decode_need(&p, end, info_len, bad); info_p = p; @@ -4954,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) return; } - newmap = ceph_mdsmap_decode(&p, end); + newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client)); if (IS_ERR(newmap)) { err = PTR_ERR(newmap); goto bad_unlock; @@ -5081,23 +5141,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; struct ceph_auth_handshake *auth = &s->s_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } @@ -5118,8 +5167,11 @@ static int verify_authorizer_reply(struct ceph_connection *con) struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; - return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } static int invalidate_authorizer(struct ceph_connection *con) @@ -5133,6 +5185,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&mdsc->fsc->client->monc); } +static int mds_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int mds_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int mds_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth; + struct ceph_auth_handshake *auth = &s->s_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int mds_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) { @@ -5182,6 +5308,10 @@ static const struct ceph_connection_operations mds_con_ops = { .alloc_msg = mds_alloc_msg, .sign_message = mds_sign_message, .check_message_signature = mds_check_message_signature, + .get_auth_request = mds_get_auth_request, + .handle_auth_reply_more = mds_handle_auth_reply_more, + .handle_auth_done = mds_handle_auth_done, + .handle_auth_bad_method = mds_handle_auth_bad_method, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f5adbebcb38e..eaa7c5422116 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -275,8 +275,7 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - kuid_t r_uid; - kgid_t r_gid; + const struct cred *r_cred; int r_request_release_offset; struct timespec64 r_stamp; diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index e4aba6c6d3b5..abd9af7727ad 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -114,7 +114,7 @@ bad: * Ignore any fields we don't care about (there are quite a few of * them). */ -struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) { struct ceph_mdsmap *m; const void *start = *p; @@ -201,18 +201,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; - ceph_decode_need(p, end, - 4*sizeof(u32) + sizeof(u64) + - sizeof(addr) + sizeof(struct ceph_timespec), - bad); - mds = ceph_decode_32(p); - inc = ceph_decode_32(p); - state = ceph_decode_32(p); + ceph_decode_32_safe(p, end, mds, bad); + ceph_decode_32_safe(p, end, inc, bad); + ceph_decode_32_safe(p, end, state, bad); *p += sizeof(u64); /* state_seq */ - err = ceph_decode_entity_addr(p, end, &addr); + if (info_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + err = ceph_decode_entity_addr(p, end, &addr); if (err) goto corrupt; - ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + + ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since), + bad); laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); @@ -243,8 +244,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) } if (state <= 0) { - pr_warn("mdsmap_decode got incorrect state(%s)\n", - ceph_mds_state_name(state)); + dout("mdsmap_decode got incorrect state(%s)\n", + ceph_mds_state_name(state)); continue; } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index fee4c4778313..5ec94bd4c1de 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -16,6 +16,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_metric_read_latency *read; struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; + struct ceph_metric_dlease *dlease; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); struct ceph_msg *msg; @@ -25,7 +26,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, s32 len; len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) - + sizeof(*meta); + + sizeof(*meta) + sizeof(*dlease); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -42,8 +43,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, cap->ver = 1; cap->compat = 1; cap->data_len = cpu_to_le32(sizeof(*cap) - 10); - cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit)); - cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis)); + cap->hit = cpu_to_le64(percpu_counter_sum(&m->i_caps_hit)); + cap->mis = cpu_to_le64(percpu_counter_sum(&m->i_caps_mis)); cap->total = cpu_to_le64(nr_caps); items++; @@ -83,6 +84,17 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, meta->nsec = cpu_to_le32(ts.tv_nsec); items++; + /* encode the dentry lease metric */ + dlease = (struct ceph_metric_dlease *)(meta + 1); + dlease->type = cpu_to_le32(CLIENT_METRIC_TYPE_DENTRY_LEASE); + dlease->ver = 1; + dlease->compat = 1; + dlease->data_len = cpu_to_le32(sizeof(*dlease) - 10); + dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); + dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); + dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 710f3f1dceab..af6038ff39d4 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -27,6 +27,7 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_READ_LATENCY, \ CLIENT_METRIC_TYPE_WRITE_LATENCY, \ CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ \ CLIENT_METRIC_TYPE_MAX, \ } @@ -80,6 +81,19 @@ struct ceph_metric_metadata_latency { __le32 nsec; } __packed; +/* metric dentry lease header */ +struct ceph_metric_dlease { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(hit + mis + total) */ + __le64 hit; + __le64 mis; + __le64 total; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 9b785f11e95a..4e32c9600ecc 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -264,7 +264,7 @@ restart: return NULL; } -static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) +bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; @@ -516,59 +516,3 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) return is_updated; } -/* - * ceph_quota_check_rename - check if a rename can be executed - * @mdsc: MDS client instance - * @old: inode to be copied - * @new: destination inode (directory) - * - * This function verifies if a rename (e.g. moving a file or directory) can be - * executed. It forces an rstat update in the @new target directory (and in the - * source @old as well, if it's a directory). The actual check is done both for - * max_files and max_bytes. - * - * This function returns 0 if it's OK to do the rename, or, if quotas are - * exceeded, -EXDEV (if @old is a directory) or -EDQUOT. - */ -int ceph_quota_check_rename(struct ceph_mds_client *mdsc, - struct inode *old, struct inode *new) -{ - struct ceph_inode_info *ci_old = ceph_inode(old); - int ret = 0; - - if (ceph_quota_is_same_realm(old, new)) - return 0; - - /* - * Get the latest rstat for target directory (and for source, if a - * directory) - */ - ret = ceph_do_getattr(new, CEPH_STAT_RSTAT, false); - if (ret) - return ret; - - if (S_ISDIR(old->i_mode)) { - ret = ceph_do_getattr(old, CEPH_STAT_RSTAT, false); - if (ret) - return ret; - ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, - ci_old->i_rbytes); - if (!ret) - ret = check_quota_exceeded(new, - QUOTA_CHECK_MAX_FILES_OP, - ci_old->i_rfiles + - ci_old->i_rsubdirs); - if (ret) - ret = -EXDEV; - } else { - ret = check_quota_exceeded(new, QUOTA_CHECK_MAX_BYTES_OP, - i_size_read(old)); - if (!ret) - ret = check_quota_exceeded(new, - QUOTA_CHECK_MAX_FILES_OP, 1); - if (ret) - ret = -EDQUOT; - } - - return ret; -} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 33ba6f0aa55c..9b1b7f4cfdd4 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -831,6 +831,13 @@ static void destroy_caches(void) ceph_fscache_unregister(); } +static void __ceph_umount_begin(struct ceph_fs_client *fsc) +{ + ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); + ceph_mdsc_force_umount(fsc->mdsc); + fsc->filp_gen++; // invalidate open files +} + /* * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). @@ -843,9 +850,7 @@ static void ceph_umount_begin(struct super_block *sb) if (!fsc) return; fsc->mount_state = CEPH_MOUNT_SHUTDOWN; - ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); - ceph_mdsc_force_umount(fsc->mdsc); - fsc->filp_gen++; // invalidate open files + __ceph_umount_begin(fsc); } static const struct super_operations ceph_super_ops = { @@ -1234,7 +1239,8 @@ int ceph_force_reconnect(struct super_block *sb) struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; - ceph_umount_begin(sb); + fsc->mount_state = CEPH_MOUNT_RECOVER; + __ceph_umount_begin(fsc); /* Make sure all page caches get invalidated. * see remove_session_caps_cb() */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 482473e4cce1..b62d8fee3b86 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -106,9 +106,8 @@ struct ceph_fs_client { struct ceph_mount_options *mount_options; struct ceph_client *client; - unsigned long mount_state; + int mount_state; - unsigned long last_auto_reconnect; bool blocklisted; bool have_copy_from2; @@ -129,6 +128,7 @@ struct ceph_fs_client { struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; struct dentry *debugfs_metric; + struct dentry *debugfs_status; struct dentry *debugfs_mds_sessions; #endif @@ -1222,14 +1222,13 @@ extern void ceph_handle_quota(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); +extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newlen); extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newlen); extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); -extern int ceph_quota_check_rename(struct ceph_mds_client *mdsc, - struct inode *old, struct inode *new); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 2a3981337077..ee8e1f4d003e 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -42,6 +42,7 @@ struct ceph_vxattr { #define VXATTR_FLAG_READONLY (1<<0) #define VXATTR_FLAG_HIDDEN (1<<1) #define VXATTR_FLAG_RSTAT (1<<2) +#define VXATTR_FLAG_DIRSTAT (1<<3) /* layouts */ @@ -303,6 +304,36 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val, ci->i_snap_btime.tv_nsec); } +static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci, + char *val, size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + + return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid); +} + +static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci, + char *val, size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + + return ceph_fmt_xattr(val, size, "client%lld", + ceph_client_gid(fsc->client)); +} + +static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + int issued; + + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + + return ceph_fmt_xattr(val, size, "%s/0x%x", + ceph_cap_string(issued), issued); +} + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -347,9 +378,9 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_LAYOUT_FIELD(dir, layout, object_size), XATTR_LAYOUT_FIELD(dir, layout, pool), XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), - XATTR_NAME_CEPH(dir, entries, 0), - XATTR_NAME_CEPH(dir, files, 0), - XATTR_NAME_CEPH(dir, subdirs, 0), + XATTR_NAME_CEPH(dir, entries, VXATTR_FLAG_DIRSTAT), + XATTR_NAME_CEPH(dir, files, VXATTR_FLAG_DIRSTAT), + XATTR_NAME_CEPH(dir, subdirs, VXATTR_FLAG_DIRSTAT), XATTR_RSTAT_FIELD(dir, rentries), XATTR_RSTAT_FIELD(dir, rfiles), XATTR_RSTAT_FIELD(dir, rsubdirs), @@ -378,6 +409,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { .exists_cb = ceph_vxattrcb_snap_btime_exists, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.caps", + .name_size = sizeof("ceph.caps"), + .getxattr_cb = ceph_vxattrcb_caps, + .exists_cb = NULL, + .flags = VXATTR_FLAG_HIDDEN, + }, { .name = NULL, 0 } /* Required table terminator */ }; @@ -403,6 +441,31 @@ static struct ceph_vxattr ceph_file_vxattrs[] = { .exists_cb = ceph_vxattrcb_snap_btime_exists, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.caps", + .name_size = sizeof("ceph.caps"), + .getxattr_cb = ceph_vxattrcb_caps, + .exists_cb = NULL, + .flags = VXATTR_FLAG_HIDDEN, + }, + { .name = NULL, 0 } /* Required table terminator */ +}; + +static struct ceph_vxattr ceph_common_vxattrs[] = { + { + .name = "ceph.cluster_fsid", + .name_size = sizeof("ceph.cluster_fsid"), + .getxattr_cb = ceph_vxattrcb_cluster_fsid, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, + { + .name = "ceph.client_id", + .name_size = sizeof("ceph.client_id"), + .getxattr_cb = ceph_vxattrcb_client_id, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; @@ -428,6 +491,13 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode, } } + vxattr = ceph_common_vxattrs; + while (vxattr->name) { + if (!strcmp(vxattr->name, name)) + return vxattr; + vxattr++; + } + return NULL; } @@ -837,6 +907,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, int mask = 0; if (vxattr->flags & VXATTR_FLAG_RSTAT) mask |= CEPH_STAT_RSTAT; + if (vxattr->flags & VXATTR_FLAG_DIRSTAT) + mask |= CEPH_CAP_FILE_SHARED; err = ceph_do_getattr(inode, mask, true); if (err) return err; @@ -950,6 +1022,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_pagelist *pagelist = NULL; int op = CEPH_MDS_OP_SETXATTR; int err; @@ -988,6 +1061,8 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (op == CEPH_MDS_OP_SETXATTR) { req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_args.setxattr.osdmap_epoch = + cpu_to_le32(osdc->osdmap->epoch); req->r_pagelist = pagelist; pagelist = NULL; } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0afb6d59bad0..f802223e71ab 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -163,7 +163,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni, host->h_nsmhandle = nsm; host->h_addrbuf = nsm->sm_addrbuf; host->net = ni->net; - host->h_cred = get_cred(ni->cred), + host->h_cred = get_cred(ni->cred); strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename)); out: @@ -439,12 +439,7 @@ nlm_bind_host(struct nlm_host *host) * RPC rebind is required */ if ((clnt = host->h_rpcclnt) != NULL) { - if (time_after_eq(jiffies, host->h_nextrebind)) { - rpc_force_rebind(clnt); - host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %lu jiffies\n", - host->h_nextrebind - jiffies); - } + nlm_rebind_host(host); } else { unsigned long increment = nlmsvc_timeout; struct rpc_timeout timeparms = { @@ -494,13 +489,20 @@ nlm_bind_host(struct nlm_host *host) return clnt; } -/* - * Force a portmap lookup of the remote lockd port +/** + * nlm_rebind_host - If needed, force a portmap lookup of the peer's lockd port + * @host: NLM host handle for peer + * + * This is not needed when using a connection-oriented protocol, such as TCP. + * The existing autobind mechanism is sufficient to force a rebind when + * required, e.g. on connection state transitions. */ void nlm_rebind_host(struct nlm_host *host) { - dprintk("lockd: rebind host %s\n", host->h_name); + if (host->h_proto != IPPROTO_UDP) + return; + if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(host->h_rpcclnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4b8cc93913f7..ff5c4d0d6d13 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -571,7 +571,7 @@ static int nfs_start_lockd(struct nfs_server *server) 1 : 0, .net = clp->cl_net, .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, - .cred = current_cred(), + .cred = server->cred, }; if (nlm_init.nfs_version > 3) @@ -781,8 +781,8 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); - if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES) - server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES; + if (server->dtsize > NFS_MAX_FILE_IO_SIZE) + server->dtsize = NFS_MAX_FILE_IO_SIZE; if (server->dtsize > server->rsize) server->dtsize = server->rsize; @@ -985,7 +985,7 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (!server) return ERR_PTR(-ENOMEM); - server->cred = get_cred(current_cred()); + server->cred = get_cred(fc->cred); error = -ENOMEM; fattr = nfs_alloc_fattr(); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8a24fe20dccf..ef827ae193d2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = { .freepage = nfs_readdir_clear_array, }; -static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred) +static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir) { struct nfs_inode *nfsi = NFS_I(dir); struct nfs_open_dir_context *ctx; @@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir ctx->attr_gencount = nfsi->attr_gencount; ctx->dir_cookie = 0; ctx->dup_cookie = 0; - ctx->cred = get_cred(cred); spin_lock(&dir->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) @@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont spin_lock(&dir->i_lock); list_del(&ctx->list); spin_unlock(&dir->i_lock); - put_cred(ctx->cred); kfree(ctx); } @@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp) nfs_inc_stats(inode, NFSIOS_VFSOPEN); - ctx = alloc_nfs_open_dir_context(inode, current_cred()); + ctx = alloc_nfs_open_dir_context(inode); if (IS_ERR(ctx)) { res = PTR_ERR(ctx); goto out; @@ -133,43 +131,55 @@ nfs_closedir(struct inode *inode, struct file *filp) struct nfs_cache_array_entry { u64 cookie; u64 ino; - struct qstr string; + const char *name; + unsigned int name_len; unsigned char d_type; }; struct nfs_cache_array { - int size; - int eof_index; u64 last_cookie; + unsigned int size; + unsigned char page_full : 1, + page_is_eof : 1, + cookies_are_ordered : 1; struct nfs_cache_array_entry array[]; }; -typedef struct { +struct nfs_readdir_descriptor { struct file *file; struct page *page; struct dir_context *ctx; - unsigned long page_index; - u64 *dir_cookie; + pgoff_t page_index; + u64 dir_cookie; u64 last_cookie; + u64 dup_cookie; loff_t current_index; loff_t prev_index; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; unsigned long dir_verifier; unsigned long timestamp; unsigned long gencount; + unsigned long attr_gencount; unsigned int cache_entry_index; + signed char duped; bool plus; bool eof; -} nfs_readdir_descriptor_t; +}; -static -void nfs_readdir_init_array(struct page *page) +static void nfs_readdir_array_init(struct nfs_cache_array *array) +{ + memset(array, 0, sizeof(struct nfs_cache_array)); +} + +static void nfs_readdir_page_init_array(struct page *page, u64 last_cookie) { struct nfs_cache_array *array; array = kmap_atomic(page); - memset(array, 0, sizeof(struct nfs_cache_array)); - array->eof_index = -1; + nfs_readdir_array_init(array); + array->last_cookie = last_cookie; + array->cookies_are_ordered = 1; kunmap_atomic(array); } @@ -184,61 +194,177 @@ void nfs_readdir_clear_array(struct page *page) array = kmap_atomic(page); for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); - array->size = 0; + kfree(array->array[i].name); + nfs_readdir_array_init(array); kunmap_atomic(array); } +static struct page * +nfs_readdir_page_array_alloc(u64 last_cookie, gfp_t gfp_flags) +{ + struct page *page = alloc_page(gfp_flags); + if (page) + nfs_readdir_page_init_array(page, last_cookie); + return page; +} + +static void nfs_readdir_page_array_free(struct page *page) +{ + if (page) { + nfs_readdir_clear_array(page); + put_page(page); + } +} + +static void nfs_readdir_array_set_eof(struct nfs_cache_array *array) +{ + array->page_is_eof = 1; + array->page_full = 1; +} + +static bool nfs_readdir_array_is_full(struct nfs_cache_array *array) +{ + return array->page_full; +} + /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in * nfs_clear_readdir_array() */ -static -int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int len) +static const char *nfs_readdir_copy_name(const char *name, unsigned int len) { - string->len = len; - string->name = kmemdup_nul(name, len, GFP_KERNEL); - if (string->name == NULL) - return -ENOMEM; + const char *ret = kmemdup_nul(name, len, GFP_KERNEL); + /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ - kmemleak_not_leak(string->name); - string->hash = full_name_hash(NULL, name, len); + if (ret != NULL) + kmemleak_not_leak(ret); + return ret; +} + +/* + * Check that the next array entry lies entirely within the page bounds + */ +static int nfs_readdir_array_can_expand(struct nfs_cache_array *array) +{ + struct nfs_cache_array_entry *cache_entry; + + if (array->page_full) + return -ENOSPC; + cache_entry = &array->array[array->size + 1]; + if ((char *)cache_entry - (char *)array > PAGE_SIZE) { + array->page_full = 1; + return -ENOSPC; + } return 0; } static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = kmap(page); + struct nfs_cache_array *array; struct nfs_cache_array_entry *cache_entry; + const char *name; int ret; - cache_entry = &array->array[array->size]; + name = nfs_readdir_copy_name(entry->name, entry->len); + if (!name) + return -ENOMEM; - /* Check that this entry lies within the page bounds */ - ret = -ENOSPC; - if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + array = kmap_atomic(page); + ret = nfs_readdir_array_can_expand(array); + if (ret) { + kfree(name); goto out; + } + cache_entry = &array->array[array->size]; cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; cache_entry->d_type = entry->d_type; - ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); - if (ret) - goto out; + cache_entry->name_len = entry->len; + cache_entry->name = name; array->last_cookie = entry->cookie; + if (array->last_cookie <= cache_entry->cookie) + array->cookies_are_ordered = 0; array->size++; if (entry->eof != 0) - array->eof_index = array->size; + nfs_readdir_array_set_eof(array); out: - kunmap(page); + kunmap_atomic(array); return ret; } +static struct page *nfs_readdir_page_get_locked(struct address_space *mapping, + pgoff_t index, u64 last_cookie) +{ + struct page *page; + + page = grab_cache_page(mapping, index); + if (page && !PageUptodate(page)) { + nfs_readdir_page_init_array(page, last_cookie); + if (invalidate_inode_pages2_range(mapping, index + 1, -1) < 0) + nfs_zap_mapping(mapping->host, mapping); + SetPageUptodate(page); + } + + return page; +} + +static u64 nfs_readdir_page_last_cookie(struct page *page) +{ + struct nfs_cache_array *array; + u64 ret; + + array = kmap_atomic(page); + ret = array->last_cookie; + kunmap_atomic(array); + return ret; +} + +static bool nfs_readdir_page_needs_filling(struct page *page) +{ + struct nfs_cache_array *array; + bool ret; + + array = kmap_atomic(page); + ret = !nfs_readdir_array_is_full(array); + kunmap_atomic(array); + return ret; +} + +static void nfs_readdir_page_set_eof(struct page *page) +{ + struct nfs_cache_array *array; + + array = kmap_atomic(page); + nfs_readdir_array_set_eof(array); + kunmap_atomic(array); +} + +static void nfs_readdir_page_unlock_and_put(struct page *page) +{ + unlock_page(page); + put_page(page); +} + +static struct page *nfs_readdir_page_get_next(struct address_space *mapping, + pgoff_t index, u64 cookie) +{ + struct page *page; + + page = nfs_readdir_page_get_locked(mapping, index, cookie); + if (page) { + if (nfs_readdir_page_last_cookie(page) == cookie) + return page; + nfs_readdir_page_unlock_and_put(page); + } + return NULL; +} + static inline int is_32bit_api(void) { @@ -258,8 +384,8 @@ bool nfs_readdir_use_cookie(const struct file *filp) return true; } -static -int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { loff_t diff = desc->ctx->pos - desc->current_index; unsigned int index; @@ -267,13 +393,13 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index >= 0) + if (array->page_is_eof) goto out_eof; return -EAGAIN; } index = (unsigned int)diff; - *desc->dir_cookie = array->array[index].cookie; + desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; return 0; out_eof: @@ -290,41 +416,55 @@ nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi) return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags); } -static -int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) +static bool nfs_readdir_array_cookie_in_range(struct nfs_cache_array *array, + u64 cookie) +{ + if (!array->cookies_are_ordered) + return true; + /* Optimisation for monotonically increasing cookies */ + if (cookie >= array->last_cookie) + return false; + if (array->size && cookie < array->array[0].cookie) + return false; + return true; +} + +static int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, + struct nfs_readdir_descriptor *desc) { int i; loff_t new_pos; int status = -EAGAIN; + if (!nfs_readdir_array_cookie_in_range(array, desc->dir_cookie)) + goto check_eof; + for (i = 0; i < array->size; i++) { - if (array->array[i].cookie == *desc->dir_cookie) { + if (array->array[i].cookie == desc->dir_cookie) { struct nfs_inode *nfsi = NFS_I(file_inode(desc->file)); - struct nfs_open_dir_context *ctx = desc->file->private_data; new_pos = desc->current_index + i; - if (ctx->attr_gencount != nfsi->attr_gencount || + if (desc->attr_gencount != nfsi->attr_gencount || !nfs_readdir_inode_mapping_valid(nfsi)) { - ctx->duped = 0; - ctx->attr_gencount = nfsi->attr_gencount; + desc->duped = 0; + desc->attr_gencount = nfsi->attr_gencount; } else if (new_pos < desc->prev_index) { - if (ctx->duped > 0 - && ctx->dup_cookie == *desc->dir_cookie) { + if (desc->duped > 0 + && desc->dup_cookie == desc->dir_cookie) { if (printk_ratelimit()) { pr_notice("NFS: directory %pD2 contains a readdir loop." "Please contact your server vendor. " - "The file: %.*s has duplicate cookie %llu\n", - desc->file, array->array[i].string.len, - array->array[i].string.name, *desc->dir_cookie); + "The file: %s has duplicate cookie %llu\n", + desc->file, array->array[i].name, desc->dir_cookie); } status = -ELOOP; goto out; } - ctx->dup_cookie = *desc->dir_cookie; - ctx->duped = -1; + desc->dup_cookie = desc->dir_cookie; + desc->duped = -1; } if (nfs_readdir_use_cookie(desc->file)) - desc->ctx->pos = *desc->dir_cookie; + desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos = new_pos; desc->prev_index = new_pos; @@ -332,24 +472,24 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des return 0; } } - if (array->eof_index >= 0) { +check_eof: + if (array->page_is_eof) { status = -EBADCOOKIE; - if (*desc->dir_cookie == array->last_cookie) + if (desc->dir_cookie == array->last_cookie) desc->eof = true; } out: return status; } -static -int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) +static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc) { struct nfs_cache_array *array; int status; - array = kmap(desc->page); + array = kmap_atomic(desc->page); - if (*desc->dir_cookie == 0) + if (desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); else status = nfs_readdir_search_for_cookie(array, desc); @@ -359,17 +499,29 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - kunmap(desc->page); + kunmap_atomic(array); return status; } /* Fill a page with xdr information before transferring to the cache page */ -static -int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, - struct nfs_entry *entry, struct file *file, struct inode *inode) +static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc, + __be32 *verf, u64 cookie, + struct page **pages, size_t bufsize, + __be32 *verf_res) { - struct nfs_open_dir_context *ctx = file->private_data; - const struct cred *cred = ctx->cred; + struct inode *inode = file_inode(desc->file); + struct nfs_readdir_arg arg = { + .dentry = file_dentry(desc->file), + .cred = desc->file->f_cred, + .verf = verf, + .cookie = cookie, + .pages = pages, + .page_len = bufsize, + .plus = desc->plus, + }; + struct nfs_readdir_res res = { + .verf = verf_res, + }; unsigned long timestamp, gencount; int error; @@ -377,14 +529,13 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, timestamp = jiffies; gencount = nfs_inc_attr_generation_counter(); desc->dir_verifier = nfs_save_change_attribute(inode); - error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages, - NFS_SERVER(inode)->dtsize, desc->plus); + error = NFS_PROTO(inode)->readdir(&arg, &res); if (error < 0) { /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); - desc->plus = false; + desc->plus = arg.plus = false; goto again; } goto error; @@ -395,7 +546,7 @@ error: return error; } -static int xdr_decode(nfs_readdir_descriptor_t *desc, +static int xdr_decode(struct nfs_readdir_descriptor *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { struct inode *inode = file_inode(desc->file); @@ -557,24 +708,23 @@ out: } /* Perform conversion from xdr to cache array */ -static -int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, - struct page **xdr_pages, struct page *page, unsigned int buflen) +static int nfs_readdir_page_filler(struct nfs_readdir_descriptor *desc, + struct nfs_entry *entry, + struct page **xdr_pages, + unsigned int buflen, + struct page **arrays, + size_t narrays) { + struct address_space *mapping = desc->file->f_mapping; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; - struct nfs_cache_array *array; - unsigned int count = 0; + struct page *scratch, *new, *page = *arrays; int status; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) return -ENOMEM; - if (buflen == 0) - goto out_nopages; - xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_page(&stream, scratch); @@ -583,209 +733,238 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en entry->label->len = NFS4_MAXLABELLEN; status = xdr_decode(desc, entry, &stream); - if (status != 0) { - if (status == -EAGAIN) - status = 0; + if (status != 0) break; - } - - count++; if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry, desc->dir_verifier); status = nfs_readdir_add_to_array(entry, page); - if (status != 0) - break; - } while (!entry->eof); + if (status != -ENOSPC) + continue; -out_nopages: - if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); - array->eof_index = array->size; + if (page->mapping != mapping) { + if (!--narrays) + break; + new = nfs_readdir_page_array_alloc(entry->prev_cookie, + GFP_KERNEL); + if (!new) + break; + arrays++; + *arrays = page = new; + } else { + new = nfs_readdir_page_get_next(mapping, + page->index + 1, + entry->prev_cookie); + if (!new) + break; + if (page != *arrays) + nfs_readdir_page_unlock_and_put(page); + page = new; + } + status = nfs_readdir_add_to_array(entry, page); + } while (!status && !entry->eof); + + switch (status) { + case -EBADCOOKIE: + if (entry->eof) { + nfs_readdir_page_set_eof(page); + status = 0; + } + break; + case -ENOSPC: + case -EAGAIN: status = 0; - kunmap(page); + break; } + if (page != *arrays) + nfs_readdir_page_unlock_and_put(page); + put_page(scratch); return status; } -static -void nfs_readdir_free_pages(struct page **pages, unsigned int npages) +static void nfs_readdir_free_pages(struct page **pages, size_t npages) { - unsigned int i; - for (i = 0; i < npages; i++) - put_page(pages[i]); + while (npages--) + put_page(pages[npages]); + kfree(pages); } /* * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call * to nfs_readdir_free_pages() */ -static -int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) +static struct page **nfs_readdir_alloc_pages(size_t npages) { - unsigned int i; + struct page **pages; + size_t i; + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; for (i = 0; i < npages; i++) { struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) goto out_freepages; pages[i] = page; } - return 0; + return pages; out_freepages: nfs_readdir_free_pages(pages, i); - return -ENOMEM; + return NULL; } -static -int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) +static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, + __be32 *verf_arg, __be32 *verf_res, + struct page **arrays, size_t narrays) { - struct page *pages[NFS_MAX_READDIR_PAGES]; - struct nfs_entry entry; - struct file *file = desc->file; - struct nfs_cache_array *array; + struct page **pages; + struct page *page = *arrays; + struct nfs_entry *entry; + size_t array_size; + struct inode *inode = file_inode(desc->file); + size_t dtsize = NFS_SERVER(inode)->dtsize; int status = -ENOMEM; - unsigned int array_size = ARRAY_SIZE(pages); - nfs_readdir_init_array(page); - - entry.prev_cookie = 0; - entry.cookie = desc->last_cookie; - entry.eof = 0; - entry.fh = nfs_alloc_fhandle(); - entry.fattr = nfs_alloc_fattr(); - entry.server = NFS_SERVER(inode); - if (entry.fh == NULL || entry.fattr == NULL) + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + entry->cookie = nfs_readdir_page_last_cookie(page); + entry->fh = nfs_alloc_fhandle(); + entry->fattr = nfs_alloc_fattr(); + entry->server = NFS_SERVER(inode); + if (entry->fh == NULL || entry->fattr == NULL) goto out; - entry.label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(entry.label)) { - status = PTR_ERR(entry.label); + entry->label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); + if (IS_ERR(entry->label)) { + status = PTR_ERR(entry->label); goto out; } - array = kmap(page); + array_size = (dtsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = nfs_readdir_alloc_pages(array_size); + if (!pages) + goto out_release_label; - status = nfs_readdir_alloc_pages(pages, array_size); - if (status < 0) - goto out_release_array; do { unsigned int pglen; - status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); - + status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie, + pages, dtsize, + verf_res); if (status < 0) break; + pglen = status; - status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen); - if (status < 0) { - if (status == -ENOSPC) - status = 0; + if (pglen == 0) { + nfs_readdir_page_set_eof(page); break; } - } while (array->eof_index < 0); + + status = nfs_readdir_page_filler(desc, entry, pages, pglen, + arrays, narrays); + } while (!status && nfs_readdir_page_needs_filling(page)); nfs_readdir_free_pages(pages, array_size); -out_release_array: - kunmap(page); - nfs4_label_free(entry.label); +out_release_label: + nfs4_label_free(entry->label); out: - nfs_free_fattr(entry.fattr); - nfs_free_fhandle(entry.fh); + nfs_free_fattr(entry->fattr); + nfs_free_fhandle(entry->fh); + kfree(entry); return status; } -/* - * Now we cache directories properly, by converting xdr information - * to an array that can be used for lookups later. This results in - * fewer cache pages, since we can store more information on each page. - * We only need to convert from xdr once so future lookups are much simpler - */ -static -int nfs_readdir_filler(void *data, struct page* page) -{ - nfs_readdir_descriptor_t *desc = data; - struct inode *inode = file_inode(desc->file); - int ret; - - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; - SetPageUptodate(page); - - if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { - /* Should never happen */ - nfs_zap_mapping(inode, inode->i_mapping); - } - unlock_page(page); - return 0; - error: - nfs_readdir_clear_array(page); - unlock_page(page); - return ret; -} - -static -void cache_page_release(nfs_readdir_descriptor_t *desc) +static void nfs_readdir_page_put(struct nfs_readdir_descriptor *desc) { put_page(desc->page); desc->page = NULL; } -static -struct page *get_cache_page(nfs_readdir_descriptor_t *desc) +static void +nfs_readdir_page_unlock_and_put_cached(struct nfs_readdir_descriptor *desc) { - return read_cache_page(desc->file->f_mapping, desc->page_index, - nfs_readdir_filler, desc); + unlock_page(desc->page); + nfs_readdir_page_put(desc); +} + +static struct page * +nfs_readdir_page_get_cached(struct nfs_readdir_descriptor *desc) +{ + return nfs_readdir_page_get_locked(desc->file->f_mapping, + desc->page_index, + desc->last_cookie); } /* * Returns 0 if desc->dir_cookie was found on page desc->page_index * and locks the page to prevent removal from the page cache. */ -static -int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc) +static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) { struct inode *inode = file_inode(desc->file); struct nfs_inode *nfsi = NFS_I(inode); + __be32 verf[NFS_DIR_VERIFIER_SIZE]; int res; - desc->page = get_cache_page(desc); - if (IS_ERR(desc->page)) - return PTR_ERR(desc->page); - res = lock_page_killable(desc->page); - if (res != 0) - goto error; - res = -EAGAIN; - if (desc->page->mapping != NULL) { - res = nfs_readdir_search_array(desc); - if (res == 0) { - nfsi->page_index = desc->page_index; - return 0; + desc->page = nfs_readdir_page_get_cached(desc); + if (!desc->page) + return -ENOMEM; + if (nfs_readdir_page_needs_filling(desc->page)) { + res = nfs_readdir_xdr_to_array(desc, nfsi->cookieverf, verf, + &desc->page, 1); + if (res < 0) { + nfs_readdir_page_unlock_and_put_cached(desc); + if (res == -EBADCOOKIE || res == -ENOTSYNC) { + invalidate_inode_pages2(desc->file->f_mapping); + desc->page_index = 0; + return -EAGAIN; + } + return res; } + memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); } - unlock_page(desc->page); -error: - cache_page_release(desc); + res = nfs_readdir_search_array(desc); + if (res == 0) { + nfsi->page_index = desc->page_index; + return 0; + } + nfs_readdir_page_unlock_and_put_cached(desc); return res; } +static bool nfs_readdir_dont_search_cache(struct nfs_readdir_descriptor *desc) +{ + struct address_space *mapping = desc->file->f_mapping; + struct inode *dir = file_inode(desc->file); + unsigned int dtsize = NFS_SERVER(dir)->dtsize; + loff_t size = i_size_read(dir); + + /* + * Default to uncached readdir if the page cache is empty, and + * we're looking for a non-zero cookie in a large directory. + */ + return desc->dir_cookie != 0 && mapping->nrpages == 0 && size > dtsize; +} + /* Search for desc->dir_cookie from the beginning of the page cache */ -static inline -int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) +static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) { int res; - if (desc->page_index == 0) { - desc->current_index = 0; - desc->prev_index = 0; - desc->last_cookie = 0; - } + if (nfs_readdir_dont_search_cache(desc)) + return -EBADCOOKIE; + do { + if (desc->page_index == 0) { + desc->current_index = 0; + desc->prev_index = 0; + desc->last_cookie = 0; + } res = find_and_lock_cache_page(desc); } while (res == -EAGAIN); return res; @@ -794,43 +973,41 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) /* * Once we've found the start of the dirent within a page: fill 'er up... */ -static -int nfs_do_filldir(nfs_readdir_descriptor_t *desc) +static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) { struct file *file = desc->file; - int i = 0; - int res = 0; - struct nfs_cache_array *array = NULL; - struct nfs_open_dir_context *ctx = file->private_data; + struct nfs_inode *nfsi = NFS_I(file_inode(file)); + struct nfs_cache_array *array; + unsigned int i = 0; array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; ent = &array->array[i]; - if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, + if (!dir_emit(desc->ctx, ent->name, ent->name_len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { desc->eof = true; break; } + memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf)); if (i < (array->size-1)) - *desc->dir_cookie = array->array[i+1].cookie; + desc->dir_cookie = array->array[i+1].cookie; else - *desc->dir_cookie = array->last_cookie; + desc->dir_cookie = array->last_cookie; if (nfs_readdir_use_cookie(file)) - desc->ctx->pos = *desc->dir_cookie; + desc->ctx->pos = desc->dir_cookie; else desc->ctx->pos++; - if (ctx->duped != 0) - ctx->duped = 1; + if (desc->duped != 0) + desc->duped = 1; } - if (array->eof_index >= 0) + if (array->page_is_eof) desc->eof = true; kunmap(desc->page); - dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", - (unsigned long long)*desc->dir_cookie, res); - return res; + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", + (unsigned long long)desc->dir_cookie); } /* @@ -845,40 +1022,41 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) * we should already have a complete representation of the * directory in the page cache by the time we get here. */ -static inline -int uncached_readdir(nfs_readdir_descriptor_t *desc) +static int uncached_readdir(struct nfs_readdir_descriptor *desc) { - struct page *page = NULL; - int status; - struct inode *inode = file_inode(desc->file); - struct nfs_open_dir_context *ctx = desc->file->private_data; + struct page **arrays; + size_t i, sz = 512; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; + int status = -ENOMEM; - dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", - (unsigned long long)*desc->dir_cookie); + dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %llu\n", + (unsigned long long)desc->dir_cookie); - page = alloc_page(GFP_HIGHUSER); - if (!page) { - status = -ENOMEM; + arrays = kcalloc(sz, sizeof(*arrays), GFP_KERNEL); + if (!arrays) + goto out; + arrays[0] = nfs_readdir_page_array_alloc(desc->dir_cookie, GFP_KERNEL); + if (!arrays[0]) goto out; - } desc->page_index = 0; - desc->last_cookie = *desc->dir_cookie; - desc->page = page; - ctx->duped = 0; + desc->last_cookie = desc->dir_cookie; + desc->duped = 0; - status = nfs_readdir_xdr_to_array(desc, page, inode); - if (status < 0) - goto out_release; + status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz); - status = nfs_do_filldir(desc); + for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { + desc->page = arrays[i]; + nfs_do_filldir(desc); + } + desc->page = NULL; - out_release: - nfs_readdir_clear_array(desc->page); - cache_page_release(desc); - out: - dfprintk(DIRCACHE, "NFS: %s: returns %d\n", - __func__, status); + + for (i = 0; i < sz && arrays[i]; i++) + nfs_readdir_page_array_free(arrays[i]); +out: + kfree(arrays); + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status); return status; } @@ -891,14 +1069,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); struct nfs_open_dir_context *dir_ctx = file->private_data; - nfs_readdir_descriptor_t my_desc = { - .file = file, - .ctx = ctx, - .dir_cookie = &dir_ctx->dir_cookie, - .plus = nfs_use_readdirplus(inode, ctx), - }, - *desc = &my_desc; - int res = 0; + struct nfs_readdir_descriptor *desc; + int res; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -910,10 +1082,27 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) { res = nfs_revalidate_mapping(inode, file->f_mapping); - if (res < 0) + if (res < 0) + goto out; + } + + res = -ENOMEM; + desc = kzalloc(sizeof(*desc), GFP_KERNEL); + if (!desc) goto out; + desc->file = file; + desc->ctx = ctx; + desc->plus = nfs_use_readdirplus(inode, ctx); + + spin_lock(&file->f_lock); + desc->dir_cookie = dir_ctx->dir_cookie; + desc->dup_cookie = dir_ctx->dup_cookie; + desc->duped = dir_ctx->duped; + desc->attr_gencount = dir_ctx->attr_gencount; + memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf)); + spin_unlock(&file->f_lock); do { res = readdir_search_pagecache(desc); @@ -921,11 +1110,13 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && !desc->eof) { + if (desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) continue; + if (res == -EBADCOOKIE || res == -ENOTSYNC) + res = 0; } break; } @@ -940,15 +1131,21 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; - res = nfs_do_filldir(desc); - unlock_page(desc->page); - cache_page_release(desc); - if (res < 0) - break; + nfs_do_filldir(desc); + nfs_readdir_page_unlock_and_put_cached(desc); } while (!desc->eof); + + spin_lock(&file->f_lock); + dir_ctx->dir_cookie = desc->dir_cookie; + dir_ctx->dup_cookie = desc->dup_cookie; + dir_ctx->duped = desc->duped; + dir_ctx->attr_gencount = desc->attr_gencount; + memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf)); + spin_unlock(&file->f_lock); + + kfree(desc); + out: - if (res > 0) - res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); return res; } @@ -984,6 +1181,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->dir_cookie = offset; else dir_ctx->dir_cookie = 0; + if (offset == 0) + memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf)); dir_ctx->duped = 0; } spin_unlock(&filp->f_lock); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 4252ce633533..872112bffcab 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -740,16 +740,12 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - bool fail_return = false; u32 idx; /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { - if (idx+1 == fls->mirror_array_cnt) - fail_return = !check_device; - mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); if (!ds) continue; @@ -1056,7 +1052,7 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; - if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx)) + if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx)) ff_layout_send_layouterror(hdr->lseg); else pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); @@ -2284,7 +2280,6 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) struct sockaddr *sap = (struct sockaddr *)&da->da_addr; char portbuf[RPCBIND_MAXUADDRPLEN]; char addrbuf[RPCBIND_MAXUADDRLEN]; - char *netid; unsigned short port; int len, netid_len; __be32 *p; @@ -2294,18 +2289,13 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) return; port = ntohs(((struct sockaddr_in *)sap)->sin_port); - netid = "tcp"; - netid_len = 3; break; case AF_INET6: if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) return; port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); - netid = "tcp6"; - netid_len = 4; break; default: - /* we only support tcp and tcp6 */ WARN_ON_ONCE(1); return; } @@ -2313,8 +2303,9 @@ ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + netid_len = strlen(da->da_netid); p = xdr_reserve_space(xdr, 4 + netid_len); - xdr_encode_opaque(p, netid, netid_len); + xdr_encode_opaque(p, da->da_netid, netid_len); p = xdr_reserve_space(xdr, 4 + len); xdr_encode_opaque(p, addrbuf, len); diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 29ec8b09a52d..06894bcdea2d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -510,13 +510,12 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_tcp: - ctx->flags |= NFS_MOUNT_TCP; - ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; - break; case Opt_rdma: ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */ - ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(param->key); + ret = xprt_find_transport_ident(param->key); + if (ret < 0) + goto out_bad_transport; + ctx->nfs_server.protocol = ret; break; case Opt_acl: if (result.negated) @@ -670,11 +669,13 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, case Opt_xprt_rdma: /* vector side protocols to TCP */ ctx->flags |= NFS_MOUNT_TCP; - ctx->nfs_server.protocol = XPRT_TRANSPORT_RDMA; - xprt_load_transport(param->string); + ret = xprt_find_transport_ident(param->string); + if (ret < 0) + goto out_bad_transport; + ctx->nfs_server.protocol = ret; break; default: - return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); + goto out_bad_transport; } ctx->protofamily = protofamily; @@ -697,7 +698,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_xprt_rdma: /* not used for side protocols */ default: - return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); + goto out_bad_transport; } ctx->mountfamily = mountfamily; break; @@ -787,6 +788,8 @@ out_invalid_address: return nfs_invalf(fc, "NFS: Bad IP address specified"); out_of_bounds: return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key); +out_bad_transport: + return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); } /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index aa6493905bbe..522aa10a1a3e 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -229,7 +229,6 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA @@ -1237,7 +1236,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { - struct nfs_inode *nfsi = NFS_I(inode); int ret; if (mapping->nrpages != 0) { @@ -1250,11 +1248,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (ret < 0) return ret; } - if (S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); - spin_unlock(&inode->i_lock); - } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); @@ -2180,7 +2173,7 @@ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); - wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0); + wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6673a77884d9..b840d0a91c9d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -56,12 +56,6 @@ static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) -/* - * Maximum number of pages that readdir can use for creating - * a vmapped array of pages. - */ -#define NFS_MAX_READDIR_PAGES 8 - struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index f6676af37d5d..7fba7711e6b3 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -34,6 +34,7 @@ * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ +#define NFS_pagepad_sz (1) /* Page padding */ #define NFS_fhandle_sz (8) #define NFS_sattr_sz (8) #define NFS_filename_sz (1+(NFS2_MAXNAMLEN>>2)) @@ -56,11 +57,11 @@ #define NFS_attrstat_sz (1+NFS_fattr_sz) #define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) -#define NFS_readlinkres_sz (2+1) -#define NFS_readres_sz (1+NFS_fattr_sz+1+1) +#define NFS_readlinkres_sz (2+NFS_pagepad_sz) +#define NFS_readres_sz (1+NFS_fattr_sz+1+NFS_pagepad_sz) #define NFS_writeres_sz (NFS_attrstat_sz) #define NFS_stat_sz (1) -#define NFS_readdirres_sz (1+1) +#define NFS_readdirres_sz (1+NFS_pagepad_sz) #define NFS_statfsres_sz (1+NFS_info_sz) static int nfs_stat_to_errno(enum nfs_stat); @@ -592,8 +593,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, const struct nfs_readlinkargs *args = data; encode_fhandle(xdr, args->fh); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->pglen, NFS_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen, + NFS_readlinkres_sz - NFS_pagepad_sz); } /* @@ -628,8 +629,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, const struct nfs_pgio_args *args = data; encode_readargs(xdr, args); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, NFS_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count, + NFS_readres_sz - NFS_pagepad_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -786,8 +787,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, const struct nfs_readdirargs *args = data; encode_readdirargs(xdr, args); - rpc_prepare_reply_pages(req, args->pages, 0, - args->count, NFS_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS_readdirres_sz - NFS_pagepad_sz); } /* diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2397ceedba8a..5c4e23abc345 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -154,14 +154,14 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, } static int -nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, - struct nfs_fh *fhandle, struct nfs_fattr *fattr, - struct nfs4_label *label) +__nfs3_proc_lookup(struct inode *dir, const char *name, size_t len, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + unsigned short task_flags) { struct nfs3_diropargs arg = { .fh = NFS_FH(dir), - .name = dentry->d_name.name, - .len = dentry->d_name.len + .name = name, + .len = len }; struct nfs3_diropres res = { .fh = fhandle, @@ -173,17 +173,11 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, .rpc_resp = &res, }; int status; - unsigned short task_flags = 0; - - /* Is this is an attribute revalidation, subject to softreval? */ - if (nfs_lookup_is_soft_revalidate(dentry)) - task_flags |= RPC_TASK_TIMEOUT; res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) return -ENOMEM; - dprintk("NFS call lookup %pd2\n", dentry); nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(dir), &msg, task_flags); nfs_refresh_inode(dir, res.dir_attr); @@ -198,6 +192,37 @@ nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, return status; } +static int +nfs3_proc_lookup(struct inode *dir, struct dentry *dentry, + struct nfs_fh *fhandle, struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + unsigned short task_flags = 0; + + /* Is this is an attribute revalidation, subject to softreval? */ + if (nfs_lookup_is_soft_revalidate(dentry)) + task_flags |= RPC_TASK_TIMEOUT; + + dprintk("NFS call lookup %pd2\n", dentry); + return __nfs3_proc_lookup(dir, dentry->d_name.name, + dentry->d_name.len, fhandle, fattr, + task_flags); +} + +static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label) +{ + const char dotdot[] = ".."; + const size_t len = strlen(dotdot); + unsigned short task_flags = 0; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + task_flags |= RPC_TASK_TIMEOUT; + + return __nfs3_proc_lookup(inode, dotdot, len, fhandle, fattr, + task_flags); +} + static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs3_accessargs arg = { @@ -637,37 +662,36 @@ out: * Also note that this implementation handles both plain readdir and * readdirplus. */ -static int -nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); - __be32 *verf = NFS_I(dir)->cookieverf; + struct inode *dir = d_inode(nr_arg->dentry); struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .verf = {verf[0], verf[1]}, - .plus = plus, - .count = count, - .pages = pages + .cookie = nr_arg->cookie, + .plus = nr_arg->plus, + .count = nr_arg->page_len, + .pages = nr_arg->pages }; struct nfs3_readdirres res = { - .verf = verf, - .plus = plus + .verf = nr_res->verf, + .plus = nr_arg->plus, }; struct rpc_message msg = { .rpc_proc = &nfs3_procedures[NFS3PROC_READDIR], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status = -ENOMEM; - if (plus) + if (nr_arg->plus) msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS]; + if (arg.cookie) + memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf)); - dprintk("NFS call readdir%s %d\n", - plus? "plus" : "", (unsigned int) cookie); + dprintk("NFS call readdir%s %llu\n", nr_arg->plus ? "plus" : "", + (unsigned long long)nr_arg->cookie); res.dir_attr = nfs_alloc_fattr(); if (res.dir_attr == NULL) @@ -680,8 +704,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred, nfs_free_fattr(res.dir_attr); out: - dprintk("NFS reply readdir%s: %d\n", - plus? "plus" : "", status); + dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "", + status); return status; } @@ -1004,6 +1028,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, .lookup = nfs3_proc_lookup, + .lookupp = nfs3_proc_lookupp, .access = nfs3_proc_access, .readlink = nfs3_proc_readlink, .create = nfs3_proc_create, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 69971f6c840d..ca10072644ff 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -33,6 +33,7 @@ * Declare the space requirements for NFS arguments and replies as * number of 32bit-words */ +#define NFS3_pagepad_sz (1) /* Page padding */ #define NFS3_fhandle_sz (1+16) #define NFS3_fh_sz (NFS3_fhandle_sz) /* shorthand */ #define NFS3_sattr_sz (15) @@ -69,13 +70,13 @@ #define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1) -#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+NFS3_pagepad_sz) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+NFS3_pagepad_sz) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) #define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) -#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+NFS3_pagepad_sz) #define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) #define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) @@ -85,7 +86,8 @@ #define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ - XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1) + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+\ + NFS3_pagepad_sz) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) static int nfs3_stat_to_errno(enum nfs_stat); @@ -909,8 +911,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, const struct nfs3_readlinkargs *args = data; encode_nfs_fh3(xdr, args->fh); - rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->pglen, NFS3_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->pglen, + NFS3_readlinkres_sz - NFS3_pagepad_sz); } /* @@ -939,7 +941,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, const void *data) { const struct nfs_pgio_args *args = data; - unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; + unsigned int replen = args->replen ? args->replen : + NFS3_readres_sz - NFS3_pagepad_sz; encode_read3args(xdr, args); rpc_prepare_reply_pages(req, args->pages, args->pgbase, @@ -1239,8 +1242,8 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdir3args(xdr, args); - rpc_prepare_reply_pages(req, args->pages, 0, - args->count, NFS3_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS3_readdirres_sz - NFS3_pagepad_sz); } /* @@ -1281,8 +1284,8 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdirplus3args(xdr, args); - rpc_prepare_reply_pages(req, args->pages, 0, - args->count, NFS3_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, args->count, + NFS3_readdirres_sz - NFS3_pagepad_sz); } /* @@ -1328,7 +1331,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, if (args->mask & (NFS_ACL | NFS_DFACL)) { rpc_prepare_reply_pages(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, - ACL3_getaclres_sz); + ACL3_getaclres_sz - NFS3_pagepad_sz); req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; } } @@ -1648,7 +1651,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; - result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2); + result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 4fc61e3d098d..f3fd935620fc 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1173,14 +1173,12 @@ static int _nfs42_proc_setxattr(struct inode *inode, const char *name, } static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, - void *buf, size_t buflen) + void *buf, size_t buflen, struct page **pages, + size_t plen) { struct nfs_server *server = NFS_SERVER(inode); - struct page *pages[NFS4XATTR_MAXPAGES] = {}; struct nfs42_getxattrargs arg = { .fh = NFS_FH(inode), - .xattr_pages = pages, - .xattr_len = buflen, .xattr_name = name, }; struct nfs42_getxattrres res; @@ -1189,7 +1187,10 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, .rpc_argp = &arg, .rpc_resp = &res, }; - int ret, np; + ssize_t ret; + + arg.xattr_len = plen; + arg.xattr_pages = pages; ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0); @@ -1214,10 +1215,6 @@ static ssize_t _nfs42_proc_getxattr(struct inode *inode, const char *name, _copy_from_pages(buf, pages, 0, res.xattr_len); } - np = DIV_ROUND_UP(res.xattr_len, PAGE_SIZE); - while (--np >= 0) - __free_page(pages[np]); - return res.xattr_len; } @@ -1292,16 +1289,45 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name, void *buf, size_t buflen) { struct nfs4_exception exception = { }; - ssize_t err; + ssize_t err, np, i; + struct page **pages; + np = nfs_page_array_len(0, buflen ?: XATTR_SIZE_MAX); + pages = kmalloc_array(np, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (i = 0; i < np; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) { + np = i + 1; + err = -ENOMEM; + goto out; + } + } + + /* + * The GETXATTR op has no length field in the call, and the + * xattr data is at the end of the reply. + * + * There is no downside in using the page-aligned length. It will + * allow receiving and caching xattrs that are too large for the + * caller but still fit in the page-rounded value. + */ do { - err = _nfs42_proc_getxattr(inode, name, buf, buflen); + err = _nfs42_proc_getxattr(inode, name, buf, buflen, + pages, np * PAGE_SIZE); if (err >= 0) break; err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry); +out: + while (--np >= 0) + __free_page(pages[np]); + kfree(pages); + return err; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index ea7dd8cbfac9..c8bad735e4c1 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -191,7 +191,7 @@ #define encode_getxattr_maxsz (op_encode_hdr_maxsz + 1 + \ nfs4_xattr_name_maxsz) -#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + 1) +#define decode_getxattr_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) #define encode_setxattr_maxsz (op_encode_hdr_maxsz + \ 1 + nfs4_xattr_name_maxsz + 1) #define decode_setxattr_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) @@ -489,6 +489,12 @@ static int decode_getxattr(struct xdr_stream *xdr, return -EIO; len = be32_to_cpup(p); + + /* + * Only check against the page length here. The actual + * requested length may be smaller, but that is only + * checked against after possibly caching a valid reply. + */ if (len > req->rq_rcv_buf.page_len) return -ERANGE; @@ -1019,56 +1025,80 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re return decode_op_hdr(xdr, OP_DEALLOCATE); } -static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res, - uint32_t *eof) +static int decode_read_plus_data(struct xdr_stream *xdr, + struct nfs_pgio_args *args, + struct nfs_pgio_res *res) { uint32_t count, recvd; uint64_t offset; __be32 *p; p = xdr_inline_decode(xdr, 8 + 4); - if (unlikely(!p)) - return -EIO; + if (!p) + return 1; p = xdr_decode_hyper(p, &offset); count = be32_to_cpup(p); - recvd = xdr_align_data(xdr, res->count, count); - res->count += recvd; - - if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - *eof = 0; + recvd = xdr_align_data(xdr, res->count, xdr_align_size(count)); + if (recvd > count) + recvd = count; + if (res->count + recvd > args->count) { + if (args->count > res->count) + res->count += args->count - res->count; return 1; } - + res->count += recvd; + if (count > recvd) + return 1; return 0; } -static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *res, - uint32_t *eof) +static int decode_read_plus_hole(struct xdr_stream *xdr, + struct nfs_pgio_args *args, + struct nfs_pgio_res *res, uint32_t *eof) { uint64_t offset, length, recvd; __be32 *p; p = xdr_inline_decode(xdr, 8 + 8); - if (unlikely(!p)) - return -EIO; + if (!p) + return 1; p = xdr_decode_hyper(p, &offset); p = xdr_decode_hyper(p, &length); + if (offset != args->offset + res->count) { + /* Server returned an out-of-sequence extent */ + if (offset > args->offset + res->count || + offset + length < args->offset + res->count) { + dprintk("NFS: server returned out of sequence extent: " + "offset/size = %llu/%llu != expected %llu\n", + (unsigned long long)offset, + (unsigned long long)length, + (unsigned long long)(args->offset + + res->count)); + return 1; + } + length -= args->offset + res->count - offset; + } + if (length + res->count > args->count) { + *eof = 0; + if (unlikely(res->count >= args->count)) + return 1; + length = args->count - res->count; + } recvd = xdr_expand_hole(xdr, res->count, length); res->count += recvd; - if (recvd < length) { - *eof = 0; + if (recvd < length) return 1; - } return 0; } static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) { + struct nfs_pgio_header *hdr = + container_of(res, struct nfs_pgio_header, res); + struct nfs_pgio_args *args = &hdr->args; uint32_t eof, segments, type; int status, i; __be32 *p; @@ -1081,6 +1111,7 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (unlikely(!p)) return -EIO; + res->count = 0; eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) @@ -1088,26 +1119,31 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) for (i = 0; i < segments; i++) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; + if (!p) + goto early_out; type = be32_to_cpup(p++); if (type == NFS4_CONTENT_DATA) - status = decode_read_plus_data(xdr, res, &eof); + status = decode_read_plus_data(xdr, args, res); else if (type == NFS4_CONTENT_HOLE) - status = decode_read_plus_hole(xdr, res, &eof); + status = decode_read_plus_hole(xdr, args, res, &eof); else return -EINVAL; if (status < 0) return status; if (status > 0) - break; + goto early_out; } out: res->eof = eof; return 0; +early_out: + if (unlikely(!i)) + return -EIO; + res->eof = 0; + return 0; } static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) @@ -1476,18 +1512,16 @@ static void nfs4_xdr_enc_getxattr(struct rpc_rqst *req, struct xdr_stream *xdr, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; - size_t plen; + uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 1; encode_getxattr(xdr, args->xattr_name, &hdr); - plen = args->xattr_len ? args->xattr_len : XATTR_SIZE_MAX; - - rpc_prepare_reply_pages(req, args->xattr_pages, 0, plen, - hdr.replen); - req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->xattr_len, + replen); encode_nops(&hdr); } @@ -1520,14 +1554,15 @@ static void nfs4_xdr_enc_listxattrs(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + replen = hdr.replen + op_decode_hdr_maxsz + 2 + 1; encode_listxattrs(xdr, args, &hdr); - rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, - hdr.replen); + rpc_prepare_reply_pages(req, args->xattr_pages, 0, args->count, replen); encode_nops(&hdr); } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index be7915c861ce..86acffe7335c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1153,7 +1153,7 @@ struct nfs_server *nfs4_create_server(struct fs_context *fc) if (!server) return ERR_PTR(-ENOMEM); - server->cred = get_cred(current_cred()); + server->cred = get_cred(fc->cred); auth_probe = ctx->auth_info.flavor_len < 1; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 65adbbbdb3a4..b2c48f7a8d1c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -184,6 +184,8 @@ static int nfs4_map_errors(int err) return -EPROTONOSUPPORT; case -NFS4ERR_FILE_OPEN: return -EBUSY; + case -NFS4ERR_NOT_SAME: + return -ENOTSYNC; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -4397,6 +4399,10 @@ static int _nfs4_proc_lookupp(struct inode *inode, .rpc_argp = &args, .rpc_resp = &res, }; + unsigned short task_flags = 0; + + if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL) + task_flags |= RPC_TASK_TIMEOUT; args.bitmask = nfs4_bitmask(server, label); @@ -4404,7 +4410,7 @@ static int _nfs4_proc_lookupp(struct inode *inode, dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, 0); + &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); return status; } @@ -4957,35 +4963,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), - .pages = pages, + .pages = nr_arg->pages, .pgbase = 0, - .count = count, - .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, - .plus = plus, + .count = nr_arg->page_len, + .plus = nr_arg->plus, }; struct nfs4_readdir_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__, - dentry, - (unsigned long long)cookie); - nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); + dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__, + nr_arg->dentry, (unsigned long long)nr_arg->cookie); + if (!(server->caps & NFS_CAP_SECURITY_LABEL)) + args.bitmask = server->attr_bitmask_nl; + else + args.bitmask = server->attr_bitmask; + + nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args); res.pgbase = args.pgbase; - status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); + status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, + &res.seq_res, 0); if (status >= 0) { - memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); + memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; } @@ -4995,19 +5006,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, return status; } -static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs4_proc_readdir(struct nfs_readdir_arg *arg, + struct nfs_readdir_res *res) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_proc_readdir(dentry, cred, cookie, - pages, count, plus); - trace_nfs4_readdir(d_inode(dentry), err); - err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, - &exception); + err = _nfs4_proc_readdir(arg, res); + trace_nfs4_readdir(d_inode(arg->dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)), + err, &exception); } while (exception.retry); return err; } @@ -5310,17 +5320,17 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) } #if defined CONFIG_NFS_V4_2 && defined CONFIG_NFS_V4_2_READ_PLUS -static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) +static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { - if (server->caps & NFS_CAP_READ_PLUS) + /* Note: We don't use READ_PLUS with pNFS yet */ + if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; - else - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } #else -static void nfs42_read_plus_support(struct nfs_server *server, struct rpc_message *msg) +static void nfs42_read_plus_support(struct nfs_pgio_header *hdr, + struct rpc_message *msg) { - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } #endif /* CONFIG_NFS_V4_2 */ @@ -5330,7 +5340,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, hdr->timestamp = jiffies; if (!hdr->pgio_done_cb) hdr->pgio_done_cb = nfs4_read_done_cb; - nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg); + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + nfs42_read_plus_support(hdr, msg); nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0); } @@ -9654,6 +9665,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, if (res.notification != args.notify_types) pdev->nocache = 1; + trace_nfs4_getdeviceinfo(server, &pdev->dev_id, status); + dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index b996ee23f1ba..3de425f59b3a 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -34,7 +34,7 @@ enum nfs4_slot_tbl_state { NFS4_SLOT_TBL_DRAINING, }; -#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) +#define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, BITS_PER_LONG) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ struct nfs4_slot *slots; /* seqid per slot */ diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 93f5c1678ec2..984cc42ee54d 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -67,7 +67,7 @@ static void nfs4_evict_inode(struct inode *inode) nfs_inode_evict_delegation(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); + pnfs_destroy_layout_final(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); nfs4_xattr_cache_zap(inode); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 484c1da96dea..48d761e593fb 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2189,6 +2189,81 @@ DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done); DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist); DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist); +DECLARE_EVENT_CLASS(nfs4_deviceid_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs4_deviceid *deviceid + ), + + TP_ARGS(clp, deviceid), + + TP_STRUCT__entry( + __string(dstaddr, clp->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + ), + + TP_fast_assign( + __assign_str(dstaddr, clp->cl_hostname); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + + TP_printk( + "deviceid=%s, dstaddr=%s", + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(dstaddr) + ) +); +#define DEFINE_PNFS_DEVICEID_EVENT(name) \ + DEFINE_EVENT(nfs4_deviceid_event, name, \ + TP_PROTO(const struct nfs_client *clp, \ + const struct nfs4_deviceid *deviceid \ + ), \ + TP_ARGS(clp, deviceid)) +DEFINE_PNFS_DEVICEID_EVENT(nfs4_deviceid_free); + +DECLARE_EVENT_CLASS(nfs4_deviceid_status, + TP_PROTO( + const struct nfs_server *server, + const struct nfs4_deviceid *deviceid, + int status + ), + + TP_ARGS(server, deviceid, status), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, status) + __string(dstaddr, server->nfs_client->cl_hostname) + __array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE) + ), + + TP_fast_assign( + __entry->dev = server->s_dev; + __entry->status = status; + __assign_str(dstaddr, server->nfs_client->cl_hostname); + memcpy(__entry->deviceid, deviceid->data, + NFS4_DEVICEID4_SIZE); + ), + + TP_printk( + "dev=%02x:%02x: deviceid=%s, dstaddr=%s, status=%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE), + __get_str(dstaddr), + __entry->status + ) +); +#define DEFINE_PNFS_DEVICEID_STATUS(name) \ + DEFINE_EVENT(nfs4_deviceid_status, name, \ + TP_PROTO(const struct nfs_server *server, \ + const struct nfs4_deviceid *deviceid, \ + int status \ + ), \ + TP_ARGS(server, deviceid, status)) +DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo); +DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid); + DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( const struct nfs_pgio_header *hdr diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2eabe5add344..ac6b79ee9355 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -84,6 +84,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, /* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ +#define pagepad_maxsz (1) #define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) #define lock_owner_id_maxsz (1 + 1 + 4) #define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) @@ -215,14 +216,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, nfs4_fattr_bitmap_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + pagepad_maxsz) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz + 1) + decode_verifier_maxsz + pagepad_maxsz) #define encode_readlink_maxsz (op_encode_hdr_maxsz) -#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + pagepad_maxsz) #define encode_write_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 4) #define decode_write_maxsz (op_decode_hdr_maxsz + \ @@ -284,14 +285,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define encode_getacl_maxsz (encode_getattr_maxsz) #define decode_getacl_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 1 + 1) + nfs4_fattr_bitmap_maxsz + 1 + pagepad_maxsz) #define encode_setacl_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_setacl_maxsz (decode_setattr_maxsz) #define encode_fs_locations_maxsz \ (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ - (1) + (pagepad_maxsz) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) #define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) @@ -393,12 +394,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ 1 /* notification bitmap, word 0 */ + \ - 1 /* possible XDR padding */) + pagepad_maxsz /* possible XDR padding */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ - XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1) + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ + pagepad_maxsz) #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ 2 /* offset */ + \ 2 /* length */ + \ @@ -2342,7 +2344,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_layoutget(xdr, args->lg_args, &hdr); rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, args->lg_args->layout.pglen, - hdr.replen); + hdr.replen - pagepad_maxsz); } encode_nops(&hdr); } @@ -2388,7 +2390,7 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_layoutget(xdr, args->lg_args, &hdr); rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, args->lg_args->layout.pglen, - hdr.replen); + hdr.replen - pagepad_maxsz); } encode_nops(&hdr); } @@ -2499,7 +2501,7 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, encode_readlink(xdr, args, req, &hdr); rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->pglen, hdr.replen); + args->pglen, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -2520,7 +2522,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, encode_readdir(xdr, args, req, &hdr); rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + args->count, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -2541,7 +2543,7 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, encode_read(xdr, args, &hdr); rpc_prepare_reply_pages(req, args->pages, args->pgbase, - args->count, hdr.replen); + args->count, hdr.replen - pagepad_maxsz); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); } @@ -2588,7 +2590,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); rpc_prepare_reply_pages(req, args->acl_pages, 0, - args->acl_len, replen + 1); + args->acl_len, replen); encode_nops(&hdr); } @@ -2810,7 +2812,7 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, } rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, - PAGE_SIZE, replen + 1); + PAGE_SIZE, replen); encode_nops(&hdr); } @@ -3009,15 +3011,19 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->seq_args), }; + uint32_t replen; encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); + + replen = hdr.replen + op_decode_hdr_maxsz + 2; + encode_getdeviceinfo(xdr, args, &hdr); - /* set up reply kvec. Subtract notification bitmap max size (2) - * so that notification bitmap is put in xdr_buf tail */ + /* set up reply kvec. device_addr4 opaque data is read into the + * pages */ rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen, hdr.replen - 2); + args->pdev->pglen, replen); encode_nops(&hdr); } @@ -3039,7 +3045,7 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_layoutget(xdr, args, &hdr); rpc_prepare_reply_pages(req, args->layout.pages, 0, - args->layout.pglen, hdr.replen); + args->layout.pglen, hdr.replen - pagepad_maxsz); encode_nops(&hdr); } @@ -5331,11 +5337,11 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, res->acl_len = attrlen; /* Check for receive buffer overflow */ - if (res->acl_len > (xdr->nwords << 2) || + if (res->acl_len > xdr_stream_remaining(xdr) || res->acl_len + res->acl_data_offset > xdr->buf->page_len) { res->acl_flags |= NFS4_ACL_TRUNC; - dprintk("NFS: acl reply: attrlen %u > page_len %u\n", - attrlen, xdr->nwords << 2); + dprintk("NFS: acl reply: attrlen %u > page_len %zu\n", + attrlen, xdr_stream_remaining(xdr)); } } else status = -EOPNOTSUPP; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0e50b9d45c32..07f59dc8cb2e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -294,6 +294,7 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode; + unsigned long i_state; if (!lo) return; @@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); + i_state = inode->i_state; spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); + /* Notify pnfs_destroy_layout_final() that we're done */ + if (i_state & (I_FREEING | I_CLEAR)) + wake_up_var(lo); } } @@ -734,8 +739,7 @@ pnfs_free_lseg_list(struct list_head *free_me) } } -void -pnfs_destroy_layout(struct nfs_inode *nfsi) +static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); @@ -753,9 +757,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); + return lo; +} + +void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + __pnfs_destroy_layout(nfsi); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout); +static bool pnfs_layout_removed(struct nfs_inode *nfsi, + struct pnfs_layout_hdr *lo) +{ + bool ret; + + spin_lock(&nfsi->vfs_inode.i_lock); + ret = nfsi->layout != lo; + spin_unlock(&nfsi->vfs_inode.i_lock); + return ret; +} + +void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + + if (lo) + wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); +} + static bool pnfs_layout_add_bulk_destroy_list(struct inode *inode, struct list_head *layout_list) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2661c44c62db..bbd3de1025f2 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -51,6 +51,8 @@ struct nfs4_pnfs_ds_addr { size_t da_addrlen; struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *da_remotestr; /* human readable addr+port */ + const char *da_netid; + int da_transport; }; struct nfs4_pnfs_ds { @@ -266,6 +268,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_layoutget_free(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_layout_final(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, @@ -710,6 +713,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { } +static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 537b80d693f1..ddbbf4fcda86 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -34,6 +34,8 @@ #include "internal.h" #include "pnfs.h" +#include "nfs4trace.h" + #define NFSDBG_FACILITY NFSDBG_PNFS /* @@ -192,24 +194,28 @@ nfs4_find_get_deviceid(struct nfs_server *server, d = __nfs4_find_get_deviceid(server, id, hash); if (d) - return d; + goto found; new = nfs4_get_device_info(server, id, cred, gfp_mask); - if (!new) + if (!new) { + trace_nfs4_find_deviceid(server, id, -ENOENT); return new; + } spin_lock(&nfs4_deviceid_lock); d = __nfs4_find_get_deviceid(server, id, hash); if (d) { spin_unlock(&nfs4_deviceid_lock); server->pnfs_curr_ld->free_deviceid_node(new); - return d; + } else { + atomic_inc(&new->ref); + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + spin_unlock(&nfs4_deviceid_lock); + d = new; } - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); - atomic_inc(&new->ref); - spin_unlock(&nfs4_deviceid_lock); - - return new; +found: + trace_nfs4_find_deviceid(server, id, 0); + return d; } EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); @@ -278,6 +284,7 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) } if (!atomic_dec_and_test(&d->ref)) return false; + trace_nfs4_deviceid_free(d->nfs_client, &d->deviceid); d->ld->free_deviceid_node(d); return true; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 679767ac258d..2efcfdd348a1 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -661,6 +661,21 @@ _data_server_lookup_locked(const struct list_head *dsaddrs) return NULL; } +static struct nfs4_pnfs_ds_addr *nfs4_pnfs_ds_addr_alloc(gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da = kzalloc(sizeof(*da), gfp_flags); + if (da) + INIT_LIST_HEAD(&da->da_node); + return da; +} + +static void nfs4_pnfs_ds_addr_free(struct nfs4_pnfs_ds_addr *da) +{ + kfree(da->da_remotestr); + kfree(da->da_netid); + kfree(da); +} + static void destroy_ds(struct nfs4_pnfs_ds *ds) { struct nfs4_pnfs_ds_addr *da; @@ -676,8 +691,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) struct nfs4_pnfs_ds_addr, da_node); list_del_init(&da->da_node); - kfree(da->da_remotestr); - kfree(da); + nfs4_pnfs_ds_addr_free(da); } kfree(ds->ds_remotestr); @@ -854,12 +868,17 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, if (!IS_ERR(clp)) { struct xprt_create xprt_args = { - .ident = XPRT_TRANSPORT_TCP, + .ident = da->da_transport, .net = clp->cl_net, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, .servername = clp->cl_hostname, }; + + if (da->da_transport != clp->cl_proto) + continue; + if (da->da_addr.ss_family != clp->cl_addr.ss_family) + continue; /* Add this address as an alias */ rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_test_and_add_xprt, NULL); @@ -867,7 +886,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, } clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, + da->da_addrlen, da->da_transport, timeo, retrans); if (IS_ERR(clp)) continue; @@ -905,7 +924,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { struct xprt_create xprt_args = { - .ident = XPRT_TRANSPORT_TCP, + .ident = da->da_transport, .net = clp->cl_net, .dstaddr = (struct sockaddr *)&da->da_addr, .addrlen = da->da_addrlen, @@ -913,17 +932,21 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, }; struct nfs4_add_xprt_data xprtdata = { .clp = clp, - .cred = nfs4_get_clid_cred(clp), }; struct rpc_add_xprt_test rpcdata = { .add_xprt_test = clp->cl_mvops->session_trunk, .data = &xprtdata, }; + if (da->da_transport != clp->cl_proto) + continue; + if (da->da_addr.ss_family != clp->cl_addr.ss_family) + continue; /** * Test this address for session trunking and * add as an alias */ + xprtdata.cred = nfs4_get_clid_cred(clp), rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, rpc_clnt_setup_test_and_add_xprt, &rpcdata); @@ -932,8 +955,9 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, } else { clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version); + da->da_addrlen, + da->da_transport, timeo, + retrans, minor_version); if (IS_ERR(clp)) continue; @@ -1021,55 +1045,26 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) struct nfs4_pnfs_ds_addr *da = NULL; char *buf, *portstr; __be16 port; - int nlen, rlen; + ssize_t nlen, rlen; int tmp[2]; - __be32 *p; - char *netid, *match_netid; - size_t len, match_netid_len; + char *netid; + size_t len; char *startsep = ""; char *endsep = ""; /* r_netid */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) + nlen = xdr_stream_decode_string_dup(xdr, &netid, XDR_MAX_NETOBJ, + gfp_flags); + if (unlikely(nlen < 0)) goto out_err; - nlen = be32_to_cpup(p++); - - p = xdr_inline_decode(xdr, nlen); - if (unlikely(!p)) - goto out_err; - - netid = kmalloc(nlen+1, gfp_flags); - if (unlikely(!netid)) - goto out_err; - - netid[nlen] = '\0'; - memcpy(netid, p, nlen); /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_free_netid; - rlen = be32_to_cpup(p); - - p = xdr_inline_decode(xdr, rlen); - if (unlikely(!p)) - goto out_free_netid; - /* port is ".ABC.DEF", 8 chars max */ - if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { - dprintk("%s: Invalid address, length %d\n", __func__, - rlen); + rlen = xdr_stream_decode_string_dup(xdr, &buf, INET6_ADDRSTRLEN + + IPV6_SCOPE_ID_LEN + 8, gfp_flags); + if (unlikely(rlen < 0)) goto out_free_netid; - } - buf = kmalloc(rlen + 1, gfp_flags); - if (!buf) { - dprintk("%s: Not enough memory\n", __func__); - goto out_free_netid; - } - buf[rlen] = '\0'; - memcpy(buf, p, rlen); /* replace port '.' with '-' */ portstr = strrchr(buf, '.'); @@ -1089,12 +1084,10 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) } *portstr = '\0'; - da = kzalloc(sizeof(*da), gfp_flags); + da = nfs4_pnfs_ds_addr_alloc(gfp_flags); if (unlikely(!da)) goto out_free_buf; - INIT_LIST_HEAD(&da->da_node); - if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, sizeof(da->da_addr))) { dprintk("%s: error parsing address %s\n", __func__, buf); @@ -1109,15 +1102,11 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) case AF_INET: ((struct sockaddr_in *)&da->da_addr)->sin_port = port; da->da_addrlen = sizeof(struct sockaddr_in); - match_netid = "tcp"; - match_netid_len = 3; break; case AF_INET6: ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; da->da_addrlen = sizeof(struct sockaddr_in6); - match_netid = "tcp6"; - match_netid_len = 4; startsep = "["; endsep = "]"; break; @@ -1128,12 +1117,15 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) goto out_free_da; } - if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { - dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", - __func__, netid, match_netid); + da->da_transport = xprt_find_transport_ident(netid); + if (da->da_transport < 0) { + dprintk("%s: ERROR: unknown r_netid \"%s\"\n", + __func__, netid); goto out_free_da; } + da->da_netid = netid; + /* save human readable address */ len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; da->da_remotestr = kzalloc(len, gfp_flags); @@ -1145,7 +1137,6 @@ nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); kfree(buf); - kfree(netid); return da; out_free_da: diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 15c865cc837f..73ab7c59d3a7 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name) * sure it is syntactically correct; the entries itself are decoded * from nfs_readdir by calling the decode_entry function directly. */ -static int -nfs_proc_readdir(struct dentry *dentry, const struct cred *cred, - u64 cookie, struct page **pages, unsigned int count, bool plus) +static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg, + struct nfs_readdir_res *nr_res) { - struct inode *dir = d_inode(dentry); + struct inode *dir = d_inode(nr_arg->dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), - .cookie = cookie, - .count = count, - .pages = pages, + .cookie = nr_arg->cookie, + .count = nr_arg->page_len, + .pages = nr_arg->pages, }; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_READDIR], .rpc_argp = &arg, - .rpc_cred = cred, + .rpc_cred = nr_arg->cred, }; int status; - dprintk("NFS call readdir %d\n", (unsigned int)cookie); + dprintk("NFS call readdir %llu\n", (unsigned long long)nr_arg->cookie); status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nr_res->verf[0] = nr_res->verf[1] = 0; nfs_invalidate_atime(dir); diff --git a/include/dt-bindings/dma/jz4775-dma.h b/include/dt-bindings/dma/jz4775-dma.h new file mode 100644 index 000000000000..8d27e2c69dca --- /dev/null +++ b/include/dt-bindings/dma/jz4775-dma.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * This header provides macros for JZ4775 DMA bindings. + * + * Copyright (c) 2020 周琰杰 (Zhou Yanjie) + */ + +#ifndef __DT_BINDINGS_DMA_JZ4775_DMA_H__ +#define __DT_BINDINGS_DMA_JZ4775_DMA_H__ + +/* + * Request type numbers for the JZ4775 DMA controller (written to the DRTn + * register for the channel). + */ +#define JZ4775_DMA_I2S0_TX 0x6 +#define JZ4775_DMA_I2S0_RX 0x7 +#define JZ4775_DMA_AUTO 0x8 +#define JZ4775_DMA_SADC_RX 0x9 +#define JZ4775_DMA_UART3_TX 0x0e +#define JZ4775_DMA_UART3_RX 0x0f +#define JZ4775_DMA_UART2_TX 0x10 +#define JZ4775_DMA_UART2_RX 0x11 +#define JZ4775_DMA_UART1_TX 0x12 +#define JZ4775_DMA_UART1_RX 0x13 +#define JZ4775_DMA_UART0_TX 0x14 +#define JZ4775_DMA_UART0_RX 0x15 +#define JZ4775_DMA_SSI0_TX 0x16 +#define JZ4775_DMA_SSI0_RX 0x17 +#define JZ4775_DMA_MSC0_TX 0x1a +#define JZ4775_DMA_MSC0_RX 0x1b +#define JZ4775_DMA_MSC1_TX 0x1c +#define JZ4775_DMA_MSC1_RX 0x1d +#define JZ4775_DMA_MSC2_TX 0x1e +#define JZ4775_DMA_MSC2_RX 0x1f +#define JZ4775_DMA_PCM0_TX 0x20 +#define JZ4775_DMA_PCM0_RX 0x21 +#define JZ4775_DMA_SMB0_TX 0x24 +#define JZ4775_DMA_SMB0_RX 0x25 +#define JZ4775_DMA_SMB1_TX 0x26 +#define JZ4775_DMA_SMB1_RX 0x27 +#define JZ4775_DMA_SMB2_TX 0x28 +#define JZ4775_DMA_SMB2_RX 0x29 + +#endif /* __DT_BINDINGS_DMA_JZ4775_DMA_H__ */ diff --git a/include/dt-bindings/dma/qcom-gpi.h b/include/dt-bindings/dma/qcom-gpi.h new file mode 100644 index 000000000000..ebda2a37f52a --- /dev/null +++ b/include/dt-bindings/dma/qcom-gpi.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ +/* Copyright (c) 2020, Linaro Ltd. */ + +#ifndef __DT_BINDINGS_DMA_QCOM_GPI_H__ +#define __DT_BINDINGS_DMA_QCOM_GPI_H__ + +#define QCOM_GPI_SPI 1 +#define QCOM_GPI_UART 2 +#define QCOM_GPI_I2C 3 + +#endif /* __DT_BINDINGS_DMA_QCOM_GPI_H__ */ diff --git a/include/dt-bindings/dma/x2000-dma.h b/include/dt-bindings/dma/x2000-dma.h new file mode 100644 index 000000000000..db2cd4830b00 --- /dev/null +++ b/include/dt-bindings/dma/x2000-dma.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * This header provides macros for X2000 DMA bindings. + * + * Copyright (c) 2020 周琰杰 (Zhou Yanjie) + */ + +#ifndef __DT_BINDINGS_DMA_X2000_DMA_H__ +#define __DT_BINDINGS_DMA_X2000_DMA_H__ + +/* + * Request type numbers for the X2000 DMA controller (written to the DRTn + * register for the channel). + */ +#define X2000_DMA_AUTO 0x8 +#define X2000_DMA_UART5_TX 0xa +#define X2000_DMA_UART5_RX 0xb +#define X2000_DMA_UART4_TX 0xc +#define X2000_DMA_UART4_RX 0xd +#define X2000_DMA_UART3_TX 0xe +#define X2000_DMA_UART3_RX 0xf +#define X2000_DMA_UART2_TX 0x10 +#define X2000_DMA_UART2_RX 0x11 +#define X2000_DMA_UART1_TX 0x12 +#define X2000_DMA_UART1_RX 0x13 +#define X2000_DMA_UART0_TX 0x14 +#define X2000_DMA_UART0_RX 0x15 +#define X2000_DMA_SSI0_TX 0x16 +#define X2000_DMA_SSI0_RX 0x17 +#define X2000_DMA_SSI1_TX 0x18 +#define X2000_DMA_SSI1_RX 0x19 +#define X2000_DMA_I2C0_TX 0x24 +#define X2000_DMA_I2C0_RX 0x25 +#define X2000_DMA_I2C1_TX 0x26 +#define X2000_DMA_I2C1_RX 0x27 +#define X2000_DMA_I2C2_TX 0x28 +#define X2000_DMA_I2C2_RX 0x29 +#define X2000_DMA_I2C3_TX 0x2a +#define X2000_DMA_I2C3_RX 0x2b +#define X2000_DMA_I2C4_TX 0x2c +#define X2000_DMA_I2C4_RX 0x2d +#define X2000_DMA_I2C5_TX 0x2e +#define X2000_DMA_I2C5_RX 0x2f +#define X2000_DMA_UART6_TX 0x30 +#define X2000_DMA_UART6_RX 0x31 +#define X2000_DMA_UART7_TX 0x32 +#define X2000_DMA_UART7_RX 0x33 +#define X2000_DMA_UART8_TX 0x34 +#define X2000_DMA_UART8_RX 0x35 +#define X2000_DMA_UART9_TX 0x36 +#define X2000_DMA_UART9_RX 0x37 +#define X2000_DMA_SADC_RX 0x38 + +#endif /* __DT_BINDINGS_DMA_X2000_DMA_H__ */ diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 6728c2ee0205..71b5d481c653 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -32,8 +32,6 @@ struct ceph_auth_handshake { }; struct ceph_auth_client_ops { - const char *name; - /* * true if we are authenticated and can connect to * services. @@ -53,7 +51,9 @@ struct ceph_auth_client_ops { */ int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end); int (*handle_reply)(struct ceph_auth_client *ac, int result, - void *buf, void *end); + void *buf, void *end, u8 *session_key, + int *session_key_len, u8 *con_secret, + int *con_secret_len); /* * Create authorizer for connecting to a service, and verify @@ -69,7 +69,10 @@ struct ceph_auth_client_ops { void *challenge_buf, int challenge_buf_len); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, - struct ceph_authorizer *a); + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); void (*invalidate_authorizer)(struct ceph_auth_client *ac, int peer_type); @@ -95,11 +98,15 @@ struct ceph_auth_client { const struct ceph_crypto_key *key; /* our secret key */ unsigned want_keys; /* which services we want */ + int preferred_mode; /* CEPH_CON_MODE_* */ + int fallback_mode; /* ditto */ + struct mutex mutex; }; -extern struct ceph_auth_client *ceph_auth_init(const char *name, - const struct ceph_crypto_key *key); +struct ceph_auth_client *ceph_auth_init(const char *name, + const struct ceph_crypto_key *key, + const int *con_modes); extern void ceph_auth_destroy(struct ceph_auth_client *ac); extern void ceph_auth_reset(struct ceph_auth_client *ac); @@ -113,21 +120,22 @@ int ceph_auth_entity_name_encode(const char *name, void **p, void *end); extern int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len); - extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); -extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *auth); + +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode); void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); -extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *a); int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, void *challenge_buf, int challenge_buf_len); -extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a); +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type); @@ -147,4 +155,34 @@ int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth, return auth->check_message_signature(auth, msg); return 0; } + +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len); +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len); +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len); +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len); +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); + #endif diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 999636d53cf2..3a47acd9cc14 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -8,7 +8,8 @@ * feature. Base case is 1 (first use). */ #define CEPH_FEATURE_INCARNATION_1 (0ull) -#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // SERVER_JEWEL +#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC #define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<options->con_modes[0] != CEPH_CON_MODE_UNKNOWN; +} /* * snapshots diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 35d385296fbb..523fd0452856 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -64,7 +64,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) } extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); -extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); +struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2); extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m); diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 60b324efd1c4..0e6e9ad3c3bf 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -3,6 +3,7 @@ #define __FS_CEPH_MESSENGER_H #include +#include #include #include #include @@ -52,6 +53,23 @@ struct ceph_connection_operations { int (*sign_message) (struct ceph_msg *msg); int (*check_message_signature) (struct ceph_msg *msg); + + /* msgr2 authentication exchange */ + int (*get_auth_request)(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len); + int (*handle_auth_reply_more)(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len); + int (*handle_auth_done)(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len); + int (*handle_auth_bad_method)(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt); }; /* use format string %s%lld */ @@ -235,14 +253,171 @@ struct ceph_msg { bool more_to_follow; bool needs_out_seq; int front_alloc_len; - unsigned long ack_stamp; /* tx: when we were acked */ struct ceph_msgpool *pool; }; +/* + * connection states + */ +#define CEPH_CON_S_CLOSED 1 +#define CEPH_CON_S_PREOPEN 2 +#define CEPH_CON_S_V1_BANNER 3 +#define CEPH_CON_S_V1_CONNECT_MSG 4 +#define CEPH_CON_S_V2_BANNER_PREFIX 5 +#define CEPH_CON_S_V2_BANNER_PAYLOAD 6 +#define CEPH_CON_S_V2_HELLO 7 +#define CEPH_CON_S_V2_AUTH 8 +#define CEPH_CON_S_V2_AUTH_SIGNATURE 9 +#define CEPH_CON_S_V2_SESSION_CONNECT 10 +#define CEPH_CON_S_V2_SESSION_RECONNECT 11 +#define CEPH_CON_S_OPEN 12 +#define CEPH_CON_S_STANDBY 13 + +/* + * ceph_connection flag bits + */ +#define CEPH_CON_F_LOSSYTX 0 /* we can close channel or drop + messages on errors */ +#define CEPH_CON_F_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CEPH_CON_F_WRITE_PENDING 2 /* we have data ready to send */ +#define CEPH_CON_F_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CEPH_CON_F_BACKOFF 4 /* need to retry queuing delayed + work */ + /* ceph connection fault delay defaults, for exponential backoff */ -#define BASE_DELAY_INTERVAL (HZ/2) -#define MAX_DELAY_INTERVAL (5 * 60 * HZ) +#define BASE_DELAY_INTERVAL (HZ / 4) +#define MAX_DELAY_INTERVAL (15 * HZ) + +struct ceph_connection_v1_info { + struct kvec out_kvec[8], /* sending header/footer data */ + *out_kvec_cur; + int out_kvec_left; /* kvec's left in out_kvec */ + int out_skip; /* skip this many bytes */ + int out_kvec_bytes; /* total bytes left */ + bool out_more; /* there is more data after the kvecs */ + bool out_msg_done; + + struct ceph_auth_handshake *auth; + int auth_retry; /* true if we need a newer authorizer */ + + /* connection negotiation temps */ + u8 in_banner[CEPH_BANNER_MAX_LEN]; + struct ceph_entity_addr actual_peer_addr; + struct ceph_entity_addr peer_addr_for_me; + struct ceph_msg_connect out_connect; + struct ceph_msg_connect_reply in_reply; + + int in_base_pos; /* bytes read */ + + /* message in temps */ + u8 in_tag; /* protocol control byte */ + struct ceph_msg_header in_hdr; + __le64 in_temp_ack; /* for reading an ack */ + + /* message out temps */ + struct ceph_msg_header out_hdr; + __le64 out_temp_ack; /* for writing an ack */ + struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 + stamp */ + + u32 connect_seq; /* identify the most recent connection + attempt for this session */ + u32 peer_global_seq; /* peer's global seq for this connection */ +}; + +#define CEPH_CRC_LEN 4 +#define CEPH_GCM_KEY_LEN 16 +#define CEPH_GCM_IV_LEN sizeof(struct ceph_gcm_nonce) +#define CEPH_GCM_BLOCK_LEN 16 +#define CEPH_GCM_TAG_LEN 16 + +#define CEPH_PREAMBLE_LEN 32 +#define CEPH_PREAMBLE_INLINE_LEN 48 +#define CEPH_PREAMBLE_PLAIN_LEN CEPH_PREAMBLE_LEN +#define CEPH_PREAMBLE_SECURE_LEN (CEPH_PREAMBLE_LEN + \ + CEPH_PREAMBLE_INLINE_LEN + \ + CEPH_GCM_TAG_LEN) +#define CEPH_EPILOGUE_PLAIN_LEN (1 + 3 * CEPH_CRC_LEN) +#define CEPH_EPILOGUE_SECURE_LEN (CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN) + +#define CEPH_FRAME_MAX_SEGMENT_COUNT 4 + +struct ceph_frame_desc { + int fd_tag; /* FRAME_TAG_* */ + int fd_seg_cnt; + int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT]; /* logical */ + int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT]; +}; + +struct ceph_gcm_nonce { + __le32 fixed; + __le64 counter __packed; +}; + +struct ceph_connection_v2_info { + struct iov_iter in_iter; + struct kvec in_kvecs[5]; /* recvmsg */ + struct bio_vec in_bvec; /* recvmsg (in_cursor) */ + int in_kvec_cnt; + int in_state; /* IN_S_* */ + + struct iov_iter out_iter; + struct kvec out_kvecs[8]; /* sendmsg */ + struct bio_vec out_bvec; /* sendpage (out_cursor, out_zero), + sendmsg (out_enc_pages) */ + int out_kvec_cnt; + int out_state; /* OUT_S_* */ + + int out_zero; /* # of zero bytes to send */ + bool out_iter_sendpage; /* use sendpage if possible */ + + struct ceph_frame_desc in_desc; + struct ceph_msg_data_cursor in_cursor; + struct ceph_msg_data_cursor out_cursor; + + struct crypto_shash *hmac_tfm; /* post-auth signature */ + struct crypto_aead *gcm_tfm; /* on-wire encryption */ + struct aead_request *gcm_req; + struct crypto_wait gcm_wait; + struct ceph_gcm_nonce in_gcm_nonce; + struct ceph_gcm_nonce out_gcm_nonce; + + struct page **out_enc_pages; + int out_enc_page_cnt; + int out_enc_resid; + int out_enc_i; + + int con_mode; /* CEPH_CON_MODE_* */ + + void *conn_bufs[16]; + int conn_buf_cnt; + + struct kvec in_sign_kvecs[8]; + struct kvec out_sign_kvecs[8]; + int in_sign_kvec_cnt; + int out_sign_kvec_cnt; + + u64 client_cookie; + u64 server_cookie; + u64 global_seq; + u64 connect_seq; + u64 peer_global_seq; + + u8 in_buf[CEPH_PREAMBLE_SECURE_LEN]; + u8 out_buf[CEPH_PREAMBLE_SECURE_LEN]; + struct { + u8 late_status; /* FRAME_LATE_STATUS_* */ + union { + struct { + u32 front_crc; + u32 middle_crc; + u32 data_crc; + } __packed; + u8 pad[CEPH_GCM_BLOCK_LEN - 1]; + }; + } out_epil; +}; /* * A single connection with another host. @@ -258,24 +433,16 @@ struct ceph_connection { struct ceph_messenger *msgr; + int state; /* CEPH_CON_S_* */ atomic_t sock_state; struct socket *sock; - struct ceph_entity_addr peer_addr; /* peer address */ - struct ceph_entity_addr peer_addr_for_me; - unsigned long flags; - unsigned long state; + unsigned long flags; /* CEPH_CON_F_* */ const char *error_msg; /* error message, if any */ struct ceph_entity_name peer_name; /* peer name */ - + struct ceph_entity_addr peer_addr; /* peer address */ u64 peer_features; - u32 connect_seq; /* identify the most recent connection - attempt for this connection, client */ - u32 peer_global_seq; /* peer's global seq for this connection */ - - struct ceph_auth_handshake *auth; - int auth_retry; /* true if we need a newer authorizer */ struct mutex mutex; @@ -286,43 +453,80 @@ struct ceph_connection { u64 in_seq, in_seq_acked; /* last message received, acked */ - /* connection negotiation temps */ - char in_banner[CEPH_BANNER_MAX_LEN]; - struct ceph_msg_connect out_connect; - struct ceph_msg_connect_reply in_reply; - struct ceph_entity_addr actual_peer_addr; - - /* message out temps */ - struct ceph_msg_header out_hdr; + struct ceph_msg *in_msg; struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ - bool out_msg_done; - struct kvec out_kvec[8], /* sending header/footer data */ - *out_kvec_cur; - int out_kvec_left; /* kvec's left in out_kvec */ - int out_skip; /* skip this many bytes */ - int out_kvec_bytes; /* total bytes left */ - int out_more; /* there is more data after the kvecs */ - __le64 out_temp_ack; /* for writing an ack */ - struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 - stamp */ - - /* message in temps */ - struct ceph_msg_header in_hdr; - struct ceph_msg *in_msg; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ - char in_tag; /* protocol control byte */ - int in_base_pos; /* bytes read */ - __le64 in_temp_ack; /* for reading an ack */ - struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ + + union { + struct ceph_connection_v1_info v1; + struct ceph_connection_v2_info v2; + }; }; +extern struct page *ceph_zero_page; + +void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag); +void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag); +bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag); +bool ceph_con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag); +bool ceph_con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag); + +void ceph_encode_my_addr(struct ceph_messenger *msgr); + +int ceph_tcp_connect(struct ceph_connection *con); +int ceph_con_close_socket(struct ceph_connection *con); +void ceph_con_reset_session(struct ceph_connection *con); + +u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt); +void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq); +void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq); + +void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, + struct ceph_msg *msg, size_t length); +struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece); +void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes); + +u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, + unsigned int length); + +bool ceph_addr_is_blank(const struct ceph_entity_addr *addr); +int ceph_addr_port(const struct ceph_entity_addr *addr); +void ceph_addr_set_port(struct ceph_entity_addr *addr, int p); + +void ceph_con_process_message(struct ceph_connection *con); +int ceph_con_in_msg_alloc(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip); +void ceph_con_get_out_msg(struct ceph_connection *con); + +/* messenger_v1.c */ +int ceph_con_v1_try_read(struct ceph_connection *con); +int ceph_con_v1_try_write(struct ceph_connection *con); +void ceph_con_v1_revoke(struct ceph_connection *con); +void ceph_con_v1_revoke_incoming(struct ceph_connection *con); +bool ceph_con_v1_opened(struct ceph_connection *con); +void ceph_con_v1_reset_session(struct ceph_connection *con); +void ceph_con_v1_reset_protocol(struct ceph_connection *con); + +/* messenger_v2.c */ +int ceph_con_v2_try_read(struct ceph_connection *con); +int ceph_con_v2_try_write(struct ceph_connection *con); +void ceph_con_v2_revoke(struct ceph_connection *con); +void ceph_con_v2_revoke_incoming(struct ceph_connection *con); +bool ceph_con_v2_opened(struct ceph_connection *con); +void ceph_con_v2_reset_session(struct ceph_connection *con); +void ceph_con_v2_reset_protocol(struct ceph_connection *con); + extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); @@ -330,7 +534,6 @@ extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count); - extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); extern void ceph_msgr_flush(void); diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 9e50aede46c8..f5e02f6c0655 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -8,24 +8,45 @@ #define CEPH_MON_PORT 6789 /* default monitor port */ -/* - * client-side processes will try to bind to ports in this - * range, simply for the benefit of tools like nmap or wireshark - * that would like to identify the protocol. - */ -#define CEPH_PORT_FIRST 6789 -#define CEPH_PORT_START 6800 /* non-monitors start here */ -#define CEPH_PORT_LAST 6900 - /* * tcp connection banner. include a protocol version. and adjust * whenever the wire protocol changes. try to keep this string length * constant. */ #define CEPH_BANNER "ceph v027" +#define CEPH_BANNER_LEN 9 #define CEPH_BANNER_MAX_LEN 30 +/* + * messenger V2 connection banner prefix. + * The full banner string should have the form: "ceph v2\n" + * the 2 bytes are the length of the remaining banner. + */ +#define CEPH_BANNER_V2 "ceph v2\n" +#define CEPH_BANNER_V2_LEN 8 +#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16)) + +/* + * messenger V2 features + */ +#define CEPH_MSGR2_INCARNATION_1 (0ull) + +#define DEFINE_MSGR2_FEATURE(bit, incarnation, name) \ + static const uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \ + static const uint64_t CEPH_MSGR2_FEATUREMASK_##name = \ + (1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation); + +#define HAVE_MSGR2_FEATURE(x, name) \ + (((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name)) + +DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1) // msgr2.1 + +#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + +#define CEPH_MSGR2_REQUIRED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1) + + /* * Rollover-safe type and comparator for 32-bit sequence numbers. * Comparator returns -1, 0, or 1. @@ -61,11 +82,18 @@ extern const char *ceph_entity_type_name(int type); * entity_addr -- network address */ struct ceph_entity_addr { - __le32 type; + __le32 type; /* CEPH_ENTITY_ADDR_TYPE_* */ __le32 nonce; /* unique id for process (e.g. pid) */ struct sockaddr_storage in_addr; } __attribute__ ((packed)); +static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs, + const struct ceph_entity_addr *rhs) +{ + return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) && + lhs->nonce == rhs->nonce; +} + struct ceph_entity_inst { struct ceph_entity_name name; struct ceph_entity_addr addr; @@ -160,6 +188,24 @@ struct ceph_msg_header { __le32 crc; /* header crc32c */ } __attribute__ ((packed)); +struct ceph_msg_header2 { + __le64 seq; /* message seq# for this session */ + __le64 tid; /* transaction id */ + __le16 type; /* message type */ + __le16 priority; /* priority. higher value == higher priority */ + __le16 version; /* version of message encoding */ + + __le32 data_pre_padding_len; + __le16 data_off; /* sender: include full offset; + receiver: mask against ~PAGE_MASK */ + + __le64 ack_seq; + __u8 flags; + /* oldest code we think can decode this. unknown if zero. */ + __le16 compat_version; + __le16 reserved; +} __attribute__ ((packed)); + #define CEPH_MSG_PRIO_LOW 64 #define CEPH_MSG_PRIO_DEFAULT 127 #define CEPH_MSG_PRIO_HIGH 196 diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index cad9acfbc320..5553019c3f07 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -251,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) } struct ceph_osdmap *ceph_osdmap_alloc(void); -extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2); +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map); extern void ceph_osdmap_destroy(struct ceph_osdmap *map); diff --git a/include/linux/dma/k3-event-router.h b/include/linux/dma/k3-event-router.h new file mode 100644 index 000000000000..e3f88b2f87be --- /dev/null +++ b/include/linux/dma/k3-event-router.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Texas Instruments Incorporated - https://www.ti.com + */ + +#ifndef K3_EVENT_ROUTER_ +#define K3_EVENT_ROUTER_ + +#include + +struct k3_event_route_data { + void *priv; + int (*set_event)(void *priv, u32 event); +}; + +#endif /* K3_EVENT_ROUTER_ */ diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h index 1962f75fa2d3..36e22c5a0f29 100644 --- a/include/linux/dma/k3-psil.h +++ b/include/linux/dma/k3-psil.h @@ -50,6 +50,15 @@ enum psil_endpoint_type { * @channel_tpl: Desired throughput level for the channel * @pdma_acc32: ACC32 must be enabled on the PDMA side * @pdma_burst: BURST must be enabled on the PDMA side + * @mapped_channel_id: PKTDMA thread to channel mapping for mapped channels. + * The thread must be serviced by the specified channel if + * mapped_channel_id is >= 0 in case of PKTDMA + * @flow_start: PKDMA flow range start of mapped channel. Unmapped + * channels use flow_id == chan_id + * @flow_num: PKDMA flow count of mapped channel. Unmapped channels + * use flow_id == chan_id + * @default_flow_id: PKDMA default (r)flow index of mapped channel. + * Must be within the flow range of the mapped channel. */ struct psil_endpoint_config { enum psil_endpoint_type ep_type; @@ -63,6 +72,13 @@ struct psil_endpoint_config { /* PDMA properties, valid for PSIL_EP_PDMA_* */ unsigned pdma_acc32:1; unsigned pdma_burst:1; + + /* PKDMA mapped channel */ + int mapped_channel_id; + /* PKTDMA tflow and rflow ranges for mapped channel */ + u16 flow_start; + u16 flow_num; + u16 default_flow_id; }; int psil_set_new_ep_config(struct device *dev, const char *name, diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index 5eb34ad973a7..e443be4d3b4b 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -41,6 +41,12 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn, u32 k3_udma_glue_tx_get_hdesc_size(struct k3_udma_glue_tx_channel *tx_chn); u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn); int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn); +struct device * + k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn); +void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn, + dma_addr_t *addr); +void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn, + dma_addr_t *addr); enum { K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0, @@ -130,5 +136,11 @@ int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_idx); int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_idx); +struct device * + k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn); +void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn, + dma_addr_t *addr); +void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn, + dma_addr_t *addr); #endif /* K3_UDMA_GLUE_H_ */ diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h new file mode 100644 index 000000000000..f46dc3372f11 --- /dev/null +++ b/include/linux/dma/qcom-gpi-dma.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Linaro Limited + */ + +#ifndef QCOM_GPI_DMA_H +#define QCOM_GPI_DMA_H + +/** + * enum spi_transfer_cmd - spi transfer commands + */ +enum spi_transfer_cmd { + SPI_TX = 1, + SPI_RX, + SPI_DUPLEX, +}; + +/** + * struct gpi_spi_config - spi config for peripheral + * + * @loopback_en: spi loopback enable when set + * @clock_pol_high: clock polarity + * @data_pol_high: data polarity + * @pack_en: process tx/rx buffers as packed + * @word_len: spi word length + * @clk_div: source clock divider + * @clk_src: serial clock + * @cmd: spi cmd + * @fragmentation: keep CS assserted at end of sequence + * @cs: chip select toggle + * @set_config: set peripheral config + * @rx_len: receive length for buffer + */ +struct gpi_spi_config { + u8 set_config; + u8 loopback_en; + u8 clock_pol_high; + u8 data_pol_high; + u8 pack_en; + u8 word_len; + u8 fragmentation; + u8 cs; + u32 clk_div; + u32 clk_src; + enum spi_transfer_cmd cmd; + u32 rx_len; +}; + +enum i2c_op { + I2C_WRITE = 1, + I2C_READ, +}; + +/** + * struct gpi_i2c_config - i2c config for peripheral + * + * @pack_enable: process tx/rx buffers as packed + * @cycle_count: clock cycles to be sent + * @high_count: high period of clock + * @low_count: low period of clock + * @clk_div: source clock divider + * @addr: i2c bus address + * @stretch: stretch the clock at eot + * @set_config: set peripheral config + * @rx_len: receive length for buffer + * @op: i2c cmd + * @muli-msg: is part of multi i2c r-w msgs + */ +struct gpi_i2c_config { + u8 set_config; + u8 pack_enable; + u8 cycle_count; + u8 high_count; + u8 low_count; + u8 addr; + u8 stretch; + u16 clk_div; + u32 rx_len; + enum i2c_op op; + bool multi_msg; +}; + +#endif /* QCOM_GPI_DMA_H */ diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index dd357a747780..68130f5f599e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -357,11 +357,14 @@ struct dma_chan { * @chan: driver channel device * @device: sysfs device * @dev_id: parent dma_device dev_id + * @chan_dma_dev: The channel is using custom/different dma-mapping + * compared to the parent dma_device */ struct dma_chan_dev { struct dma_chan *chan; struct device device; int dev_id; + bool chan_dma_dev; }; /** @@ -418,6 +421,9 @@ enum dma_slave_buswidth { * @slave_id: Slave requester id. Only valid for slave channels. The dma * slave peripheral will have unique id as dma requester which need to be * pass as slave config. + * @peripheral_config: peripheral configuration for programming peripheral + * for dmaengine transfer + * @peripheral_size: peripheral configuration buffer size * * This struct is passed in as configuration data to a DMA engine * in order to set up a certain channel for DMA transport at runtime. @@ -443,6 +449,8 @@ struct dma_slave_config { u32 dst_port_window_size; bool device_fc; unsigned int slave_id; + void *peripheral_config; + size_t peripheral_size; }; /** @@ -800,6 +808,7 @@ struct dma_filter { * by tx_status * @device_alloc_chan_resources: allocate resources and return the * number of allocated descriptors + * @device_router_config: optional callback for DMA router configuration * @device_free_chan_resources: release DMA channel's resources * @device_prep_dma_memcpy: prepares a memcpy operation * @device_prep_dma_xor: prepares a xor operation @@ -874,6 +883,7 @@ struct dma_device { enum dma_residue_granularity residue_granularity; int (*device_alloc_chan_resources)(struct dma_chan *chan); + int (*device_router_config)(struct dma_chan *chan); void (*device_free_chan_resources)(struct dma_chan *chan); struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)( @@ -1611,4 +1621,13 @@ dmaengine_get_direction_text(enum dma_transfer_direction dir) return "invalid"; } } + +static inline struct device *dmaengine_get_dma_device(struct dma_chan *chan) +{ + if (chan->dev->chan_dma_dev) + return &chan->dev->device; + + return chan->device->dev; +} + #endif /* DMAENGINE_H */ diff --git a/include/linux/mailbox/arm_mhuv2_message.h b/include/linux/mailbox/arm_mhuv2_message.h new file mode 100644 index 000000000000..821b9d96daa4 --- /dev/null +++ b/include/linux/mailbox/arm_mhuv2_message.h @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ARM MHUv2 Mailbox Message + * + * Copyright (C) 2020 Arm Ltd. + * Copyright (C) 2020 Linaro Ltd. + */ + +#ifndef _LINUX_ARM_MHUV2_MESSAGE_H_ +#define _LINUX_ARM_MHUV2_MESSAGE_H_ + +#include + +/* Data structure for data-transfer protocol */ +struct arm_mhuv2_mbox_msg { + void *data; + size_t len; +}; + +#endif /* _LINUX_ARM_MHUV2_MESSAGE_H_ */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a2c6455ea3fa..681ed98e4ba8 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -45,6 +45,11 @@ */ #define NFS_RPC_SWAPFLAGS (RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS) +/* + * Size of the NFS directory verifier + */ +#define NFS_DIR_VERIFIER_SIZE 2 + /* * NFSv3/v4 Access mode cache entry */ @@ -88,8 +93,8 @@ struct nfs_open_context { struct nfs_open_dir_context { struct list_head list; - const struct cred *cred; unsigned long attr_gencount; + __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 dup_cookie; signed char duped; @@ -157,7 +162,7 @@ struct nfs_inode { * This is the cookie verifier used for NFSv3 readdir * operations */ - __be32 cookieverf[2]; + __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; atomic_long_t nrequests; struct nfs_mds_commit_info commit_info; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d63cb862d58e..3327239fa2f9 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -750,6 +750,20 @@ struct nfs_entry { struct nfs_server * server; }; +struct nfs_readdir_arg { + struct dentry *dentry; + const struct cred *cred; + __be32 *verf; + u64 cookie; + struct page **pages; + unsigned int page_len; + bool plus; +}; + +struct nfs_readdir_res { + __be32 *verf; +}; + /* * The following types are for NFSv2 only. */ @@ -1744,8 +1758,7 @@ struct nfs_rpc_ops { unsigned int, struct iattr *); int (*mkdir) (struct inode *, struct dentry *, struct iattr *); int (*rmdir) (struct inode *, const struct qstr *); - int (*readdir) (struct dentry *, const struct cred *, - u64, struct page **, unsigned int, bool); + int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *); int (*mknod) (struct inode *, struct dentry *, struct iattr *, dev_t); int (*statfs) (struct nfs_server *, struct nfs_fh *, diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h index 658dc71d2901..39b022b92598 100644 --- a/include/linux/soc/ti/k3-ringacc.h +++ b/include/linux/soc/ti/k3-ringacc.h @@ -70,6 +70,7 @@ struct k3_ring; * @dma_dev: Master device which is using and accessing to the ring * memory when the mode is K3_RINGACC_RING_MODE_RING. Memory allocations * should be done using this device. + * @asel: Address Space Select value for physical addresses */ struct k3_ring_cfg { u32 size; @@ -79,6 +80,7 @@ struct k3_ring_cfg { u32 flags; struct device *dma_dev; + u32 asel; }; #define K3_RINGACC_RING_ID_ANY (-1) @@ -250,4 +252,19 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem); u32 k3_ringacc_get_tisci_dev_id(struct k3_ring *ring); +/* DMA ring support */ +struct ti_sci_handle; + +/** + * struct struct k3_ringacc_init_data - Initialization data for DMA rings + */ +struct k3_ringacc_init_data { + const struct ti_sci_handle *tisci; + u32 tisci_dev_id; + u32 num_rings; +}; + +struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev, + struct k3_ringacc_init_data *data); + #endif /* __SOC_TI_K3_RINGACC_API_H_ */ diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 9b35ce50cf2b..19b6dea27367 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -128,8 +128,8 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); void xdr_inline_pages(struct xdr_buf *, unsigned int, struct page **, unsigned int, unsigned int); -void xdr_terminate_string(struct xdr_buf *, const u32); -size_t xdr_buf_pagecount(struct xdr_buf *buf); +void xdr_terminate_string(const struct xdr_buf *, const u32); +size_t xdr_buf_pagecount(const struct xdr_buf *buf); int xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp); void xdr_free_bvec(struct xdr_buf *buf); @@ -182,15 +182,14 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) * XDR buffer helper functions */ extern void xdr_shift_buf(struct xdr_buf *, size_t); -extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); -extern int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, - unsigned int base, unsigned int len); +extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *); +extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); extern void xdr_buf_trim(struct xdr_buf *, unsigned int); -extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); -extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); +extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); +extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int); -extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32); -extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *); +extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32); +extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *); struct xdr_array2_desc; typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem); @@ -201,9 +200,9 @@ struct xdr_array2_desc { xdr_xcode_elem_t xcode; }; -extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base, +extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); -extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base, +extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, struct xdr_array2_desc *desc); extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len); @@ -251,9 +250,9 @@ extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); -extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); -extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t); -extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t); +extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length); +extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length); extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int len); diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index a603d48d2b2c..d2e97ee802af 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -330,6 +330,7 @@ struct xprt_class { struct rpc_xprt * (*setup)(struct xprt_create *); struct module *owner; char name[32]; + const char * netid[]; }; /* @@ -384,7 +385,7 @@ xprt_disable_swap(struct rpc_xprt *xprt) */ int xprt_register_transport(struct xprt_class *type); int xprt_unregister_transport(struct xprt_class *type); -int xprt_load_transport(const char *); +int xprt_find_transport_ident(const char *); void xprt_wait_for_reply_request_def(struct rpc_task *task); void xprt_wait_for_reply_request_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 896aafc37b09..76e85e16854b 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) -DECLARE_EVENT_CLASS(xprtrdma_reply_event, +DECLARE_EVENT_CLASS(xprtrdma_reply_class, TP_PROTO( const struct rpcrdma_rep *rep ), @@ -68,29 +68,30 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_event, TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, rep) - __field(const void *, r_xprt) __field(u32, xid) __field(u32, version) __field(u32, proc) + __string(addr, rpcrdma_addrstr(rep->rr_rxprt)) + __string(port, rpcrdma_portstr(rep->rr_rxprt)) ), TP_fast_assign( - __entry->rep = rep; - __entry->r_xprt = rep->rr_rxprt; __entry->xid = be32_to_cpu(rep->rr_xid); __entry->version = be32_to_cpu(rep->rr_vers); __entry->proc = be32_to_cpu(rep->rr_proc); + __assign_str(addr, rpcrdma_addrstr(rep->rr_rxprt)); + __assign_str(port, rpcrdma_portstr(rep->rr_rxprt)); ), - TP_printk("rxprt %p xid=0x%08x rep=%p: version %u proc %u", - __entry->r_xprt, __entry->xid, __entry->rep, - __entry->version, __entry->proc + TP_printk("peer=[%s]:%s xid=0x%08x version=%u proc=%u", + __get_str(addr), __get_str(port), + __entry->xid, __entry->version, __entry->proc ) ); #define DEFINE_REPLY_EVENT(name) \ - DEFINE_EVENT(xprtrdma_reply_event, name, \ + DEFINE_EVENT(xprtrdma_reply_class, \ + xprtrdma_reply_##name##_err, \ TP_PROTO( \ const struct rpcrdma_rep *rep \ ), \ @@ -261,41 +262,6 @@ DECLARE_EVENT_CLASS(xprtrdma_wrch_event, ), \ TP_ARGS(task, mr, nsegs)) -DECLARE_EVENT_CLASS(xprtrdma_frwr_done, - TP_PROTO( - const struct ib_wc *wc, - const struct rpcrdma_frwr *frwr - ), - - TP_ARGS(wc, frwr), - - TP_STRUCT__entry( - __field(u32, mr_id) - __field(unsigned int, status) - __field(unsigned int, vendor_err) - ), - - TP_fast_assign( - __entry->mr_id = frwr->fr_mr->res.id; - __entry->status = wc->status; - __entry->vendor_err = __entry->status ? wc->vendor_err : 0; - ), - - TP_printk( - "mr.id=%u: %s (%u/0x%x)", - __entry->mr_id, rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); - -#define DEFINE_FRWR_DONE_EVENT(name) \ - DEFINE_EVENT(xprtrdma_frwr_done, name, \ - TP_PROTO( \ - const struct ib_wc *wc, \ - const struct rpcrdma_frwr *frwr \ - ), \ - TP_ARGS(wc, frwr)) - TRACE_DEFINE_ENUM(DMA_BIDIRECTIONAL); TRACE_DEFINE_ENUM(DMA_TO_DEVICE); TRACE_DEFINE_ENUM(DMA_FROM_DEVICE); @@ -308,7 +274,55 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_FROM_DEVICE, "FROM_DEVICE" }, \ { DMA_NONE, "NONE" }) -DECLARE_EVENT_CLASS(xprtrdma_mr, +DECLARE_EVENT_CLASS(xprtrdma_mr_class, + TP_PROTO( + const struct rpcrdma_mr *mr + ), + + TP_ARGS(mr), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, mr_id) + __field(int, nents) + __field(u32, handle) + __field(u32, length) + __field(u64, offset) + __field(u32, dir) + ), + + TP_fast_assign( + const struct rpcrdma_req *req = mr->mr_req; + const struct rpc_task *task = req->rl_slot.rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->nents = mr->mr_nents; + __entry->handle = mr->mr_handle; + __entry->length = mr->mr_length; + __entry->offset = mr->mr_offset; + __entry->dir = mr->mr_dir; + ), + + TP_printk("task:%u@%u mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)", + __entry->task_id, __entry->client_id, + __entry->mr_id, __entry->nents, __entry->length, + (unsigned long long)__entry->offset, __entry->handle, + xprtrdma_show_direction(__entry->dir) + ) +); + +#define DEFINE_MR_EVENT(name) \ + DEFINE_EVENT(xprtrdma_mr_class, \ + xprtrdma_mr_##name, \ + TP_PROTO( \ + const struct rpcrdma_mr *mr \ + ), \ + TP_ARGS(mr)) + +DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class, TP_PROTO( const struct rpcrdma_mr *mr ), @@ -340,45 +354,47 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ) ); -#define DEFINE_MR_EVENT(name) \ - DEFINE_EVENT(xprtrdma_mr, xprtrdma_mr_##name, \ - TP_PROTO( \ - const struct rpcrdma_mr *mr \ - ), \ +#define DEFINE_ANON_MR_EVENT(name) \ + DEFINE_EVENT(xprtrdma_anonymous_mr_class, \ + xprtrdma_mr_##name, \ + TP_PROTO( \ + const struct rpcrdma_mr *mr \ + ), \ TP_ARGS(mr)) -DECLARE_EVENT_CLASS(xprtrdma_cb_event, +DECLARE_EVENT_CLASS(xprtrdma_callback_class, TP_PROTO( + const struct rpcrdma_xprt *r_xprt, const struct rpc_rqst *rqst ), - TP_ARGS(rqst), + TP_ARGS(r_xprt, rqst), TP_STRUCT__entry( - __field(const void *, rqst) - __field(const void *, rep) - __field(const void *, req) __field(u32, xid) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->rqst = rqst; - __entry->req = rpcr_to_rdmar(rqst); - __entry->rep = rpcr_to_rdmar(rqst)->rl_reply; __entry->xid = be32_to_cpu(rqst->rq_xid); + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("xid=0x%08x, rqst=%p req=%p rep=%p", - __entry->xid, __entry->rqst, __entry->req, __entry->rep + TP_printk("peer=[%s]:%s xid=0x%08x", + __get_str(addr), __get_str(port), __entry->xid ) ); -#define DEFINE_CB_EVENT(name) \ - DEFINE_EVENT(xprtrdma_cb_event, name, \ +#define DEFINE_CALLBACK_EVENT(name) \ + DEFINE_EVENT(xprtrdma_callback_class, \ + xprtrdma_cb_##name, \ TP_PROTO( \ + const struct rpcrdma_xprt *r_xprt, \ const struct rpc_rqst *rqst \ ), \ - TP_ARGS(rqst)) + TP_ARGS(r_xprt, rqst)) /** ** Connection events @@ -549,61 +565,33 @@ TRACE_EVENT(xprtrdma_createmrs, ) ); -TRACE_EVENT(xprtrdma_mr_get, +TRACE_EVENT(xprtrdma_nomrs_err, TP_PROTO( + const struct rpcrdma_xprt *r_xprt, const struct rpcrdma_req *req ), - TP_ARGS(req), + TP_ARGS(r_xprt, req), TP_STRUCT__entry( - __field(const void *, req) __field(unsigned int, task_id) __field(unsigned int, client_id) - __field(u32, xid) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( const struct rpc_rqst *rqst = &req->rl_slot; - __entry->req = req; __entry->task_id = rqst->rq_task->tk_pid; __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->xid = be32_to_cpu(rqst->rq_xid); + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("task:%u@%u xid=0x%08x req=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->req - ) -); - -TRACE_EVENT(xprtrdma_nomrs, - TP_PROTO( - const struct rpcrdma_req *req - ), - - TP_ARGS(req), - - TP_STRUCT__entry( - __field(const void *, req) - __field(unsigned int, task_id) - __field(unsigned int, client_id) - __field(u32, xid) - ), - - TP_fast_assign( - const struct rpc_rqst *rqst = &req->rl_slot; - - __entry->req = req; - __entry->task_id = rqst->rq_task->tk_pid; - __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->xid = be32_to_cpu(rqst->rq_xid); - ), - - TP_printk("task:%u@%u xid=0x%08x req=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->req + TP_printk("peer=[%s]:%s task:%u@%u", + __get_str(addr), __get_str(port), + __entry->task_id, __entry->client_id ) ); @@ -735,8 +723,8 @@ TRACE_EVENT(xprtrdma_post_send, TP_ARGS(req), TP_STRUCT__entry( - __field(const void *, req) - __field(const void *, sc) + __field(u32, cq_id) + __field(int, completion_id) __field(unsigned int, task_id) __field(unsigned int, client_id) __field(int, num_sge) @@ -745,20 +733,21 @@ TRACE_EVENT(xprtrdma_post_send, TP_fast_assign( const struct rpc_rqst *rqst = &req->rl_slot; + const struct rpcrdma_sendctx *sc = req->rl_sendctx; + __entry->cq_id = sc->sc_cid.ci_queue_id; + __entry->completion_id = sc->sc_cid.ci_completion_id; __entry->task_id = rqst->rq_task->tk_pid; __entry->client_id = rqst->rq_task->tk_client ? rqst->rq_task->tk_client->cl_clid : -1; - __entry->req = req; - __entry->sc = req->rl_sendctx; __entry->num_sge = req->rl_wr.num_sge; __entry->signaled = req->rl_wr.send_flags & IB_SEND_SIGNALED; ), - TP_printk("task:%u@%u req=%p sc=%p (%d SGE%s) %s", + TP_printk("task:%u@%u cq.id=%u cid=%d (%d SGE%s) %s", __entry->task_id, __entry->client_id, - __entry->req, __entry->sc, __entry->num_sge, - (__entry->num_sge == 1 ? "" : "s"), + __entry->cq_id, __entry->completion_id, + __entry->num_sge, (__entry->num_sge == 1 ? "" : "s"), (__entry->signaled ? "signaled" : "") ) ); @@ -771,15 +760,17 @@ TRACE_EVENT(xprtrdma_post_recv, TP_ARGS(rep), TP_STRUCT__entry( - __field(const void *, rep) + __field(u32, cq_id) + __field(int, completion_id) ), TP_fast_assign( - __entry->rep = rep; + __entry->cq_id = rep->rr_cid.ci_queue_id; + __entry->completion_id = rep->rr_cid.ci_completion_id; ), - TP_printk("rep=%p", - __entry->rep + TP_printk("cq.id=%d cid=%d", + __entry->cq_id, __entry->completion_id ) ); @@ -816,7 +807,7 @@ TRACE_EVENT(xprtrdma_post_recvs, ) ); -TRACE_EVENT(xprtrdma_post_linv, +TRACE_EVENT(xprtrdma_post_linv_err, TP_PROTO( const struct rpcrdma_req *req, int status @@ -825,19 +816,21 @@ TRACE_EVENT(xprtrdma_post_linv, TP_ARGS(req, status), TP_STRUCT__entry( - __field(const void *, req) + __field(unsigned int, task_id) + __field(unsigned int, client_id) __field(int, status) - __field(u32, xid) ), TP_fast_assign( - __entry->req = req; + const struct rpc_task *task = req->rl_slot.rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; __entry->status = status; - __entry->xid = be32_to_cpu(req->rl_slot.rq_xid); ), - TP_printk("req=%p xid=0x%08x status=%d", - __entry->req, __entry->xid, __entry->status + TP_printk("task:%u@%u status=%d", + __entry->task_id, __entry->client_id, __entry->status ) ); @@ -845,75 +838,12 @@ TRACE_EVENT(xprtrdma_post_linv, ** Completion events **/ -TRACE_EVENT(xprtrdma_wc_send, - TP_PROTO( - const struct rpcrdma_sendctx *sc, - const struct ib_wc *wc - ), - - TP_ARGS(sc, wc), - - TP_STRUCT__entry( - __field(const void *, req) - __field(const void *, sc) - __field(unsigned int, unmap_count) - __field(unsigned int, status) - __field(unsigned int, vendor_err) - ), - - TP_fast_assign( - __entry->req = sc->sc_req; - __entry->sc = sc; - __entry->unmap_count = sc->sc_unmap_count; - __entry->status = wc->status; - __entry->vendor_err = __entry->status ? wc->vendor_err : 0; - ), - - TP_printk("req=%p sc=%p unmapped=%u: %s (%u/0x%x)", - __entry->req, __entry->sc, __entry->unmap_count, - rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); - -TRACE_EVENT(xprtrdma_wc_receive, - TP_PROTO( - const struct ib_wc *wc - ), - - TP_ARGS(wc), - - TP_STRUCT__entry( - __field(const void *, rep) - __field(u32, byte_len) - __field(unsigned int, status) - __field(u32, vendor_err) - ), - - TP_fast_assign( - __entry->rep = container_of(wc->wr_cqe, struct rpcrdma_rep, - rr_cqe); - __entry->status = wc->status; - if (wc->status) { - __entry->byte_len = 0; - __entry->vendor_err = wc->vendor_err; - } else { - __entry->byte_len = wc->byte_len; - __entry->vendor_err = 0; - } - ), - - TP_printk("rep=%p %u bytes: %s (%u/0x%x)", - __entry->rep, __entry->byte_len, - rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); - -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_fastreg); -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li); -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_wake); -DEFINE_FRWR_DONE_EVENT(xprtrdma_wc_li_done); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_receive); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_li); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake); +DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done); TRACE_EVENT(xprtrdma_frwr_alloc, TP_PROTO( @@ -1036,9 +966,9 @@ TRACE_EVENT(xprtrdma_frwr_maperr, DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); -DEFINE_MR_EVENT(unmap); -DEFINE_MR_EVENT(reminv); -DEFINE_MR_EVENT(recycle); + +DEFINE_ANON_MR_EVENT(unmap); +DEFINE_ANON_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, TP_PROTO( @@ -1066,17 +996,14 @@ TRACE_EVENT(xprtrdma_reply, TP_PROTO( const struct rpc_task *task, const struct rpcrdma_rep *rep, - const struct rpcrdma_req *req, unsigned int credits ), - TP_ARGS(task, rep, req, credits), + TP_ARGS(task, rep, credits), TP_STRUCT__entry( __field(unsigned int, task_id) __field(unsigned int, client_id) - __field(const void *, rep) - __field(const void *, req) __field(u32, xid) __field(unsigned int, credits) ), @@ -1084,49 +1011,102 @@ TRACE_EVENT(xprtrdma_reply, TP_fast_assign( __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->rep = rep; - __entry->req = req; __entry->xid = be32_to_cpu(rep->rr_xid); __entry->credits = credits; ), - TP_printk("task:%u@%u xid=0x%08x, %u credits, rep=%p -> req=%p", + TP_printk("task:%u@%u xid=0x%08x credits=%u", __entry->task_id, __entry->client_id, __entry->xid, - __entry->credits, __entry->rep, __entry->req + __entry->credits ) ); -TRACE_EVENT(xprtrdma_defer_cmp, +DEFINE_REPLY_EVENT(vers); +DEFINE_REPLY_EVENT(rqst); +DEFINE_REPLY_EVENT(short); +DEFINE_REPLY_EVENT(hdr); + +TRACE_EVENT(xprtrdma_err_vers, TP_PROTO( - const struct rpcrdma_rep *rep + const struct rpc_rqst *rqst, + __be32 *min, + __be32 *max ), - TP_ARGS(rep), + TP_ARGS(rqst, min, max), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, min) + __field(u32, max) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->min = be32_to_cpup(min); + __entry->max = be32_to_cpup(max); + ), + + TP_printk("task:%u@%u xid=0x%08x versions=[%u, %u]", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->min, __entry->max + ) +); + +TRACE_EVENT(xprtrdma_err_chunk, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), TP_STRUCT__entry( __field(unsigned int, task_id) __field(unsigned int, client_id) - __field(const void *, rep) __field(u32, xid) ), TP_fast_assign( - __entry->task_id = rep->rr_rqst->rq_task->tk_pid; - __entry->client_id = rep->rr_rqst->rq_task->tk_client->cl_clid; - __entry->rep = rep; - __entry->xid = be32_to_cpu(rep->rr_xid); + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqst->rq_xid); ), - TP_printk("task:%u@%u xid=0x%08x rep=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->rep + TP_printk("task:%u@%u xid=0x%08x", + __entry->task_id, __entry->client_id, __entry->xid ) ); -DEFINE_REPLY_EVENT(xprtrdma_reply_vers); -DEFINE_REPLY_EVENT(xprtrdma_reply_rqst); -DEFINE_REPLY_EVENT(xprtrdma_reply_short); -DEFINE_REPLY_EVENT(xprtrdma_reply_hdr); +TRACE_EVENT(xprtrdma_err_unrecognized, + TP_PROTO( + const struct rpc_rqst *rqst, + __be32 *procedure + ), + + TP_ARGS(rqst, procedure), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, procedure) + ), + + TP_fast_assign( + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client->cl_clid; + __entry->procedure = be32_to_cpup(procedure); + ), + + TP_printk("task:%u@%u xid=0x%08x procedure=%u", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->procedure + ) +); TRACE_EVENT(xprtrdma_fixup, TP_PROTO( @@ -1187,6 +1167,28 @@ TRACE_EVENT(xprtrdma_decode_seg, ) ); +TRACE_EVENT(xprtrdma_mrs_zap, + TP_PROTO( + const struct rpc_task *task + ), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + ), + + TP_fast_assign( + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + ), + + TP_printk("task:%u@%u", + __entry->task_id, __entry->client_id + ) +); + /** ** Callback events **/ @@ -1219,36 +1221,8 @@ TRACE_EVENT(xprtrdma_cb_setup, ) ); -DEFINE_CB_EVENT(xprtrdma_cb_call); -DEFINE_CB_EVENT(xprtrdma_cb_reply); - -TRACE_EVENT(xprtrdma_leaked_rep, - TP_PROTO( - const struct rpc_rqst *rqst, - const struct rpcrdma_rep *rep - ), - - TP_ARGS(rqst, rep), - - TP_STRUCT__entry( - __field(unsigned int, task_id) - __field(unsigned int, client_id) - __field(u32, xid) - __field(const void *, rep) - ), - - TP_fast_assign( - __entry->task_id = rqst->rq_task->tk_pid; - __entry->client_id = rqst->rq_task->tk_client->cl_clid; - __entry->xid = be32_to_cpu(rqst->rq_xid); - __entry->rep = rep; - ), - - TP_printk("task:%u@%u xid=0x%08x rep=%p", - __entry->task_id, __entry->client_id, __entry->xid, - __entry->rep - ) -); +DEFINE_CALLBACK_EVENT(call); +DEFINE_CALLBACK_EVENT(reply); /** ** Server-side RPC/RDMA events diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h index fdcdfe414223..236d437947bc 100644 --- a/include/uapi/linux/idxd.h +++ b/include/uapi/linux/idxd.h @@ -26,6 +26,9 @@ #define IDXD_OP_FLAG_DRDBK 0x4000 #define IDXD_OP_FLAG_DSTS 0x8000 +/* IAX */ +#define IDXD_OP_FLAG_RD_SRC2_AECS 0x010000 + /* Opcode */ enum dsa_opcode { DSA_OPCODE_NOOP = 0, @@ -47,6 +50,14 @@ enum dsa_opcode { DSA_OPCODE_CFLUSH = 0x20, }; +enum iax_opcode { + IAX_OPCODE_NOOP = 0, + IAX_OPCODE_DRAIN = 2, + IAX_OPCODE_MEMMOVE, + IAX_OPCODE_DECOMPRESS = 0x42, + IAX_OPCODE_COMPRESS, +}; + /* Completion record status */ enum dsa_completion_status { DSA_COMP_NONE = 0, @@ -80,6 +91,33 @@ enum dsa_completion_status { DSA_COMP_TRANSLATION_FAIL, }; +enum iax_completion_status { + IAX_COMP_NONE = 0, + IAX_COMP_SUCCESS, + IAX_COMP_PAGE_FAULT_IR = 0x04, + IAX_COMP_OUTBUF_OVERFLOW, + IAX_COMP_BAD_OPCODE = 0x10, + IAX_COMP_INVALID_FLAGS, + IAX_COMP_NOZERO_RESERVE, + IAX_COMP_INVALID_SIZE, + IAX_COMP_OVERLAP_BUFFERS = 0x16, + IAX_COMP_INT_HANDLE_INVAL = 0x19, + IAX_COMP_CRA_XLAT, + IAX_COMP_CRA_ALIGN, + IAX_COMP_ADDR_ALIGN, + IAX_COMP_PRIV_BAD, + IAX_COMP_TRAFFIC_CLASS_CONF, + IAX_COMP_PFAULT_RDBA, + IAX_COMP_HW_ERR1, + IAX_COMP_HW_ERR_DRB, + IAX_COMP_TRANSLATION_FAIL, + IAX_COMP_PRS_TIMEOUT, + IAX_COMP_WATCHDOG, + IAX_COMP_INVALID_COMP_FLAG = 0x30, + IAX_COMP_INVALID_FILTER_FLAG, + IAX_COMP_INVALID_NUM_ELEMS = 0x33, +}; + #define DSA_COMP_STATUS_MASK 0x7f #define DSA_COMP_STATUS_WRITE 0x80 @@ -163,6 +201,28 @@ struct dsa_hw_desc { }; } __attribute__((packed)); +struct iax_hw_desc { + uint32_t pasid:20; + uint32_t rsvd:11; + uint32_t priv:1; + uint32_t flags:24; + uint32_t opcode:8; + uint64_t completion_addr; + uint64_t src1_addr; + uint64_t dst_addr; + uint32_t src1_size; + uint16_t int_handle; + union { + uint16_t compr_flags; + uint16_t decompr_flags; + }; + uint64_t src2_addr; + uint32_t max_dst_size; + uint32_t src2_size; + uint32_t filter_flags; + uint32_t num_inputs; +} __attribute__((packed)); + struct dsa_raw_desc { uint64_t field[8]; } __attribute__((packed)); @@ -223,4 +283,23 @@ struct dsa_raw_completion_record { uint64_t field[4]; } __attribute__((packed)); +struct iax_completion_record { + volatile uint8_t status; + uint8_t error_code; + uint16_t rsvd; + uint32_t bytes_completed; + uint64_t fault_addr; + uint32_t invalid_flags; + uint32_t rsvd2; + uint32_t output_size; + uint8_t output_bits; + uint8_t rsvd3; + uint16_t rsvd4; + uint64_t rsvd5[4]; +} __attribute__((packed)); + +struct iax_raw_completion_record { + uint64_t field[8]; +} __attribute__((packed)); + #endif diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index f36f9a3a4e20..c5c4eef3a9ff 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -5,6 +5,9 @@ config CEPH_LIB select LIBCRC32C select CRYPTO_AES select CRYPTO_CBC + select CRYPTO_GCM + select CRYPTO_HMAC + select CRYPTO_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/Makefile b/net/ceph/Makefile index ce09bb4fb249..8802a0c0155d 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_strings.o ceph_hash.o \ - pagevec.o snapshot.o string_table.o + pagevec.o snapshot.o string_table.o \ + messenger_v1.o messenger_v2.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index fbeee068ea14..eb261aa5fe18 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -21,28 +21,31 @@ static u32 supported_protocols[] = { CEPH_AUTH_CEPHX }; -static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +static int init_protocol(struct ceph_auth_client *ac, int proto) { - switch (protocol) { + dout("%s proto %d\n", __func__, proto); + + switch (proto) { case CEPH_AUTH_NONE: return ceph_auth_none_init(ac); case CEPH_AUTH_CEPHX: return ceph_x_init(ac); default: - return -ENOENT; + pr_err("bad auth protocol %d\n", proto); + return -EINVAL; } } /* * setup, teardown. */ -struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key) +struct ceph_auth_client *ceph_auth_init(const char *name, + const struct ceph_crypto_key *key, + const int *con_modes) { struct ceph_auth_client *ac; int ret; - dout("auth_init name '%s'\n", name); - ret = -ENOMEM; ac = kzalloc(sizeof(*ac), GFP_NOFS); if (!ac) @@ -54,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp ac->name = name; else ac->name = CEPH_AUTH_NAME_DEFAULT; - dout("auth_init name %s\n", ac->name); ac->key = key; + ac->preferred_mode = con_modes[0]; + ac->fallback_mode = con_modes[1]; + + dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__, + ac->name, ac->preferred_mode, ac->fallback_mode); return ac; out: @@ -145,31 +152,35 @@ bad: goto out; } -static int ceph_build_auth_request(struct ceph_auth_client *ac, - void *msg_buf, size_t msg_len) +static int build_request(struct ceph_auth_client *ac, bool add_header, + void *buf, int buf_len) { - struct ceph_mon_request_header *monhdr = msg_buf; - void *p = monhdr + 1; - void *end = msg_buf + msg_len; + void *end = buf + buf_len; + void *p; int ret; - monhdr->have_version = 0; - monhdr->session_mon = cpu_to_le16(-1); - monhdr->session_mon_tid = 0; - - ceph_encode_32(&p, ac->protocol); + p = buf; + if (add_header) { + /* struct ceph_mon_request_header + protocol */ + ceph_encode_64_safe(&p, end, 0, e_range); + ceph_encode_16_safe(&p, end, -1, e_range); + ceph_encode_64_safe(&p, end, 0, e_range); + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + } + ceph_encode_need(&p, end, sizeof(u32), e_range); ret = ac->ops->build_request(ac, p + sizeof(u32), end); if (ret < 0) { - pr_err("error %d building auth method %s request\n", ret, - ac->ops->name); - goto out; + pr_err("auth protocol '%s' building request failed: %d\n", + ceph_auth_proto_name(ac->protocol), ret); + return ret; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - ret = p + ret - msg_buf; -out: - return ret; + return p + ret - buf; + +e_range: + return -ERANGE; } /* @@ -229,10 +240,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->ops = NULL; } if (ac->protocol != protocol) { - ret = ceph_auth_init_protocol(ac, protocol); + ret = init_protocol(ac, protocol); if (ret) { - pr_err("error %d on auth protocol %d init\n", - ret, protocol); + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(protocol), ret); goto out; } } @@ -240,12 +251,13 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ac->negotiating = false; } - ret = ac->ops->handle_reply(ac, result, payload, payload_end); - if (ret == -EAGAIN) { - ret = ceph_build_auth_request(ac, reply_buf, reply_len); - } else if (ret) { - pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - } + ret = ac->ops->handle_reply(ac, result, payload, payload_end, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, true, reply_buf, reply_len); + else if (ret) + pr_err("auth protocol '%s' mauth authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); out: mutex_unlock(&ac->mutex); @@ -264,7 +276,7 @@ int ceph_build_auth(struct ceph_auth_client *ac, mutex_lock(&ac->mutex); if (ac->ops->should_authenticate(ac)) - ret = ceph_build_auth_request(ac, msg_buf, msg_len); + ret = build_request(ac, true, msg_buf, msg_len); mutex_unlock(&ac->mutex); return ret; } @@ -281,19 +293,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac) } EXPORT_SYMBOL(ceph_auth_is_authenticated); -int ceph_auth_create_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *auth) +int __ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, bool force_new, + int *proto, int *pref_mode, int *fallb_mode) { - int ret = 0; + int ret; mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->create_authorizer) + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) ret = ac->ops->create_authorizer(ac, peer_type, auth); + else if (ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, auth); + else + ret = 0; + if (ret) + goto out; + + *proto = ac->protocol; + if (pref_mode && fallb_mode) { + *pref_mode = ac->preferred_mode; + *fallb_mode = ac->fallback_mode; + } + +out: mutex_unlock(&ac->mutex); return ret; } -EXPORT_SYMBOL(ceph_auth_create_authorizer); +EXPORT_SYMBOL(__ceph_auth_get_authorizer); void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) { @@ -301,20 +332,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); -int ceph_auth_update_authorizer(struct ceph_auth_client *ac, - int peer_type, - struct ceph_auth_handshake *a) -{ - int ret = 0; - - mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->update_authorizer) - ret = ac->ops->update_authorizer(ac, peer_type, a); - mutex_unlock(&ac->mutex); - return ret; -} -EXPORT_SYMBOL(ceph_auth_update_authorizer); - int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, void *challenge_buf, @@ -332,13 +349,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge); int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a) + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { int ret = 0; mutex_lock(&ac->mutex); if (ac->ops && ac->ops->verify_authorizer_reply) - ret = ac->ops->verify_authorizer_reply(ac, a); + ret = ac->ops->verify_authorizer_reply(ac, a, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); mutex_unlock(&ac->mutex); return ret; } @@ -352,3 +374,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) mutex_unlock(&ac->mutex); } EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); + +/* + * msgr2 authentication + */ + +static bool contains(const int *arr, int cnt, int val) +{ + int i; + + for (i = 0; i < cnt; i++) { + if (arr[i] == val) + return true; + } + + return false; +} + +static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode) +{ + WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN); + if (fallb_mode != CEPH_CON_MODE_UNKNOWN) { + ceph_encode_32_safe(p, end, 2, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + ceph_encode_32_safe(p, end, fallb_mode, e_range); + } else { + ceph_encode_32_safe(p, end, 1, e_range); + ceph_encode_32_safe(p, end, pref_mode, e_range); + } + + return 0; + +e_range: + return -ERANGE; +} + +/* + * Similar to ceph_auth_build_hello(). + */ +int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len) +{ + int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE; + void *end = buf + buf_len; + void *lenp; + void *p; + int ret; + + mutex_lock(&ac->mutex); + if (ac->protocol == CEPH_AUTH_UNKNOWN) { + ret = init_protocol(ac, proto); + if (ret) { + pr_err("auth protocol '%s' init failed: %d\n", + ceph_auth_proto_name(proto), ret); + goto out; + } + } else { + WARN_ON(ac->protocol != proto); + ac->ops->reset(ac); + } + + p = buf; + ceph_encode_32_safe(&p, end, ac->protocol, e_range); + ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode); + if (ret) + goto out; + + lenp = p; + p += 4; /* space for len */ + + ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range); + ret = ceph_auth_entity_name_encode(ac->name, &p, end); + if (ret) + goto out; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + ceph_encode_32(&lenp, p - lenp - 4); + ret = p - buf; + +out: + mutex_unlock(&ac->mutex); + return ret; + +e_range: + ret = -ERANGE; + goto out; +} + +int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply, + int reply_len, void *buf, int buf_len) +{ + int ret; + + mutex_lock(&ac->mutex); + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + NULL, NULL, NULL, NULL); + if (ret == -EAGAIN) + ret = build_request(ac, false, buf, buf_len); + else + WARN_ON(ret >= 0); + mutex_unlock(&ac->mutex); + return ret; +} + +int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + int ret; + + mutex_lock(&ac->mutex); + if (global_id && ac->global_id != global_id) { + dout("%s global_id %llu -> %llu\n", __func__, ac->global_id, + global_id); + ac->global_id = global_id; + } + + ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + mutex_unlock(&ac->mutex); + return ret; +} + +bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed\n", + ceph_auth_proto_name(ac->protocol)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed\n", + ceph_con_mode_name(ac->preferred_mode)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed\n", + ceph_con_mode_name(ac->fallback_mode)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' msgr authentication failed: %d\n", + ceph_auth_proto_name(ac->protocol), result); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} + +int ceph_auth_get_authorizer(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + int peer_type, void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + int pref_mode, fallb_mode; + int proto; + void *p; + int ret; + + ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto, + &pref_mode, &fallb_mode); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, proto, e_range); + ret = encode_con_modes(&p, end, pref_mode, fallb_mode); + if (ret) + return ret; + + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_get_authorizer); + +int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + void *buf, int *buf_len) +{ + void *end = buf + *buf_len; + void *p; + int ret; + + ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer, + reply, reply_len); + if (ret) + return ret; + + p = buf; + ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range); + *buf_len = p - buf; + return 0; + +e_range: + return -ERANGE; +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more); + +int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac, + struct ceph_auth_handshake *auth, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + reply, reply_len, session_key, session_key_len, + con_secret, con_secret_len); +} +EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done); + +bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac, + int peer_type, int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + mutex_lock(&ac->mutex); + WARN_ON(used_proto != ac->protocol); + + if (result == -EOPNOTSUPP) { + if (!contains(allowed_protos, proto_cnt, ac->protocol)) { + pr_err("auth protocol '%s' not allowed by %s\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) && + (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN || + !contains(allowed_modes, mode_cnt, ac->fallback_mode))) { + pr_err("preferred mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->preferred_mode), + ceph_entity_type_name(peer_type)); + if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN) + pr_err("no fallback mode\n"); + else + pr_err("fallback mode '%s' not allowed by %s\n", + ceph_con_mode_name(ac->fallback_mode), + ceph_entity_type_name(peer_type)); + goto not_allowed; + } + } + + WARN_ON(result == -EOPNOTSUPP || result >= 0); + pr_err("auth protocol '%s' authorization to %s failed: %d\n", + ceph_auth_proto_name(ac->protocol), + ceph_entity_type_name(peer_type), result); + + if (ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + + mutex_unlock(&ac->mutex); + return true; + +not_allowed: + mutex_unlock(&ac->mutex); + return false; +} +EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index edb7042479ed..70e86e462250 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end) * authenticate state, so nothing happens here. */ static int handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) + void *buf, void *end, u8 *session_key, + int *session_key_len, u8 *con_secret, + int *con_secret_len) { struct ceph_auth_none_info *xi = ac->private; @@ -116,7 +118,6 @@ static int ceph_auth_none_create_authorizer( } static const struct ceph_auth_client_ops ceph_auth_none_ops = { - .name = "none", .reset = reset, .destroy = destroy, .is_authenticated = is_authenticated, diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index b52732337ca6..9815cfe42af0 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); static int ceph_x_is_authenticated(struct ceph_auth_client *ac) { struct ceph_x_info *xi = ac->private; - int need; + int missing; + int need; /* missing + need renewal */ ceph_x_validate_tickets(ac, &need); - dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return (ac->want_keys & xi->have_keys) == ac->want_keys; + missing = ac->want_keys & ~xi->have_keys; + WARN_ON((need & missing) != missing); + dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__, + ac->want_keys, xi->have_keys, missing, !missing); + return !missing; } static int ceph_x_should_authenticate(struct ceph_auth_client *ac) @@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac) int need; ceph_x_validate_tickets(ac, &need); - dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", - ac->want_keys, need, xi->have_keys); - return need != 0; + dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__, + ac->want_keys, xi->have_keys, need, !!need); + return !!need; } static int ceph_x_encrypt_offset(void) @@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, dout(" decrypted %d bytes\n", ret); dend = dp + ret; - tkt_struct_v = ceph_decode_8(&dp); + ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad); if (tkt_struct_v != 1) goto bad; @@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; + ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&validity, dp); dp += sizeof(struct ceph_timespec); new_expires = ktime_get_real_seconds() + validity.tv_sec; @@ -265,22 +269,21 @@ out: static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, struct ceph_crypto_key *secret, - void *buf, void *end) + void **p, void *end) { - void *p = buf; u8 reply_struct_v; u32 num; int ret; - ceph_decode_8_safe(&p, end, reply_struct_v, bad); + ceph_decode_8_safe(p, end, reply_struct_v, bad); if (reply_struct_v != 1) return -EINVAL; - ceph_decode_32_safe(&p, end, num, bad); + ceph_decode_32_safe(p, end, num, bad); dout("%d tickets\n", num); while (num--) { - ret = process_one_ticket(ac, secret, &p, end); + ret = process_one_ticket(ac, secret, p, end); if (ret) return ret; } @@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, } } au->service = th->service; + WARN_ON(!th->secret_id); au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; @@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th) static bool have_key(struct ceph_x_ticket_handler *th) { - if (th->have_key) { - if (ktime_get_real_seconds() >= th->expires) - th->have_key = false; + if (th->have_key && ktime_get_real_seconds() >= th->expires) { + dout("ticket %d (%s) secret_id %llu expired\n", th->service, + ceph_entity_type_name(th->service), th->secret_id); + th->have_key = false; } return th->have_key; @@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_info *xi = ac->private; int need; struct ceph_x_request_header *head = buf; + void *p; int ret; struct ceph_x_ticket_handler *th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); @@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, return PTR_ERR(th); ceph_x_validate_tickets(ac, &need); - - dout("build_request want %x have %x need %x\n", - ac->want_keys, xi->have_keys, need); + dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys, + xi->have_keys, need); if (need & CEPH_ENTITY_TYPE_AUTH) { struct ceph_x_authenticate *auth = (void *)(head + 1); - void *p = auth + 1; void *enc_buf = xi->auth_authorizer.enc_buf; struct ceph_x_challenge_blob *blob = enc_buf + ceph_x_encrypt_offset(); u64 *u; + p = auth + 1; if (p > end) return -ERANGE; @@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; - auth->struct_v = 1; + auth->struct_v = 2; /* nautilus+ */ auth->key = 0; for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++) auth->key ^= *(__le64 *)u; @@ -534,39 +539,117 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; + /* nautilus+: request service tickets at the same time */ + need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH; + WARN_ON(!need); + ceph_encode_32_safe(&p, end, need, e_range); return p - buf; } if (need) { - void *p = head + 1; - struct ceph_x_service_ticket_request *req; - - if (p > end) - return -ERANGE; - head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - + dout(" get_principal_session_key\n"); ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); if (ret) return ret; - ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, - xi->auth_authorizer.buf->vec.iov_len); - req = p; - req->keys = cpu_to_le32(need); - p += sizeof(*req); + p = buf; + ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY, + e_range); + ceph_encode_copy_safe(&p, end, + xi->auth_authorizer.buf->vec.iov_base, + xi->auth_authorizer.buf->vec.iov_len, e_range); + ceph_encode_8_safe(&p, end, 1, e_range); + ceph_encode_32_safe(&p, end, need, e_range); return p - buf; } return 0; + +e_range: + return -ERANGE; +} + +static int handle_auth_session_key(struct ceph_auth_client *ac, + void **p, void *end, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_x_info *xi = ac->private; + struct ceph_x_ticket_handler *th; + void *dp, *dend; + int len; + int ret; + + /* AUTH ticket */ + ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end); + if (ret) + return ret; + + if (*p == end) { + /* pre-nautilus (or didn't request service tickets!) */ + WARN_ON(session_key || con_secret); + return 0; + } + + th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + + if (session_key) { + memcpy(session_key, th->session_key.key, th->session_key.len); + *session_key_len = th->session_key.len; + } + + /* connection secret */ + ceph_decode_32_safe(p, end, len, e_inval); + dout("%s connection secret blob len %d\n", __func__, len); + if (len > 0) { + dp = *p + ceph_x_encrypt_offset(); + ret = ceph_x_decrypt(&th->session_key, p, *p + len); + if (ret < 0) + return ret; + + dout("%s decrypted %d bytes\n", __func__, ret); + dend = dp + ret; + + ceph_decode_32_safe(&dp, dend, len, e_inval); + if (len > CEPH_MAX_CON_SECRET_LEN) { + pr_err("connection secret too big %d\n", len); + return -EINVAL; + } + + dout("%s connection secret len %d\n", __func__, len); + if (con_secret) { + memcpy(con_secret, dp, len); + *con_secret_len = len; + } + } + + /* service tickets */ + ceph_decode_32_safe(p, end, len, e_inval); + dout("%s service tickets blob len %d\n", __func__, len); + if (len > 0) { + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, + p, *p + len); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; } static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, - void *buf, void *end) + void *buf, void *end, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) { struct ceph_x_info *xi = ac->private; - struct ceph_x_reply_header *head = buf; struct ceph_x_ticket_handler *th; int len = end - buf; + void *p; int op; int ret; @@ -587,22 +670,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } - op = le16_to_cpu(head->op); - result = le32_to_cpu(head->result); + p = buf; + ceph_decode_16_safe(&p, end, op, e_inval); + ceph_decode_32_safe(&p, end, result, e_inval); dout("handle_reply op %d result %d\n", op, result); switch (op) { case CEPHX_GET_AUTH_SESSION_KEY: - /* verify auth key */ - ret = ceph_x_proc_ticket_reply(ac, &xi->secret, - buf + sizeof(*head), end); + /* AUTH ticket + [connection secret] + service tickets */ + ret = handle_auth_session_key(ac, &p, end, session_key, + session_key_len, con_secret, + con_secret_len); break; case CEPHX_GET_PRINCIPAL_SESSION_KEY: th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); if (IS_ERR(th)) return PTR_ERR(th); - ret = ceph_x_proc_ticket_reply(ac, &th->session_key, - buf + sizeof(*head), end); + + /* service tickets */ + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end); break; default: @@ -613,6 +699,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, if (ac->want_keys == xi->have_keys) return 0; return -EAGAIN; + +e_inval: + return -EINVAL; } static void ceph_x_destroy_authorizer(struct ceph_authorizer *a) @@ -678,40 +767,44 @@ static int ceph_x_update_authorizer( return 0; } -static int decrypt_authorize_challenge(struct ceph_x_authorizer *au, - void *challenge_buf, - int challenge_buf_len, - u64 *server_challenge) +/* + * CephXAuthorizeChallenge + */ +static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret, + void *challenge, int challenge_len, + u64 *server_challenge) { - struct ceph_x_authorize_challenge *ch = - challenge_buf + sizeof(struct ceph_x_encrypt_header); + void *dp, *dend; int ret; /* no leading len */ - ret = __ceph_x_decrypt(&au->session_key, challenge_buf, - challenge_buf_len); + ret = __ceph_x_decrypt(secret, challenge, challenge_len); if (ret < 0) return ret; - if (ret < sizeof(*ch)) { - pr_err("bad size %d for ceph_x_authorize_challenge\n", ret); - return -EINVAL; - } - *server_challenge = le64_to_cpu(ch->server_challenge); + dout("%s decrypted %d bytes\n", __func__, ret); + dp = challenge + sizeof(struct ceph_x_encrypt_header); + dend = dp + ret; + + ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */ + ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval); + dout("%s server_challenge %llu\n", __func__, *server_challenge); return 0; + +e_inval: + return -EINVAL; } static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, struct ceph_authorizer *a, - void *challenge_buf, - int challenge_buf_len) + void *challenge, int challenge_len) { struct ceph_x_authorizer *au = (void *)a; u64 server_challenge; int ret; - ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len, - &server_challenge); + ret = decrypt_authorizer_challenge(&au->session_key, challenge, + challenge_len, &server_challenge); if (ret) { pr_err("failed to decrypt authorize challenge: %d", ret); return ret; @@ -726,29 +819,76 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, return 0; } -static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, - struct ceph_authorizer *a) +/* + * CephXAuthorizeReply + */ +static int decrypt_authorizer_reply(struct ceph_crypto_key *secret, + void **p, void *end, u64 *nonce_plus_one, + u8 *con_secret, int *con_secret_len) { - struct ceph_x_authorizer *au = (void *)a; - void *p = au->enc_buf; - struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset(); + void *dp, *dend; + u8 struct_v; + int len; int ret; - ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN); + dp = *p + ceph_x_encrypt_offset(); + ret = ceph_x_decrypt(secret, p, end); if (ret < 0) return ret; - if (ret < sizeof(*reply)) { - pr_err("bad size %d for ceph_x_authorize_reply\n", ret); - return -EINVAL; + + dout("%s decrypted %d bytes\n", __func__, ret); + dend = dp + ret; + + ceph_decode_8_safe(&dp, dend, struct_v, e_inval); + ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval); + dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one); + if (struct_v >= 2) { + ceph_decode_32_safe(&dp, dend, len, e_inval); + if (len > CEPH_MAX_CON_SECRET_LEN) { + pr_err("connection secret too big %d\n", len); + return -EINVAL; + } + + dout("%s connection secret len %d\n", __func__, len); + if (con_secret) { + memcpy(con_secret, dp, len); + *con_secret_len = len; + } } - if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one)) - ret = -EPERM; - else - ret = 0; - dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", - au->nonce, le64_to_cpu(reply->nonce_plus_one), ret); - return ret; + return 0; + +e_inval: + return -EINVAL; +} + +static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_x_authorizer *au = (void *)a; + u64 nonce_plus_one; + int ret; + + if (session_key) { + memcpy(session_key, au->session_key.key, au->session_key.len); + *session_key_len = au->session_key.len; + } + + ret = decrypt_authorizer_reply(&au->session_key, &reply, + reply + reply_len, &nonce_plus_one, + con_secret, con_secret_len); + if (ret) + return ret; + + if (nonce_plus_one != au->nonce + 1) { + pr_err("failed to authenticate server\n"); + return -EPERM; + } + + return 0; } static void ceph_x_reset(struct ceph_auth_client *ac) @@ -785,8 +925,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); - if (!IS_ERR(th)) + if (IS_ERR(th)) + return; + + if (th->have_key) { + dout("ticket %d (%s) secret_id %llu invalidated\n", + th->service, ceph_entity_type_name(th->service), + th->secret_id); th->have_key = false; + } } static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, @@ -911,7 +1058,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, } static const struct ceph_auth_client_ops ceph_x_ops = { - .name = "x", .is_authenticated = ceph_x_is_authenticated, .should_authenticate = ceph_x_should_authenticate, .build_request = ceph_x_build_request, diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 24b0b74564d0..792fcb974dc3 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -38,7 +38,8 @@ struct ceph_x_authenticate { __u8 struct_v; __le64 client_challenge; __le64 key; - /* ticket blob */ + /* old_ticket blob */ + /* nautilus+: other_keys */ } __attribute__ ((packed)); struct ceph_x_service_ticket_request { diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4e7edd707a14..271287c5ec12 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -265,6 +265,7 @@ enum { Opt_ip, Opt_crush_location, Opt_read_from_replica, + Opt_ms_mode, /* string args above */ Opt_share, Opt_crc, @@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = { {} }; +enum ceph_ms_mode { + Opt_ms_mode_legacy, + Opt_ms_mode_crc, + Opt_ms_mode_secure, + Opt_ms_mode_prefer_crc, + Opt_ms_mode_prefer_secure +}; + +static const struct constant_table ceph_param_ms_mode[] = { + {"legacy", Opt_ms_mode_legacy}, + {"crc", Opt_ms_mode_crc}, + {"secure", Opt_ms_mode_secure}, + {"prefer-crc", Opt_ms_mode_prefer_crc}, + {"prefer-secure", Opt_ms_mode_prefer_secure}, + {} +}; + static const struct fs_parameter_spec ceph_parameters[] = { fsparam_flag ("abort_on_full", Opt_abort_on_full), fsparam_flag_no ("cephx_require_signatures", Opt_cephx_require_signatures), @@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = { fs_param_deprecated, NULL), fsparam_enum ("read_from_replica", Opt_read_from_replica, ceph_param_read_from_replica), + fsparam_enum ("ms_mode", Opt_ms_mode, + ceph_param_ms_mode), fsparam_string ("secret", Opt_secret), fsparam_flag_no ("share", Opt_share), fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay), @@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void) opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT; + opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; return opt; } EXPORT_SYMBOL(ceph_alloc_options); @@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, BUG(); } break; + case Opt_ms_mode: + switch (result.uint_32) { + case Opt_ms_mode_legacy: + opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_crc: + opt->con_modes[0] = CEPH_CON_MODE_CRC; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_secure: + opt->con_modes[0] = CEPH_CON_MODE_SECURE; + opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN; + break; + case Opt_ms_mode_prefer_crc: + opt->con_modes[0] = CEPH_CON_MODE_CRC; + opt->con_modes[1] = CEPH_CON_MODE_SECURE; + break; + case Opt_ms_mode_prefer_secure: + opt->con_modes[0] = CEPH_CON_MODE_SECURE; + opt->con_modes[1] = CEPH_CON_MODE_CRC; + break; + default: + BUG(); + } + break; case Opt_osdtimeout: warn_plog(&log, "Ignoring osdtimeout"); @@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) { seq_puts(m, "read_from_replica=localize,"); } + if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) { + if (opt->con_modes[0] == CEPH_CON_MODE_CRC && + opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { + seq_puts(m, "ms_mode=crc,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && + opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) { + seq_puts(m, "ms_mode=secure,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC && + opt->con_modes[1] == CEPH_CON_MODE_SECURE) { + seq_puts(m, "ms_mode=prefer-crc,"); + } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE && + opt->con_modes[1] == CEPH_CON_MODE_CRC) { + seq_puts(m, "ms_mode=prefer-secure,"); + } + } if (opt->flags & CEPH_OPT_FSID) seq_printf(m, "fsid=%pU,", &opt->fsid); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 10e01494993c..355fea272120 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type) } EXPORT_SYMBOL(ceph_entity_type_name); +const char *ceph_auth_proto_name(int proto) +{ + switch (proto) { + case CEPH_AUTH_UNKNOWN: + return "unknown"; + case CEPH_AUTH_NONE: + return "none"; + case CEPH_AUTH_CEPHX: + return "cephx"; + default: + return "???"; + } +} + +const char *ceph_con_mode_name(int mode) +{ + switch (mode) { + case CEPH_CON_MODE_UNKNOWN: + return "unknown"; + case CEPH_CON_MODE_CRC: + return "crc"; + case CEPH_CON_MODE_SECURE: + return "secure"; + default: + return "???"; + } +} + const char *ceph_osd_op_name(int op) { switch (op) { diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h index 96ef4d860bc9..13bd526349fa 100644 --- a/net/ceph/crypto.h +++ b/net/ceph/crypto.h @@ -5,6 +5,9 @@ #include #include +#define CEPH_KEY_LEN 16 +#define CEPH_MAX_CON_SECRET_LEN 64 + /* * cryptographic secret */ diff --git a/net/ceph/decode.c b/net/ceph/decode.c index eea529595a7a..b44f7651be04 100644 --- a/net/ceph/decode.c +++ b/net/ceph/decode.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include + +#include #include @@ -82,3 +85,101 @@ bad: } EXPORT_SYMBOL(ceph_decode_entity_addr); +/* + * Return addr of desired type (MSGR2 or LEGACY) or error. + * Make sure there is only one match. + * + * Assume encoding with MSG_ADDR2. + */ +int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr) +{ + __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 : + CEPH_ENTITY_ADDR_TYPE_LEGACY; + struct ceph_entity_addr tmp_addr; + int addr_cnt; + bool found; + u8 marker; + int ret; + int i; + + ceph_decode_8_safe(p, end, marker, e_inval); + if (marker != 2) { + pr_err("bad addrvec marker %d\n", marker); + return -EINVAL; + } + + ceph_decode_32_safe(p, end, addr_cnt, e_inval); + + found = false; + for (i = 0; i < addr_cnt; i++) { + ret = ceph_decode_entity_addr(p, end, &tmp_addr); + if (ret) + return ret; + + if (tmp_addr.type == my_type) { + if (found) { + pr_err("another match of type %d in addrvec\n", + le32_to_cpu(my_type)); + return -EINVAL; + } + + memcpy(addr, &tmp_addr, sizeof(*addr)); + found = true; + } + } + if (!found && addr_cnt != 0) { + pr_err("no match of type %d in addrvec\n", + le32_to_cpu(my_type)); + return -ENOENT; + } + + return 0; + +e_inval: + return -EINVAL; +} +EXPORT_SYMBOL(ceph_decode_entity_addrvec); + +static int get_sockaddr_encoding_len(sa_family_t family) +{ + union { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } u; + + switch (family) { + case AF_INET: + return sizeof(u.sin); + case AF_INET6: + return sizeof(u.sin6); + default: + return sizeof(u); + } +} + +int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len; +} + +void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr) +{ + sa_family_t family = get_unaligned(&addr->in_addr.ss_family); + int addr_len = get_sockaddr_encoding_len(family); + + ceph_encode_8(p, 1); /* marker */ + ceph_start_encoding(p, 1, 1, sizeof(addr->type) + + sizeof(addr->nonce) + + sizeof(u32) + addr_len); + ceph_encode_copy(p, &addr->type, sizeof(addr->type)); + ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce)); + + ceph_encode_32(p, addr_len); + ceph_encode_16(p, family); + ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family)); +} diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index af0f1fa24937..57d043b382ed 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -82,71 +82,51 @@ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ -/* - * connection states - */ -#define CON_STATE_CLOSED 1 /* -> PREOPEN */ -#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ -#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ -#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ -#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ -#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ - -/* - * ceph_connection flag bits - */ -#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop - * messages on errors */ -#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ -#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ -#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ -#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ - static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { - case CON_FLAG_LOSSYTX: - case CON_FLAG_KEEPALIVE_PENDING: - case CON_FLAG_WRITE_PENDING: - case CON_FLAG_SOCK_CLOSED: - case CON_FLAG_BACKOFF: + case CEPH_CON_F_LOSSYTX: + case CEPH_CON_F_KEEPALIVE_PENDING: + case CEPH_CON_F_WRITE_PENDING: + case CEPH_CON_F_SOCK_CLOSED: + case CEPH_CON_F_BACKOFF: return true; default: return false; } } -static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) +void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } -static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) +void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } -static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) +bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } -static bool con_flag_test_and_clear(struct ceph_connection *con, - unsigned long con_flag) +bool ceph_con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } -static bool con_flag_test_and_set(struct ceph_connection *con, - unsigned long con_flag) +bool ceph_con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); @@ -157,12 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con, static struct kmem_cache *ceph_msg_cache; -/* static tag bytes (protocol control messages) */ -static char tag_msg = CEPH_MSGR_TAG_MSG; -static char tag_ack = CEPH_MSGR_TAG_ACK; -static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; -static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; - #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif @@ -184,7 +158,7 @@ static void con_fault(struct ceph_connection *con); static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); -static struct page *zero_page; /* used in certain error cases */ +struct page *ceph_zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) { @@ -219,10 +193,13 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr) } EXPORT_SYMBOL(ceph_pr_addr); -static void encode_my_addr(struct ceph_messenger *msgr) +void ceph_encode_my_addr(struct ceph_messenger *msgr) { - memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); - ceph_encode_banner_addr(&msgr->my_enc_addr); + if (!ceph_msgr2(from_msgr(msgr))) { + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, + sizeof(msgr->my_enc_addr)); + ceph_encode_banner_addr(&msgr->my_enc_addr); + } } /* @@ -254,9 +231,9 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } - BUG_ON(zero_page == NULL); - put_page(zero_page); - zero_page = NULL; + BUG_ON(!ceph_zero_page); + put_page(ceph_zero_page); + ceph_zero_page = NULL; ceph_msgr_slab_exit(); } @@ -266,9 +243,9 @@ int __init ceph_msgr_init(void) if (ceph_msgr_slab_init()) return -ENOMEM; - BUG_ON(zero_page != NULL); - zero_page = ZERO_PAGE(0); - get_page(zero_page); + BUG_ON(ceph_zero_page); + ceph_zero_page = ZERO_PAGE(0); + get_page(ceph_zero_page); /* * The number of active work items is limited by the number of @@ -372,7 +349,7 @@ static void ceph_sock_data_ready(struct sock *sk) } if (sk->sk_state != TCP_CLOSE_WAIT) { - dout("%s on %p state = %lu, queueing work\n", __func__, + dout("%s %p state = %d, queueing work\n", __func__, con, con->state); queue_con(con); } @@ -390,7 +367,7 @@ static void ceph_sock_write_space(struct sock *sk) * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ - if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { + if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); @@ -406,7 +383,7 @@ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; - dout("%s %p state = %lu sk_state = %u\n", __func__, + dout("%s %p state = %d sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { @@ -416,7 +393,7 @@ static void ceph_sock_state_change(struct sock *sk) case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); - con_flag_set(con, CON_FLAG_SOCK_CLOSED); + ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: @@ -450,13 +427,15 @@ static void set_sock_callbacks(struct socket *sock, /* * initiate connection to a remote socket. */ -static int ceph_tcp_connect(struct ceph_connection *con) +int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */ struct socket *sock; unsigned int noio_flag; int ret; + dout("%s con %p peer_addr %s\n", __func__, con, + ceph_pr_addr(&con->peer_addr)); BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ @@ -474,8 +453,6 @@ static int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); - dout("connect %s\n", ceph_pr_addr(&con->peer_addr)); - con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss), O_NONBLOCK); @@ -497,104 +474,14 @@ static int ceph_tcp_connect(struct ceph_connection *con) return 0; } -/* - * If @buf is NULL, discard up to @len bytes. - */ -static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) -{ - struct kvec iov = {buf, len}; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - if (!buf) - msg.msg_flags |= MSG_TRUNC; - - iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); - r = sock_recvmsg(sock, &msg, msg.msg_flags); - if (r == -EAGAIN) - r = 0; - return r; -} - -static int ceph_tcp_recvpage(struct socket *sock, struct page *page, - int page_offset, size_t length) -{ - struct bio_vec bvec = { - .bv_page = page, - .bv_offset = page_offset, - .bv_len = length - }; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - BUG_ON(page_offset + length > PAGE_SIZE); - iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); - r = sock_recvmsg(sock, &msg, msg.msg_flags); - if (r == -EAGAIN) - r = 0; - return r; -} - -/* - * write something. @more is true if caller will be sending more data - * shortly. - */ -static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, - size_t kvlen, size_t len, bool more) -{ - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; - int r; - - if (more) - msg.msg_flags |= MSG_MORE; - else - msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ - - r = kernel_sendmsg(sock, &msg, iov, kvlen, len); - if (r == -EAGAIN) - r = 0; - return r; -} - -/* - * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST - */ -static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) -{ - ssize_t (*sendpage)(struct socket *sock, struct page *page, - int offset, size_t size, int flags); - int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; - int ret; - - /* - * sendpage cannot properly handle pages with page_count == 0, - * we need to fall back to sendmsg if that's the case. - * - * Same goes for slab pages: skb_can_coalesce() allows - * coalescing neighboring slab objects into a single frag which - * triggers one of hardened usercopy checks. - */ - if (sendpage_ok(page)) - sendpage = sock->ops->sendpage; - else - sendpage = sock_no_sendpage; - - ret = sendpage(sock, page, offset, size, flags); - if (ret == -EAGAIN) - ret = 0; - - return ret; -} - /* * Shutdown/close the socket for the given connection. */ -static int con_close_socket(struct ceph_connection *con) +int ceph_con_close_socket(struct ceph_connection *con) { int rc = 0; - dout("con_close_socket on %p sock %p\n", con, con->sock); + dout("%s con %p sock %p\n", __func__, con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); @@ -607,12 +494,34 @@ static int con_close_socket(struct ceph_connection *con) * received a socket close event before we had the chance to * shut the socket down. */ - con_flag_clear(con, CON_FLAG_SOCK_CLOSED); + ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED); con_sock_state_closed(con); return rc; } +static void ceph_con_reset_protocol(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + + ceph_con_close_socket(con); + if (con->in_msg) { + WARN_ON(con->in_msg->con != con); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + if (con->out_msg) { + WARN_ON(con->out_msg->con != con); + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_protocol(con); + else + ceph_con_v1_reset_protocol(con); +} + /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. @@ -623,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg) ceph_msg_put(msg); } + static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { @@ -632,31 +542,22 @@ static void ceph_msg_remove_list(struct list_head *head) } } -static void reset_connection(struct ceph_connection *con) +void ceph_con_reset_session(struct ceph_connection *con) { - /* reset connection, out_queue, msg_ and connect_seq */ - /* discard existing out_queue and msg_seq */ - dout("reset_connection %p\n", con); + dout("%s con %p\n", __func__, con); + + WARN_ON(con->in_msg); + WARN_ON(con->out_msg); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); - - if (con->in_msg) { - BUG_ON(con->in_msg->con != con); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - - con->connect_seq = 0; con->out_seq = 0; - if (con->out_msg) { - BUG_ON(con->out_msg->con != con); - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } con->in_seq = 0; con->in_seq_acked = 0; - con->out_skip = 0; + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_reset_session(con); + else + ceph_con_v1_reset_session(con); } /* @@ -666,17 +567,17 @@ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr)); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; - con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ - con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - con_flag_clear(con, CON_FLAG_BACKOFF); + ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next + connect */ + ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING); + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF); - reset_connection(con); - con->peer_global_seq = 0; + ceph_con_reset_protocol(con); + ceph_con_reset_session(con); cancel_con(con); - con_close_socket(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); @@ -691,8 +592,8 @@ void ceph_con_open(struct ceph_connection *con, mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(addr)); - WARN_ON(con->state != CON_STATE_CLOSED); - con->state = CON_STATE_PREOPEN; + WARN_ON(con->state != CEPH_CON_S_CLOSED); + con->state = CEPH_CON_S_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); @@ -709,7 +610,10 @@ EXPORT_SYMBOL(ceph_con_open); */ bool ceph_con_opened(struct ceph_connection *con) { - return con->connect_seq > 0; + if (ceph_msgr2(from_msgr(con->msgr))) + return ceph_con_v2_opened(con); + + return ceph_con_v1_opened(con); } /* @@ -732,16 +636,15 @@ void ceph_con_init(struct ceph_connection *con, void *private, INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; } EXPORT_SYMBOL(ceph_con_init); - /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ -static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; @@ -753,48 +656,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) return ret; } -static void con_out_kvec_reset(struct ceph_connection *con) +/* + * Discard messages that have been acked by the server. + */ +void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq) { - BUG_ON(con->out_skip); + struct ceph_msg *msg; + u64 seq; - con->out_kvec_left = 0; - con->out_kvec_bytes = 0; - con->out_kvec_cur = &con->out_kvec[0]; -} + dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq); + while (!list_empty(&con->out_sent)) { + msg = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + WARN_ON(msg->needs_out_seq); + seq = le64_to_cpu(msg->hdr.seq); + if (seq > ack_seq) + break; -static void con_out_kvec_add(struct ceph_connection *con, - size_t size, void *data) -{ - int index = con->out_kvec_left; - - BUG_ON(con->out_skip); - BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); - - con->out_kvec[index].iov_len = size; - con->out_kvec[index].iov_base = data; - con->out_kvec_left++; - con->out_kvec_bytes += size; + dout("%s con %p discarding msg %p seq %llu\n", __func__, con, + msg, seq); + ceph_msg_remove(msg); + } } /* - * Chop off a kvec from the end. Return residual number of bytes for - * that kvec, i.e. how many bytes would have been written if the kvec - * hadn't been nuked. + * Discard messages that have been requeued in con_fault(), up to + * reconnect_seq. This avoids gratuitously resending messages that + * the server had received and handled prior to reconnect. */ -static int con_out_kvec_skip(struct ceph_connection *con) +void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq) { - int off = con->out_kvec_cur - con->out_kvec; - int skip = 0; + struct ceph_msg *msg; + u64 seq; - if (con->out_kvec_bytes > 0) { - skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; - BUG_ON(con->out_kvec_bytes < skip); - BUG_ON(!con->out_kvec_left); - con->out_kvec_bytes -= skip; - con->out_kvec_left--; + dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq); + while (!list_empty(&con->out_queue)) { + msg = list_first_entry(&con->out_queue, struct ceph_msg, + list_head); + if (msg->needs_out_seq) + break; + seq = le64_to_cpu(msg->hdr.seq); + if (seq > reconnect_seq) + break; + + dout("%s con %p discarding msg %p seq %llu\n", __func__, con, + msg, seq); + ceph_msg_remove(msg); } - - return skip; } #ifdef CONFIG_BLOCK @@ -1113,10 +1021,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) cursor->need_crc = true; } -static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, + struct ceph_msg *msg, size_t length) { - struct ceph_msg_data_cursor *cursor = &msg->cursor; - BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(!msg->num_data_items); @@ -1132,9 +1039,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ -static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece) +struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) { struct page *page; @@ -1173,8 +1080,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, * Returns true if the result moves the cursor on to the next piece * of the data item. */ -static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, - size_t bytes) +void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; @@ -1210,328 +1116,8 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, cursor->need_crc = new_piece; } -static size_t sizeof_footer(struct ceph_connection *con) -{ - return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? - sizeof(struct ceph_msg_footer) : - sizeof(struct ceph_msg_footer_old); -} - -static void prepare_message_data(struct ceph_msg *msg, u32 data_len) -{ - /* Initialize data cursor */ - - ceph_msg_data_cursor_init(msg, (size_t)data_len); -} - -/* - * Prepare footer for currently outgoing message, and finish things - * off. Assumes out_kvec* are already valid.. we just add on to the end. - */ -static void prepare_write_message_footer(struct ceph_connection *con) -{ - struct ceph_msg *m = con->out_msg; - - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; - - dout("prepare_write_message_footer %p\n", con); - con_out_kvec_add(con, sizeof_footer(con), &m->footer); - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { - if (con->ops->sign_message) - con->ops->sign_message(m); - else - m->footer.sig = 0; - } else { - m->old_footer.flags = m->footer.flags; - } - con->out_more = m->more_to_follow; - con->out_msg_done = true; -} - -/* - * Prepare headers for the next outgoing message. - */ -static void prepare_write_message(struct ceph_connection *con) -{ - struct ceph_msg *m; - u32 crc; - - con_out_kvec_reset(con); - con->out_msg_done = false; - - /* Sneak an ack in there first? If we can get it into the same - * TCP packet that's a good thing. */ - if (con->in_seq > con->in_seq_acked) { - con->in_seq_acked = con->in_seq; - con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - } - - BUG_ON(list_empty(&con->out_queue)); - m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); - con->out_msg = m; - BUG_ON(m->con != con); - - /* put message on sent list */ - ceph_msg_get(m); - list_move_tail(&m->list_head, &con->out_sent); - - /* - * only assign outgoing seq # if we haven't sent this message - * yet. if it is requeued, resend with it's original seq. - */ - if (m->needs_out_seq) { - m->hdr.seq = cpu_to_le64(++con->out_seq); - m->needs_out_seq = false; - - if (con->ops->reencode_message) - con->ops->reencode_message(m); - } - - dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", - m, con->out_seq, le16_to_cpu(m->hdr.type), - le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - m->data_length); - WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); - WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - - /* tag + hdr + front + middle */ - con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); - con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); - - if (m->middle) - con_out_kvec_add(con, m->middle->vec.iov_len, - m->middle->vec.iov_base); - - /* fill in hdr crc and finalize hdr */ - crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); - - /* fill in front and middle crc, footer */ - crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); - if (m->middle) { - crc = crc32c(0, m->middle->vec.iov_base, - m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); - } else - con->out_msg->footer.middle_crc = 0; - dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; - - /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; - if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); - con->out_more = 1; /* data + footer will follow */ - } else { - /* no, queue up footer too and be done */ - prepare_write_message_footer(con); - } - - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare an ack. - */ -static void prepare_write_ack(struct ceph_connection *con) -{ - dout("prepare_write_ack %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con_out_kvec_reset(con); - - con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); - - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - - con->out_more = 1; /* more will follow.. eventually.. */ - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare to share the seq during handshake - */ -static void prepare_write_seq(struct ceph_connection *con) -{ - dout("prepare_write_seq %p %llu -> %llu\n", con, - con->in_seq_acked, con->in_seq); - con->in_seq_acked = con->in_seq; - - con_out_kvec_reset(con); - - con->out_temp_ack = cpu_to_le64(con->in_seq_acked); - con_out_kvec_add(con, sizeof (con->out_temp_ack), - &con->out_temp_ack); - - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Prepare to write keepalive byte. - */ -static void prepare_write_keepalive(struct ceph_connection *con) -{ - dout("prepare_write_keepalive %p\n", con); - con_out_kvec_reset(con); - if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec64 now; - - ktime_get_real_ts64(&now); - con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); - ceph_encode_timespec64(&con->out_temp_keepalive2, &now); - con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), - &con->out_temp_keepalive2); - } else { - con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); - } - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -/* - * Connection negotiation. - */ - -static int get_connect_authorizer(struct ceph_connection *con) -{ - struct ceph_auth_handshake *auth; - int auth_proto; - - if (!con->ops->get_authorizer) { - con->auth = NULL; - con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; - con->out_connect.authorizer_len = 0; - return 0; - } - - auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry); - if (IS_ERR(auth)) - return PTR_ERR(auth); - - con->auth = auth; - con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); - con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len); - return 0; -} - -/* - * We connected to a peer and are saying hello. - */ -static void prepare_write_banner(struct ceph_connection *con) -{ - con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); - con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), - &con->msgr->my_enc_addr); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -static void __prepare_write_connect(struct ceph_connection *con) -{ - con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect); - if (con->auth) - con_out_kvec_add(con, con->auth->authorizer_buf_len, - con->auth->authorizer_buf); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); -} - -static int prepare_write_connect(struct ceph_connection *con) -{ - unsigned int global_seq = get_global_seq(con->msgr, 0); - int proto; - int ret; - - switch (con->peer_name.type) { - case CEPH_ENTITY_TYPE_MON: - proto = CEPH_MONC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_OSD: - proto = CEPH_OSDC_PROTOCOL; - break; - case CEPH_ENTITY_TYPE_MDS: - proto = CEPH_MDSC_PROTOCOL; - break; - default: - BUG(); - } - - dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, - con->connect_seq, global_seq, proto); - - con->out_connect.features = - cpu_to_le64(from_msgr(con->msgr)->supported_features); - con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); - con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); - con->out_connect.global_seq = cpu_to_le32(global_seq); - con->out_connect.protocol_version = cpu_to_le32(proto); - con->out_connect.flags = 0; - - ret = get_connect_authorizer(con); - if (ret) - return ret; - - __prepare_write_connect(con); - return 0; -} - -/* - * write as much of pending kvecs to the socket as we can. - * 1 -> done - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_kvec(struct ceph_connection *con) -{ - int ret; - - dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); - while (con->out_kvec_bytes > 0) { - ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, - con->out_kvec_left, con->out_kvec_bytes, - con->out_more); - if (ret <= 0) - goto out; - con->out_kvec_bytes -= ret; - if (con->out_kvec_bytes == 0) - break; /* done */ - - /* account for full iov entries consumed */ - while (ret >= con->out_kvec_cur->iov_len) { - BUG_ON(!con->out_kvec_left); - ret -= con->out_kvec_cur->iov_len; - con->out_kvec_cur++; - con->out_kvec_left--; - } - /* and for a partially-consumed entry */ - if (ret) { - con->out_kvec_cur->iov_len -= ret; - con->out_kvec_cur->iov_base += ret; - } - } - con->out_kvec_left = 0; - ret = 1; -out: - dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, - con->out_kvec_bytes, con->out_kvec_left, ret); - return ret; /* done! */ -} - -static u32 ceph_crc32c_page(u32 crc, struct page *page, - unsigned int page_offset, - unsigned int length) +u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, + unsigned int length) { char *kaddr; @@ -1542,257 +1128,8 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page, return crc; } -/* - * Write as much message data payload as we can. If we finish, queue - * up the footer. - * 1 -> done, footer is now queued in out_kvec[]. - * 0 -> socket full, but more to do - * <0 -> error - */ -static int write_partial_message_data(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->out_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; - u32 crc; - dout("%s %p msg %p\n", __func__, con, msg); - - if (!msg->num_data_items) - return -EINVAL; - - /* - * Iterate through each page that contains data to be - * written, and send as much as possible for each. - * - * If we are calculating the data crc (the default), we will - * need to map the page. If we have no pages, they have - * been revoked, so use the zero page. - */ - crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; - while (cursor->total_resid) { - struct page *page; - size_t page_offset; - size_t length; - int ret; - - if (!cursor->resid) { - ceph_msg_data_advance(cursor, 0); - continue; - } - - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); - if (length == cursor->total_resid) - more = MSG_MORE; - ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, - more); - if (ret <= 0) { - if (do_datacrc) - msg->footer.data_crc = cpu_to_le32(crc); - - return ret; - } - if (do_datacrc && cursor->need_crc) - crc = ceph_crc32c_page(crc, page, page_offset, length); - ceph_msg_data_advance(cursor, (size_t)ret); - } - - dout("%s %p msg %p done\n", __func__, con, msg); - - /* prepare and queue up footer, too */ - if (do_datacrc) - msg->footer.data_crc = cpu_to_le32(crc); - else - msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; - con_out_kvec_reset(con); - prepare_write_message_footer(con); - - return 1; /* must return > 0 to indicate success */ -} - -/* - * write some zeros - */ -static int write_partial_skip(struct ceph_connection *con) -{ - int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; - int ret; - - dout("%s %p %d left\n", __func__, con, con->out_skip); - while (con->out_skip > 0) { - size_t size = min(con->out_skip, (int) PAGE_SIZE); - - if (size == con->out_skip) - more = MSG_MORE; - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more); - if (ret <= 0) - goto out; - con->out_skip -= ret; - } - ret = 1; -out: - return ret; -} - -/* - * Prepare to read connection handshake, or an ack. - */ -static void prepare_read_banner(struct ceph_connection *con) -{ - dout("prepare_read_banner %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_connect(struct ceph_connection *con) -{ - dout("prepare_read_connect %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_ack(struct ceph_connection *con) -{ - dout("prepare_read_ack %p\n", con); - con->in_base_pos = 0; -} - -static void prepare_read_seq(struct ceph_connection *con) -{ - dout("prepare_read_seq %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_SEQ; -} - -static void prepare_read_tag(struct ceph_connection *con) -{ - dout("prepare_read_tag %p\n", con); - con->in_base_pos = 0; - con->in_tag = CEPH_MSGR_TAG_READY; -} - -static void prepare_read_keepalive_ack(struct ceph_connection *con) -{ - dout("prepare_read_keepalive_ack %p\n", con); - con->in_base_pos = 0; -} - -/* - * Prepare to read a message. - */ -static int prepare_read_message(struct ceph_connection *con) -{ - dout("prepare_read_message %p\n", con); - BUG_ON(con->in_msg != NULL); - con->in_base_pos = 0; - con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; - return 0; -} - - -static int read_partial(struct ceph_connection *con, - int end, int size, void *object) -{ - while (con->in_base_pos < end) { - int left = end - con->in_base_pos; - int have = size - left; - int ret = ceph_tcp_recvmsg(con->sock, object + have, left); - if (ret <= 0) - return ret; - con->in_base_pos += ret; - } - return 1; -} - - -/* - * Read all or part of the connect-side handshake on a new connection - */ -static int read_partial_banner(struct ceph_connection *con) -{ - int size; - int end; - int ret; - - dout("read_partial_banner %p at %d\n", con, con->in_base_pos); - - /* peer's banner */ - size = strlen(CEPH_BANNER); - end = size; - ret = read_partial(con, end, size, con->in_banner); - if (ret <= 0) - goto out; - - size = sizeof (con->actual_peer_addr); - end += size; - ret = read_partial(con, end, size, &con->actual_peer_addr); - if (ret <= 0) - goto out; - ceph_decode_banner_addr(&con->actual_peer_addr); - - size = sizeof (con->peer_addr_for_me); - end += size; - ret = read_partial(con, end, size, &con->peer_addr_for_me); - if (ret <= 0) - goto out; - ceph_decode_banner_addr(&con->peer_addr_for_me); - -out: - return ret; -} - -static int read_partial_connect(struct ceph_connection *con) -{ - int size; - int end; - int ret; - - dout("read_partial_connect %p at %d\n", con, con->in_base_pos); - - size = sizeof (con->in_reply); - end = size; - ret = read_partial(con, end, size, &con->in_reply); - if (ret <= 0) - goto out; - - if (con->auth) { - size = le32_to_cpu(con->in_reply.authorizer_len); - if (size > con->auth->authorizer_reply_buf_len) { - pr_err("authorizer reply too big: %d > %zu\n", size, - con->auth->authorizer_reply_buf_len); - ret = -EINVAL; - goto out; - } - - end += size; - ret = read_partial(con, end, size, - con->auth->authorizer_reply_buf); - if (ret <= 0) - goto out; - } - - dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", - con, (int)con->in_reply.tag, - le32_to_cpu(con->in_reply.connect_seq), - le32_to_cpu(con->in_reply.global_seq)); -out: - return ret; -} - -/* - * Verify the hello banner looks okay. - */ -static int verify_hello(struct ceph_connection *con) -{ - if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { - pr_err("connect to %s got bad banner\n", - ceph_pr_addr(&con->peer_addr)); - con->error_msg = "protocol error, bad banner"; - return -1; - } - return 0; -} - -static bool addr_is_blank(struct ceph_entity_addr *addr) +bool ceph_addr_is_blank(const struct ceph_entity_addr *addr) { struct sockaddr_storage ss = addr->in_addr; /* align */ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr; @@ -1808,7 +1145,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr) } } -static int addr_port(struct ceph_entity_addr *addr) +int ceph_addr_port(const struct ceph_entity_addr *addr) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: @@ -1819,7 +1156,7 @@ static int addr_port(struct ceph_entity_addr *addr) return 0; } -static void addr_set_port(struct ceph_entity_addr *addr, int p) +void ceph_addr_set_port(struct ceph_entity_addr *addr, int p) { switch (get_unaligned(&addr->in_addr.ss_family)) { case AF_INET: @@ -1977,8 +1314,17 @@ int ceph_parse_ips(const char *c, const char *end, port = CEPH_MON_PORT; } - addr_set_port(&addr[i], port); + ceph_addr_set_port(&addr[i], port); + /* + * We want the type to be set according to ms_mode + * option, but options are normally parsed after mon + * addresses. Rather than complicating parsing, set + * to LEGACY and override in build_initial_monmap() + * for mon addresses and ceph_messenger_init() for + * ip option. + */ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; + addr[i].nonce = 0; dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); @@ -2000,521 +1346,12 @@ bad: return ret; } -static int process_banner(struct ceph_connection *con) -{ - dout("process_banner on %p\n", con); - - if (verify_hello(con) < 0) - return -1; - - /* - * Make sure the other end is who we wanted. note that the other - * end may not yet know their ip address, so if it's 0.0.0.0, give - * them the benefit of the doubt. - */ - if (memcmp(&con->peer_addr, &con->actual_peer_addr, - sizeof(con->peer_addr)) != 0 && - !(addr_is_blank(&con->actual_peer_addr) && - con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warn("wrong peer, want %s/%u, got %s/%u\n", - ceph_pr_addr(&con->peer_addr), - le32_to_cpu(con->peer_addr.nonce), - ceph_pr_addr(&con->actual_peer_addr), - le32_to_cpu(con->actual_peer_addr.nonce)); - con->error_msg = "wrong peer at address"; - return -1; - } - - /* - * did we learn our address? - */ - if (addr_is_blank(&con->msgr->inst.addr)) { - int port = addr_port(&con->msgr->inst.addr); - - memcpy(&con->msgr->inst.addr.in_addr, - &con->peer_addr_for_me.in_addr, - sizeof(con->peer_addr_for_me.in_addr)); - addr_set_port(&con->msgr->inst.addr, port); - encode_my_addr(con->msgr); - dout("process_banner learned my addr is %s\n", - ceph_pr_addr(&con->msgr->inst.addr)); - } - - return 0; -} - -static int process_connect(struct ceph_connection *con) -{ - u64 sup_feat = from_msgr(con->msgr)->supported_features; - u64 req_feat = from_msgr(con->msgr)->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); - int ret; - - dout("process_connect on %p tag %d\n", con, (int)con->in_tag); - - if (con->auth) { - int len = le32_to_cpu(con->in_reply.authorizer_len); - - /* - * Any connection that defines ->get_authorizer() - * should also define ->add_authorizer_challenge() and - * ->verify_authorizer_reply(). - * - * See get_connect_authorizer(). - */ - if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { - ret = con->ops->add_authorizer_challenge( - con, con->auth->authorizer_reply_buf, len); - if (ret < 0) - return ret; - - con_out_kvec_reset(con); - __prepare_write_connect(con); - prepare_read_connect(con); - return 0; - } - - if (len) { - ret = con->ops->verify_authorizer_reply(con); - if (ret < 0) { - con->error_msg = "bad authorize reply"; - return ret; - } - } - } - - switch (con->in_reply.tag) { - case CEPH_MSGR_TAG_FEATURES: - pr_err("%s%lld %s feature set mismatch," - " my %llx < server's %llx, missing %llx\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - sup_feat, server_feat, server_feat & ~sup_feat); - con->error_msg = "missing required protocol features"; - reset_connection(con); - return -1; - - case CEPH_MSGR_TAG_BADPROTOVER: - pr_err("%s%lld %s protocol version mismatch," - " my %d != server's %d\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - le32_to_cpu(con->out_connect.protocol_version), - le32_to_cpu(con->in_reply.protocol_version)); - con->error_msg = "protocol version mismatch"; - reset_connection(con); - return -1; - - case CEPH_MSGR_TAG_BADAUTHORIZER: - con->auth_retry++; - dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, - con->auth_retry); - if (con->auth_retry == 2) { - con->error_msg = "connect authorization failure"; - return -1; - } - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RESETSESSION: - /* - * If we connected with a large connect_seq but the peer - * has no record of a session with us (no connection, or - * connect_seq == 0), they will send RESETSESION to indicate - * that they must have reset their session, and may have - * dropped messages. - */ - dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_reply.connect_seq)); - pr_err("%s%lld %s connection reset\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr)); - reset_connection(con); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - - /* Tell ceph about it. */ - mutex_unlock(&con->mutex); - pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); - if (con->ops->peer_reset) - con->ops->peer_reset(con); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_NEGOTIATING) - return -EAGAIN; - break; - - case CEPH_MSGR_TAG_RETRY_SESSION: - /* - * If we sent a smaller connect_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", - le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_reply.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_RETRY_GLOBAL: - /* - * If we sent a smaller global_seq than the peer has, try - * again with a larger value. - */ - dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.global_seq)); - get_global_seq(con->msgr, - le32_to_cpu(con->in_reply.global_seq)); - con_out_kvec_reset(con); - ret = prepare_write_connect(con); - if (ret < 0) - return ret; - prepare_read_connect(con); - break; - - case CEPH_MSGR_TAG_SEQ: - case CEPH_MSGR_TAG_READY: - if (req_feat & ~server_feat) { - pr_err("%s%lld %s protocol feature mismatch," - " my required %llx > server's %llx, need %llx\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - req_feat, server_feat, req_feat & ~server_feat); - con->error_msg = "missing required protocol features"; - reset_connection(con); - return -1; - } - - WARN_ON(con->state != CON_STATE_NEGOTIATING); - con->state = CON_STATE_OPEN; - con->auth_retry = 0; /* we authenticated; clear flag */ - con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); - con->connect_seq++; - con->peer_features = server_feat; - dout("process_connect got READY gseq %d cseq %d (%d)\n", - con->peer_global_seq, - le32_to_cpu(con->in_reply.connect_seq), - con->connect_seq); - WARN_ON(con->connect_seq != - le32_to_cpu(con->in_reply.connect_seq)); - - if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) - con_flag_set(con, CON_FLAG_LOSSYTX); - - con->delay = 0; /* reset backoff memory */ - - if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { - prepare_write_seq(con); - prepare_read_seq(con); - } else { - prepare_read_tag(con); - } - break; - - case CEPH_MSGR_TAG_WAIT: - /* - * If there is a connection race (we are opening - * connections to each other), one of us may just have - * to WAIT. This shouldn't happen if we are the - * client. - */ - con->error_msg = "protocol error, got WAIT as client"; - return -1; - - default: - con->error_msg = "protocol error, garbage tag during connect"; - return -1; - } - return 0; -} - - -/* - * read (part of) an ack - */ -static int read_partial_ack(struct ceph_connection *con) -{ - int size = sizeof (con->in_temp_ack); - int end = size; - - return read_partial(con, end, size, &con->in_temp_ack); -} - -/* - * We can finally discard anything that's been acked. - */ -static void process_ack(struct ceph_connection *con) -{ - struct ceph_msg *m; - u64 ack = le64_to_cpu(con->in_temp_ack); - u64 seq; - bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); - struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; - - /* - * In the reconnect case, con_fault() has requeued messages - * in out_sent. We should cleanup old messages according to - * the reconnect seq. - */ - while (!list_empty(list)) { - m = list_first_entry(list, struct ceph_msg, list_head); - if (reconnect && m->needs_out_seq) - break; - seq = le64_to_cpu(m->hdr.seq); - if (seq > ack) - break; - dout("got ack for seq %llu type %d at %p\n", seq, - le16_to_cpu(m->hdr.type), m); - m->ack_stamp = jiffies; - ceph_msg_remove(m); - } - - prepare_read_tag(con); -} - - -static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, - unsigned int sec_len, u32 *crc) -{ - int ret, left; - - BUG_ON(!section); - - while (section->iov_len < sec_len) { - BUG_ON(section->iov_base == NULL); - left = sec_len - section->iov_len; - ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + - section->iov_len, left); - if (ret <= 0) - return ret; - section->iov_len += ret; - } - if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, section->iov_len); - - return 1; -} - -static int read_partial_msg_data(struct ceph_connection *con) -{ - struct ceph_msg *msg = con->in_msg; - struct ceph_msg_data_cursor *cursor = &msg->cursor; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - struct page *page; - size_t page_offset; - size_t length; - u32 crc = 0; - int ret; - - if (!msg->num_data_items) - return -EIO; - - if (do_datacrc) - crc = con->in_data_crc; - while (cursor->total_resid) { - if (!cursor->resid) { - ceph_msg_data_advance(cursor, 0); - continue; - } - - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); - ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); - if (ret <= 0) { - if (do_datacrc) - con->in_data_crc = crc; - - return ret; - } - - if (do_datacrc) - crc = ceph_crc32c_page(crc, page, page_offset, ret); - ceph_msg_data_advance(cursor, (size_t)ret); - } - if (do_datacrc) - con->in_data_crc = crc; - - return 1; /* must return > 0 to indicate success */ -} - -/* - * read (part of) a message. - */ -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message(struct ceph_connection *con) -{ - struct ceph_msg *m = con->in_msg; - int size; - int end; - int ret; - unsigned int front_len, middle_len, data_len; - bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); - bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); - u64 seq; - u32 crc; - - dout("read_partial_message con %p msg %p\n", con, m); - - /* header */ - size = sizeof (con->in_hdr); - end = size; - ret = read_partial(con, end, size, &con->in_hdr); - if (ret <= 0) - return ret; - - crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); - if (cpu_to_le32(crc) != con->in_hdr.crc) { - pr_err("read_partial_message bad hdr crc %u != expected %u\n", - crc, con->in_hdr.crc); - return -EBADMSG; - } - - front_len = le32_to_cpu(con->in_hdr.front_len); - if (front_len > CEPH_MSG_MAX_FRONT_LEN) - return -EIO; - middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) - return -EIO; - data_len = le32_to_cpu(con->in_hdr.data_len); - if (data_len > CEPH_MSG_MAX_DATA_LEN) - return -EIO; - - /* verify seq# */ - seq = le64_to_cpu(con->in_hdr.seq); - if ((s64)seq - (s64)con->in_seq < 1) { - pr_info("skipping %s%lld %s seq %lld expected %lld\n", - ENTITY_NAME(con->peer_name), - ceph_pr_addr(&con->peer_addr), - seq, con->in_seq + 1); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof_footer(con); - con->in_tag = CEPH_MSGR_TAG_READY; - return 1; - } else if ((s64)seq - (s64)con->in_seq > 1) { - pr_err("read_partial_message bad seq %lld expected %lld\n", - seq, con->in_seq + 1); - con->error_msg = "bad message sequence # for incoming message"; - return -EBADE; - } - - /* allocate message? */ - if (!con->in_msg) { - int skip = 0; - - dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - front_len, data_len); - ret = ceph_con_in_msg_alloc(con, &skip); - if (ret < 0) - return ret; - - BUG_ON(!con->in_msg ^ skip); - if (skip) { - /* skip this message */ - dout("alloc_msg said skip message\n"); - con->in_base_pos = -front_len - middle_len - data_len - - sizeof_footer(con); - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; - return 1; - } - - BUG_ON(!con->in_msg); - BUG_ON(con->in_msg->con != con); - m = con->in_msg; - m->front.iov_len = 0; /* haven't read it yet */ - if (m->middle) - m->middle->vec.iov_len = 0; - - /* prepare for data payload, if any */ - - if (data_len) - prepare_message_data(con->in_msg, data_len); - } - - /* front */ - ret = read_partial_message_section(con, &m->front, front_len, - &con->in_front_crc); - if (ret <= 0) - return ret; - - /* middle */ - if (m->middle) { - ret = read_partial_message_section(con, &m->middle->vec, - middle_len, - &con->in_middle_crc); - if (ret <= 0) - return ret; - } - - /* (page) data */ - if (data_len) { - ret = read_partial_msg_data(con); - if (ret <= 0) - return ret; - } - - /* footer */ - size = sizeof_footer(con); - end += size; - ret = read_partial(con, end, size, &m->footer); - if (ret <= 0) - return ret; - - if (!need_sign) { - m->footer.flags = m->old_footer.flags; - m->footer.sig = 0; - } - - dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", - m, front_len, m->footer.front_crc, middle_len, - m->footer.middle_crc, data_len, m->footer.data_crc); - - /* crc ok? */ - if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { - pr_err("read_partial_message %p front crc %u != exp. %u\n", - m, con->in_front_crc, m->footer.front_crc); - return -EBADMSG; - } - if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { - pr_err("read_partial_message %p middle crc %u != exp %u\n", - m, con->in_middle_crc, m->footer.middle_crc); - return -EBADMSG; - } - if (do_datacrc && - (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && - con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { - pr_err("read_partial_message %p data crc %u != exp. %u\n", m, - con->in_data_crc, le32_to_cpu(m->footer.data_crc)); - return -EBADMSG; - } - - if (need_sign && con->ops->check_message_signature && - con->ops->check_message_signature(m)) { - pr_err("read_partial_message %p signature check failed\n", m); - return -EBADMSG; - } - - return 1; /* done! */ -} - /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ -static void process_message(struct ceph_connection *con) +void ceph_con_process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; @@ -2528,12 +1365,13 @@ static void process_message(struct ceph_connection *con) con->in_seq++; mutex_unlock(&con->mutex); - dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); @@ -2541,264 +1379,6 @@ static void process_message(struct ceph_connection *con) mutex_lock(&con->mutex); } -static int read_keepalive_ack(struct ceph_connection *con) -{ - struct ceph_timespec ceph_ts; - size_t size = sizeof(ceph_ts); - int ret = read_partial(con, size, size, &ceph_ts); - if (ret <= 0) - return ret; - ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); - prepare_read_tag(con); - return 1; -} - -/* - * Write something to the socket. Called in a worker thread when the - * socket appears to be writeable and we have something ready to send. - */ -static int try_write(struct ceph_connection *con) -{ - int ret = 1; - - dout("try_write start %p state %lu\n", con, con->state); - if (con->state != CON_STATE_PREOPEN && - con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN) - return 0; - - /* open the socket first? */ - if (con->state == CON_STATE_PREOPEN) { - BUG_ON(con->sock); - con->state = CON_STATE_CONNECTING; - - con_out_kvec_reset(con); - prepare_write_banner(con); - prepare_read_banner(con); - - BUG_ON(con->in_msg); - con->in_tag = CEPH_MSGR_TAG_READY; - dout("try_write initiating connect on %p new state %lu\n", - con, con->state); - ret = ceph_tcp_connect(con); - if (ret < 0) { - con->error_msg = "connect error"; - goto out; - } - } - -more: - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); - BUG_ON(!con->sock); - - /* kvec data queued? */ - if (con->out_kvec_left) { - ret = write_partial_kvec(con); - if (ret <= 0) - goto out; - } - if (con->out_skip) { - ret = write_partial_skip(con); - if (ret <= 0) - goto out; - } - - /* msg pages? */ - if (con->out_msg) { - if (con->out_msg_done) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; /* we're done with this one */ - goto do_next; - } - - ret = write_partial_message_data(con); - if (ret == 1) - goto more; /* we need to send the footer, too! */ - if (ret == 0) - goto out; - if (ret < 0) { - dout("try_write write_partial_message_data err %d\n", - ret); - goto out; - } - } - -do_next: - if (con->state == CON_STATE_OPEN) { - if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { - prepare_write_keepalive(con); - goto more; - } - /* is anything else pending? */ - if (!list_empty(&con->out_queue)) { - prepare_write_message(con); - goto more; - } - if (con->in_seq > con->in_seq_acked) { - prepare_write_ack(con); - goto more; - } - } - - /* Nothing to do! */ - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - dout("try_write nothing else to write.\n"); - ret = 0; -out: - dout("try_write done on %p ret %d\n", con, ret); - return ret; -} - -/* - * Read what we can from the socket. - */ -static int try_read(struct ceph_connection *con) -{ - int ret = -1; - -more: - dout("try_read start on %p state %lu\n", con, con->state); - if (con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN) - return 0; - - BUG_ON(!con->sock); - - dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, - con->in_base_pos); - - if (con->state == CON_STATE_CONNECTING) { - dout("try_read connecting\n"); - ret = read_partial_banner(con); - if (ret <= 0) - goto out; - ret = process_banner(con); - if (ret < 0) - goto out; - - con->state = CON_STATE_NEGOTIATING; - - /* - * Received banner is good, exchange connection info. - * Do not reset out_kvec, as sending our banner raced - * with receiving peer banner after connect completed. - */ - ret = prepare_write_connect(con); - if (ret < 0) - goto out; - prepare_read_connect(con); - - /* Send connection info before awaiting response */ - goto out; - } - - if (con->state == CON_STATE_NEGOTIATING) { - dout("try_read negotiating\n"); - ret = read_partial_connect(con); - if (ret <= 0) - goto out; - ret = process_connect(con); - if (ret < 0) - goto out; - goto more; - } - - WARN_ON(con->state != CON_STATE_OPEN); - - if (con->in_base_pos < 0) { - /* - * skipping + discarding content. - */ - ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); - if (ret <= 0) - goto out; - dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); - con->in_base_pos += ret; - if (con->in_base_pos) - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) { - /* - * what's next? - */ - ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); - if (ret <= 0) - goto out; - dout("try_read got tag %d\n", (int)con->in_tag); - switch (con->in_tag) { - case CEPH_MSGR_TAG_MSG: - prepare_read_message(con); - break; - case CEPH_MSGR_TAG_ACK: - prepare_read_ack(con); - break; - case CEPH_MSGR_TAG_KEEPALIVE2_ACK: - prepare_read_keepalive_ack(con); - break; - case CEPH_MSGR_TAG_CLOSE: - con_close_socket(con); - con->state = CON_STATE_CLOSED; - goto out; - default: - goto bad_tag; - } - } - if (con->in_tag == CEPH_MSGR_TAG_MSG) { - ret = read_partial_message(con); - if (ret <= 0) { - switch (ret) { - case -EBADMSG: - con->error_msg = "bad crc/signature"; - fallthrough; - case -EBADE: - ret = -EIO; - break; - case -EIO: - con->error_msg = "io error"; - break; - } - goto out; - } - if (con->in_tag == CEPH_MSGR_TAG_READY) - goto more; - process_message(con); - if (con->state == CON_STATE_OPEN) - prepare_read_tag(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_ACK || - con->in_tag == CEPH_MSGR_TAG_SEQ) { - /* - * the final handshake seq exchange is semantically - * equivalent to an ACK - */ - ret = read_partial_ack(con); - if (ret <= 0) - goto out; - process_ack(con); - goto more; - } - if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { - ret = read_keepalive_ack(con); - if (ret <= 0) - goto out; - goto more; - } - -out: - dout("try_read done on %p ret %d\n", con, ret); - return ret; - -bad_tag: - pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); - con->error_msg = "protocol error, garbage tag"; - ret = -1; - goto out; -} - - /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. @@ -2811,6 +1391,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) return -ENOENT; } + if (delay >= HZ) + delay = round_jiffies_relative(delay); + dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); @@ -2836,27 +1419,30 @@ static void cancel_con(struct ceph_connection *con) static bool con_sock_closed(struct ceph_connection *con) { - if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) + if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED)) return false; #define CASE(x) \ - case CON_STATE_ ## x: \ + case CEPH_CON_S_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); - CASE(CONNECTING); - CASE(NEGOTIATING); + CASE(V1_BANNER); + CASE(V1_CONNECT_MSG); + CASE(V2_BANNER_PREFIX); + CASE(V2_BANNER_PAYLOAD); + CASE(V2_HELLO); + CASE(V2_AUTH); + CASE(V2_AUTH_SIGNATURE); + CASE(V2_SESSION_CONNECT); + CASE(V2_SESSION_RECONNECT); CASE(OPEN); CASE(STANDBY); default: - pr_warn("%s con %p unrecognized state %lu\n", - __func__, con, con->state); - con->error_msg = "unrecognized con state"; BUG(); - break; } #undef CASE @@ -2867,15 +1453,15 @@ static bool con_backoff(struct ceph_connection *con) { int ret; - if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) + if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF)) return false; - ret = queue_con_delay(con, round_jiffies_relative(con->delay)); + ret = queue_con_delay(con, con->delay); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); - con_flag_set(con, CON_FLAG_BACKOFF); + ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); } return true; @@ -2891,11 +1477,11 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry) { - dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->v1.auth_retry) { + dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); - con->auth_retry = 0; + con->v1.auth_retry = 0; } if (con->ops->fault) @@ -2923,21 +1509,24 @@ static void ceph_con_workfn(struct work_struct *work) dout("%s: con %p BACKOFF\n", __func__, con); break; } - if (con->state == CON_STATE_STANDBY) { + if (con->state == CEPH_CON_S_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } - if (con->state == CON_STATE_CLOSED) { + if (con->state == CEPH_CON_S_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } - if (con->state == CON_STATE_PREOPEN) { + if (con->state == CEPH_CON_S_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } - ret = try_read(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_read(con); + else + ret = ceph_con_v1_try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -2947,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work) break; } - ret = try_write(con); + if (ceph_msgr2(from_msgr(con->msgr))) + ret = ceph_con_v2_try_write(con); + else + ret = ceph_con_v1_try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; @@ -2974,64 +1566,54 @@ static void ceph_con_workfn(struct work_struct *work) */ static void con_fault(struct ceph_connection *con) { - dout("fault %p state %lu to peer %s\n", + dout("fault %p state %d to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr), con->error_msg); con->error_msg = NULL; - WARN_ON(con->state != CON_STATE_CONNECTING && - con->state != CON_STATE_NEGOTIATING && - con->state != CON_STATE_OPEN); + WARN_ON(con->state == CEPH_CON_S_STANDBY || + con->state == CEPH_CON_S_CLOSED); - con_close_socket(con); + ceph_con_reset_protocol(con); - if (con_flag_test(con, CON_FLAG_LOSSYTX)) { + if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); - con->state = CON_STATE_CLOSED; + con->state = CEPH_CON_S_CLOSED; return; } - if (con->in_msg) { - BUG_ON(con->in_msg->con != con); - ceph_msg_put(con->in_msg); - con->in_msg = NULL; - } - if (con->out_msg) { - BUG_ON(con->out_msg->con != con); - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } - /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && - !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { + !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); - con_flag_clear(con, CON_FLAG_WRITE_PENDING); - con->state = CON_STATE_STANDBY; + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + con->state = CEPH_CON_S_STANDBY; } else { /* retry after a delay. */ - con->state = CON_STATE_PREOPEN; - if (con->delay == 0) + con->state = CEPH_CON_S_PREOPEN; + if (!con->delay) { con->delay = BASE_DELAY_INTERVAL; - else if (con->delay < MAX_DELAY_INTERVAL) + } else if (con->delay < MAX_DELAY_INTERVAL) { con->delay *= 2; - con_flag_set(con, CON_FLAG_BACKOFF); + if (con->delay > MAX_DELAY_INTERVAL) + con->delay = MAX_DELAY_INTERVAL; + } + ceph_con_flag_set(con, CEPH_CON_F_BACKOFF); queue_con(con); } } - void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) { u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; msgr->inst.addr.nonce = cpu_to_le32(nonce); - encode_my_addr(msgr); + ceph_encode_my_addr(msgr); } /* @@ -3042,26 +1624,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr, { spin_lock_init(&msgr->global_seq_lock); - if (myaddr) - msgr->inst.addr = *myaddr; + if (myaddr) { + memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr, + sizeof(msgr->inst.addr.in_addr)); + ceph_addr_set_port(&msgr->inst.addr, 0); + } - /* select a random nonce */ - msgr->inst.addr.type = 0; - get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); - encode_my_addr(msgr); + /* + * Since nautilus, clients are identified using type ANY. + * For msgr1, ceph_encode_banner_addr() munges it to NONE. + */ + msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY; + + /* generate a random non-zero nonce */ + do { + get_random_bytes(&msgr->inst.addr.nonce, + sizeof(msgr->inst.addr.nonce)); + } while (!msgr->inst.addr.nonce); + ceph_encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } -EXPORT_SYMBOL(ceph_messenger_init); void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } -EXPORT_SYMBOL(ceph_messenger_fini); static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { @@ -3075,17 +1666,19 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ - if (con->state == CON_STATE_STANDBY) { + if (con->state == CEPH_CON_S_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); - con->state = CON_STATE_PREOPEN; - con->connect_seq++; - WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); - WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); + con->state = CEPH_CON_S_PREOPEN; + con->v1.connect_seq++; + WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); + WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. + * + * Consumes a ref on @msg. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { @@ -3096,7 +1689,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) mutex_lock(&con->mutex); - if (con->state == CON_STATE_CLOSED) { + if (con->state == CEPH_CON_S_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); @@ -3119,7 +1712,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) /* if there wasn't anything waiting to send before, queue * new work */ - if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); @@ -3137,36 +1730,30 @@ void ceph_msg_revoke(struct ceph_msg *msg) } mutex_lock(&con->mutex); - if (!list_empty(&msg->list_head)) { - dout("%s %p msg %p - was on queue\n", __func__, con, msg); - list_del_init(&msg->list_head); - msg->hdr.seq = 0; - - ceph_msg_put(msg); + if (list_empty(&msg->list_head)) { + WARN_ON(con->out_msg == msg); + dout("%s con %p msg %p not linked\n", __func__, con, msg); + mutex_unlock(&con->mutex); + return; } + + dout("%s con %p msg %p was linked\n", __func__, con, msg); + msg->hdr.seq = 0; + ceph_msg_remove(msg); + if (con->out_msg == msg) { - BUG_ON(con->out_skip); - /* footer */ - if (con->out_msg_done) { - con->out_skip += con_out_kvec_skip(con); - } else { - BUG_ON(!msg->data_length); - con->out_skip += sizeof_footer(con); - } - /* data, middle, front */ - if (msg->data_length) - con->out_skip += msg->cursor.total_resid; - if (msg->middle) - con->out_skip += con_out_kvec_skip(con); - con->out_skip += con_out_kvec_skip(con); - - dout("%s %p msg %p - was sending, will write %d skip %d\n", - __func__, con, msg, con->out_kvec_bytes, con->out_skip); - msg->hdr.seq = 0; + WARN_ON(con->state != CEPH_CON_S_OPEN); + dout("%s con %p msg %p was sending\n", __func__, con, msg); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke(con); + else + ceph_con_v1_revoke(con); + ceph_msg_put(con->out_msg); con->out_msg = NULL; - ceph_msg_put(msg); + } else { + dout("%s con %p msg %p not current, out_msg %p\n", __func__, + con, msg, con->out_msg); } - mutex_unlock(&con->mutex); } @@ -3184,25 +1771,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg) mutex_lock(&con->mutex); if (con->in_msg == msg) { - unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); - unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); - unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); - - /* skip rest of message */ - dout("%s %p msg %p revoked\n", __func__, con, msg); - con->in_base_pos = con->in_base_pos - - sizeof(struct ceph_msg_header) - - front_len - - middle_len - - data_len - - sizeof(struct ceph_msg_footer); + WARN_ON(con->state != CEPH_CON_S_OPEN); + dout("%s con %p msg %p was recving\n", __func__, con, msg); + if (ceph_msgr2(from_msgr(con->msgr))) + ceph_con_v2_revoke_incoming(con); + else + ceph_con_v1_revoke_incoming(con); ceph_msg_put(con->in_msg); con->in_msg = NULL; - con->in_tag = CEPH_MSGR_TAG_READY; - con->in_seq++; } else { - dout("%s %p in_msg %p msg %p no-op\n", - __func__, con, con->in_msg, msg); + dout("%s con %p msg %p not current, in_msg %p\n", __func__, + con, msg, con->in_msg); } mutex_unlock(&con->mutex); } @@ -3215,10 +1794,10 @@ void ceph_con_keepalive(struct ceph_connection *con) dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); - con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING); + ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); - if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING)) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); @@ -3424,9 +2003,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) +int ceph_con_in_msg_alloc(struct ceph_connection *con, + struct ceph_msg_header *hdr, int *skip) { - struct ceph_msg_header *hdr = &con->in_hdr; int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; @@ -3437,7 +2016,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { + if (con->state != CEPH_CON_S_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; @@ -3458,7 +2037,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } - memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); @@ -3471,6 +2050,39 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) return ret; } +void ceph_con_get_out_msg(struct ceph_connection *con) +{ + struct ceph_msg *msg; + + BUG_ON(list_empty(&con->out_queue)); + msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); + WARN_ON(msg->con != con); + + /* + * Put the message on "sent" list using a ref from ceph_con_send(). + * It is put when the message is acked or revoked. + */ + list_move_tail(&msg->list_head, &con->out_sent); + + /* + * Only assign outgoing seq # if we haven't sent this message + * yet. If it is requeued, resend with it's original seq. + */ + if (msg->needs_out_seq) { + msg->hdr.seq = cpu_to_le64(++con->out_seq); + msg->needs_out_seq = false; + + if (con->ops->reencode_message) + con->ops->reencode_message(msg); + } + + /* + * Get a ref for out_msg. It is put when we are done sending the + * message or in case of a fault. + */ + WARN_ON(con->out_msg); + con->out_msg = ceph_msg_get(msg); +} /* * Free a generically kmalloc'd message. diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c new file mode 100644 index 000000000000..04f653b3c897 --- /dev/null +++ b/net/ceph/messenger_v1.c @@ -0,0 +1,1506 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; + +/* + * If @buf is NULL, discard up to @len bytes. + */ +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (!buf) + msg.msg_flags |= MSG_TRUNC; + + iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len); + r = sock_recvmsg(sock, &msg, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_offset = page_offset, + .bv_len = length + }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + BUG_ON(page_offset + length > PAGE_SIZE); + iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length); + r = sock_recvmsg(sock, &msg, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, bool more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; +} + +/* + * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST + */ +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, int more) +{ + ssize_t (*sendpage)(struct socket *sock, struct page *page, + int offset, size_t size, int flags); + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; + int ret; + + /* + * sendpage cannot properly handle pages with page_count == 0, + * we need to fall back to sendmsg if that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag which + * triggers one of hardened usercopy checks. + */ + if (sendpage_ok(page)) + sendpage = sock->ops->sendpage; + else + sendpage = sock_no_sendpage; + + ret = sendpage(sock, page, offset, size, flags); + if (ret == -EAGAIN) + ret = 0; + + return ret; +} + +static void con_out_kvec_reset(struct ceph_connection *con) +{ + BUG_ON(con->v1.out_skip); + + con->v1.out_kvec_left = 0; + con->v1.out_kvec_bytes = 0; + con->v1.out_kvec_cur = &con->v1.out_kvec[0]; +} + +static void con_out_kvec_add(struct ceph_connection *con, + size_t size, void *data) +{ + int index = con->v1.out_kvec_left; + + BUG_ON(con->v1.out_skip); + BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec)); + + con->v1.out_kvec[index].iov_len = size; + con->v1.out_kvec[index].iov_base = data; + con->v1.out_kvec_left++; + con->v1.out_kvec_bytes += size; +} + +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int skip = 0; + + if (con->v1.out_kvec_bytes > 0) { + skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len; + BUG_ON(con->v1.out_kvec_bytes < skip); + BUG_ON(!con->v1.out_kvec_left); + con->v1.out_kvec_bytes -= skip; + con->v1.out_kvec_left--; + } + + return skip; +} + +static size_t sizeof_footer(struct ceph_connection *con) +{ + return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? + sizeof(struct ceph_msg_footer) : + sizeof(struct ceph_msg_footer_old); +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); +} + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con) +{ + struct ceph_msg *m = con->out_msg; + + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + + dout("prepare_write_message_footer %p\n", con); + con_out_kvec_add(con, sizeof_footer(con), &m->footer); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { + if (con->ops->sign_message) + con->ops->sign_message(m); + else + m->footer.sig = 0; + } else { + m->old_footer.flags = m->footer.flags; + } + con->v1.out_more = m->more_to_follow; + con->v1.out_msg_done = true; +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + u32 crc; + + con_out_kvec_reset(con); + con->v1.out_msg_done = false; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + } + + ceph_con_get_out_msg(con); + m = con->out_msg; + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + m->data_length); + WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); + + /* tag + hdr + front + middle */ + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + + if (m->middle) + con_out_kvec_add(con, m->middle->vec.iov_len, + m->middle->vec.iov_base); + + /* fill in hdr crc and finalize hdr */ + crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); + con->out_msg->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + + /* fill in front and middle crc, footer */ + crc = crc32c(0, m->front.iov_base, m->front.iov_len); + con->out_msg->footer.front_crc = cpu_to_le32(crc); + if (m->middle) { + crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + con->out_msg->footer.middle_crc = cpu_to_le32(crc); + } else + con->out_msg->footer.middle_crc = 0; + dout("%s front_crc %u middle_crc %u\n", __func__, + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; + + /* is there a data payload? */ + con->out_msg->footer.data_crc = 0; + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->v1.out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + + con->v1.out_more = 1; /* more will follow.. eventually.. */ + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof(con->v1.out_temp_ack), + &con->v1.out_temp_ack); + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con_out_kvec_reset(con); + if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { + struct timespec64 now; + + ktime_get_real_ts64(&now); + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); + ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2), + &con->v1.out_temp_keepalive2); + } else { + con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); + } + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +/* + * Connection negotiation. + */ + +static int get_connect_authorizer(struct ceph_connection *con) +{ + struct ceph_auth_handshake *auth; + int auth_proto; + + if (!con->ops->get_authorizer) { + con->v1.auth = NULL; + con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->v1.out_connect.authorizer_len = 0; + return 0; + } + + auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + con->v1.auth = auth; + con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->v1.out_connect.authorizer_len = + cpu_to_le32(auth->authorizer_buf_len); + return 0; +} + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_banner(struct ceph_connection *con) +{ + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + &con->msgr->my_enc_addr); + + con->v1.out_more = 0; + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +static void __prepare_write_connect(struct ceph_connection *con) +{ + con_out_kvec_add(con, sizeof(con->v1.out_connect), + &con->v1.out_connect); + if (con->v1.auth) + con_out_kvec_add(con, con->v1.auth->authorizer_buf_len, + con->v1.auth->authorizer_buf); + + con->v1.out_more = 0; + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); +} + +static int prepare_write_connect(struct ceph_connection *con) +{ + unsigned int global_seq = ceph_get_global_seq(con->msgr, 0); + int proto; + int ret; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->v1.connect_seq, global_seq, proto); + + con->v1.out_connect.features = + cpu_to_le64(from_msgr(con->msgr)->supported_features); + con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq); + con->v1.out_connect.global_seq = cpu_to_le32(global_seq); + con->v1.out_connect.protocol_version = cpu_to_le32(proto); + con->v1.out_connect.flags = 0; + + ret = get_connect_authorizer(con); + if (ret) + return ret; + + __prepare_write_connect(con); + return 0; +} + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes); + while (con->v1.out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur, + con->v1.out_kvec_left, + con->v1.out_kvec_bytes, + con->v1.out_more); + if (ret <= 0) + goto out; + con->v1.out_kvec_bytes -= ret; + if (!con->v1.out_kvec_bytes) + break; /* done */ + + /* account for full iov entries consumed */ + while (ret >= con->v1.out_kvec_cur->iov_len) { + BUG_ON(!con->v1.out_kvec_left); + ret -= con->v1.out_kvec_cur->iov_len; + con->v1.out_kvec_cur++; + con->v1.out_kvec_left--; + } + /* and for a partially-consumed entry */ + if (ret) { + con->v1.out_kvec_cur->iov_len -= ret; + con->v1.out_kvec_cur->iov_base += ret; + } + } + con->v1.out_kvec_left = 0; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret); + return ret; /* done! */ +} + +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; + u32 crc; + + dout("%s %p msg %p\n", __func__, con, msg); + + if (!msg->num_data_items) + return -EINVAL; + + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->total_resid) { + struct page *page; + size_t page_offset; + size_t length; + int ret; + + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + if (length == cursor->total_resid) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, + more); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + + return ret; + } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + ceph_msg_data_advance(cursor, (size_t)ret); + } + + dout("%s %p msg %p done\n", __func__, con, msg); + + /* prepare and queue up footer, too */ + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); + prepare_write_message_footer(con); + + return 1; /* must return > 0 to indicate success */ +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; + int ret; + + dout("%s %p %d left\n", __func__, con, con->v1.out_skip); + while (con->v1.out_skip > 0) { + size_t size = min(con->v1.out_skip, (int)PAGE_SIZE); + + if (size == con->v1.out_skip) + more = MSG_MORE; + ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size, + more); + if (ret <= 0) + goto out; + con->v1.out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->v1.in_base_pos = 0; +} + +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->v1.in_base_pos = 0; + con->v1.in_tag = CEPH_MSGR_TAG_SEQ; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->v1.in_base_pos = 0; + con->v1.in_tag = CEPH_MSGR_TAG_READY; +} + +static void prepare_read_keepalive_ack(struct ceph_connection *con) +{ + dout("prepare_read_keepalive_ack %p\n", con); + con->v1.in_base_pos = 0; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->v1.in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + +static int read_partial(struct ceph_connection *con, + int end, int size, void *object) +{ + while (con->v1.in_base_pos < end) { + int left = end - con->v1.in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->v1.in_base_pos += ret; + } + return 1; +} + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_banner(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos); + + /* peer's banner */ + size = strlen(CEPH_BANNER); + end = size; + ret = read_partial(con, end, size, con->v1.in_banner); + if (ret <= 0) + goto out; + + size = sizeof(con->v1.actual_peer_addr); + end += size; + ret = read_partial(con, end, size, &con->v1.actual_peer_addr); + if (ret <= 0) + goto out; + ceph_decode_banner_addr(&con->v1.actual_peer_addr); + + size = sizeof(con->v1.peer_addr_for_me); + end += size; + ret = read_partial(con, end, size, &con->v1.peer_addr_for_me); + if (ret <= 0) + goto out; + ceph_decode_banner_addr(&con->v1.peer_addr_for_me); + +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos); + + size = sizeof(con->v1.in_reply); + end = size; + ret = read_partial(con, end, size, &con->v1.in_reply); + if (ret <= 0) + goto out; + + if (con->v1.auth) { + size = le32_to_cpu(con->v1.in_reply.authorizer_len); + if (size > con->v1.auth->authorizer_reply_buf_len) { + pr_err("authorizer reply too big: %d > %zu\n", size, + con->v1.auth->authorizer_reply_buf_len); + ret = -EINVAL; + goto out; + } + + end += size; + ret = read_partial(con, end, size, + con->v1.auth->authorizer_reply_buf); + if (ret <= 0) + goto out; + } + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, con->v1.in_reply.tag, + le32_to_cpu(con->v1.in_reply.connect_seq), + le32_to_cpu(con->v1.in_reply.global_seq)); +out: + return ret; +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to %s got bad banner\n", + ceph_pr_addr(&con->peer_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static int process_banner(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + + dout("process_banner on %p\n", con); + + if (verify_hello(con) < 0) + return -1; + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr, + sizeof(con->peer_addr)) != 0 && + !(ceph_addr_is_blank(&con->v1.actual_peer_addr) && + con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_warn("wrong peer, want %s/%u, got %s/%u\n", + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&con->v1.actual_peer_addr), + le32_to_cpu(con->v1.actual_peer_addr.nonce)); + con->error_msg = "wrong peer at address"; + return -1; + } + + /* + * did we learn our address? + */ + if (ceph_addr_is_blank(my_addr)) { + memcpy(&my_addr->in_addr, + &con->v1.peer_addr_for_me.in_addr, + sizeof(con->v1.peer_addr_for_me.in_addr)); + ceph_addr_set_port(my_addr, 0); + ceph_encode_my_addr(con->msgr); + dout("process_banner learned my addr is %s\n", + ceph_pr_addr(my_addr)); + } + + return 0; +} + +static int process_connect(struct ceph_connection *con) +{ + u64 sup_feat = from_msgr(con->msgr)->supported_features; + u64 req_feat = from_msgr(con->msgr)->required_features; + u64 server_feat = le64_to_cpu(con->v1.in_reply.features); + int ret; + + dout("process_connect on %p tag %d\n", con, con->v1.in_tag); + + if (con->v1.auth) { + int len = le32_to_cpu(con->v1.in_reply.authorizer_len); + + /* + * Any connection that defines ->get_authorizer() + * should also define ->add_authorizer_challenge() and + * ->verify_authorizer_reply(). + * + * See get_connect_authorizer(). + */ + if (con->v1.in_reply.tag == + CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + ret = con->ops->add_authorizer_challenge( + con, con->v1.auth->authorizer_reply_buf, len); + if (ret < 0) + return ret; + + con_out_kvec_reset(con); + __prepare_write_connect(con); + prepare_read_connect(con); + return 0; + } + + if (len) { + ret = con->ops->verify_authorizer_reply(con); + if (ret < 0) { + con->error_msg = "bad authorize reply"; + return ret; + } + } + } + + switch (con->v1.in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->v1.out_connect.protocol_version), + le32_to_cpu(con->v1.in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->v1.auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->v1.auth_retry); + if (con->v1.auth_retry == 2) { + con->error_msg = "connect authorization failure"; + return -1; + } + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->v1.in_reply.connect_seq)); + pr_info("%s%lld %s session reset\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr)); + ceph_con_reset_session(con); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + + /* Tell ceph about it. */ + mutex_unlock(&con->mutex); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V1_CONNECT_MSG) + return -EAGAIN; + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", + le32_to_cpu(con->v1.out_connect.connect_seq), + le32_to_cpu(con->v1.in_reply.connect_seq)); + con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", + con->v1.peer_global_seq, + le32_to_cpu(con->v1.in_reply.global_seq)); + ceph_get_global_seq(con->msgr, + le32_to_cpu(con->v1.in_reply.global_seq)); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_SEQ: + case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + return -1; + } + + WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG); + con->state = CEPH_CON_S_OPEN; + con->v1.auth_retry = 0; /* we authenticated; clear flag */ + con->v1.peer_global_seq = + le32_to_cpu(con->v1.in_reply.global_seq); + con->v1.connect_seq++; + con->peer_features = server_feat; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->v1.peer_global_seq, + le32_to_cpu(con->v1.in_reply.connect_seq), + con->v1.connect_seq); + WARN_ON(con->v1.connect_seq != + le32_to_cpu(con->v1.in_reply.connect_seq)); + + if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); + + con->delay = 0; /* reset backoff memory */ + + if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + con->error_msg = "protocol error, got WAIT as client"; + return -1; + + default: + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int size = sizeof(con->v1.in_temp_ack); + int end = size; + + return read_partial(con, end, size, &con->v1.in_temp_ack); +} + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + u64 ack = le64_to_cpu(con->v1.in_temp_ack); + + if (con->v1.in_tag == CEPH_MSGR_TAG_ACK) + ceph_con_discard_sent(con, ack); + else + ceph_con_discard_requeued(con, ack); + + prepare_read_tag(con); +} + +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + int ret, left; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + } + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, section->iov_len); + + return 1; +} + +static int read_partial_msg_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; + int ret; + + if (!msg->num_data_items) + return -EIO; + + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->total_resid) { + if (!cursor->resid) { + ceph_msg_data_advance(cursor, 0); + continue; + } + + page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + + return ret; + } + + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + ceph_msg_data_advance(cursor, (size_t)ret); + } + if (do_datacrc) + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + +/* + * read (part of) a message. + */ +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + int size; + int end; + int ret; + unsigned int front_len, middle_len, data_len; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); + u64 seq; + u32 crc; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + size = sizeof(con->v1.in_hdr); + end = size; + ret = read_partial(con, end, size, &con->v1.in_hdr); + if (ret <= 0) + return ret; + + crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc)); + if (cpu_to_le32(crc) != con->v1.in_hdr.crc) { + pr_err("read_partial_message bad hdr crc %u != expected %u\n", + crc, con->v1.in_hdr.crc); + return -EBADMSG; + } + + front_len = le32_to_cpu(con->v1.in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) + return -EIO; + data_len = le32_to_cpu(con->v1.in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + + /* verify seq# */ + seq = le64_to_cpu(con->v1.in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld expected %lld\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + seq, con->in_seq + 1); + con->v1.in_base_pos = -front_len - middle_len - data_len - + sizeof_footer(con); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + return 1; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + /* allocate message? */ + if (!con->in_msg) { + int skip = 0; + + dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type, + front_len, data_len); + ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip); + if (ret < 0) + return ret; + + BUG_ON(!con->in_msg ^ skip); + if (skip) { + /* skip this message */ + dout("alloc_msg said skip message\n"); + con->v1.in_base_pos = -front_len - middle_len - + data_len - sizeof_footer(con); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 1; + } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; + + /* prepare for data payload, if any */ + + if (data_len) + prepare_message_data(con->in_msg, data_len); + } + + /* front */ + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; + + /* middle */ + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, + &con->in_middle_crc); + if (ret <= 0) + return ret; + } + + /* (page) data */ + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; + } + + /* footer */ + size = sizeof_footer(con); + end += size; + ret = read_partial(con, end, size, &m->footer); + if (ret <= 0) + return ret; + + if (!need_sign) { + m->footer.flags = m->old_footer.flags; + m->footer.sig = 0; + } + + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (do_datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + if (need_sign && con->ops->check_message_signature && + con->ops->check_message_signature(m)) { + pr_err("read_partial_message %p signature check failed\n", m); + return -EBADMSG; + } + + return 1; /* done! */ +} + +static int read_keepalive_ack(struct ceph_connection *con) +{ + struct ceph_timespec ceph_ts; + size_t size = sizeof(ceph_ts); + int ret = read_partial(con, size, size, &ceph_ts); + if (ret <= 0) + return ret; + ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); + prepare_read_tag(con); + return 1; +} + +/* + * Read what we can from the socket. + */ +int ceph_con_v1_try_read(struct ceph_connection *con) +{ + int ret = -1; + +more: + dout("try_read start %p state %d\n", con, con->state); + if (con->state != CEPH_CON_S_V1_BANNER && + con->state != CEPH_CON_S_V1_CONNECT_MSG && + con->state != CEPH_CON_S_OPEN) + return 0; + + BUG_ON(!con->sock); + + dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag, + con->v1.in_base_pos); + + if (con->state == CEPH_CON_S_V1_BANNER) { + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + con->state = CEPH_CON_S_V1_CONNECT_MSG; + + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ + goto out; + } + + if (con->state == CEPH_CON_S_V1_CONNECT_MSG) { + ret = read_partial_connect(con); + if (ret <= 0) + goto out; + ret = process_connect(con); + if (ret < 0) + goto out; + goto more; + } + + WARN_ON(con->state != CEPH_CON_S_OPEN); + + if (con->v1.in_base_pos < 0) { + /* + * skipping + discarding content. + */ + ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos); + if (ret <= 0) + goto out; + dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos); + con->v1.in_base_pos += ret; + if (con->v1.in_base_pos) + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1); + if (ret <= 0) + goto out; + dout("try_read got tag %d\n", con->v1.in_tag); + switch (con->v1.in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + prepare_read_keepalive_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + ceph_con_close_socket(con); + con->state = CEPH_CON_S_CLOSED; + goto out; + default: + goto bad_tag; + } + } + if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc/signature"; + fallthrough; + case -EBADE: + ret = -EIO; + break; + case -EIO: + con->error_msg = "io error"; + break; + } + goto out; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_READY) + goto more; + ceph_con_process_message(con); + if (con->state == CEPH_CON_S_OPEN) + prepare_read_tag(con); + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_ACK || + con->v1.in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ + ret = read_partial_ack(con); + if (ret <= 0) + goto out; + process_ack(con); + goto more; + } + if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ret = read_keepalive_ack(con); + if (ret <= 0) + goto out; + goto more; + } + +out: + dout("try_read done on %p ret %d\n", con, ret); + return ret; + +bad_tag: + pr_err("try_read bad tag %d\n", con->v1.in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +int ceph_con_v1_try_write(struct ceph_connection *con) +{ + int ret = 1; + + dout("try_write start %p state %d\n", con, con->state); + if (con->state != CEPH_CON_S_PREOPEN && + con->state != CEPH_CON_S_V1_BANNER && + con->state != CEPH_CON_S_V1_CONNECT_MSG && + con->state != CEPH_CON_S_OPEN) + return 0; + + /* open the socket first? */ + if (con->state == CEPH_CON_S_PREOPEN) { + BUG_ON(con->sock); + con->state = CEPH_CON_S_V1_BANNER; + + con_out_kvec_reset(con); + prepare_write_banner(con); + prepare_read_banner(con); + + BUG_ON(con->in_msg); + con->v1.in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %d\n", + con, con->state); + ret = ceph_tcp_connect(con); + if (ret < 0) { + con->error_msg = "connect error"; + goto out; + } + } + +more: + dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes); + BUG_ON(!con->sock); + + /* kvec data queued? */ + if (con->v1.out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto out; + } + if (con->v1.out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto out; + } + + /* msg pages? */ + if (con->out_msg) { + if (con->v1.out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + + ret = write_partial_message_data(con); + if (ret == 1) + goto more; /* we need to send the footer, too! */ + if (ret == 0) + goto out; + if (ret < 0) { + dout("try_write write_partial_message_data err %d\n", + ret); + goto out; + } + } + +do_next: + if (con->state == CEPH_CON_S_OPEN) { + if (ceph_con_flag_test_and_clear(con, + CEPH_CON_F_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + } + + /* Nothing to do! */ + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + dout("try_write nothing else to write.\n"); + ret = 0; +out: + dout("try_write done on %p ret %d\n", con, ret); + return ret; +} + +void ceph_con_v1_revoke(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + WARN_ON(con->v1.out_skip); + /* footer */ + if (con->v1.out_msg_done) { + con->v1.out_skip += con_out_kvec_skip(con); + } else { + WARN_ON(!msg->data_length); + con->v1.out_skip += sizeof_footer(con); + } + /* data, middle, front */ + if (msg->data_length) + con->v1.out_skip += msg->cursor.total_resid; + if (msg->middle) + con->v1.out_skip += con_out_kvec_skip(con); + con->v1.out_skip += con_out_kvec_skip(con); + + dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con, + con->v1.out_kvec_bytes, con->v1.out_skip); +} + +void ceph_con_v1_revoke_incoming(struct ceph_connection *con) +{ + unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len); + unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len); + unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len); + + /* skip rest of message */ + con->v1.in_base_pos = con->v1.in_base_pos - + sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - + sizeof(struct ceph_msg_footer); + + con->v1.in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + + dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos); +} + +bool ceph_con_v1_opened(struct ceph_connection *con) +{ + return con->v1.connect_seq; +} + +void ceph_con_v1_reset_session(struct ceph_connection *con) +{ + con->v1.connect_seq = 0; + con->v1.peer_global_seq = 0; +} + +void ceph_con_v1_reset_protocol(struct ceph_connection *con) +{ + con->v1.out_skip = 0; +} diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c new file mode 100644 index 000000000000..c1ebb2aa08b5 --- /dev/null +++ b/net/ceph/messenger_v2.c @@ -0,0 +1,3443 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ceph msgr2 protocol implementation + * + * Copyright (C) 2020 Ilya Dryomov + */ + +#include + +#include +#include /* for crypto_memneq() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */ + +#define FRAME_TAG_HELLO 1 +#define FRAME_TAG_AUTH_REQUEST 2 +#define FRAME_TAG_AUTH_BAD_METHOD 3 +#define FRAME_TAG_AUTH_REPLY_MORE 4 +#define FRAME_TAG_AUTH_REQUEST_MORE 5 +#define FRAME_TAG_AUTH_DONE 6 +#define FRAME_TAG_AUTH_SIGNATURE 7 +#define FRAME_TAG_CLIENT_IDENT 8 +#define FRAME_TAG_SERVER_IDENT 9 +#define FRAME_TAG_IDENT_MISSING_FEATURES 10 +#define FRAME_TAG_SESSION_RECONNECT 11 +#define FRAME_TAG_SESSION_RESET 12 +#define FRAME_TAG_SESSION_RETRY 13 +#define FRAME_TAG_SESSION_RETRY_GLOBAL 14 +#define FRAME_TAG_SESSION_RECONNECT_OK 15 +#define FRAME_TAG_WAIT 16 +#define FRAME_TAG_MESSAGE 17 +#define FRAME_TAG_KEEPALIVE2 18 +#define FRAME_TAG_KEEPALIVE2_ACK 19 +#define FRAME_TAG_ACK 20 + +#define FRAME_LATE_STATUS_ABORTED 0x1 +#define FRAME_LATE_STATUS_COMPLETE 0xe +#define FRAME_LATE_STATUS_ABORTED_MASK 0xf + +#define IN_S_HANDLE_PREAMBLE 1 +#define IN_S_HANDLE_CONTROL 2 +#define IN_S_HANDLE_CONTROL_REMAINDER 3 +#define IN_S_PREPARE_READ_DATA 4 +#define IN_S_PREPARE_READ_DATA_CONT 5 +#define IN_S_HANDLE_EPILOGUE 6 +#define IN_S_FINISH_SKIP 7 + +#define OUT_S_QUEUE_DATA 1 +#define OUT_S_QUEUE_DATA_CONT 2 +#define OUT_S_QUEUE_ENC_PAGE 3 +#define OUT_S_QUEUE_ZEROS 4 +#define OUT_S_FINISH_MESSAGE 5 +#define OUT_S_GET_NEXT 6 + +#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN) +#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN) +#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN) +#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN) + +#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static int do_recvmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_recvmsg(sock, &msg, msg.msg_flags); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +/* + * Read as much as possible. + * + * Return: + * 1 - done, nothing (else) to read + * 0 - socket is empty, need to wait + * <0 - error + */ +static int ceph_tcp_recv(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p %s %zu\n", __func__, con, + iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need", + iov_iter_count(&con->v2.in_iter)); + ret = do_recvmsg(con->sock, &con->v2.in_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.in_iter)); + return ret; +} + +static int do_sendmsg(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + int ret; + + msg.msg_iter = *it; + while (iov_iter_count(it)) { + ret = sock_sendmsg(sock, &msg); + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + WARN_ON(msg_data_left(&msg)); + return 1; +} + +static int do_try_sendpage(struct socket *sock, struct iov_iter *it) +{ + struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; + struct bio_vec bv; + int ret; + + if (WARN_ON(!iov_iter_is_bvec(it))) + return -EINVAL; + + while (iov_iter_count(it)) { + /* iov_iter_iovec() for ITER_BVEC */ + bv.bv_page = it->bvec->bv_page; + bv.bv_offset = it->bvec->bv_offset + it->iov_offset; + bv.bv_len = min(iov_iter_count(it), + it->bvec->bv_len - it->iov_offset); + + /* + * sendpage cannot properly handle pages with + * page_count == 0, we need to fall back to sendmsg if + * that's the case. + * + * Same goes for slab pages: skb_can_coalesce() allows + * coalescing neighboring slab objects into a single frag + * which triggers one of hardened usercopy checks. + */ + if (sendpage_ok(bv.bv_page)) { + ret = sock->ops->sendpage(sock, bv.bv_page, + bv.bv_offset, bv.bv_len, + CEPH_MSG_FLAGS); + } else { + iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len); + ret = sock_sendmsg(sock, &msg); + } + if (ret <= 0) { + if (ret == -EAGAIN) + ret = 0; + return ret; + } + + iov_iter_advance(it, ret); + } + + return 1; +} + +/* + * Write as much as possible. The socket is expected to be corked, + * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here. + * + * Return: + * 1 - done, nothing (else) to write + * 0 - socket is full, need to wait + * <0 - error + */ +static int ceph_tcp_send(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p have %zu try_sendpage %d\n", __func__, con, + iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage); + if (con->v2.out_iter_sendpage) + ret = do_try_sendpage(con->sock, &con->v2.out_iter); + else + ret = do_sendmsg(con->sock, &con->v2.out_iter); + dout("%s con %p ret %d left %zu\n", __func__, con, ret, + iov_iter_count(&con->v2.out_iter)); + return ret; +} + +static void add_in_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf; + con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len; + con->v2.in_kvec_cnt++; + + con->v2.in_iter.nr_segs++; + con->v2.in_iter.count += len; +} + +static void reset_in_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_kvec_cnt = 0; + iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0); +} + +static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + con->v2.in_bvec = *bv; + iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len); +} + +static void set_in_skip(struct ceph_connection *con, int len) +{ + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + dout("%s con %p len %d\n", __func__, con, len); + iov_iter_discard(&con->v2.in_iter, READ, len); +} + +static void add_out_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf; + con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len; + con->v2.out_kvec_cnt++; + + con->v2.out_iter.nr_segs++; + con->v2.out_iter.count += len; +} + +static void reset_out_kvecs(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_kvec_cnt = 0; + + iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0); + con->v2.out_iter_sendpage = false; +} + +static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv, + bool zerocopy) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(con->v2.out_zero); + + con->v2.out_bvec = *bv; + con->v2.out_iter_sendpage = zerocopy; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void set_out_bvec_zero(struct ceph_connection *con) +{ + WARN_ON(iov_iter_count(&con->v2.out_iter)); + WARN_ON(!con->v2.out_zero); + + con->v2.out_bvec.bv_page = ceph_zero_page; + con->v2.out_bvec.bv_offset = 0; + con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE); + con->v2.out_iter_sendpage = true; + iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1, + con->v2.out_bvec.bv_len); +} + +static void out_zero_add(struct ceph_connection *con, int len) +{ + dout("%s con %p len %d\n", __func__, con, len); + con->v2.out_zero += len; +} + +static void *alloc_conn_buf(struct ceph_connection *con, int len) +{ + void *buf; + + dout("%s con %p len %d\n", __func__, con, len); + + if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) + return NULL; + + buf = ceph_kvmalloc(len, GFP_NOIO); + if (!buf) + return NULL; + + con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf; + return buf; +} + +static void free_conn_bufs(struct ceph_connection *con) +{ + while (con->v2.conn_buf_cnt) + kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]); +} + +static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs)); + + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf; + con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len; + con->v2.in_sign_kvec_cnt++; +} + +static void clear_in_sign_kvecs(struct ceph_connection *con) +{ + con->v2.in_sign_kvec_cnt = 0; +} + +static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len) +{ + BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs)); + + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf; + con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len; + con->v2.out_sign_kvec_cnt++; +} + +static void clear_out_sign_kvecs(struct ceph_connection *con) +{ + con->v2.out_sign_kvec_cnt = 0; +} + +static bool con_secure(struct ceph_connection *con) +{ + return con->v2.con_mode == CEPH_CON_MODE_SECURE; +} + +static int front_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.front_len); +} + +static int middle_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.middle_len); +} + +static int data_len(const struct ceph_msg *msg) +{ + return le32_to_cpu(msg->hdr.data_len); +} + +static bool need_padding(int len) +{ + return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN); +} + +static int padded_len(int len) +{ + return ALIGN(len, CEPH_GCM_BLOCK_LEN); +} + +static int padding_len(int len) +{ + return padded_len(len) - len; +} + +/* preamble + control segment */ +static int head_onwire_len(int ctrl_len, bool secure) +{ + int head_len; + int rem_len; + + if (secure) { + head_len = CEPH_PREAMBLE_SECURE_LEN; + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN; + } + } else { + head_len = CEPH_PREAMBLE_PLAIN_LEN; + if (ctrl_len) + head_len += ctrl_len + CEPH_CRC_LEN; + } + return head_len; +} + +/* front, middle and data segments + epilogue */ +static int __tail_onwire_len(int front_len, int middle_len, int data_len, + bool secure) +{ + if (!front_len && !middle_len && !data_len) + return 0; + + if (!secure) + return front_len + middle_len + data_len + + CEPH_EPILOGUE_PLAIN_LEN; + + return padded_len(front_len) + padded_len(middle_len) + + padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN; +} + +static int tail_onwire_len(const struct ceph_msg *msg, bool secure) +{ + return __tail_onwire_len(front_len(msg), middle_len(msg), + data_len(msg), secure); +} + +/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */ +#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \ + sizeof(struct ceph_msg_header2) + \ + CEPH_CRC_LEN) + +static const int frame_aligns[] = { + sizeof(void *), + sizeof(void *), + sizeof(void *), + PAGE_SIZE +}; + +/* + * Discards trailing empty segments, unless there is just one segment. + * A frame always has at least one (possibly empty) segment. + */ +static int calc_segment_count(const int *lens, int len_cnt) +{ + int i; + + for (i = len_cnt - 1; i >= 0; i--) { + if (lens[i]) + return i + 1; + } + + return 1; +} + +static void init_frame_desc(struct ceph_frame_desc *desc, int tag, + const int *lens, int len_cnt) +{ + int i; + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = tag; + desc->fd_seg_cnt = calc_segment_count(lens, len_cnt); + BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT); + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = lens[i]; + desc->fd_aligns[i] = frame_aligns[i]; + } +} + +/* + * Preamble crc covers everything up to itself (28 bytes) and + * is calculated and verified irrespective of the connection mode + * (i.e. even if the frame is encrypted). + */ +static void encode_preamble(const struct ceph_frame_desc *desc, void *p) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + void *start = p; + int i; + + memset(p, 0, CEPH_PREAMBLE_LEN); + + ceph_encode_8(&p, desc->fd_tag); + ceph_encode_8(&p, desc->fd_seg_cnt); + for (i = 0; i < desc->fd_seg_cnt; i++) { + ceph_encode_32(&p, desc->fd_lens[i]); + ceph_encode_16(&p, desc->fd_aligns[i]); + } + + put_unaligned_le32(crc32c(0, start, crcp - start), crcp); +} + +static int decode_preamble(void *p, struct ceph_frame_desc *desc) +{ + void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN; + u32 crc, expected_crc; + int i; + + crc = crc32c(0, p, crcp - p); + expected_crc = get_unaligned_le32(crcp); + if (crc != expected_crc) { + pr_err("bad preamble crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + memset(desc, 0, sizeof(*desc)); + + desc->fd_tag = ceph_decode_8(&p); + desc->fd_seg_cnt = ceph_decode_8(&p); + if (desc->fd_seg_cnt < 1 || + desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) { + pr_err("bad segment count %d\n", desc->fd_seg_cnt); + return -EINVAL; + } + for (i = 0; i < desc->fd_seg_cnt; i++) { + desc->fd_lens[i] = ceph_decode_32(&p); + desc->fd_aligns[i] = ceph_decode_16(&p); + } + + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { + pr_err("last segment empty\n"); + return -EINVAL; + } + + if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { + pr_err("control segment too big %d\n", desc->fd_lens[0]); + return -EINVAL; + } + if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { + pr_err("front segment too big %d\n", desc->fd_lens[1]); + return -EINVAL; + } + if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) { + pr_err("middle segment too big %d\n", desc->fd_lens[2]); + return -EINVAL; + } + if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) { + pr_err("data segment too big %d\n", desc->fd_lens[3]); + return -EINVAL; + } + + return 0; +} + +static void encode_epilogue_plain(struct ceph_connection *con, bool aborted) +{ + con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED : + FRAME_LATE_STATUS_COMPLETE; + cpu_to_le32s(&con->v2.out_epil.front_crc); + cpu_to_le32s(&con->v2.out_epil.middle_crc); + cpu_to_le32s(&con->v2.out_epil.data_crc); +} + +static void encode_epilogue_secure(struct ceph_connection *con, bool aborted) +{ + memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil)); + con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED : + FRAME_LATE_STATUS_COMPLETE; +} + +static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc, + u32 *data_crc) +{ + u8 late_status; + + late_status = ceph_decode_8(&p); + if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) != + FRAME_LATE_STATUS_COMPLETE) { + /* we should never get an aborted message as client */ + pr_err("bad late_status 0x%x\n", late_status); + return -EINVAL; + } + + if (front_crc && middle_crc && data_crc) { + *front_crc = ceph_decode_32(&p); + *middle_crc = ceph_decode_32(&p); + *data_crc = ceph_decode_32(&p); + } + + return 0; +} + +static void fill_header(struct ceph_msg_header *hdr, + const struct ceph_msg_header2 *hdr2, + int front_len, int middle_len, int data_len, + const struct ceph_entity_name *peer_name) +{ + hdr->seq = hdr2->seq; + hdr->tid = hdr2->tid; + hdr->type = hdr2->type; + hdr->priority = hdr2->priority; + hdr->version = hdr2->version; + hdr->front_len = cpu_to_le32(front_len); + hdr->middle_len = cpu_to_le32(middle_len); + hdr->data_len = cpu_to_le32(data_len); + hdr->data_off = hdr2->data_off; + hdr->src = *peer_name; + hdr->compat_version = hdr2->compat_version; + hdr->reserved = 0; + hdr->crc = 0; +} + +static void fill_header2(struct ceph_msg_header2 *hdr2, + const struct ceph_msg_header *hdr, u64 ack_seq) +{ + hdr2->seq = hdr->seq; + hdr2->tid = hdr->tid; + hdr2->type = hdr->type; + hdr2->priority = hdr->priority; + hdr2->version = hdr->version; + hdr2->data_pre_padding_len = 0; + hdr2->data_off = hdr->data_off; + hdr2->ack_seq = cpu_to_le64(ack_seq); + hdr2->flags = 0; + hdr2->compat_version = hdr->compat_version; + hdr2->reserved = 0; +} + +static int verify_control_crc(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + u32 crc, expected_crc; + + WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len); + WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN); + + crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len); + expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base); + if (crc != expected_crc) { + pr_err("bad control crc, calculated %u, expected %u\n", + crc, expected_crc); + return -EBADMSG; + } + + return 0; +} + +static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc, + u32 middle_crc, u32 data_crc) +{ + if (front_len(con->in_msg)) { + con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base, + front_len(con->in_msg)); + } else { + WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg)); + con->in_front_crc = -1; + } + + if (middle_len(con->in_msg)) + con->in_middle_crc = crc32c(-1, + con->in_msg->middle->vec.iov_base, + middle_len(con->in_msg)); + else if (data_len(con->in_msg)) + con->in_middle_crc = -1; + else + con->in_middle_crc = 0; + + if (!data_len(con->in_msg)) + con->in_data_crc = 0; + + dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg, + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + + if (con->in_front_crc != front_crc) { + pr_err("bad front crc, calculated %u, expected %u\n", + con->in_front_crc, front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != middle_crc) { + pr_err("bad middle crc, calculated %u, expected %u\n", + con->in_middle_crc, middle_crc); + return -EBADMSG; + } + if (con->in_data_crc != data_crc) { + pr_err("bad data crc, calculated %u, expected %u\n", + con->in_data_crc, data_crc); + return -EBADMSG; + } + + return 0; +} + +static int setup_crypto(struct ceph_connection *con, + u8 *session_key, int session_key_len, + u8 *con_secret, int con_secret_len) +{ + unsigned int noio_flag; + void *p; + int ret; + + dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", + __func__, con, con->v2.con_mode, session_key_len, con_secret_len); + WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + + if (con->v2.con_mode != CEPH_CON_MODE_CRC && + con->v2.con_mode != CEPH_CON_MODE_SECURE) { + pr_err("bad con_mode %d\n", con->v2.con_mode); + return -EINVAL; + } + + if (!session_key_len) { + WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC); + WARN_ON(con_secret_len); + return 0; /* auth_none */ + } + + noio_flag = memalloc_noio_save(); + con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); + memalloc_noio_restore(noio_flag); + if (IS_ERR(con->v2.hmac_tfm)) { + ret = PTR_ERR(con->v2.hmac_tfm); + con->v2.hmac_tfm = NULL; + pr_err("failed to allocate hmac tfm context: %d\n", ret); + return ret; + } + + WARN_ON((unsigned long)session_key & + crypto_shash_alignmask(con->v2.hmac_tfm)); + ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, + session_key_len); + if (ret) { + pr_err("failed to set hmac key: %d\n", ret); + return ret; + } + + if (con->v2.con_mode == CEPH_CON_MODE_CRC) { + WARN_ON(con_secret_len); + return 0; /* auth_x, plain mode */ + } + + if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) { + pr_err("con_secret too small %d\n", con_secret_len); + return -EINVAL; + } + + noio_flag = memalloc_noio_save(); + con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + memalloc_noio_restore(noio_flag); + if (IS_ERR(con->v2.gcm_tfm)) { + ret = PTR_ERR(con->v2.gcm_tfm); + con->v2.gcm_tfm = NULL; + pr_err("failed to allocate gcm tfm context: %d\n", ret); + return ret; + } + + p = con_secret; + WARN_ON((unsigned long)p & crypto_aead_alignmask(con->v2.gcm_tfm)); + ret = crypto_aead_setkey(con->v2.gcm_tfm, p, CEPH_GCM_KEY_LEN); + if (ret) { + pr_err("failed to set gcm key: %d\n", ret); + return ret; + } + + p += CEPH_GCM_KEY_LEN; + WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN); + ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN); + if (ret) { + pr_err("failed to set gcm tag size: %d\n", ret); + return ret; + } + + con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO); + if (!con->v2.gcm_req) { + pr_err("failed to allocate gcm request\n"); + return -ENOMEM; + } + + crypto_init_wait(&con->v2.gcm_wait); + aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &con->v2.gcm_wait); + + memcpy(&con->v2.in_gcm_nonce, p, CEPH_GCM_IV_LEN); + memcpy(&con->v2.out_gcm_nonce, p + CEPH_GCM_IV_LEN, CEPH_GCM_IV_LEN); + return 0; /* auth_x, secure mode */ +} + +static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs, + int kvec_cnt, u8 *hmac) +{ + SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ + int ret; + int i; + + dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, + con->v2.hmac_tfm, kvec_cnt); + + if (!con->v2.hmac_tfm) { + memset(hmac, 0, SHA256_DIGEST_SIZE); + return 0; /* auth_none */ + } + + desc->tfm = con->v2.hmac_tfm; + ret = crypto_shash_init(desc); + if (ret) + return ret; + + for (i = 0; i < kvec_cnt; i++) { + WARN_ON((unsigned long)kvecs[i].iov_base & + crypto_shash_alignmask(con->v2.hmac_tfm)); + ret = crypto_shash_update(desc, kvecs[i].iov_base, + kvecs[i].iov_len); + if (ret) + return ret; + } + + ret = crypto_shash_final(desc, hmac); + if (ret) + return ret; + + shash_desc_zero(desc); + return 0; /* auth_x, both plain and secure modes */ +} + +static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) +{ + u64 counter; + + counter = le64_to_cpu(nonce->counter); + nonce->counter = cpu_to_le64(counter + 1); +} + +static int gcm_crypt(struct ceph_connection *con, bool encrypt, + struct scatterlist *src, struct scatterlist *dst, + int src_len) +{ + struct ceph_gcm_nonce *nonce; + int ret; + + nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce; + + aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */ + aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce); + ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) : + crypto_aead_decrypt(con->v2.gcm_req), + &con->v2.gcm_wait); + if (ret) + return ret; + + gcm_inc_nonce(nonce); + return 0; +} + +static void get_bvec_at(struct ceph_msg_data_cursor *cursor, + struct bio_vec *bv) +{ + struct page *page; + size_t off, len; + + WARN_ON(!cursor->total_resid); + + /* skip zero-length data items */ + while (!cursor->resid) + ceph_msg_data_advance(cursor, 0); + + /* get a piece of data, cursor isn't advanced */ + page = ceph_msg_data_next(cursor, &off, &len, NULL); + + bv->bv_page = page; + bv->bv_offset = off; + bv->bv_len = len; +} + +static int calc_sg_cnt(void *buf, int buf_len) +{ + int sg_cnt; + + if (!buf_len) + return 0; + + sg_cnt = need_padding(buf_len) ? 1 : 0; + if (is_vmalloc_addr(buf)) { + WARN_ON(offset_in_page(buf)); + sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT; + } else { + sg_cnt++; + } + + return sg_cnt; +} + +static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor) +{ + int data_len = cursor->total_resid; + struct bio_vec bv; + int sg_cnt; + + if (!data_len) + return 0; + + sg_cnt = need_padding(data_len) ? 1 : 0; + do { + get_bvec_at(cursor, &bv); + sg_cnt++; + + ceph_msg_data_advance(cursor, bv.bv_len); + } while (cursor->total_resid); + + return sg_cnt; +} + +static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad) +{ + void *end = buf + buf_len; + struct page *page; + int len; + void *p; + + if (!buf_len) + return; + + if (is_vmalloc_addr(buf)) { + p = buf; + do { + page = vmalloc_to_page(p); + len = min_t(int, end - p, PAGE_SIZE); + WARN_ON(!page || !len || offset_in_page(p)); + sg_set_page(*sg, page, len, 0); + *sg = sg_next(*sg); + p += len; + } while (p != end); + } else { + sg_set_buf(*sg, buf, buf_len); + *sg = sg_next(*sg); + } + + if (need_padding(buf_len)) { + sg_set_buf(*sg, pad, padding_len(buf_len)); + *sg = sg_next(*sg); + } +} + +static void init_sgs_cursor(struct scatterlist **sg, + struct ceph_msg_data_cursor *cursor, u8 *pad) +{ + int data_len = cursor->total_resid; + struct bio_vec bv; + + if (!data_len) + return; + + do { + get_bvec_at(cursor, &bv); + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + *sg = sg_next(*sg); + + ceph_msg_data_advance(cursor, bv.bv_len); + } while (cursor->total_resid); + + if (need_padding(data_len)) { + sg_set_buf(*sg, pad, padding_len(data_len)); + *sg = sg_next(*sg); + } +} + +static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, + u8 *front_pad, u8 *middle_pad, u8 *data_pad, + void *epilogue, bool add_tag) +{ + struct ceph_msg_data_cursor cursor; + struct scatterlist *cur_sg; + int sg_cnt; + int ret; + + if (!front_len(msg) && !middle_len(msg) && !data_len(msg)) + return 0; + + sg_cnt = 1; /* epilogue + [auth tag] */ + if (front_len(msg)) + sg_cnt += calc_sg_cnt(msg->front.iov_base, + front_len(msg)); + if (middle_len(msg)) + sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base, + middle_len(msg)); + if (data_len(msg)) { + ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); + sg_cnt += calc_sg_cnt_cursor(&cursor); + } + + ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO); + if (ret) + return ret; + + cur_sg = sgt->sgl; + if (front_len(msg)) + init_sgs(&cur_sg, msg->front.iov_base, front_len(msg), + front_pad); + if (middle_len(msg)) + init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg), + middle_pad); + if (data_len(msg)) { + ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); + init_sgs_cursor(&cur_sg, &cursor, data_pad); + } + + WARN_ON(!sg_is_last(cur_sg)); + sg_set_buf(cur_sg, epilogue, + CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0)); + return 0; +} + +static int decrypt_preamble(struct ceph_connection *con) +{ + struct scatterlist sg; + + sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN); + return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN); +} + +static int decrypt_control_remainder(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN; + struct scatterlist sgs[2]; + + WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len); + WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len); + + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len); + sg_set_buf(&sgs[1], con->v2.in_buf, pt_len); + + return gcm_crypt(con, false, sgs, sgs, + padded_len(rem_len) + CEPH_GCM_TAG_LEN); +} + +static int decrypt_message(struct ceph_connection *con) +{ + struct sg_table sgt = {}; + int ret; + + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), + MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), + con->v2.in_buf, true); + if (ret) + goto out; + + ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl, + tail_onwire_len(con->in_msg, true)); + +out: + sg_free_table(&sgt); + return ret; +} + +static int prepare_banner(struct ceph_connection *con) +{ + int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8; + void *buf, *p; + + buf = alloc_conn_buf(con, buf_len); + if (!buf) + return -ENOMEM; + + p = buf; + ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN); + ceph_encode_16(&p, sizeof(u64) + sizeof(u64)); + ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES); + ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES); + WARN_ON(p != buf + buf_len); + + add_out_kvec(con, buf, buf_len); + add_out_sign_kvec(con, buf, buf_len); + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +/* + * base: + * preamble + * control body (ctrl_len bytes) + * space for control crc + * + * extdata (optional): + * control body (extdata_len bytes) + * + * Compute control crc and gather base and extdata into: + * + * preamble + * control body (ctrl_len + extdata_len bytes) + * control crc + * + * Preamble should already be encoded at the start of base. + */ +static void prepare_head_plain(struct ceph_connection *con, void *base, + int ctrl_len, void *extdata, int extdata_len, + bool to_be_signed) +{ + int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN; + void *crcp = base + base_len - CEPH_CRC_LEN; + u32 crc; + + crc = crc32c(-1, CTRL_BODY(base), ctrl_len); + if (extdata_len) + crc = crc32c(crc, extdata, extdata_len); + put_unaligned_le32(crc, crcp); + + if (!extdata_len) { + add_out_kvec(con, base, base_len); + if (to_be_signed) + add_out_sign_kvec(con, base, base_len); + return; + } + + add_out_kvec(con, base, crcp - base); + add_out_kvec(con, extdata, extdata_len); + add_out_kvec(con, crcp, CEPH_CRC_LEN); + if (to_be_signed) { + add_out_sign_kvec(con, base, crcp - base); + add_out_sign_kvec(con, extdata, extdata_len); + add_out_sign_kvec(con, crcp, CEPH_CRC_LEN); + } +} + +static int prepare_head_secure_small(struct ceph_connection *con, + void *base, int ctrl_len) +{ + struct scatterlist sg; + int ret; + + /* inline buffer padding? */ + if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN) + memset(CTRL_BODY(base) + ctrl_len, 0, + CEPH_PREAMBLE_INLINE_LEN - ctrl_len); + + sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN); + ret = gcm_crypt(con, true, &sg, &sg, + CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN); + if (ret) + return ret; + + add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN); + return 0; +} + +/* + * base: + * preamble + * control body (ctrl_len bytes) + * space for padding, if needed + * space for control remainder auth tag + * space for preamble auth tag + * + * Encrypt preamble and the inline portion, then encrypt the remainder + * and gather into: + * + * preamble + * control body (48 bytes) + * preamble auth tag + * control body (ctrl_len - 48 bytes) + * zero padding, if needed + * control remainder auth tag + * + * Preamble should already be encoded at the start of base. + */ +static int prepare_head_secure_big(struct ceph_connection *con, + void *base, int ctrl_len) +{ + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN; + void *rem_tag = rem + padded_len(rem_len); + void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN; + struct scatterlist sgs[2]; + int ret; + + sg_init_table(sgs, 2); + sg_set_buf(&sgs[0], base, rem - base); + sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN); + ret = gcm_crypt(con, true, sgs, sgs, rem - base); + if (ret) + return ret; + + /* control remainder padding? */ + if (need_padding(rem_len)) + memset(rem + rem_len, 0, padding_len(rem_len)); + + sg_init_one(&sgs[0], rem, pmbl_tag - rem); + ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem); + if (ret) + return ret; + + add_out_kvec(con, base, rem - base); + add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN); + add_out_kvec(con, rem, pmbl_tag - rem); + return 0; +} + +static int __prepare_control(struct ceph_connection *con, int tag, + void *base, int ctrl_len, void *extdata, + int extdata_len, bool to_be_signed) +{ + int total_len = ctrl_len + extdata_len; + struct ceph_frame_desc desc; + int ret; + + dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag, + total_len, ctrl_len, extdata_len); + + /* extdata may be vmalloc'ed but not base */ + if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len)) + return -EINVAL; + + init_frame_desc(&desc, tag, &total_len, 1); + encode_preamble(&desc, base); + + if (con_secure(con)) { + if (WARN_ON(extdata_len || to_be_signed)) + return -EINVAL; + + if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN) + /* fully inlined, inline buffer may need padding */ + ret = prepare_head_secure_small(con, base, ctrl_len); + else + /* partially inlined, inline buffer is full */ + ret = prepare_head_secure_big(con, base, ctrl_len); + if (ret) + return ret; + } else { + prepare_head_plain(con, base, ctrl_len, extdata, extdata_len, + to_be_signed); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +static int prepare_control(struct ceph_connection *con, int tag, + void *base, int ctrl_len) +{ + return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false); +} + +static int prepare_hello(struct ceph_connection *con) +{ + void *buf, *p; + int ctrl_len; + + ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr); + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_entity_addr(&p, &con->peer_addr); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len, + NULL, 0, true); +} + +/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */ +#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN) + +static int prepare_auth_request(struct ceph_connection *con) +{ + void *authorizer, *authorizer_copy; + int ctrl_len, authorizer_len; + void *buf; + int ret; + + ctrl_len = AUTH_BUF_LEN; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + mutex_unlock(&con->mutex); + ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len, + &authorizer, &authorizer_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_HELLO) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p get_auth_request ret %d\n", __func__, con, ret); + if (ret) + return ret; + + authorizer_copy = alloc_conn_buf(con, authorizer_len); + if (!authorizer_copy) + return -ENOMEM; + + memcpy(authorizer_copy, authorizer, authorizer_len); + + return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len, + authorizer_copy, authorizer_len, true); +} + +static int prepare_auth_request_more(struct ceph_connection *con, + void *reply, int reply_len) +{ + int ctrl_len, authorizer_len; + void *authorizer; + void *buf; + int ret; + + ctrl_len = AUTH_BUF_LEN; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false)); + if (!buf) + return -ENOMEM; + + mutex_unlock(&con->mutex); + ret = con->ops->handle_auth_reply_more(con, reply, reply_len, + CTRL_BODY(buf), &ctrl_len, + &authorizer, &authorizer_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret); + if (ret) + return ret; + + return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf, + ctrl_len, authorizer, authorizer_len, true); +} + +static int prepare_auth_signature(struct ceph_connection *con) +{ + void *buf; + int ret; + + buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, false)); + if (!buf) + return -ENOMEM; + + ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); + if (ret) + return ret; + + return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, + SHA256_DIGEST_SIZE); +} + +static int prepare_client_ident(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + struct ceph_client *client = from_msgr(con->msgr); + u64 global_id = ceph_client_gid(client); + void *buf, *p; + int ctrl_len; + + WARN_ON(con->v2.server_cookie); + WARN_ON(con->v2.connect_seq); + WARN_ON(con->v2.peer_global_seq); + + if (!con->v2.client_cookie) { + do { + get_random_bytes(&con->v2.client_cookie, + sizeof(con->v2.client_cookie)); + } while (!con->v2.client_cookie); + dout("%s con %p generated cookie 0x%llx\n", __func__, con, + con->v2.client_cookie); + } else { + dout("%s con %p cookie already set 0x%llx\n", __func__, con, + con->v2.client_cookie); + } + + dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n", + __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce), + ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce), + global_id, con->v2.global_seq, client->supported_features, + client->required_features, con->v2.client_cookie); + + ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + + ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con))); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, 2); /* addrvec marker */ + ceph_encode_32(&p, 1); /* addr_cnt */ + ceph_encode_entity_addr(&p, my_addr); + ceph_encode_entity_addr(&p, &con->peer_addr); + ceph_encode_64(&p, global_id); + ceph_encode_64(&p, con->v2.global_seq); + ceph_encode_64(&p, client->supported_features); + ceph_encode_64(&p, client->required_features); + ceph_encode_64(&p, 0); /* flags */ + ceph_encode_64(&p, con->v2.client_cookie); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len); +} + +static int prepare_session_reconnect(struct ceph_connection *con) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + void *buf, *p; + int ctrl_len; + + WARN_ON(!con->v2.client_cookie); + WARN_ON(!con->v2.server_cookie); + WARN_ON(!con->v2.connect_seq); + WARN_ON(!con->v2.peer_global_seq); + + dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n", + __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce), + con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq, + con->v2.connect_seq, con->in_seq); + + ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8; + buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con))); + if (!buf) + return -ENOMEM; + + p = CTRL_BODY(buf); + ceph_encode_8(&p, 2); /* entity_addrvec_t marker */ + ceph_encode_32(&p, 1); /* my_addrs len */ + ceph_encode_entity_addr(&p, my_addr); + ceph_encode_64(&p, con->v2.client_cookie); + ceph_encode_64(&p, con->v2.server_cookie); + ceph_encode_64(&p, con->v2.global_seq); + ceph_encode_64(&p, con->v2.connect_seq); + ceph_encode_64(&p, con->in_seq); + WARN_ON(p != CTRL_BODY(buf) + ctrl_len); + + return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len); +} + +static int prepare_keepalive2(struct ceph_connection *con) +{ + struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf); + struct timespec64 now; + + ktime_get_real_ts64(&now); + dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec, + now.tv_nsec); + + ceph_encode_timespec64(ts, &now); + + reset_out_kvecs(con); + return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf, + sizeof(struct ceph_timespec)); +} + +static int prepare_ack(struct ceph_connection *con) +{ + void *p; + + dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + p = CTRL_BODY(con->v2.out_buf); + ceph_encode_64(&p, con->in_seq_acked); + + reset_out_kvecs(con); + return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); +} + +static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +{ + dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, + con->out_msg, aborted, con->v2.out_epil.front_crc, + con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); + + encode_epilogue_plain(con, aborted); + add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN); +} + +/* + * For "used" empty segments, crc is -1. For unused (trailing) + * segments, crc is 0. + */ +static void prepare_message_plain(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + + prepare_head_plain(con, con->v2.out_buf, + sizeof(struct ceph_msg_header2), NULL, 0, false); + + if (!front_len(msg) && !middle_len(msg)) { + if (!data_len(msg)) { + /* + * Empty message: once the head is written, + * we are done -- there is no epilogue. + */ + con->v2.out_state = OUT_S_FINISH_MESSAGE; + return; + } + + con->v2.out_epil.front_crc = -1; + con->v2.out_epil.middle_crc = -1; + con->v2.out_state = OUT_S_QUEUE_DATA; + return; + } + + if (front_len(msg)) { + con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base, + front_len(msg)); + add_out_kvec(con, msg->front.iov_base, front_len(msg)); + } else { + /* middle (at least) is there, checked above */ + con->v2.out_epil.front_crc = -1; + } + + if (middle_len(msg)) { + con->v2.out_epil.middle_crc = + crc32c(-1, msg->middle->vec.iov_base, middle_len(msg)); + add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + } else { + con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0; + } + + if (data_len(msg)) { + con->v2.out_state = OUT_S_QUEUE_DATA; + } else { + con->v2.out_epil.data_crc = 0; + prepare_epilogue_plain(con, false); + con->v2.out_state = OUT_S_FINISH_MESSAGE; + } +} + +/* + * Unfortunately the kernel crypto API doesn't support streaming + * (piecewise) operation for AEAD algorithms, so we can't get away + * with a fixed size buffer and a couple sgs. Instead, we have to + * allocate pages for the entire tail of the message (currently up + * to ~32M) and two sgs arrays (up to ~256K each)... + */ +static int prepare_message_secure(struct ceph_connection *con) +{ + void *zerop = page_address(ceph_zero_page); + struct sg_table enc_sgt = {}; + struct sg_table sgt = {}; + struct page **enc_pages; + int enc_page_cnt; + int tail_len; + int ret; + + ret = prepare_head_secure_small(con, con->v2.out_buf, + sizeof(struct ceph_msg_header2)); + if (ret) + return ret; + + tail_len = tail_onwire_len(con->out_msg, true); + if (!tail_len) { + /* + * Empty message: once the head is written, + * we are done -- there is no epilogue. + */ + con->v2.out_state = OUT_S_FINISH_MESSAGE; + return 0; + } + + encode_epilogue_secure(con, false); + ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + &con->v2.out_epil, false); + if (ret) + goto out; + + enc_page_cnt = calc_pages_for(0, tail_len); + enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO); + if (IS_ERR(enc_pages)) { + ret = PTR_ERR(enc_pages); + goto out; + } + + WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = enc_pages; + con->v2.out_enc_page_cnt = enc_page_cnt; + con->v2.out_enc_resid = tail_len; + con->v2.out_enc_i = 0; + + ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt, + 0, tail_len, GFP_NOIO); + if (ret) + goto out; + + ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl, + tail_len - CEPH_GCM_TAG_LEN); + if (ret) + goto out; + + dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, + con->out_msg, sgt.orig_nents, enc_page_cnt); + con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; + +out: + sg_free_table(&sgt); + sg_free_table(&enc_sgt); + return ret; +} + +static int prepare_message(struct ceph_connection *con) +{ + int lens[] = { + sizeof(struct ceph_msg_header2), + front_len(con->out_msg), + middle_len(con->out_msg), + data_len(con->out_msg) + }; + struct ceph_frame_desc desc; + int ret; + + dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, + con->out_msg, lens[0], lens[1], lens[2], lens[3]); + + if (con->in_seq > con->in_seq_acked) { + dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + } + + reset_out_kvecs(con); + init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); + encode_preamble(&desc, con->v2.out_buf); + fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + con->in_seq_acked); + + if (con_secure(con)) { + ret = prepare_message_secure(con); + if (ret) + return ret; + } else { + prepare_message_plain(con); + } + + ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +static int prepare_read_banner_prefix(struct ceph_connection *con) +{ + void *buf; + + buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN); + if (!buf) + return -ENOMEM; + + reset_in_kvecs(con); + add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN); + add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN); + con->state = CEPH_CON_S_V2_BANNER_PREFIX; + return 0; +} + +static int prepare_read_banner_payload(struct ceph_connection *con, + int payload_len) +{ + void *buf; + + buf = alloc_conn_buf(con, payload_len); + if (!buf) + return -ENOMEM; + + reset_in_kvecs(con); + add_in_kvec(con, buf, payload_len); + add_in_sign_kvec(con, buf, payload_len); + con->state = CEPH_CON_S_V2_BANNER_PAYLOAD; + return 0; +} + +static void prepare_read_preamble(struct ceph_connection *con) +{ + reset_in_kvecs(con); + add_in_kvec(con, con->v2.in_buf, + con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN : + CEPH_PREAMBLE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_PREAMBLE; +} + +static int prepare_read_control(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int head_len; + void *buf; + + reset_in_kvecs(con); + if (con->state == CEPH_CON_S_V2_HELLO || + con->state == CEPH_CON_S_V2_AUTH) { + head_len = head_onwire_len(ctrl_len, false); + buf = alloc_conn_buf(con, head_len); + if (!buf) + return -ENOMEM; + + /* preserve preamble */ + memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN); + + add_in_kvec(con, CTRL_BODY(buf), ctrl_len); + add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN); + add_in_sign_kvec(con, buf, head_len); + } else { + if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + add_in_kvec(con, buf, ctrl_len); + } else { + add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len); + } + add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN); + } + con->v2.in_state = IN_S_HANDLE_CONTROL; + return 0; +} + +static int prepare_read_control_remainder(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN; + void *buf; + + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN); + + reset_in_kvecs(con); + add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len); + add_in_kvec(con, con->v2.in_buf, + padding_len(rem_len) + CEPH_GCM_TAG_LEN); + con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER; + return 0; +} + +static void prepare_read_data(struct ceph_connection *con) +{ + struct bio_vec bv; + + if (!con_secure(con)) + con->in_data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg, + data_len(con->in_msg)); + + get_bvec_at(&con->v2.in_cursor, &bv); + set_in_bvec(con, &bv); + con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; +} + +static void prepare_read_data_cont(struct ceph_connection *con) +{ + struct bio_vec bv; + + if (!con_secure(con)) + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + + ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len); + if (con->v2.in_cursor.total_resid) { + get_bvec_at(&con->v2.in_cursor, &bv); + set_in_bvec(con, &bv); + WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); + return; + } + + /* + * We've read all data. Prepare to read data padding (if any) + * and epilogue. + */ + reset_in_kvecs(con); + if (con_secure(con)) { + if (need_padding(data_len(con->in_msg))) + add_in_kvec(con, DATA_PAD(con->v2.in_buf), + padding_len(data_len(con->in_msg))); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN); + } else { + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + } + con->v2.in_state = IN_S_HANDLE_EPILOGUE; +} + +static void __finish_skip(struct ceph_connection *con) +{ + con->in_seq++; + prepare_read_preamble(con); +} + +static void prepare_skip_message(struct ceph_connection *con) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + int tail_len; + + dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1], + desc->fd_lens[2], desc->fd_lens[3]); + + tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2], + desc->fd_lens[3], con_secure(con)); + if (!tail_len) { + __finish_skip(con); + } else { + set_in_skip(con, tail_len); + con->v2.in_state = IN_S_FINISH_SKIP; + } +} + +static int process_banner_prefix(struct ceph_connection *con) +{ + int payload_len; + void *p; + + WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN); + + p = con->v2.in_kvecs[0].iov_base; + if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) { + if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN)) + con->error_msg = "server is speaking msgr1 protocol"; + else + con->error_msg = "protocol error, bad banner"; + return -EINVAL; + } + + p += CEPH_BANNER_V2_LEN; + payload_len = ceph_decode_16(&p); + dout("%s con %p payload_len %d\n", __func__, con, payload_len); + + return prepare_read_banner_payload(con, payload_len); +} + +static int process_banner_payload(struct ceph_connection *con) +{ + void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len; + u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES; + u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES; + u64 server_feat, server_req_feat; + void *p; + int ret; + + p = con->v2.in_kvecs[0].iov_base; + ceph_decode_64_safe(&p, end, server_feat, bad); + ceph_decode_64_safe(&p, end, server_req_feat, bad); + + dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n", + __func__, con, server_feat, server_req_feat); + + if (req_feat & ~server_feat) { + pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n", + server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + if (server_req_feat & ~feat) { + pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n", + feat, server_req_feat & ~feat); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + + /* no reset_out_kvecs() as our banner may still be pending */ + ret = prepare_hello(con); + if (ret) { + pr_err("prepare_hello failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_HELLO; + prepare_read_preamble(con); + return 0; + +bad: + pr_err("failed to decode banner payload\n"); + return -EINVAL; +} + +static int process_hello(struct ceph_connection *con, void *p, void *end) +{ + struct ceph_entity_addr *my_addr = &con->msgr->inst.addr; + struct ceph_entity_addr addr_for_me; + u8 entity_type; + int ret; + + if (con->state != CEPH_CON_S_V2_HELLO) { + con->error_msg = "protocol error, unexpected hello"; + return -EINVAL; + } + + ceph_decode_8_safe(&p, end, entity_type, bad); + ret = ceph_decode_entity_addr(&p, end, &addr_for_me); + if (ret) { + pr_err("failed to decode addr_for_me: %d\n", ret); + return ret; + } + + dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con, + entity_type, ceph_pr_addr(&addr_for_me)); + + if (entity_type != con->peer_name.type) { + pr_err("bad peer type, want %d, got %d\n", + con->peer_name.type, entity_type); + con->error_msg = "wrong peer at address"; + return -EINVAL; + } + + /* + * Set our address to the address our first peer (i.e. monitor) + * sees that we are connecting from. If we are behind some sort + * of NAT and want to be identified by some private (not NATed) + * address, ip option should be used. + */ + if (ceph_addr_is_blank(my_addr)) { + memcpy(&my_addr->in_addr, &addr_for_me.in_addr, + sizeof(my_addr->in_addr)); + ceph_addr_set_port(my_addr, 0); + dout("%s con %p set my addr %s, as seen by peer %s\n", + __func__, con, ceph_pr_addr(my_addr), + ceph_pr_addr(&con->peer_addr)); + } else { + dout("%s con %p my addr already set %s\n", + __func__, con, ceph_pr_addr(my_addr)); + } + + WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr)); + WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY); + WARN_ON(!my_addr->nonce); + + /* no reset_out_kvecs() as our hello may still be pending */ + ret = prepare_auth_request(con); + if (ret) { + if (ret != -EAGAIN) + pr_err("prepare_auth_request failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_AUTH; + return 0; + +bad: + pr_err("failed to decode hello\n"); + return -EINVAL; +} + +static int process_auth_bad_method(struct ceph_connection *con, + void *p, void *end) +{ + int allowed_protos[8], allowed_modes[8]; + int allowed_proto_cnt, allowed_mode_cnt; + int used_proto, result; + int ret; + int i; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_bad_method"; + return -EINVAL; + } + + ceph_decode_32_safe(&p, end, used_proto, bad); + ceph_decode_32_safe(&p, end, result, bad); + dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto, + result); + + ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad); + if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) { + pr_err("allowed_protos too big %d\n", allowed_proto_cnt); + return -EINVAL; + } + for (i = 0; i < allowed_proto_cnt; i++) { + ceph_decode_32_safe(&p, end, allowed_protos[i], bad); + dout("%s con %p allowed_protos[%d] %d\n", __func__, con, + i, allowed_protos[i]); + } + + ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad); + if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) { + pr_err("allowed_modes too big %d\n", allowed_mode_cnt); + return -EINVAL; + } + for (i = 0; i < allowed_mode_cnt; i++) { + ceph_decode_32_safe(&p, end, allowed_modes[i], bad); + dout("%s con %p allowed_modes[%d] %d\n", __func__, con, + i, allowed_modes[i]); + } + + mutex_unlock(&con->mutex); + ret = con->ops->handle_auth_bad_method(con, used_proto, result, + allowed_protos, + allowed_proto_cnt, + allowed_modes, + allowed_mode_cnt); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret); + return ret; + +bad: + pr_err("failed to decode auth_bad_method\n"); + return -EINVAL; +} + +static int process_auth_reply_more(struct ceph_connection *con, + void *p, void *end) +{ + int payload_len; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_reply_more"; + return -EINVAL; + } + + ceph_decode_32_safe(&p, end, payload_len, bad); + ceph_decode_need(&p, end, payload_len, bad); + + dout("%s con %p payload_len %d\n", __func__, con, payload_len); + + reset_out_kvecs(con); + ret = prepare_auth_request_more(con, p, payload_len); + if (ret) { + if (ret != -EAGAIN) + pr_err("prepare_auth_request_more failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode auth_reply_more\n"); + return -EINVAL; +} + +static int process_auth_done(struct ceph_connection *con, void *p, void *end) +{ + u8 session_key[CEPH_KEY_LEN]; + u8 con_secret[CEPH_MAX_CON_SECRET_LEN]; + int session_key_len, con_secret_len; + int payload_len; + u64 global_id; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH) { + con->error_msg = "protocol error, unexpected auth_done"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, global_id, bad); + ceph_decode_32_safe(&p, end, con->v2.con_mode, bad); + ceph_decode_32_safe(&p, end, payload_len, bad); + + dout("%s con %p global_id %llu con_mode %d payload_len %d\n", + __func__, con, global_id, con->v2.con_mode, payload_len); + + mutex_unlock(&con->mutex); + session_key_len = 0; + con_secret_len = 0; + ret = con->ops->handle_auth_done(con, global_id, p, payload_len, + session_key, &session_key_len, + con_secret, &con_secret_len); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_AUTH) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret); + if (ret) + return ret; + + ret = setup_crypto(con, session_key, session_key_len, con_secret, + con_secret_len); + if (ret) + return ret; + + reset_out_kvecs(con); + ret = prepare_auth_signature(con); + if (ret) { + pr_err("prepare_auth_signature failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_AUTH_SIGNATURE; + return 0; + +bad: + pr_err("failed to decode auth_done\n"); + return -EINVAL; +} + +static int process_auth_signature(struct ceph_connection *con, + void *p, void *end) +{ + u8 hmac[SHA256_DIGEST_SIZE]; + int ret; + + if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) { + con->error_msg = "protocol error, unexpected auth_signature"; + return -EINVAL; + } + + ret = hmac_sha256(con, con->v2.out_sign_kvecs, + con->v2.out_sign_kvec_cnt, hmac); + if (ret) + return ret; + + ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); + if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { + con->error_msg = "integrity error, bad auth signature"; + return -EBADMSG; + } + + dout("%s con %p auth signature ok\n", __func__, con); + + /* no reset_out_kvecs() as our auth_signature may still be pending */ + if (!con->v2.server_cookie) { + ret = prepare_client_ident(con); + if (ret) { + pr_err("prepare_client_ident failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_CONNECT; + } else { + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_RECONNECT; + } + + return 0; + +bad: + pr_err("failed to decode auth_signature\n"); + return -EINVAL; +} + +static int process_server_ident(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_client *client = from_msgr(con->msgr); + u64 features, required_features; + struct ceph_entity_addr addr; + u64 global_seq; + u64 global_id; + u64 cookie; + u64 flags; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) { + con->error_msg = "protocol error, unexpected server_ident"; + return -EINVAL; + } + + ret = ceph_decode_entity_addrvec(&p, end, true, &addr); + if (ret) { + pr_err("failed to decode server addrs: %d\n", ret); + return ret; + } + + ceph_decode_64_safe(&p, end, global_id, bad); + ceph_decode_64_safe(&p, end, global_seq, bad); + ceph_decode_64_safe(&p, end, features, bad); + ceph_decode_64_safe(&p, end, required_features, bad); + ceph_decode_64_safe(&p, end, flags, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + + dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n", + __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce), + global_id, global_seq, features, required_features, flags, cookie); + + /* is this who we intended to talk to? */ + if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) { + pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n", + ceph_pr_addr(&con->peer_addr), + le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&addr), le32_to_cpu(addr.nonce)); + con->error_msg = "wrong peer at address"; + return -EINVAL; + } + + if (client->required_features & ~features) { + pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n", + features, client->required_features & ~features); + con->error_msg = "missing required protocol features"; + return -EINVAL; + } + + /* + * Both name->type and name->num are set in ceph_con_open() but + * name->num may be bogus in the initial monmap. name->type is + * verified in handle_hello(). + */ + WARN_ON(!con->peer_name.type); + con->peer_name.num = cpu_to_le64(global_id); + con->v2.peer_global_seq = global_seq; + con->peer_features = features; + WARN_ON(required_features & ~client->supported_features); + con->v2.server_cookie = cookie; + + if (flags & CEPH_MSG_CONNECT_LOSSY) { + ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX); + WARN_ON(con->v2.server_cookie); + } else { + WARN_ON(!con->v2.server_cookie); + } + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + con->delay = 0; /* reset backoff memory */ + + con->state = CEPH_CON_S_OPEN; + con->v2.out_state = OUT_S_GET_NEXT; + return 0; + +bad: + pr_err("failed to decode server_ident\n"); + return -EINVAL; +} + +static int process_ident_missing_features(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_client *client = from_msgr(con->msgr); + u64 missing_features; + + if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) { + con->error_msg = "protocol error, unexpected ident_missing_features"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, missing_features, bad); + pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n", + client->supported_features, missing_features); + con->error_msg = "missing required protocol features"; + return -EINVAL; + +bad: + pr_err("failed to decode ident_missing_features\n"); + return -EINVAL; +} + +static int process_session_reconnect_ok(struct ceph_connection *con, + void *p, void *end) +{ + u64 seq; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_reconnect_ok"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, seq, bad); + + dout("%s con %p seq %llu\n", __func__, con, seq); + ceph_con_discard_requeued(con, seq); + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + con->delay = 0; /* reset backoff memory */ + + con->state = CEPH_CON_S_OPEN; + con->v2.out_state = OUT_S_GET_NEXT; + return 0; + +bad: + pr_err("failed to decode session_reconnect_ok\n"); + return -EINVAL; +} + +static int process_session_retry(struct ceph_connection *con, + void *p, void *end) +{ + u64 connect_seq; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_retry"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, connect_seq, bad); + + dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq); + WARN_ON(connect_seq <= con->v2.connect_seq); + con->v2.connect_seq = connect_seq + 1; + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode session_retry\n"); + return -EINVAL; +} + +static int process_session_retry_global(struct ceph_connection *con, + void *p, void *end) +{ + u64 global_seq; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_retry_global"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, global_seq, bad); + + dout("%s con %p global_seq %llu\n", __func__, con, global_seq); + WARN_ON(global_seq <= con->v2.global_seq); + con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq); + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_session_reconnect(con); + if (ret) { + pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret); + return ret; + } + + return 0; + +bad: + pr_err("failed to decode session_retry_global\n"); + return -EINVAL; +} + +static int process_session_reset(struct ceph_connection *con, + void *p, void *end) +{ + bool full; + int ret; + + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + con->error_msg = "protocol error, unexpected session_reset"; + return -EINVAL; + } + + ceph_decode_8_safe(&p, end, full, bad); + if (!full) { + con->error_msg = "protocol error, bad session_reset"; + return -EINVAL; + } + + pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr)); + ceph_con_reset_session(con); + + mutex_unlock(&con->mutex); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + free_conn_bufs(con); + + reset_out_kvecs(con); + ret = prepare_client_ident(con); + if (ret) { + pr_err("prepare_client_ident (rst) failed: %d\n", ret); + return ret; + } + + con->state = CEPH_CON_S_V2_SESSION_CONNECT; + return 0; + +bad: + pr_err("failed to decode session_reset\n"); + return -EINVAL; +} + +static int process_keepalive2_ack(struct ceph_connection *con, + void *p, void *end) +{ + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected keepalive2_ack"; + return -EINVAL; + } + + ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad); + ceph_decode_timespec64(&con->last_keepalive_ack, p); + + dout("%s con %p timestamp %lld.%09ld\n", __func__, con, + con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec); + + return 0; + +bad: + pr_err("failed to decode keepalive2_ack\n"); + return -EINVAL; +} + +static int process_ack(struct ceph_connection *con, void *p, void *end) +{ + u64 seq; + + if (con->state != CEPH_CON_S_OPEN) { + con->error_msg = "protocol error, unexpected ack"; + return -EINVAL; + } + + ceph_decode_64_safe(&p, end, seq, bad); + + dout("%s con %p seq %llu\n", __func__, con, seq); + ceph_con_discard_sent(con, seq); + return 0; + +bad: + pr_err("failed to decode ack\n"); + return -EINVAL; +} + +static int process_control(struct ceph_connection *con, void *p, void *end) +{ + int tag = con->v2.in_desc.fd_tag; + int ret; + + dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p)); + + switch (tag) { + case FRAME_TAG_HELLO: + ret = process_hello(con, p, end); + break; + case FRAME_TAG_AUTH_BAD_METHOD: + ret = process_auth_bad_method(con, p, end); + break; + case FRAME_TAG_AUTH_REPLY_MORE: + ret = process_auth_reply_more(con, p, end); + break; + case FRAME_TAG_AUTH_DONE: + ret = process_auth_done(con, p, end); + break; + case FRAME_TAG_AUTH_SIGNATURE: + ret = process_auth_signature(con, p, end); + break; + case FRAME_TAG_SERVER_IDENT: + ret = process_server_ident(con, p, end); + break; + case FRAME_TAG_IDENT_MISSING_FEATURES: + ret = process_ident_missing_features(con, p, end); + break; + case FRAME_TAG_SESSION_RECONNECT_OK: + ret = process_session_reconnect_ok(con, p, end); + break; + case FRAME_TAG_SESSION_RETRY: + ret = process_session_retry(con, p, end); + break; + case FRAME_TAG_SESSION_RETRY_GLOBAL: + ret = process_session_retry_global(con, p, end); + break; + case FRAME_TAG_SESSION_RESET: + ret = process_session_reset(con, p, end); + break; + case FRAME_TAG_KEEPALIVE2_ACK: + ret = process_keepalive2_ack(con, p, end); + break; + case FRAME_TAG_ACK: + ret = process_ack(con, p, end); + break; + default: + pr_err("bad tag %d\n", tag); + con->error_msg = "protocol error, bad tag"; + return -EINVAL; + } + if (ret) { + dout("%s con %p error %d\n", __func__, con, ret); + return ret; + } + + prepare_read_preamble(con); + return 0; +} + +/* + * Return: + * 1 - con->in_msg set, read message + * 0 - skip message + * <0 - error + */ +static int process_message_header(struct ceph_connection *con, + void *p, void *end) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + struct ceph_msg_header2 *hdr2 = p; + struct ceph_msg_header hdr; + int skip; + int ret; + u64 seq; + + /* verify seq# */ + seq = le64_to_cpu(hdr2->seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr), + seq, con->in_seq + 1); + return 0; + } + if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq)); + + fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2], + desc->fd_lens[3], &con->peer_name); + ret = ceph_con_in_msg_alloc(con, &hdr, &skip); + if (ret) + return ret; + + WARN_ON(!con->in_msg ^ skip); + if (skip) + return 0; + + WARN_ON(!con->in_msg); + WARN_ON(con->in_msg->con != con); + return 1; +} + +static int process_message(struct ceph_connection *con) +{ + ceph_con_process_message(con); + + /* + * We could have been closed by ceph_con_close() because + * ceph_con_process_message() temporarily drops con->mutex. + */ + if (con->state != CEPH_CON_S_OPEN) { + dout("%s con %p state changed to %d\n", __func__, con, + con->state); + return -EAGAIN; + } + + prepare_read_preamble(con); + return 0; +} + +static int __handle_control(struct ceph_connection *con, void *p) +{ + void *end = p + con->v2.in_desc.fd_lens[0]; + struct ceph_msg *msg; + int ret; + + if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE) + return process_control(con, p, end); + + ret = process_message_header(con, p, end); + if (ret < 0) + return ret; + if (ret == 0) { + prepare_skip_message(con); + return 0; + } + + msg = con->in_msg; /* set in process_message_header() */ + if (!front_len(msg) && !middle_len(msg)) { + if (!data_len(msg)) + return process_message(con); + + prepare_read_data(con); + return 0; + } + + reset_in_kvecs(con); + if (front_len(msg)) { + WARN_ON(front_len(msg) > msg->front_alloc_len); + add_in_kvec(con, msg->front.iov_base, front_len(msg)); + msg->front.iov_len = front_len(msg); + + if (con_secure(con) && need_padding(front_len(msg))) + add_in_kvec(con, FRONT_PAD(con->v2.in_buf), + padding_len(front_len(msg))); + } else { + msg->front.iov_len = 0; + } + if (middle_len(msg)) { + WARN_ON(middle_len(msg) > msg->middle->alloc_len); + add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg)); + msg->middle->vec.iov_len = middle_len(msg); + + if (con_secure(con) && need_padding(middle_len(msg))) + add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf), + padding_len(middle_len(msg))); + } else if (msg->middle) { + msg->middle->vec.iov_len = 0; + } + + if (data_len(msg)) { + con->v2.in_state = IN_S_PREPARE_READ_DATA; + } else { + add_in_kvec(con, con->v2.in_buf, + con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN : + CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + } + return 0; +} + +static int handle_preamble(struct ceph_connection *con) +{ + struct ceph_frame_desc *desc = &con->v2.in_desc; + int ret; + + if (con_secure(con)) { + ret = decrypt_preamble(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad preamble auth tag"; + return ret; + } + } + + ret = decode_preamble(con->v2.in_buf, desc); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad crc"; + else + con->error_msg = "protocol error, bad preamble"; + return ret; + } + + dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__, + con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0], + desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]); + + if (!con_secure(con)) + return prepare_read_control(con); + + if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN) + return prepare_read_control_remainder(con); + + return __handle_control(con, CTRL_BODY(con->v2.in_buf)); +} + +static int handle_control(struct ceph_connection *con) +{ + int ctrl_len = con->v2.in_desc.fd_lens[0]; + void *buf; + int ret; + + WARN_ON(con_secure(con)); + + ret = verify_control_crc(con); + if (ret) { + con->error_msg = "integrity error, bad crc"; + return ret; + } + + if (con->state == CEPH_CON_S_V2_AUTH) { + buf = alloc_conn_buf(con, ctrl_len); + if (!buf) + return -ENOMEM; + + memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len); + return __handle_control(con, buf); + } + + return __handle_control(con, con->v2.in_kvecs[0].iov_base); +} + +static int handle_control_remainder(struct ceph_connection *con) +{ + int ret; + + WARN_ON(!con_secure(con)); + + ret = decrypt_control_remainder(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad control remainder auth tag"; + return ret; + } + + return __handle_control(con, con->v2.in_kvecs[0].iov_base - + CEPH_PREAMBLE_INLINE_LEN); +} + +static int handle_epilogue(struct ceph_connection *con) +{ + u32 front_crc, middle_crc, data_crc; + int ret; + + if (con_secure(con)) { + ret = decrypt_message(con); + if (ret) { + if (ret == -EBADMSG) + con->error_msg = "integrity error, bad epilogue auth tag"; + return ret; + } + + /* just late_status */ + ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL); + if (ret) { + con->error_msg = "protocol error, bad epilogue"; + return ret; + } + } else { + ret = decode_epilogue(con->v2.in_buf, &front_crc, + &middle_crc, &data_crc); + if (ret) { + con->error_msg = "protocol error, bad epilogue"; + return ret; + } + + ret = verify_epilogue_crcs(con, front_crc, middle_crc, + data_crc); + if (ret) { + con->error_msg = "integrity error, bad crc"; + return ret; + } + } + + return process_message(con); +} + +static void finish_skip(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + + if (con_secure(con)) + gcm_inc_nonce(&con->v2.in_gcm_nonce); + + __finish_skip(con); +} + +static int populate_in_iter(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d in_state %d\n", __func__, con, con->state, + con->v2.in_state); + WARN_ON(iov_iter_count(&con->v2.in_iter)); + + if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) { + ret = process_banner_prefix(con); + } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) { + ret = process_banner_payload(con); + } else if ((con->state >= CEPH_CON_S_V2_HELLO && + con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) || + con->state == CEPH_CON_S_OPEN) { + switch (con->v2.in_state) { + case IN_S_HANDLE_PREAMBLE: + ret = handle_preamble(con); + break; + case IN_S_HANDLE_CONTROL: + ret = handle_control(con); + break; + case IN_S_HANDLE_CONTROL_REMAINDER: + ret = handle_control_remainder(con); + break; + case IN_S_PREPARE_READ_DATA: + prepare_read_data(con); + ret = 0; + break; + case IN_S_PREPARE_READ_DATA_CONT: + prepare_read_data_cont(con); + ret = 0; + break; + case IN_S_HANDLE_EPILOGUE: + ret = handle_epilogue(con); + break; + case IN_S_FINISH_SKIP: + finish_skip(con); + ret = 0; + break; + default: + WARN(1, "bad in_state %d", con->v2.in_state); + return -EINVAL; + } + } else { + WARN(1, "bad state %d", con->state); + return -EINVAL; + } + if (ret) { + dout("%s con %p error %d\n", __func__, con, ret); + return ret; + } + + if (WARN_ON(!iov_iter_count(&con->v2.in_iter))) + return -ENODATA; + dout("%s con %p populated %zu\n", __func__, con, + iov_iter_count(&con->v2.in_iter)); + return 1; +} + +int ceph_con_v2_try_read(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d need %zu\n", __func__, con, con->state, + iov_iter_count(&con->v2.in_iter)); + + if (con->state == CEPH_CON_S_PREOPEN) + return 0; + + /* + * We should always have something pending here. If not, + * avoid calling populate_in_iter() as if we read something + * (ceph_tcp_recv() would immediately return 1). + */ + if (WARN_ON(!iov_iter_count(&con->v2.in_iter))) + return -ENODATA; + + for (;;) { + ret = ceph_tcp_recv(con); + if (ret <= 0) + return ret; + + ret = populate_in_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "read processing error"; + return ret; + } + } +} + +static void queue_data(struct ceph_connection *con) +{ + struct bio_vec bv; + + con->v2.out_epil.data_crc = -1; + ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, + data_len(con->out_msg)); + + get_bvec_at(&con->v2.out_cursor, &bv); + set_out_bvec(con, &bv, true); + con->v2.out_state = OUT_S_QUEUE_DATA_CONT; +} + +static void queue_data_cont(struct ceph_connection *con) +{ + struct bio_vec bv; + + con->v2.out_epil.data_crc = ceph_crc32c_page( + con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page, + con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len); + + ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len); + if (con->v2.out_cursor.total_resid) { + get_bvec_at(&con->v2.out_cursor, &bv); + set_out_bvec(con, &bv, true); + WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT); + return; + } + + /* + * We've written all data. Queue epilogue. Once it's written, + * we are done. + */ + reset_out_kvecs(con); + prepare_epilogue_plain(con, false); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void queue_enc_page(struct ceph_connection *con) +{ + struct bio_vec bv; + + dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i, + con->v2.out_enc_resid); + WARN_ON(!con->v2.out_enc_resid); + + bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i]; + bv.bv_offset = 0; + bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE); + + set_out_bvec(con, &bv, false); + con->v2.out_enc_i++; + con->v2.out_enc_resid -= bv.bv_len; + + if (con->v2.out_enc_resid) { + WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE); + return; + } + + /* + * We've queued the last piece of ciphertext (ending with + * epilogue) + auth tag. Once it's written, we are done. + */ + WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void queue_zeros(struct ceph_connection *con) +{ + dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); + + if (con->v2.out_zero) { + set_out_bvec_zero(con); + con->v2.out_zero -= con->v2.out_bvec.bv_len; + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + /* + * We've zero-filled everything up to epilogue. Queue epilogue + * with late_status set to ABORTED and crcs adjusted for zeros. + * Once it's written, we are done patching up for the revoke. + */ + reset_out_kvecs(con); + prepare_epilogue_plain(con, true); + con->v2.out_state = OUT_S_FINISH_MESSAGE; +} + +static void finish_message(struct ceph_connection *con) +{ + dout("%s con %p msg %p\n", __func__, con, con->out_msg); + + /* we end up here both plain and secure modes */ + if (con->v2.out_enc_pages) { + WARN_ON(!con->v2.out_enc_page_cnt); + ceph_release_page_vector(con->v2.out_enc_pages, + con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = NULL; + con->v2.out_enc_page_cnt = 0; + } + /* message may have been revoked */ + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + + con->v2.out_state = OUT_S_GET_NEXT; +} + +static int populate_out_iter(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d out_state %d\n", __func__, con, con->state, + con->v2.out_state); + WARN_ON(iov_iter_count(&con->v2.out_iter)); + + if (con->state != CEPH_CON_S_OPEN) { + WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX || + con->state > CEPH_CON_S_V2_SESSION_RECONNECT); + goto nothing_pending; + } + + switch (con->v2.out_state) { + case OUT_S_QUEUE_DATA: + WARN_ON(!con->out_msg); + queue_data(con); + goto populated; + case OUT_S_QUEUE_DATA_CONT: + WARN_ON(!con->out_msg); + queue_data_cont(con); + goto populated; + case OUT_S_QUEUE_ENC_PAGE: + queue_enc_page(con); + goto populated; + case OUT_S_QUEUE_ZEROS: + WARN_ON(con->out_msg); /* revoked */ + queue_zeros(con); + goto populated; + case OUT_S_FINISH_MESSAGE: + finish_message(con); + break; + case OUT_S_GET_NEXT: + break; + default: + WARN(1, "bad out_state %d", con->v2.out_state); + return -EINVAL; + } + + WARN_ON(con->v2.out_state != OUT_S_GET_NEXT); + if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) { + ret = prepare_keepalive2(con); + if (ret) { + pr_err("prepare_keepalive2 failed: %d\n", ret); + return ret; + } + } else if (!list_empty(&con->out_queue)) { + ceph_con_get_out_msg(con); + ret = prepare_message(con); + if (ret) { + pr_err("prepare_message failed: %d\n", ret); + return ret; + } + } else if (con->in_seq > con->in_seq_acked) { + ret = prepare_ack(con); + if (ret) { + pr_err("prepare_ack failed: %d\n", ret); + return ret; + } + } else { + goto nothing_pending; + } + +populated: + if (WARN_ON(!iov_iter_count(&con->v2.out_iter))) + return -ENODATA; + dout("%s con %p populated %zu\n", __func__, con, + iov_iter_count(&con->v2.out_iter)); + return 1; + +nothing_pending: + WARN_ON(iov_iter_count(&con->v2.out_iter)); + dout("%s con %p nothing pending\n", __func__, con); + ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING); + return 0; +} + +int ceph_con_v2_try_write(struct ceph_connection *con) +{ + int ret; + + dout("%s con %p state %d have %zu\n", __func__, con, con->state, + iov_iter_count(&con->v2.out_iter)); + + /* open the socket first? */ + if (con->state == CEPH_CON_S_PREOPEN) { + WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2); + + /* + * Always bump global_seq. Bump connect_seq only if + * there is a session (i.e. we are reconnecting and will + * send session_reconnect instead of client_ident). + */ + con->v2.global_seq = ceph_get_global_seq(con->msgr, 0); + if (con->v2.server_cookie) + con->v2.connect_seq++; + + ret = prepare_read_banner_prefix(con); + if (ret) { + pr_err("prepare_read_banner_prefix failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + + reset_out_kvecs(con); + ret = prepare_banner(con); + if (ret) { + pr_err("prepare_banner failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + + ret = ceph_tcp_connect(con); + if (ret) { + pr_err("ceph_tcp_connect failed: %d\n", ret); + con->error_msg = "connect error"; + return ret; + } + } + + if (!iov_iter_count(&con->v2.out_iter)) { + ret = populate_out_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "write processing error"; + return ret; + } + } + + tcp_sock_set_cork(con->sock->sk, true); + for (;;) { + ret = ceph_tcp_send(con); + if (ret <= 0) + break; + + ret = populate_out_iter(con); + if (ret <= 0) { + if (ret && ret != -EAGAIN && !con->error_msg) + con->error_msg = "write processing error"; + break; + } + } + + tcp_sock_set_cork(con->sock->sk, false); + return ret; +} + +static u32 crc32c_zeros(u32 crc, int zero_len) +{ + int len; + + while (zero_len) { + len = min(zero_len, (int)PAGE_SIZE); + crc = crc32c(crc, page_address(ceph_zero_page), len); + zero_len -= len; + } + + return crc; +} + +static void prepare_zero_front(struct ceph_connection *con, int resid) +{ + int sent; + + WARN_ON(!resid || resid > front_len(con->out_msg)); + sent = front_len(con->out_msg) - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.front_crc = + crc32c(-1, con->out_msg->front.iov_base, sent); + con->v2.out_epil.front_crc = + crc32c_zeros(con->v2.out_epil.front_crc, resid); + } else { + con->v2.out_epil.front_crc = crc32c_zeros(-1, resid); + } + + con->v2.out_iter.count -= resid; + out_zero_add(con, resid); +} + +static void prepare_zero_middle(struct ceph_connection *con, int resid) +{ + int sent; + + WARN_ON(!resid || resid > middle_len(con->out_msg)); + sent = middle_len(con->out_msg) - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.middle_crc = + crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + con->v2.out_epil.middle_crc = + crc32c_zeros(con->v2.out_epil.middle_crc, resid); + } else { + con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid); + } + + con->v2.out_iter.count -= resid; + out_zero_add(con, resid); +} + +static void prepare_zero_data(struct ceph_connection *con) +{ + dout("%s con %p\n", __func__, con); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); + out_zero_add(con, data_len(con->out_msg)); +} + +static void revoke_at_queue_data(struct ceph_connection *con) +{ + int boundary; + int resid; + + WARN_ON(!data_len(con->out_msg)); + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + + boundary = front_len(con->out_msg) + middle_len(con->out_msg); + if (resid > boundary) { + resid -= boundary; + WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head\n", __func__, con); + if (front_len(con->out_msg)) + prepare_zero_front(con, front_len(con->out_msg)); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_data(con); + WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + boundary = middle_len(con->out_msg); + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending front\n", __func__, con); + prepare_zero_front(con, resid); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_data(con); + queue_zeros(con); + return; + } + + WARN_ON(!resid); + dout("%s con %p was sending middle\n", __func__, con); + prepare_zero_middle(con, resid); + prepare_zero_data(con); + queue_zeros(con); +} + +static void revoke_at_queue_data_cont(struct ceph_connection *con) +{ + int sent, resid; /* current piece of data */ + + WARN_ON(!data_len(con->out_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); + sent = con->v2.out_bvec.bv_len - resid; + dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); + + if (sent) { + con->v2.out_epil.data_crc = ceph_crc32c_page( + con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page, + con->v2.out_bvec.bv_offset, sent); + ceph_msg_data_advance(&con->v2.out_cursor, sent); + } + WARN_ON(resid > con->v2.out_cursor.total_resid); + con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc, + con->v2.out_cursor.total_resid); + + con->v2.out_iter.count -= resid; + out_zero_add(con, con->v2.out_cursor.total_resid); + queue_zeros(con); +} + +static void revoke_at_finish_message(struct ceph_connection *con) +{ + int boundary; + int resid; + + WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); + resid = iov_iter_count(&con->v2.out_iter); + + if (!front_len(con->out_msg) && !middle_len(con->out_msg) && + !data_len(con->out_msg)) { + WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head (empty message) - noop\n", + __func__, con); + return; + } + + boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); + dout("%s con %p was sending head\n", __func__, con); + if (front_len(con->out_msg)) + prepare_zero_front(con, front_len(con->out_msg)); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); + con->v2.out_state = OUT_S_QUEUE_ZEROS; + return; + } + + boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending front\n", __func__, con); + prepare_zero_front(con, resid); + if (middle_len(con->out_msg)) + prepare_zero_middle(con, middle_len(con->out_msg)); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + queue_zeros(con); + return; + } + + boundary = CEPH_EPILOGUE_PLAIN_LEN; + if (resid > boundary) { + resid -= boundary; + dout("%s con %p was sending middle\n", __func__, con); + prepare_zero_middle(con, resid); + con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; + queue_zeros(con); + return; + } + + WARN_ON(!resid); + dout("%s con %p was sending epilogue - noop\n", __func__, con); +} + +void ceph_con_v2_revoke(struct ceph_connection *con) +{ + WARN_ON(con->v2.out_zero); + + if (con_secure(con)) { + WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE && + con->v2.out_state != OUT_S_FINISH_MESSAGE); + dout("%s con %p secure - noop\n", __func__, con); + return; + } + + switch (con->v2.out_state) { + case OUT_S_QUEUE_DATA: + revoke_at_queue_data(con); + break; + case OUT_S_QUEUE_DATA_CONT: + revoke_at_queue_data_cont(con); + break; + case OUT_S_FINISH_MESSAGE: + revoke_at_finish_message(con); + break; + default: + WARN(1, "bad out_state %d", con->v2.out_state); + break; + } +} + +static void revoke_at_prepare_read_data(struct ceph_connection *con) +{ + int remaining; /* data + [data padding] + epilogue */ + int resid; + + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid); + + if (con_secure(con)) + remaining = padded_len(data_len(con->in_msg)) + + CEPH_EPILOGUE_SECURE_LEN; + else + remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN; + + dout("%s con %p resid %d remaining %d\n", __func__, con, resid, + remaining); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +static void revoke_at_prepare_read_data_cont(struct ceph_connection *con) +{ + int recved, resid; /* current piece of data */ + int remaining; /* [data padding] + epilogue */ + + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid || resid > con->v2.in_bvec.bv_len); + recved = con->v2.in_bvec.bv_len - resid; + dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid); + + if (recved) + ceph_msg_data_advance(&con->v2.in_cursor, recved); + WARN_ON(resid > con->v2.in_cursor.total_resid); + + if (con_secure(con)) + remaining = padding_len(data_len(con->in_msg)) + + CEPH_EPILOGUE_SECURE_LEN; + else + remaining = CEPH_EPILOGUE_PLAIN_LEN; + + dout("%s con %p total_resid %zu remaining %d\n", __func__, con, + con->v2.in_cursor.total_resid, remaining); + con->v2.in_iter.count -= resid; + set_in_skip(con, con->v2.in_cursor.total_resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +static void revoke_at_handle_epilogue(struct ceph_connection *con) +{ + int resid; + + WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + WARN_ON(!resid); + + dout("%s con %p resid %d\n", __func__, con, resid); + con->v2.in_iter.count -= resid; + set_in_skip(con, resid); + con->v2.in_state = IN_S_FINISH_SKIP; +} + +void ceph_con_v2_revoke_incoming(struct ceph_connection *con) +{ + switch (con->v2.in_state) { + case IN_S_PREPARE_READ_DATA: + revoke_at_prepare_read_data(con); + break; + case IN_S_PREPARE_READ_DATA_CONT: + revoke_at_prepare_read_data_cont(con); + break; + case IN_S_HANDLE_EPILOGUE: + revoke_at_handle_epilogue(con); + break; + default: + WARN(1, "bad in_state %d", con->v2.in_state); + break; + } +} + +bool ceph_con_v2_opened(struct ceph_connection *con) +{ + return con->v2.peer_global_seq; +} + +void ceph_con_v2_reset_session(struct ceph_connection *con) +{ + con->v2.client_cookie = 0; + con->v2.server_cookie = 0; + con->v2.global_seq = 0; + con->v2.connect_seq = 0; + con->v2.peer_global_seq = 0; +} + +void ceph_con_v2_reset_protocol(struct ceph_connection *con) +{ + iov_iter_truncate(&con->v2.in_iter, 0); + iov_iter_truncate(&con->v2.out_iter, 0); + con->v2.out_zero = 0; + + clear_in_sign_kvecs(con); + clear_out_sign_kvecs(con); + free_conn_bufs(con); + + if (con->v2.out_enc_pages) { + WARN_ON(!con->v2.out_enc_page_cnt); + ceph_release_page_vector(con->v2.out_enc_pages, + con->v2.out_enc_page_cnt); + con->v2.out_enc_pages = NULL; + con->v2.out_enc_page_cnt = 0; + } + + con->v2.con_mode = CEPH_CON_MODE_UNKNOWN; + + if (con->v2.hmac_tfm) { + crypto_free_shash(con->v2.hmac_tfm); + con->v2.hmac_tfm = NULL; + } + if (con->v2.gcm_req) { + aead_request_free(con->v2.gcm_req); + con->v2.gcm_req = NULL; + } + if (con->v2.gcm_tfm) { + crypto_free_aead(con->v2.gcm_tfm); + con->v2.gcm_tfm = NULL; + } +} diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index c4cf2529d08b..b9d54ed9f338 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops; static int __validate_auth(struct ceph_mon_client *monc); +static int decode_mon_info(void **p, void *end, bool msgr2, + struct ceph_entity_addr *addr) +{ + void *mon_info_end; + u32 struct_len; + u8 struct_v; + int ret; + + ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v, + &struct_len); + if (ret) + return ret; + + mon_info_end = *p + struct_len; + ceph_decode_skip_string(p, end, e_inval); /* skip mon name */ + ret = ceph_decode_entity_addrvec(p, end, msgr2, addr); + if (ret) + return ret; + + *p = mon_info_end; + return 0; + +e_inval: + return -EINVAL; +} + /* * Decode a monmap blob (e.g., during mount). + * + * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC). */ -static struct ceph_monmap *ceph_monmap_decode(void *p, void *end) +static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2) { - struct ceph_monmap *m = NULL; - int i, err = -EINVAL; + struct ceph_monmap *monmap = NULL; struct ceph_fsid fsid; - u32 epoch, num_mon; - u32 len; + u32 struct_len; + int blob_len; + int num_mon; + u8 struct_v; + u32 epoch; + int ret; + int i; - ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); + ceph_decode_32_safe(p, end, blob_len, e_inval); + ceph_decode_need(p, end, blob_len, e_inval); - dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p)); - p += sizeof(u16); /* skip version */ + ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len); + if (ret) + goto fail; - ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); - ceph_decode_copy(&p, &fsid, sizeof(fsid)); - epoch = ceph_decode_32(&p); + dout("%s struct_v %d\n", __func__, struct_v); + ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval); + ceph_decode_32_safe(p, end, epoch, e_inval); + if (struct_v >= 6) { + u32 feat_struct_len; + u8 feat_struct_v; - num_mon = ceph_decode_32(&p); + *p += sizeof(struct ceph_timespec); /* skip last_changed */ + *p += sizeof(struct ceph_timespec); /* skip created */ - if (num_mon > CEPH_MAX_MON) - goto bad; - m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS); - if (m == NULL) - return ERR_PTR(-ENOMEM); - m->fsid = fsid; - m->epoch = epoch; - m->num_mon = num_mon; - for (i = 0; i < num_mon; ++i) { - struct ceph_entity_inst *inst = &m->mon_inst[i]; + ret = ceph_start_decoding(p, end, 1, "mon_feature_t", + &feat_struct_v, &feat_struct_len); + if (ret) + goto fail; - /* copy name portion */ - ceph_decode_copy_safe(&p, end, &inst->name, - sizeof(inst->name), bad); - err = ceph_decode_entity_addr(&p, end, &inst->addr); - if (err) - goto bad; + *p += feat_struct_len; /* skip persistent_features */ + + ret = ceph_start_decoding(p, end, 1, "mon_feature_t", + &feat_struct_v, &feat_struct_len); + if (ret) + goto fail; + + *p += feat_struct_len; /* skip optional_features */ } - dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, - m->num_mon); - for (i = 0; i < m->num_mon; i++) - dout("monmap_decode mon%d is %s\n", i, - ceph_pr_addr(&m->mon_inst[i].addr)); - return m; -bad: - dout("monmap_decode failed with %d\n", err); - kfree(m); - return ERR_PTR(err); + ceph_decode_32_safe(p, end, num_mon, e_inval); + + dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch, + num_mon); + if (num_mon > CEPH_MAX_MON) + goto e_inval; + + monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO); + if (!monmap) { + ret = -ENOMEM; + goto fail; + } + monmap->fsid = fsid; + monmap->epoch = epoch; + monmap->num_mon = num_mon; + + /* legacy_mon_addr map or mon_info map */ + for (i = 0; i < num_mon; i++) { + struct ceph_entity_inst *inst = &monmap->mon_inst[i]; + + ceph_decode_skip_string(p, end, e_inval); /* skip mon name */ + inst->name.type = CEPH_ENTITY_TYPE_MON; + inst->name.num = cpu_to_le64(i); + + if (struct_v >= 6) + ret = decode_mon_info(p, end, msgr2, &inst->addr); + else + ret = ceph_decode_entity_addr(p, end, &inst->addr); + if (ret) + goto fail; + + dout("%s mon%d addr %s\n", __func__, i, + ceph_pr_addr(&inst->addr)); + } + + return monmap; + +e_inval: + ret = -EINVAL; +fail: + kfree(monmap); + return ERR_PTR(ret); } /* @@ -96,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) { int i; - for (i = 0; i < m->num_mon; i++) - if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) + for (i = 0; i < m->num_mon; i++) { + if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr)) return 1; + } + return 0; } @@ -190,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc) &monc->monmap->mon_inst[monc->cur_mon].addr); /* - * send an initial keepalive to ensure our timestamp is valid - * by the time we are in an OPENED state + * Queue a keepalive to ensure that in case of an early fault + * the messenger doesn't put us into STANDBY state and instead + * retries. This also ensures that our timestamp is valid by + * the time we finish hunting and delayed_work() checks it. */ ceph_con_keepalive(&monc->con); + if (ceph_msgr2(monc->client)) { + monc->pending_auth = 1; + return; + } /* initiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, @@ -476,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, p = msg->front.iov_base; end = p + msg->front.iov_len; - monmap = ceph_monmap_decode(p, end); + monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client)); if (IS_ERR(monmap)) { pr_err("problem decoding monmap, %d\n", (int)PTR_ERR(monmap)); @@ -1052,8 +1125,9 @@ static void delayed_work(struct work_struct *work) */ static int build_initial_monmap(struct ceph_mon_client *monc) { + __le32 my_type = ceph_msgr2(monc->client) ? + CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY; struct ceph_options *opt = monc->client->options; - struct ceph_entity_addr *mon_addr = opt->mon_addr; int num_mon = opt->num_mon; int i; @@ -1062,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc) GFP_KERNEL); if (!monc->monmap) return -ENOMEM; + for (i = 0; i < num_mon; i++) { - monc->monmap->mon_inst[i].addr = mon_addr[i]; - monc->monmap->mon_inst[i].addr.nonce = 0; - monc->monmap->mon_inst[i].name.type = - CEPH_ENTITY_TYPE_MON; - monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); + struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i]; + + memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr, + sizeof(inst->addr.in_addr)); + inst->addr.type = my_type; + inst->addr.nonce = 0; + inst->name.type = CEPH_ENTITY_TYPE_MON; + inst->name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; return 0; @@ -1089,8 +1167,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) /* connection */ /* authentication */ - monc->auth = ceph_auth_init(cl->options->name, - cl->options->key); + monc->auth = ceph_auth_init(cl->options->name, cl->options->key, + cl->options->con_modes); if (IS_ERR(monc->auth)) { err = PTR_ERR(monc->auth); goto out_monmap; @@ -1194,30 +1272,22 @@ static void finish_hunting(struct ceph_mon_client *monc) } } -static void handle_auth_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) +static void finish_auth(struct ceph_mon_client *monc, int auth_err, + bool was_authed) { - int ret; - int was_auth = 0; + dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed); + WARN_ON(auth_err > 0); - mutex_lock(&monc->mutex); - was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; - ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, - msg->front.iov_len, - monc->m_auth->front.iov_base, - monc->m_auth->front_alloc_len); - if (ret > 0) { - __send_prepared_auth_request(monc, ret); - goto out; + if (auth_err) { + monc->client->auth_err = auth_err; + wake_up_all(&monc->client->auth_wq); + return; } - finish_hunting(monc); - - if (ret < 0) { - monc->client->auth_err = ret; - } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { - dout("authenticated, starting session\n"); + if (!was_authed && ceph_auth_is_authenticated(monc->auth)) { + dout("%s authenticated, starting session global_id %llu\n", + __func__, monc->auth->global_id); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; monc->client->msgr.inst.name.num = @@ -1229,11 +1299,27 @@ static void handle_auth_reply(struct ceph_mon_client *monc, pr_info("mon%d %s session established\n", monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr)); } +} -out: +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + bool was_authed; + int ret; + + mutex_lock(&monc->mutex); + was_authed = ceph_auth_is_authenticated(monc->auth); + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + if (ret > 0) { + __send_prepared_auth_request(monc, ret); + } else { + finish_auth(monc, ret, was_authed); + finish_hunting(monc); + } mutex_unlock(&monc->mutex); - if (monc->client->auth_err < 0) - wake_up_all(&monc->client->auth_wq); } static int __validate_auth(struct ceph_mon_client *monc) @@ -1262,6 +1348,88 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_validate_auth); +static int mon_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mon_client *monc = con->private; + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_auth_get_request(monc->auth, buf, *buf_len); + mutex_unlock(&monc->mutex); + if (ret < 0) + return ret; + + *buf_len = ret; + *authorizer = NULL; + *authorizer_len = 0; + return 0; +} + +static int mon_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_mon_client *monc = con->private; + int ret; + + mutex_lock(&monc->mutex); + ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len, + buf, *buf_len); + mutex_unlock(&monc->mutex); + if (ret < 0) + return ret; + + *buf_len = ret; + *authorizer = NULL; + *authorizer_len = 0; + return 0; +} + +static int mon_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_mon_client *monc = con->private; + bool was_authed; + int ret; + + mutex_lock(&monc->mutex); + WARN_ON(!monc->hunting); + was_authed = ceph_auth_is_authenticated(monc->auth); + ret = ceph_auth_handle_reply_done(monc->auth, global_id, + reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); + finish_auth(monc, ret, was_authed); + if (!ret) + finish_hunting(monc); + mutex_unlock(&monc->mutex); + return 0; +} + +static int mon_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_mon_client *monc = con->private; + bool was_authed; + + mutex_lock(&monc->mutex); + WARN_ON(!monc->hunting); + was_authed = ceph_auth_is_authenticated(monc->auth); + ceph_auth_handle_bad_method(monc->auth, used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt); + finish_auth(monc, -EACCES, was_authed); + mutex_unlock(&monc->mutex); + return 0; +} + /* * handle incoming message */ @@ -1412,4 +1580,8 @@ static const struct ceph_connection_operations mon_con_ops = { .dispatch = dispatch, .fault = mon_fault, .alloc_msg = mon_alloc_msg, + .get_auth_request = mon_get_auth_request, + .handle_auth_reply_more = mon_handle_auth_reply_more, + .handle_auth_done = mon_handle_auth_done, + .handle_auth_bad_method = mon_handle_auth_bad_method, }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7901ab6c79fd..61229c5e22cb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc, set_pool_was_full(osdc); if (incremental) - newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); + newmap = osdmap_apply_incremental(&p, end, + ceph_msgr2(osdc->client), + osdc->osdmap); else - newmap = ceph_osdmap_decode(&p, end); + newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client)); if (IS_ERR(newmap)) return PTR_ERR(newmap); @@ -5575,6 +5577,7 @@ static void put_osd_con(struct ceph_connection *con) /* * authentication */ + /* * Note: returned pointer is the address of a structure that's * managed separately. Caller must *not* attempt to free it. @@ -5586,23 +5589,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; struct ceph_auth_handshake *auth = &o->o_auth; + int ret; - if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(auth->authorizer); - auth->authorizer = NULL; - } - if (!auth->authorizer) { - int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } else { - int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); - if (ret) - return ERR_PTR(ret); - } - *proto = ac->protocol; + ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + force_new, proto, NULL, NULL); + if (ret) + return ERR_PTR(ret); return auth; } @@ -5623,8 +5615,11 @@ static int verify_authorizer_reply(struct ceph_connection *con) struct ceph_osd *o = con->private; struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; - return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer); + return ceph_auth_verify_authorizer_reply(ac, auth->authorizer, + auth->authorizer_reply_buf, auth->authorizer_reply_buf_len, + NULL, NULL, NULL, NULL); } static int invalidate_authorizer(struct ceph_connection *con) @@ -5637,6 +5632,80 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } +static int osd_get_auth_request(struct ceph_connection *con, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_reply_more(struct ceph_connection *con, + void *reply, int reply_len, + void *buf, int *buf_len, + void **authorizer, int *authorizer_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + int ret; + + ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len, + buf, buf_len); + if (ret) + return ret; + + *authorizer = auth->authorizer_buf; + *authorizer_len = auth->authorizer_buf_len; + return 0; +} + +static int osd_handle_auth_done(struct ceph_connection *con, + u64 global_id, void *reply, int reply_len, + u8 *session_key, int *session_key_len, + u8 *con_secret, int *con_secret_len) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_client *ac = o->o_osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + + return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len, + session_key, session_key_len, + con_secret, con_secret_len); +} + +static int osd_handle_auth_bad_method(struct ceph_connection *con, + int used_proto, int result, + const int *allowed_protos, int proto_cnt, + const int *allowed_modes, int mode_cnt) +{ + struct ceph_osd *o = con->private; + struct ceph_mon_client *monc = &o->o_osdc->client->monc; + int ret; + + if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD, + used_proto, result, + allowed_protos, proto_cnt, + allowed_modes, mode_cnt)) { + ret = ceph_monc_validate_auth(monc); + if (ret) + return ret; + } + + return -EACCES; +} + static void osd_reencode_message(struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); @@ -5674,4 +5743,8 @@ static const struct ceph_connection_operations osd_con_ops = { .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, .fault = osd_fault, + .get_auth_request = osd_get_auth_request, + .handle_auth_reply_more = osd_handle_auth_reply_more, + .handle_auth_done = osd_handle_auth_done, + .handle_auth_bad_method = osd_handle_auth_bad_method, }; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index fa08c15be0c0..2b1dd252f231 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end, /* * decode a full map. */ -static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +static int osdmap_decode(void **p, void *end, bool msgr2, + struct ceph_osdmap *map) { u8 struct_v; u32 epoch = 0; @@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) goto e_inval; for (i = 0; i < map->max_osd; i++) { - err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]); + struct ceph_entity_addr *addr = &map->osd_addr[i]; + + if (struct_v >= 8) + err = ceph_decode_entity_addrvec(p, end, msgr2, addr); + else + err = ceph_decode_entity_addr(p, end, addr); if (err) goto bad; + + dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr)); } /* pg_temp */ @@ -1790,7 +1798,7 @@ bad: /* * Allocate and decode a full map. */ -struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2) { struct ceph_osdmap *map; int ret; @@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) if (!map) return ERR_PTR(-ENOMEM); - ret = osdmap_decode(p, end, map); + ret = osdmap_decode(p, end, msgr2, map); if (ret) { ceph_osdmap_destroy(map); return ERR_PTR(ret); @@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, - struct ceph_osdmap *map) + bool msgr2, struct ceph_osdmap *map) { void *new_up_client; void *new_state; void *new_weight_end; u32 len; + int ret; int i; new_up_client = *p; @@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; ceph_decode_skip_32(p, end, e_inval); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; } new_state = *p; @@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, while (len--) { s32 osd; u32 xorstate; - int ret; osd = ceph_decode_32(p); if (struct_v >= 5) @@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, osd = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); - if (ceph_decode_entity_addr(p, end, &addr)) - goto e_inval; + if (struct_v >= 7) + ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); + else + ret = ceph_decode_entity_addr(p, end, &addr); + if (ret) + return ret; + + dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); + pr_info("osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; @@ -1927,7 +1946,7 @@ e_inval: /* * decode and apply an incremental map update. */ -struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { struct ceph_fsid fsid; @@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); - return ceph_osdmap_decode(p, min(*p+len, end)); + return ceph_osdmap_decode(p, min(*p+len, end), msgr2); } /* new crush? */ @@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_up_client, new_state, new_weight */ - err = decode_new_up_state_weight(p, end, struct_v, map); + err = decode_new_up_state_weight(p, end, struct_v, msgr2, map); if (err) goto bad; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3259120462ed..612f0a641f4c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1251,10 +1251,7 @@ void rpc_prepare_reply_pages(struct rpc_rqst *req, struct page **pages, unsigned int base, unsigned int len, unsigned int hdrsize) { - /* Subtract one to force an extra word of buffer space for the - * payload's XDR pad to fall into the rcv_buf's tail iovec. - */ - hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign - 1; + hdrsize += RPC_REPHDRSIZE + req->rq_cred->cr_auth->au_ralign; xdr_inline_pages(&req->rq_rcv_buf, hdrsize << 2, pages, base, len); trace_rpc_xdr_reply_pages(req->rq_task, &req->rq_rcv_buf); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index fd9bca242724..56029e3af6ff 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -128,13 +128,13 @@ static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *n return 0; len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", xprt->debugfs->d_name.name); - if (len > sizeof(name)) + if (len >= sizeof(name)) return -1; if (*nump == 0) strcpy(link, "xprt"); else { len = snprintf(link, sizeof(link), "xprt%d", *nump); - if (len > sizeof(link)) + if (len >= sizeof(link)) return -1; } debugfs_create_symlink(link, clnt->cl_debugfs, name); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f06d7c315017..cf702a5f7fe5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -675,6 +675,23 @@ struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_wake_up_next); +/** + * rpc_wake_up_locked - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + */ +static void rpc_wake_up_locked(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_locked(queue, task); + } +} + /** * rpc_wake_up - wake up all rpc_tasks * @queue: rpc_wait_queue on which the tasks are sleeping @@ -683,26 +700,29 @@ EXPORT_SYMBOL_GPL(rpc_wake_up_next); */ void rpc_wake_up(struct rpc_wait_queue *queue) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_locked(queue); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up); +/** + * rpc_wake_up_status_locked - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + */ +static void rpc_wake_up_status_locked(struct rpc_wait_queue *queue, int status) +{ + struct rpc_task *task; + + for (;;) { + task = __rpc_find_next_queued(queue); + if (task == NULL) + break; + rpc_wake_up_task_queue_set_status_locked(queue, task, status); + } +} + /** * rpc_wake_up_status - wake up all rpc_tasks and set their status value. * @queue: rpc_wait_queue on which the tasks are sleeping @@ -712,23 +732,8 @@ EXPORT_SYMBOL_GPL(rpc_wake_up); */ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) { - struct list_head *head; - spin_lock(&queue->lock); - head = &queue->tasks[queue->maxpriority]; - for (;;) { - while (!list_empty(head)) { - struct rpc_task *task; - task = list_first_entry(head, - struct rpc_task, - u.tk_wait.list); - task->tk_status = status; - rpc_wake_up_task_queue_locked(queue, task); - } - if (head == &queue->tasks[0]) - break; - head--; - } + rpc_wake_up_status_locked(queue, status); spin_unlock(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_status); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 757560a3b06b..3964ff74ee51 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace); * @len: length of string, in bytes * */ -void -xdr_terminate_string(struct xdr_buf *buf, const u32 len) +void xdr_terminate_string(const struct xdr_buf *buf, const u32 len) { char *kaddr; @@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); -size_t -xdr_buf_pagecount(struct xdr_buf *buf) +size_t xdr_buf_pagecount(const struct xdr_buf *buf) { if (!buf->page_len) return 0; @@ -193,9 +191,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, tail->iov_base = buf + offset; tail->iov_len = buflen - offset; - if ((xdr->page_len & 3) == 0) - tail->iov_len -= sizeof(__be32); - xdr->buflen += len; } EXPORT_SYMBOL_GPL(xdr_inline_pages); @@ -228,6 +223,9 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, BUG_ON(pgfrom_base <= pgto_base); + if (!len) + return; + pgto = pages + (pgto_base >> PAGE_SHIFT); pgfrom = pages + (pgfrom_base >> PAGE_SHIFT); @@ -266,26 +264,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static void -_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len) -{ - struct kvec *tail = buf->tail; - - if (len > tail->iov_len) - len = tail->iov_len; - - _copy_to_pages(buf->pages, - buf->page_base + pgto, - (char *)tail->iov_base, - len); - tail->iov_len -= len; - - if (tail->iov_len > 0) - memmove((char *)tail->iov_base, - tail->iov_base + len, - tail->iov_len); -} - /** * _shift_data_right_pages * @pages: vector of pages containing both the source and dest memory area. @@ -310,6 +288,9 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, BUG_ON(pgto_base <= pgfrom_base); + if (!len) + return; + pgto_base += len; pgfrom_base += len; @@ -351,46 +332,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base, } while ((len -= copy) != 0); } -static unsigned int -_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len) -{ - struct kvec *tail = buf->tail; - unsigned int tailbuf_len; - unsigned int result = 0; - size_t copy; - - tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; - - /* Shift the tail first */ - if (tailbuf_len != 0) { - unsigned int free_space = tailbuf_len - tail->iov_len; - - if (len < free_space) - free_space = len; - if (len > free_space) - len = free_space; - - tail->iov_len += free_space; - copy = len; - - if (tail->iov_len > len) { - char *p = (char *)tail->iov_base + len; - memmove(p, tail->iov_base, tail->iov_len - free_space); - result += tail->iov_len - free_space; - } else - copy = tail->iov_len; - - /* Copy from the inlined pages into the tail */ - _copy_from_pages((char *)tail->iov_base, - buf->pages, - buf->page_base + pgfrom, - copy); - result += copy; - } - - return result; -} - /** * _copy_to_pages * @pages: array of pages @@ -408,6 +349,9 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) char *vto; size_t copy; + if (!len) + return; + pgto = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -452,6 +396,9 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) char *vfrom; size_t copy; + if (!len) + return; + pgfrom = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -475,18 +422,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) } EXPORT_SYMBOL_GPL(_copy_from_pages); +static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base, + unsigned int len) +{ + if (base >= iov->iov_len) + return; + if (len > iov->iov_len - base) + len = iov->iov_len - base; + memset(iov->iov_base + base, 0, len); +} + /** - * _zero_pages - * @pages: array of pages - * @pgbase: beginning page vector address + * xdr_buf_pages_zero + * @buf: xdr_buf + * @pgbase: beginning offset * @len: length */ -static void -_zero_pages(struct page **pages, size_t pgbase, size_t len) +static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase, + unsigned int len) { + struct page **pages = buf->pages; struct page **page; char *vpage; - size_t zero; + unsigned int zero; + + if (!len) + return; + if (pgbase >= buf->page_len) { + xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len); + return; + } + if (pgbase + len > buf->page_len) { + xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len); + len = buf->page_len - pgbase; + } + + pgbase += buf->page_base; page = pages + (pgbase >> PAGE_SHIFT); pgbase &= ~PAGE_MASK; @@ -507,122 +478,367 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len) } while ((len -= zero) != 0); } -/** - * xdr_shrink_bufhead - * @buf: xdr_buf - * @len: bytes to remove from buf->head[0] - * - * Shrinks XDR buffer's header kvec buf->head[0] by - * 'len' bytes. The extra data is not lost, but is instead - * moved into the inlined pages and/or the tail. - */ -static unsigned int -xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +static unsigned int xdr_buf_pages_fill_sparse(const struct xdr_buf *buf, + unsigned int buflen, gfp_t gfp) { - struct kvec *head, *tail; - size_t copy, offs; - unsigned int pglen = buf->page_len; - unsigned int result; + unsigned int i, npages, pagelen; - result = 0; - tail = buf->tail; - head = buf->head; + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return buflen; + if (buflen <= buf->head->iov_len) + return buflen; + pagelen = buflen - buf->head->iov_len; + if (pagelen > buf->page_len) + pagelen = buf->page_len; + npages = (pagelen + buf->page_base + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < npages; i++) { + if (!buf->pages[i]) + continue; + buf->pages[i] = alloc_page(gfp); + if (likely(buf->pages[i])) + continue; + buflen -= pagelen; + pagelen = i << PAGE_SHIFT; + if (pagelen > buf->page_base) + buflen += pagelen - buf->page_base; + break; + } + return buflen; +} - WARN_ON_ONCE(len > head->iov_len); - if (len > head->iov_len) - len = head->iov_len; +static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + struct kvec *tail = buf->tail; + unsigned int sum = head->iov_len + buf->page_len + tail->iov_len; + unsigned int free_space, newlen; - /* Shift the tail first */ - if (tail->iov_len != 0) { - if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove((char *)tail->iov_base + len, - tail->iov_base, copy); - result += copy; + if (sum > buf->len) { + free_space = min_t(unsigned int, sum - buf->len, len); + newlen = xdr_buf_pages_fill_sparse(buf, buf->len + free_space, + GFP_KERNEL); + free_space = newlen - buf->len; + buf->len = newlen; + len -= free_space; + if (!len) + return; + } + + if (buf->buflen > sum) { + /* Expand the tail buffer */ + free_space = min_t(unsigned int, buf->buflen - sum, len); + tail->iov_len += free_space; + buf->len += free_space; + } +} + +static void xdr_buf_tail_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + + if (to >= tail->iov_len) + return; + if (len + to > tail->iov_len) + len = tail->iov_len - to; + memmove(tail->iov_base + to, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0; + unsigned int talen = 0, tato = 0; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + if (to >= buf->page_len) { + tato = to - buf->page_len; + if (tail->iov_len >= len + tato) + talen = len; + else if (tail->iov_len > tato) + talen = tail->iov_len - tato; + } else if (len + to >= buf->page_len) { + pglen = buf->page_len - to; + talen = len - pglen; + if (talen > tail->iov_len) + talen = tail->iov_len; + } else + pglen = len; + + _copy_from_pages(tail->iov_base + tato, buf->pages, + buf->page_base + base + pglen, talen); + _shift_data_right_pages(buf->pages, buf->page_base + to, + buf->page_base + base, pglen); +} + +static void xdr_buf_head_copy_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + const struct kvec *tail = buf->tail; + unsigned int to = base + shift; + unsigned int pglen = 0, pgto = 0; + unsigned int talen = 0, tato = 0; + + if (base >= head->iov_len) + return; + if (len > head->iov_len - base) + len = head->iov_len - base; + if (to >= buf->page_len + head->iov_len) { + tato = to - buf->page_len - head->iov_len; + talen = len; + } else if (to >= head->iov_len) { + pgto = to - head->iov_len; + pglen = len; + if (pgto + pglen > buf->page_len) { + talen = pgto + pglen - buf->page_len; + pglen -= talen; } - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > pglen) - copy = pglen; - offs = len - copy; - if (offs >= tail->iov_len) - copy = 0; - else if (copy > tail->iov_len - offs) - copy = tail->iov_len - offs; - if (copy != 0) { - _copy_from_pages((char *)tail->iov_base + offs, - buf->pages, - buf->page_base + pglen + offs - len, - copy); - result += copy; - } - /* Do we also need to copy data from the head into the tail ? */ - if (len > pglen) { - offs = copy = len - pglen; - if (copy > tail->iov_len) - copy = tail->iov_len; - memcpy(tail->iov_base, - (char *)head->iov_base + - head->iov_len - offs, - copy); - result += copy; + } else { + pglen = len - to; + if (pglen > buf->page_len) { + talen = pglen - buf->page_len; + pglen = buf->page_len; } } - /* Now handle pages */ - if (pglen != 0) { - if (pglen > len) - _shift_data_right_pages(buf->pages, - buf->page_base + len, - buf->page_base, - pglen - len); - copy = len; - if (len > pglen) - copy = pglen; - _copy_to_pages(buf->pages, buf->page_base, - (char *)head->iov_base + head->iov_len - len, - copy); - result += copy; - } - head->iov_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - return result; + len -= talen; + base += len; + if (talen + tato > tail->iov_len) + talen = tail->iov_len > tato ? tail->iov_len - tato : 0; + memcpy(tail->iov_base + tato, head->iov_base + base, talen); + + len -= pglen; + base -= pglen; + _copy_to_pages(buf->pages, buf->page_base + pgto, head->iov_base + base, + pglen); + + base -= len; + memmove(head->iov_base + to, head->iov_base + base, len); +} + +static void xdr_buf_tail_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len || !shift || !len) + return; + xdr_buf_tail_copy_right(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift); + return; + } + if (base + len > buf->page_len) + xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len, + shift); + xdr_buf_pages_copy_right(buf, base, len, shift); +} + +static void xdr_buf_head_shift_right(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + + if (!shift) + return; + if (base >= head->iov_len) { + xdr_buf_pages_shift_right(buf, head->iov_len - base, len, + shift); + return; + } + if (base + len > head->iov_len) + xdr_buf_pages_shift_right(buf, 0, base + len - head->iov_len, + shift); + xdr_buf_head_copy_right(buf, base, len, shift); +} + +static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base, + unsigned int len, unsigned int shift) +{ + const struct kvec *tail = buf->tail; + + if (base >= tail->iov_len) + return; + if (len > tail->iov_len - base) + len = tail->iov_len - base; + /* Shift data into head */ + if (shift > buf->page_len + base) { + const struct kvec *head = buf->head; + unsigned int hdto = + head->iov_len + buf->page_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + buf->page_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + /* Shift data into pages */ + if (shift > base) { + unsigned int pgto = buf->page_len + base - shift; + unsigned int pglen = len; + + if (pgto + pglen > buf->page_len) + pglen = buf->page_len - pgto; + _copy_to_pages(buf->pages, buf->page_base + pgto, + tail->iov_base + base, pglen); + base += pglen; + len -= pglen; + if (!len) + return; + } + memmove(tail->iov_base + base - shift, tail->iov_base + base, len); +} + +static void xdr_buf_pages_copy_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + unsigned int pgto; + + if (base >= buf->page_len) + return; + if (len > buf->page_len - base) + len = buf->page_len - base; + /* Shift data into head */ + if (shift > base) { + const struct kvec *head = buf->head; + unsigned int hdto = head->iov_len + base - shift; + unsigned int hdlen = len; + + if (WARN_ONCE(shift > head->iov_len + base, + "SUNRPC: Misaligned data.\n")) + return; + if (hdto + hdlen > head->iov_len) + hdlen = head->iov_len - hdto; + _copy_from_pages(head->iov_base + hdto, buf->pages, + buf->page_base + base, hdlen); + base += hdlen; + len -= hdlen; + if (!len) + return; + } + pgto = base - shift; + _shift_data_left_pages(buf->pages, buf->page_base + pgto, + buf->page_base + base, len); +} + +static void xdr_buf_tail_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + xdr_buf_tail_copy_left(buf, base, len, shift); +} + +static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + if (!shift || !len) + return; + if (base >= buf->page_len) { + xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift); + return; + } + xdr_buf_pages_copy_left(buf, base, len, shift); + len += base; + if (len <= buf->page_len) + return; + xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); } /** - * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes + * xdr_shrink_bufhead * @buf: xdr_buf - * @len: bytes to remove from buf->pages + * @len: new length of buf->head[0] + * + * Shrinks XDR buffer's header kvec buf->head[0], setting it to + * 'len' bytes. The extra data is not lost, but is instead + * moved into the inlined pages and/or the tail. + */ +static unsigned int xdr_shrink_bufhead(struct xdr_buf *buf, unsigned int len) +{ + struct kvec *head = buf->head; + unsigned int shift, buflen = max(buf->len, len); + + WARN_ON_ONCE(len > head->iov_len); + if (head->iov_len > buflen) { + buf->buflen -= head->iov_len - buflen; + head->iov_len = buflen; + } + if (len >= head->iov_len) + return 0; + shift = head->iov_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_head_shift_right(buf, len, buflen - len, shift); + head->iov_len = len; + buf->buflen -= shift; + buf->len -= shift; + return shift; +} + +/** + * xdr_shrink_pagelen - shrinks buf->pages to @len bytes + * @buf: xdr_buf + * @len: new page buffer length * * The extra data is not lost, but is instead moved into buf->tail. * Returns the actual number of bytes moved. */ -static unsigned int -xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len) { - unsigned int pglen = buf->page_len; - unsigned int result; + unsigned int shift, buflen = buf->len - buf->head->iov_len; - if (len > buf->page_len) - len = buf-> page_len; - - result = _shift_data_right_tail(buf, pglen - len, len); - buf->page_len -= len; - buf->buflen -= len; - /* Have we truncated the message? */ - if (buf->len > buf->buflen) - buf->len = buf->buflen; - - return result; + WARN_ON_ONCE(len > buf->page_len); + if (buf->head->iov_len >= buf->len || len > buflen) + buflen = len; + if (buf->page_len > buflen) { + buf->buflen -= buf->page_len - buflen; + buf->page_len = buflen; + } + if (len >= buf->page_len) + return 0; + shift = buf->page_len - len; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, len, buflen - len, shift); + buf->page_len = len; + buf->len -= shift; + buf->buflen -= shift; + return shift; } void xdr_shift_buf(struct xdr_buf *buf, size_t len) { - xdr_shrink_bufhead(buf, len); + xdr_shrink_bufhead(buf, buf->head->iov_len - len); } EXPORT_SYMBOL_GPL(xdr_shift_buf); @@ -636,6 +852,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr) } EXPORT_SYMBOL_GPL(xdr_stream_pos); +static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + unsigned int blen = xdr->buf->len; + + xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0; +} + +static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos) +{ + xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len); +} + /** * xdr_page_pos - Return the current offset from the start of the xdr pages * @xdr: pointer to struct xdr_stream @@ -969,19 +1197,31 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b } EXPORT_SYMBOL_GPL(xdr_write_pages); -static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, - unsigned int len) +static unsigned int xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, + unsigned int base, unsigned int len) { if (len > iov->iov_len) len = iov->iov_len; - xdr->p = (__be32*)iov->iov_base; + if (unlikely(base > len)) + base = len; + xdr->p = (__be32*)(iov->iov_base + base); xdr->end = (__be32*)(iov->iov_base + len); xdr->iov = iov; xdr->page_ptr = NULL; + return len - base; } -static int xdr_set_page_base(struct xdr_stream *xdr, - unsigned int base, unsigned int len) +static unsigned int xdr_set_tail_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + + xdr_stream_set_pos(xdr, base + buf->page_len + buf->head->iov_len); + return xdr_set_iov(xdr, buf->tail, base, len); +} + +static unsigned int xdr_set_page_base(struct xdr_stream *xdr, + unsigned int base, unsigned int len) { unsigned int pgnr; unsigned int maxlen; @@ -990,12 +1230,15 @@ static int xdr_set_page_base(struct xdr_stream *xdr, void *kaddr; maxlen = xdr->buf->page_len; - if (base >= maxlen) - return -EINVAL; - maxlen -= base; + if (base >= maxlen) { + base = maxlen; + maxlen = 0; + } else + maxlen -= base; if (len > maxlen) len = maxlen; + xdr_stream_page_set_pos(xdr, base); base += xdr->buf->page_base; pgnr = base >> PAGE_SHIFT; @@ -1010,14 +1253,16 @@ static int xdr_set_page_base(struct xdr_stream *xdr, pgend = PAGE_SIZE; xdr->end = (__be32*)(kaddr + pgend); xdr->iov = NULL; - return 0; + return len; } static void xdr_set_page(struct xdr_stream *xdr, unsigned int base, unsigned int len) { - if (xdr_set_page_base(xdr, base, len) < 0) - xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2); + if (xdr_set_page_base(xdr, base, len) == 0) { + base -= xdr->buf->page_len; + xdr_set_tail_base(xdr, base, len); + } } static void xdr_set_next_page(struct xdr_stream *xdr) @@ -1026,17 +1271,18 @@ static void xdr_set_next_page(struct xdr_stream *xdr) newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT; newbase -= xdr->buf->page_base; - - xdr_set_page(xdr, newbase, PAGE_SIZE); + if (newbase < xdr->buf->page_len) + xdr_set_page_base(xdr, newbase, xdr_stream_remaining(xdr)); + else + xdr_set_tail_base(xdr, 0, xdr_stream_remaining(xdr)); } static bool xdr_set_next_buffer(struct xdr_stream *xdr) { if (xdr->page_ptr != NULL) xdr_set_next_page(xdr); - else if (xdr->iov == xdr->buf->head) { - xdr_set_page(xdr, 0, PAGE_SIZE); - } + else if (xdr->iov == xdr->buf->head) + xdr_set_page(xdr, 0, xdr_stream_remaining(xdr)); return xdr->p != xdr->end; } @@ -1053,12 +1299,9 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, xdr->buf = buf; xdr_reset_scratch_buffer(xdr); xdr->nwords = XDR_QUADLEN(buf->len); - if (buf->head[0].iov_len != 0) - xdr_set_iov(xdr, buf->head, buf->len); - else if (buf->page_len != 0) - xdr_set_page_base(xdr, 0, buf->len); - else - xdr_set_iov(xdr, buf->head, buf->len); + if (xdr_set_iov(xdr, buf->head, 0, buf->len) == 0 && + xdr_set_page_base(xdr, 0, buf->len) == 0) + xdr_set_iov(xdr, buf->tail, 0, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; @@ -1158,14 +1401,13 @@ static void xdr_realign_pages(struct xdr_stream *xdr) struct xdr_buf *buf = xdr->buf; struct kvec *iov = buf->head; unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; /* Realign pages to current pointer position */ if (iov->iov_len > cur) { - offset = iov->iov_len - cur; - copied = xdr_shrink_bufhead(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_bufhead(buf, cur); + trace_rpc_xdr_alignment(xdr, cur, copied); + xdr_set_page(xdr, 0, buf->page_len); } } @@ -1173,8 +1415,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) { struct xdr_buf *buf = xdr->buf; unsigned int nwords = XDR_QUADLEN(len); - unsigned int cur = xdr_stream_pos(xdr); - unsigned int copied, offset; + unsigned int copied; if (xdr->nwords == 0) return 0; @@ -1188,125 +1429,103 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) len = buf->page_len; else if (nwords < xdr->nwords) { /* Truncate page data and move it into the tail */ - offset = buf->page_len - len; - copied = xdr_shrink_pagelen(buf, offset); - trace_rpc_xdr_alignment(xdr, offset, copied); - xdr->nwords = XDR_QUADLEN(buf->len - cur); + copied = xdr_shrink_pagelen(buf, len); + trace_rpc_xdr_alignment(xdr, len, copied); } return len; } /** - * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * xdr_read_pages - align page-based XDR data to current pointer position * @xdr: pointer to xdr_stream struct * @len: number of bytes of page data * * Moves data beyond the current pointer position from the XDR head[] buffer - * into the page list. Any data that lies beyond current position + "len" - * bytes is moved into the XDR tail[]. + * into the page list. Any data that lies beyond current position + @len + * bytes is moved into the XDR tail[]. The xdr_stream current position is + * then advanced past that data to align to the next XDR object in the tail. * * Returns the number of XDR encoded bytes now contained in the pages */ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) { - struct xdr_buf *buf = xdr->buf; - struct kvec *iov; - unsigned int nwords; - unsigned int end; - unsigned int padding; + unsigned int nwords = XDR_QUADLEN(len); + unsigned int base, end, pglen; - len = xdr_align_pages(xdr, len); - if (len == 0) + pglen = xdr_align_pages(xdr, nwords << 2); + if (pglen == 0) return 0; - nwords = XDR_QUADLEN(len); - padding = (nwords << 2) - len; - xdr->iov = iov = buf->tail; - /* Compute remaining message length. */ - end = ((xdr->nwords - nwords) << 2) + padding; - if (end > iov->iov_len) - end = iov->iov_len; - /* - * Position current pointer at beginning of tail, and - * set remaining message length. - */ - xdr->p = (__be32 *)((char *)iov->iov_base + padding); - xdr->end = (__be32 *)((char *)iov->iov_base + end); - xdr->page_ptr = NULL; - xdr->nwords = XDR_QUADLEN(end - padding); - return len; + base = (nwords << 2) - pglen; + end = xdr_stream_remaining(xdr) - pglen; + + xdr_set_tail_base(xdr, base, end); + return len <= pglen ? len : pglen; } EXPORT_SYMBOL_GPL(xdr_read_pages); -uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length) +unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int from, bytes; - unsigned int shift = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, bytes, len; + unsigned int shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; - if (length < bytes) - bytes = length; + + if (from >= buf->page_len + buf->tail->iov_len) + return 0; + if (from + buf->head->iov_len >= buf->len) + return 0; + + len = buf->len - buf->head->iov_len; + + /* We only shift data left! */ + if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n", + from, offset)) + return 0; + if (WARN_ONCE(offset > buf->page_len, + "SUNRPC: buffer overflow. offset=%u, page_len=%u\n", + offset, buf->page_len)) + return 0; /* Move page data to the left */ - if (from > offset) { - shift = min_t(unsigned int, bytes, buf->page_len - from); - _shift_data_left_pages(buf->pages, - buf->page_base + offset, - buf->page_base + from, - shift); - bytes -= shift; + shift = from - offset; + xdr_buf_pages_shift_left(buf, from, len, shift); - /* Move tail data into the pages, if necessary */ - if (bytes > 0) - _shift_data_left_tail(buf, offset + shift, bytes); - } + bytes = xdr_stream_remaining(xdr); + if (length > bytes) + length = bytes; + bytes -= length; - xdr->nwords -= XDR_QUADLEN(length); - xdr_set_page(xdr, from + length, PAGE_SIZE); + xdr->buf->len -= shift; + xdr_set_page(xdr, offset + length, bytes); return length; } EXPORT_SYMBOL_GPL(xdr_align_data); -uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length) +unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) { struct xdr_buf *buf = xdr->buf; - unsigned int bytes; - unsigned int from; - unsigned int truncated = 0; - - if ((offset + length) < offset || - (offset + length) > buf->page_len) - length = buf->page_len - offset; + unsigned int from, to, shift; xdr_realign_pages(xdr); from = xdr_page_pos(xdr); - bytes = xdr->nwords << 2; + to = xdr_align_size(offset + length); - if (offset + length + bytes > buf->page_len) { - unsigned int shift = (offset + length + bytes) - buf->page_len; - unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift); - truncated = shift - res; - xdr->nwords -= XDR_QUADLEN(truncated); - bytes -= shift; - } + /* Could the hole be behind us? */ + if (to > from) { + unsigned int buflen = buf->len - buf->head->iov_len; + shift = to - from; + xdr_buf_try_expand(buf, shift); + xdr_buf_pages_shift_right(buf, from, buflen, shift); + xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); + } else if (to != from) + xdr_align_data(xdr, to, 0); + xdr_buf_pages_zero(buf, offset, length); - /* Now move the page data over and zero pages */ - if (bytes > 0) - _shift_data_right_pages(buf->pages, - buf->page_base + offset + length, - buf->page_base + from, - bytes); - _zero_pages(buf->pages, buf->page_base + offset, length); - - buf->len += length - (from - offset) - truncated; - xdr_set_page(xdr, offset + length, PAGE_SIZE); return length; } EXPORT_SYMBOL_GPL(xdr_expand_hole); @@ -1335,8 +1554,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page); static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; -void -xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf) { buf->head[0] = *iov; buf->tail[0] = empty_iov; @@ -1493,7 +1711,8 @@ fix_len: } EXPORT_SYMBOL_GPL(xdr_buf_trim); -static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1502,8 +1721,7 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); + _copy_from_pages(obj, subbuf->pages, subbuf->page_base, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1511,7 +1729,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne } /* obj is assumed to point to allocated memory of size at least len: */ -int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1524,7 +1743,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u } EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf); -static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len) +static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf, + void *obj, unsigned int len) { unsigned int this_len; @@ -1533,8 +1753,7 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->page_len); - if (this_len) - _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); + _copy_to_pages(subbuf->pages, subbuf->page_base, obj, this_len); len -= this_len; obj += this_len; this_len = min_t(unsigned int, len, subbuf->tail[0].iov_len); @@ -1542,7 +1761,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned } /* obj is assumed to point to allocated memory of size at least len: */ -int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len) +int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base, + void *obj, unsigned int len) { struct xdr_buf subbuf; int status; @@ -1555,8 +1775,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un } EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf); -int -xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) +int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj) { __be32 raw; int status; @@ -1569,8 +1788,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj) } EXPORT_SYMBOL_GPL(xdr_decode_word); -int -xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) +int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj) { __be32 raw = cpu_to_be32(obj); @@ -1579,9 +1797,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) EXPORT_SYMBOL_GPL(xdr_encode_word); /* Returns 0 on success, or else a negative error code. */ -static int -xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc, int encode) +static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc, int encode) { char *elem = NULL, *c; unsigned int copied = 0, todo, avail_here; @@ -1773,9 +1990,8 @@ out: return err; } -int -xdr_decode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if (base >= buf->len) return -EINVAL; @@ -1784,9 +2000,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_decode_array2); -int -xdr_encode_array2(struct xdr_buf *buf, unsigned int base, - struct xdr_array2_desc *desc) +int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base, + struct xdr_array2_desc *desc) { if ((unsigned long) base + 4 + desc->array_len * desc->elem_size > buf->head->iov_len + buf->page_len + buf->tail->iov_len) @@ -1796,9 +2011,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base, } EXPORT_SYMBOL_GPL(xdr_encode_array2); -int -xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, - int (*actor)(struct scatterlist *, void *), void *data) +int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, + unsigned int len, + int (*actor)(struct scatterlist *, void *), void *data) { int i, ret = 0; unsigned int page_len, thislen, page_offset; @@ -1966,10 +2181,8 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); if (ret > 0) { - char *s = kmalloc(ret + 1, gfp_flags); + char *s = kmemdup_nul(p, ret, gfp_flags); if (s != NULL) { - memcpy(s, p, ret); - s[ret] = '\0'; *str = s; return strlen(s); } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f6c17e75f20e..691ccf8049a4 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -151,33 +151,94 @@ out: } EXPORT_SYMBOL_GPL(xprt_unregister_transport); -/** - * xprt_load_transport - load a transport implementation - * @transport_name: transport to load - * - * Returns: - * 0: transport successfully loaded - * -ENOENT: transport module not available - */ -int xprt_load_transport(const char *transport_name) +static void +xprt_class_release(const struct xprt_class *t) { - struct xprt_class *t; - int result; + module_put(t->owner); +} + +static const struct xprt_class * +xprt_class_find_by_ident_locked(int ident) +{ + const struct xprt_class *t; - result = 0; - spin_lock(&xprt_list_lock); list_for_each_entry(t, &xprt_list, list) { - if (strcmp(t->name, transport_name) == 0) { - spin_unlock(&xprt_list_lock); - goto out; + if (t->ident != ident) + continue; + if (!try_module_get(t->owner)) + continue; + return t; + } + return NULL; +} + +static const struct xprt_class * +xprt_class_find_by_ident(int ident) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_ident_locked(ident); + spin_unlock(&xprt_list_lock); + return t; +} + +static const struct xprt_class * +xprt_class_find_by_netid_locked(const char *netid) +{ + const struct xprt_class *t; + unsigned int i; + + list_for_each_entry(t, &xprt_list, list) { + for (i = 0; t->netid[i][0] != '\0'; i++) { + if (strcmp(t->netid[i], netid) != 0) + continue; + if (!try_module_get(t->owner)) + continue; + return t; } } - spin_unlock(&xprt_list_lock); - result = request_module("xprt%s", transport_name); -out: - return result; + return NULL; } -EXPORT_SYMBOL_GPL(xprt_load_transport); + +static const struct xprt_class * +xprt_class_find_by_netid(const char *netid) +{ + const struct xprt_class *t; + + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + if (!t) { + spin_unlock(&xprt_list_lock); + request_module("rpc%s", netid); + spin_lock(&xprt_list_lock); + t = xprt_class_find_by_netid_locked(netid); + } + spin_unlock(&xprt_list_lock); + return t; +} + +/** + * xprt_find_transport_ident - convert a netid into a transport identifier + * @netid: transport to load + * + * Returns: + * > 0: transport identifier + * -ENOENT: transport module not available + */ +int xprt_find_transport_ident(const char *netid) +{ + const struct xprt_class *t; + int ret; + + t = xprt_class_find_by_netid(netid); + if (!t) + return -ENOENT; + ret = t->ident; + xprt_class_release(t); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_find_transport_ident); static void xprt_clear_locked(struct rpc_xprt *xprt) { @@ -1896,21 +1957,17 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { struct rpc_xprt *xprt; - struct xprt_class *t; + const struct xprt_class *t; - spin_lock(&xprt_list_lock); - list_for_each_entry(t, &xprt_list, list) { - if (t->ident == args->ident) { - spin_unlock(&xprt_list_lock); - goto found; - } + t = xprt_class_find_by_ident(args->ident); + if (!t) { + dprintk("RPC: transport (%d) not supported\n", args->ident); + return ERR_PTR(-EIO); } - spin_unlock(&xprt_list_lock); - dprintk("RPC: transport (%d) not supported\n", args->ident); - return ERR_PTR(-EIO); -found: xprt = t->setup(args); + xprt_class_release(t); + if (IS_ERR(xprt)) goto out; if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index c92c1aac270a..946edf2db646 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2020, Oracle and/or its affiliates. * * Support for backward direction RPCs on RPC/RDMA. */ @@ -82,7 +82,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; - trace_xprtrdma_cb_reply(rqst); + trace_xprtrdma_cb_reply(r_xprt, rqst); return 0; } @@ -260,7 +260,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, */ req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - trace_xprtrdma_cb_call(rqst); + trace_xprtrdma_cb_call(r_xprt, rqst); /* Queue rqst for ULP's callback service */ bc_serv = xprt->bc_serv; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 44888f5badef..baca49fe83af 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -65,18 +65,23 @@ void frwr_release_mr(struct rpcrdma_mr *mr) kfree(mr); } +static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) +{ + if (mr->mr_device) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents, + mr->mr_dir); + mr->mr_device = NULL; + } +} + static void frwr_mr_recycle(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; trace_xprtrdma_mr_recycle(mr); - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } + frwr_mr_unmap(r_xprt, mr); spin_lock(&r_xprt->rx_buf.rb_lock); list_del(&mr->mr_all); @@ -86,6 +91,16 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) frwr_release_mr(mr); } +static void frwr_mr_put(struct rpcrdma_mr *mr) +{ + frwr_mr_unmap(mr->mr_xprt, mr); + + /* The MR is returned to the req's MR free list instead + * of to the xprt's MR free list. No spinlock is needed. + */ + rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); +} + /* frwr_reset - Place MRs back on the free list * @req: request to reset * @@ -101,7 +116,7 @@ void frwr_reset(struct rpcrdma_req *req) struct rpcrdma_mr *mr; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -130,7 +145,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->mr_xprt = r_xprt; mr->frwr.fr_mr = frmr; - mr->mr_dir = DMA_NONE; + mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->frwr.fr_linv_done); @@ -315,6 +330,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; + mr->mr_device = ep->re_id->device; ibmr = mr->frwr.fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); @@ -341,7 +357,6 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, return seg; out_dmamap_err: - mr->mr_dir = DMA_NONE; trace_xprtrdma_frwr_sgerr(mr, i); return ERR_PTR(-EIO); @@ -363,12 +378,21 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct rpcrdma_frwr, fr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, frwr); + trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); /* The MR will get recycled when the associated req is retransmitted */ rpcrdma_flush_disconnect(cq->cq_context, wc); } +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_frwr *frwr) +{ + struct rpc_rdma_cid *cid = &frwr->fr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = frwr->fr_mr->res.id; +} + /** * frwr_send - post Send WRs containing the RPC Call message * @r_xprt: controlling transport instance @@ -385,6 +409,7 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -395,6 +420,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; + frwr_cid_init(ep, frwr); frwr->fr_regwr.wr.next = post_wr; frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; frwr->fr_regwr.wr.num_sge = 0; @@ -404,7 +430,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); + return ib_post_send(ep->re_id->qp, post_wr, NULL); } /** @@ -420,18 +446,17 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_reminv(mr); - rpcrdma_mr_put(mr); + frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } } -static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) +static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { if (wc->status != IB_WC_SUCCESS) frwr_mr_recycle(mr); else - rpcrdma_mr_put(mr); + frwr_mr_put(mr); } /** @@ -448,8 +473,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -469,8 +494,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); complete(&frwr->fr_linv_done); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -490,6 +515,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, **prev, *last; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -509,6 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -534,7 +561,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -547,7 +574,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); @@ -574,10 +601,10 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_rep *rep = mr->mr_req->rl_reply; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, frwr); - __frwr_release_mr(wc, mr); + trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); + frwr_mr_done(wc, mr); - /* Ensure @rep is generated before __frwr_release_mr */ + /* Ensure @rep is generated before frwr_mr_done */ smp_rmb(); rpcrdma_complete_rqst(rep); @@ -597,6 +624,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; + struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; @@ -614,6 +642,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; + frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -639,13 +668,13 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, &bad_wr); if (!rc) return; /* Recycle MRs in the LOCAL_INV chain that did not get posted. */ - trace_xprtrdma_post_linv(req, rc); + trace_xprtrdma_post_linv_err(req, rc); while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index 620327c01302..45c5b41ac8dc 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("svcrdma"); MODULE_ALIAS("xprtrdma"); +MODULE_ALIAS("rpcrdma6"); static void __exit rpc_rdma_cleanup(void) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 0f5120c7668f..8f5d0cb68360 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2014-2017 Oracle. All rights reserved. + * Copyright (c) 2014-2020, Oracle and/or its affiliates. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -179,6 +179,31 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, r_xprt->rx_ep->re_max_inline_recv; } +/* ACL likes to be lazy in allocating pages. For TCP, these + * pages can be allocated during receive processing. Not true + * for RDMA, which must always provision receive buffers + * up front. + */ +static noinline int +rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) +{ + struct page **ppages; + int len; + + len = buf->page_len; + ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); + while (len > 0) { + if (!*ppages) + *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + if (!*ppages) + return -ENOBUFS; + ppages++; + len -= PAGE_SIZE; + } + + return 0; +} + /* Split @vec on page boundaries into SGEs. FMR registers pages, not * a byte range. Other modes coalesce these SGEs into a single MR * when they can. @@ -233,15 +258,6 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. - */ - if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { - if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); - if (!*ppages) - return -ENOBUFS; - } seg->mr_page = *ppages; seg->mr_offset = (char *)page_base; seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); @@ -315,7 +331,6 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, *mr = rpcrdma_mr_get(r_xprt); if (!*mr) goto out_getmr_err; - trace_xprtrdma_mr_get(req); (*mr)->mr_req = req; } @@ -323,7 +338,7 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); out_getmr_err: - trace_xprtrdma_nomrs(req); + trace_xprtrdma_nomrs_err(r_xprt, req); xprt_wait_for_buffer_space(&r_xprt->rx_xprt); rpcrdma_mrs_refresh(r_xprt); return ERR_PTR(-EAGAIN); @@ -867,6 +882,12 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) __be32 *p; int ret; + if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { + ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); + if (ret) + return ret; + } + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), rqst); @@ -1322,20 +1343,13 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %s: server reports " - "version error (%u-%u), xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1)), - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_vers(rqst, p, p + 1); break; case err_chunk: - dprintk("RPC: %s: server reports " - "header decoding error, xid %08x\n", __func__, - be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_chunk(rqst); break; default: - dprintk("RPC: %s: server reports " - "unrecognized error %d, xid %08x\n", __func__, - be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); + trace_xprtrdma_err_unrecognized(rqst, p); } return -EIO; @@ -1376,7 +1390,7 @@ out: return; out_badheader: - trace_xprtrdma_reply_hdr(rep); + trace_xprtrdma_reply_hdr_err(rep); r_xprt->rx_stats.bad_reply_count++; rqst->rq_task->tk_status = status; status = 0; @@ -1450,14 +1464,12 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + if (unlikely(req->rl_reply)) rpcrdma_recv_buffer_put(req->rl_reply); - } req->rl_reply = rep; rep->rr_rqst = rqst; - trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + trace_xprtrdma_reply(rqst->rq_task, rep, credits); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) frwr_reminv(rep, &req->rl_registered); @@ -1469,16 +1481,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) return; out_badversion: - trace_xprtrdma_reply_vers(rep); + trace_xprtrdma_reply_vers_err(rep); goto out; out_norqst: spin_unlock(&xprt->queue_lock); - trace_xprtrdma_reply_rqst(rep); + trace_xprtrdma_reply_rqst_err(rep); goto out; out_shortreply: - trace_xprtrdma_reply_short(rep); + trace_xprtrdma_reply_short_err(rep); out: rpcrdma_recv_buffer_put(rep); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8915e42240d3..78d29d1bcc20 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -599,11 +599,12 @@ static void xprt_rdma_free(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - if (!list_empty(&req->rl_registered)) - frwr_unmap_sync(r_xprt, req); + if (unlikely(!list_empty(&req->rl_registered))) { + trace_xprtrdma_mrs_zap(task); + frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req); + } /* XXX: If the RPC is completing because of a signal and * not because a reply was received, we ought to ensure @@ -768,6 +769,7 @@ static struct xprt_class xprt_rdma = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_RDMA, .setup = xprt_setup_rdma, + .netid = { "rdma", "rdma6", "" }, }; void xprt_rdma_cleanup(void) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ad6e2e4994ce..ec912cf9c618 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -167,7 +167,7 @@ static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_send(sc, wc); + trace_xprtrdma_wc_send(wc, &sc->sc_cid); rpcrdma_sendctx_put_locked(r_xprt, sc); rpcrdma_flush_disconnect(r_xprt, wc); } @@ -186,7 +186,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_receive(wc); + trace_xprtrdma_wc_receive(wc, &rep->rr_cid); --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -643,6 +643,9 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) return NULL; sc->sc_cqe.done = rpcrdma_wc_send; + sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; + sc->sc_cid.ci_completion_id = + atomic_inc_return(&ep->re_completion_ids); return sc; } @@ -972,6 +975,9 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) goto out_free_regbuf; + rep->rr_cid.ci_completion_id = + atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); + xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; @@ -1178,25 +1184,6 @@ rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) return mr; } -/** - * rpcrdma_mr_put - DMA unmap an MR and release it - * @mr: MR to release - * - */ -void rpcrdma_mr_put(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - if (mr->mr_dir != DMA_NONE) { - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - mr->mr_dir = DMA_NONE; - } - - rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs); -} - /** * rpcrdma_buffer_get - Get a request buffer * @buffers: Buffer pool from which to obtain a buffer @@ -1411,6 +1398,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!rep) break; + rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; trace_xprtrdma_post_recv(rep); rep->rr_recv_wr.next = wr; wr = &rep->rr_recv_wr; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 43974ef39a50..94b28657aeeb 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -53,6 +53,7 @@ #include /* RDMA verbs api */ #include /* rpc_xprt */ +#include /* completion IDs */ #include /* RPC/RDMA protocol */ #include /* xprt parameters */ @@ -93,6 +94,8 @@ struct rpcrdma_ep { unsigned int re_max_requests; /* depends on device */ unsigned int re_inline_send; /* negotiated */ unsigned int re_inline_recv; /* negotiated */ + + atomic_t re_completion_ids; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -180,6 +183,8 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; + struct rpc_rdma_cid rr_cid; + __be32 rr_xid; __be32 rr_vers; __be32 rr_proc; @@ -211,6 +216,7 @@ enum { struct rpcrdma_req; struct rpcrdma_sendctx { struct ib_cqe sc_cqe; + struct rpc_rdma_cid sc_cid; struct rpcrdma_req *sc_req; unsigned int sc_unmap_count; struct ib_sge sc_sges[]; @@ -225,6 +231,7 @@ struct rpcrdma_sendctx { struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; + struct rpc_rdma_cid fr_cid; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -236,6 +243,7 @@ struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; @@ -466,7 +474,6 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); -void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7090bbee0ec5..c56a66cdf4ac 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -433,7 +433,8 @@ xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, if (ret <= 0) goto sock_err; xs_flush_bvec(buf->bvec, ret, seek + buf->page_base); - offset += ret - buf->page_base; + ret -= buf->page_base; + offset += ret; if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) goto out; if (ret != want) @@ -3059,6 +3060,7 @@ static struct xprt_class xs_local_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_LOCAL, .setup = xs_setup_local, + .netid = { "" }, }; static struct xprt_class xs_udp_transport = { @@ -3067,6 +3069,7 @@ static struct xprt_class xs_udp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_UDP, .setup = xs_setup_udp, + .netid = { "udp", "udp6", "" }, }; static struct xprt_class xs_tcp_transport = { @@ -3075,6 +3078,7 @@ static struct xprt_class xs_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_TCP, .setup = xs_setup_tcp, + .netid = { "tcp", "tcp6", "" }, }; static struct xprt_class xs_bc_tcp_transport = { @@ -3083,6 +3087,7 @@ static struct xprt_class xs_bc_tcp_transport = { .owner = THIS_MODULE, .ident = XPRT_TRANSPORT_BC_TCP, .setup = xs_setup_bc_tcp, + .netid = { "" }, }; /**