From 667da76b1c8c506d067d96dccc9574ba26a8b910 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 15 Jan 2022 08:45:05 +0100 Subject: [PATCH 001/186] IB/mthca: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32 bits case will also fail for the same reason. Simplify code and remove some dead code accordingly. [1]: https://lore.kernel.org/r/YL3vSPK5DXTNvgdx@infradead.org Link: https://lore.kernel.org/r/03c66fe5c2a81dbb29349ebf9af631e5ea216ec4.1642232675.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_main.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index f507c4cd46d3..b54bc8865dae 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -939,12 +939,8 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (err) { - dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n"); - err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); - if (err) { - dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); - goto err_free_res; - } + dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); + goto err_free_res; } /* We can handle large RDMA requests, so allow larger segments. */ From ac491992f3705178e18eb185f392bf3fa2f6f54d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 9 Jan 2022 09:35:18 +0100 Subject: [PATCH 002/186] RDMA/pvrdma: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32 bits case will also fail for the same reason. Simplify code and remove some dead code accordingly. [1]: https://lore.kernel.org/r/YL3vSPK5DXTNvgdx@infradead.org Link: https://lore.kernel.org/r/10c29cb45d14fb8ed89cf1308c4a9a7d445c52eb.1641717275.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 105f3a155939..343288b02792 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -811,12 +811,10 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, } /* Enable 64-Bit DMA */ - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) != 0) { - ret = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); - if (ret != 0) { - dev_err(&pdev->dev, "dma_set_mask failed\n"); - goto err_free_resource; - } + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (ret) { + dev_err(&pdev->dev, "dma_set_mask failed\n"); + goto err_free_resource; } dma_set_max_seg_size(&pdev->dev, UINT_MAX); pci_set_master(pdev); From 44c3aa585cf8ab57b018fe75f6fb64a72a0a9932 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Fri, 14 Jan 2022 16:47:49 +0100 Subject: [PATCH 003/186] RDMA/rtrs: Remove empty line after bracket Fix: CHECK:BRACES: Blank lines aren't necessary after an open brace '{' Link: https://lore.kernel.org/r/20220114154753.983568-2-haris.iqbal@ionos.com Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index 4da889103a5f..60fa0b0160f4 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -479,7 +479,6 @@ static int rtrs_str_to_sockaddr(const char *addr, size_t len, */ int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len) { - switch (addr->sa_family) { case AF_IB: return scnprintf(buf, len, "gid:%pI6", From b73627eaf43522f6618bc7e72b283d309bf00a0e Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Fri, 14 Jan 2022 16:47:51 +0100 Subject: [PATCH 004/186] RDMA/rtrs-clt: Reflow text so lines don't end with a '(' Fix: CHECK:OPEN_ENDED_LINE: Lines should not end with a '(' Link: https://lore.kernel.org/r/20220114154753.983568-4-haris.iqbal@ionos.com Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c index b4fa473b7888..d3c436ead694 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c @@ -156,8 +156,7 @@ static DEVICE_ATTR_RW(mpath_policy); static ssize_t add_path_show(struct device *dev, struct device_attribute *attr, char *page) { - return sysfs_emit( - page, + return sysfs_emit(page, "Usage: echo [@] > %s\n\n*addr ::= [ ip: | gid: ]\n", attr->attr.name); } From b962fee5c2665d05ba3e2068756609d40dd312ea Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Fri, 14 Jan 2022 16:47:52 +0100 Subject: [PATCH 005/186] RDMA/rtrs-clt: Update one outdated comment in path_it_deinit() The skip_list is used for both MIN_INFLIGHT and MIN_LATENCY. Link: https://lore.kernel.org/r/20220114154753.983568-5-haris.iqbal@ionos.com Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 7c3f98e57889..a86616246586 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -917,7 +917,7 @@ static inline void path_it_deinit(struct path_it *it) { struct list_head *skip, *tmp; /* - * The skip_list is used only for the MIN_INFLIGHT policy. + * The skip_list is used only for the MIN_INFLIGHT and MIN_LATENCY policies. * We need to remove paths from it, so that next IO can insert * paths (->mp_skip_entry) into a skip_list again. */ From c1289d5d8502d62e5bc50ff066c9d6daabfc3264 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Fri, 14 Jan 2022 16:47:53 +0100 Subject: [PATCH 006/186] RDMA/rtrs-clt: Do stop and failover outside reconnect work. We can't do instant reconnect, not to DDoS server, but we should stop and failover earlier, so there is less service interruption. To avoid deadlock, as error_recovery is called from different callback like rdma event or hb error handler, add a new err recovery_work. Link: https://lore.kernel.org/r/20220114154753.983568-6-haris.iqbal@ionos.com Signed-off-by: Jack Wang Reviewed-by: Aleksei Marov Reviewed-by: Md Haris Iqbal Signed-off-by: Md Haris Iqbal Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 40 ++++++++++++++------------ drivers/infiniband/ulp/rtrs/rtrs-clt.h | 1 + 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index a86616246586..b696aa4abae4 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -297,6 +297,7 @@ static bool rtrs_clt_change_state_from_to(struct rtrs_clt_path *clt_path, return changed; } +static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_path *clt_path); static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con) { struct rtrs_clt_path *clt_path = to_clt_path(con->c.path); @@ -304,16 +305,7 @@ static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con) if (rtrs_clt_change_state_from_to(clt_path, RTRS_CLT_CONNECTED, RTRS_CLT_RECONNECTING)) { - struct rtrs_clt_sess *clt = clt_path->clt; - unsigned int delay_ms; - - /* - * Normal scenario, reconnect if we were successfully connected - */ - delay_ms = clt->reconnect_delay_sec * 1000; - queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, - msecs_to_jiffies(delay_ms + - prandom_u32() % RTRS_RECONNECT_SEED)); + queue_work(rtrs_wq, &clt_path->err_recovery_work); } else { /* * Error can happen just on establishing new connection, @@ -1511,6 +1503,22 @@ static void rtrs_clt_init_hb(struct rtrs_clt_path *clt_path) static void rtrs_clt_reconnect_work(struct work_struct *work); static void rtrs_clt_close_work(struct work_struct *work); +static void rtrs_clt_err_recovery_work(struct work_struct *work) +{ + struct rtrs_clt_path *clt_path; + struct rtrs_clt_sess *clt; + int delay_ms; + + clt_path = container_of(work, struct rtrs_clt_path, err_recovery_work); + clt = clt_path->clt; + delay_ms = clt->reconnect_delay_sec * 1000; + rtrs_clt_stop_and_destroy_conns(clt_path); + queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, + msecs_to_jiffies(delay_ms + + prandom_u32() % + RTRS_RECONNECT_SEED)); +} + static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt, const struct rtrs_addr *path, size_t con_num, u32 nr_poll_queues) @@ -1562,6 +1570,7 @@ static struct rtrs_clt_path *alloc_path(struct rtrs_clt_sess *clt, clt_path->state = RTRS_CLT_CONNECTING; atomic_set(&clt_path->connected_cnt, 0); INIT_WORK(&clt_path->close_work, rtrs_clt_close_work); + INIT_WORK(&clt_path->err_recovery_work, rtrs_clt_err_recovery_work); INIT_DELAYED_WORK(&clt_path->reconnect_dwork, rtrs_clt_reconnect_work); rtrs_clt_init_hb(clt_path); @@ -2326,6 +2335,7 @@ static void rtrs_clt_close_work(struct work_struct *work) clt_path = container_of(work, struct rtrs_clt_path, close_work); + cancel_work_sync(&clt_path->err_recovery_work); cancel_delayed_work_sync(&clt_path->reconnect_dwork); rtrs_clt_stop_and_destroy_conns(clt_path); rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CLOSED, NULL); @@ -2638,7 +2648,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work) { struct rtrs_clt_path *clt_path; struct rtrs_clt_sess *clt; - unsigned int delay_ms; int err; clt_path = container_of(to_delayed_work(work), struct rtrs_clt_path, @@ -2655,8 +2664,6 @@ static void rtrs_clt_reconnect_work(struct work_struct *work) } clt_path->reconnect_attempts++; - /* Stop everything */ - rtrs_clt_stop_and_destroy_conns(clt_path); msleep(RTRS_RECONNECT_BACKOFF); if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_CONNECTING, NULL)) { err = init_path(clt_path); @@ -2669,11 +2676,7 @@ static void rtrs_clt_reconnect_work(struct work_struct *work) reconnect_again: if (rtrs_clt_change_state_get_old(clt_path, RTRS_CLT_RECONNECTING, NULL)) { clt_path->stats->reconnects.fail_cnt++; - delay_ms = clt->reconnect_delay_sec * 1000; - queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, - msecs_to_jiffies(delay_ms + - prandom_u32() % - RTRS_RECONNECT_SEED)); + queue_work(rtrs_wq, &clt_path->err_recovery_work); } } @@ -2905,6 +2908,7 @@ int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_path *clt_path) &old_state); if (changed) { clt_path->reconnect_attempts = 0; + rtrs_clt_stop_and_destroy_conns(clt_path); queue_delayed_work(rtrs_wq, &clt_path->reconnect_dwork, 0); } if (changed || old_state == RTRS_CLT_RECONNECTING) { diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index d1b18a154ae0..f848c0392d98 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -134,6 +134,7 @@ struct rtrs_clt_path { struct rtrs_clt_io_req *reqs; struct delayed_work reconnect_dwork; struct work_struct close_work; + struct work_struct err_recovery_work; unsigned int reconnect_attempts; bool established; struct rtrs_rbuf *rbufs; From 32a88d16615c2be295571c29273c4ac94cb75309 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 18 Jan 2022 09:35:02 +0200 Subject: [PATCH 007/186] RDMA/core: Set MR type in ib_reg_user_mr Add missing assignment of MR type to IB_MR_TYPE_USER. Fixes: 33006bd4f37f ("IB/core: Introduce ib_reg_user_mr") Link: https://lore.kernel.org/r/be2e91bcd6e52dc36be289ae92f30d3a5cc6dcb1.1642491047.git.leonro@nvidia.com Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c18634bec212..e821dc94a43e 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2153,6 +2153,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return mr; mr->device = pd->device; + mr->type = IB_MR_TYPE_USER; mr->pd = pd; mr->dm = NULL; atomic_inc(&pd->usecnt); From 84aa6c3963b702002b57ba5bdb9110960550a2ec Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:00:48 +0200 Subject: [PATCH 008/186] RDMA/mlx5: Delete get_num_static_uars function There is no need to keep get_num_static_uars in the headers file as it is not shared and used only once. Link: https://lore.kernel.org/r/11d78568c3c6ba588ee8465e0d10d96145fc825c.1642960830.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 ------ drivers/infiniband/hw/mlx5/qp.c | 3 ++- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index cbc20e400be0..af68bce056a7 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1539,12 +1539,6 @@ static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_suppor MLX5_UARS_IN_PAGE : 1; } -static inline int get_num_static_uars(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi) -{ - return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; -} - extern void *xlt_emergency_page; int bfregn_to_uar_index(struct mlx5_ib_dev *dev, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 29475cf8c7c3..15ef663dfef8 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -615,7 +615,8 @@ enum { static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { - return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; + return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * + bfregi->num_static_sys_pages * MLX5_NON_FP_BFREGS_PER_UAR; } static int num_med_bfreg(struct mlx5_ib_dev *dev, From bd660922ab612b37f8110e491d8afa02718197df Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:50 +0200 Subject: [PATCH 009/186] RDMA/mlx5: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/3ab153e25c7ea59599022dc7fe3c409fcfe1aac1.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ib_virt.c | 1 - drivers/infiniband/hw/mlx5/mem.c | 1 - drivers/infiniband/hw/mlx5/qp.c | 1 - drivers/infiniband/hw/mlx5/srq.c | 1 - 4 files changed, 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c index f2f62875d072..afeb5e53254f 100644 --- a/drivers/infiniband/hw/mlx5/ib_virt.c +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include "mlx5_ib.h" diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 844545064c9e..6191aa833ac2 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include #include "mlx5_ib.h" diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 15ef663dfef8..c69ead3bb0d3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -31,7 +31,6 @@ */ #include -#include #include #include #include diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 191c4ee7db62..09b365a98bbf 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -3,7 +3,6 @@ * Copyright (c) 2013-2018, Mellanox Technologies inc. All rights reserved. */ -#include #include #include #include From b74525f21e33aba6360978d2af81e920a1806c78 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:51 +0200 Subject: [PATCH 010/186] RDMA/core: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/e412c83b45b6ebdd937886cc9c2cc7c8abcc34fa.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 1 - drivers/infiniband/core/cache.c | 1 - drivers/infiniband/core/cma_configfs.c | 1 - drivers/infiniband/core/cq.c | 1 - drivers/infiniband/core/iwpm_util.h | 1 - drivers/infiniband/core/sa_query.c | 1 - 6 files changed, 6 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 65e3e7df8a4b..f253295795f0 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index f6aa1a964573..4084d05a4510 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -34,7 +34,6 @@ */ #include -#include #include #include #include diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 9ac16e0db761..de8a2d5d741c 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 433b426729d4..a70876a0a231 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -2,7 +2,6 @@ /* * Copyright (c) 2015 HGST, a Western Digital Company. */ -#include #include #include #include diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index 3a42ad43056e..d6fc8402158a 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -33,7 +33,6 @@ #ifndef _IWPM_UTIL_H #define _IWPM_UTIL_H -#include #include #include #include diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 74ecd7456a11..8dc7d1f4b35d 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -32,7 +32,6 @@ * SOFTWARE. */ -#include #include #include #include From 75eeaed44813cd046d0e53a120387a1c786cc20b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:52 +0200 Subject: [PATCH 011/186] RDMA/hfi1: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/c53a257079bc93dac036155b793073939d17ddba.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/affinity.c | 1 - drivers/infiniband/hw/hfi1/debugfs.c | 1 - drivers/infiniband/hw/hfi1/device.c | 1 - drivers/infiniband/hw/hfi1/fault.c | 1 - drivers/infiniband/hw/hfi1/firmware.c | 1 - 5 files changed, 5 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 98c813ba4304..706b3b659713 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -5,7 +5,6 @@ #include #include -#include #include #include diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 22a3cdb940be..80ba1e53c068 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c index 68a184c39941..8ceff7141baf 100644 --- a/drivers/infiniband/hw/hfi1/device.c +++ b/drivers/infiniband/hw/hfi1/device.c @@ -4,7 +4,6 @@ */ #include -#include #include #include diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index e2e4f9f6fae2..3af77a0840ab 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index 31e63e245ea9..aa15a5cc7cf3 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -5,7 +5,6 @@ #include #include -#include #include #include From 8a110fc9df0314b33146d05ebdcdc9486ce4a570 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:53 +0200 Subject: [PATCH 012/186] RDMA/mlx4: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/fa91416f41e195f072fcdb8ad0ea085cee006502.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/alias_GUID.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index e2e1f5daddc4..111fa88a3be4 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include From c8e2d59bf4ddebf4ceafd4e902e572a045e6ae10 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:54 +0200 Subject: [PATCH 013/186] RDMA/mthca: Delete useless module.h include There is no need in include of module.h in the following file. Link: https://lore.kernel.org/r/ab856f40804d67905a655bc85e480d96ff66e46e.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_profile.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c index 7ea970774839..69af65f1b332 100644 --- a/drivers/infiniband/hw/mthca/mthca_profile.c +++ b/drivers/infiniband/hw/mthca/mthca_profile.c @@ -31,8 +31,6 @@ * SOFTWARE. */ -#include -#include #include #include From fffa617a0facede2e4cd69cb63c3f58162bffcbc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:55 +0200 Subject: [PATCH 014/186] RDMA/qib: Delete useless module.h include There is no need in include of module.h in the following file. Link: https://lore.kernel.org/r/72ab68466d1d22846f47ac058e332bbe27ce188b.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_fs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index a0c5f3bdc324..a973905afd13 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -32,7 +32,6 @@ * SOFTWARE. */ -#include #include #include #include From cad4c6caadf3f3c044e5ee75fb406df697b464ec Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:56 +0200 Subject: [PATCH 015/186] RDMA/usnic: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/745480bafd6f63c97a7049f34d84ef17dbc167d6.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_debugfs.c | 1 - drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c | 1 - drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 1 - drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 1 - drivers/infiniband/hw/usnic/usnic_transport.c | 1 - drivers/infiniband/hw/usnic/usnic_vnic.c | 1 - 6 files changed, 6 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c index e5a3f02fb078..10a8cd5ba076 100644 --- a/drivers/infiniband/hw/usnic/usnic_debugfs.c +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -32,7 +32,6 @@ */ #include -#include #include "usnic.h" #include "usnic_log.h" diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c index 3b60fa9cb58d..59bfbfaee325 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -32,7 +32,6 @@ */ #include #include -#include #include #include "usnic_log.h" diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 7d868f033bbf..fdb63a8fb997 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -31,7 +31,6 @@ * */ -#include #include #include diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 5a0e26cd648e..d3a9670bf971 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -30,7 +30,6 @@ * SOFTWARE. * */ -#include #include #include #include diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c index 82dd810bc000..dc37066900a5 100644 --- a/drivers/infiniband/hw/usnic/usnic_transport.c +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -32,7 +32,6 @@ */ #include #include -#include #include #include diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c index ebe08f348453..0c47f73aaed5 100644 --- a/drivers/infiniband/hw/usnic/usnic_vnic.c +++ b/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include "usnic_ib.h" From d7b887ab5d42d43404c1990363aac8714d4a5411 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:57 +0200 Subject: [PATCH 016/186] RDMA/rxe: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/8bdb652b01f2316bc57b456fb8c60bfbffe6cc64.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.h | 1 - drivers/infiniband/sw/rxe/rxe_mmap.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index fb9066e6f5f0..30fbdf3bc76a 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -12,7 +12,6 @@ #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include diff --git a/drivers/infiniband/sw/rxe/rxe_mmap.c b/drivers/infiniband/sw/rxe/rxe_mmap.c index 035f226af133..9149b6095429 100644 --- a/drivers/infiniband/sw/rxe/rxe_mmap.c +++ b/drivers/infiniband/sw/rxe/rxe_mmap.c @@ -4,7 +4,6 @@ * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ -#include #include #include #include From 163b4c12cd29509cc0971c02b8cc64ce26e22328 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:58 +0200 Subject: [PATCH 017/186] RDMA/ipoib: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/a5acab89181b55fd640369b3829cc49e0320302f.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 1 - drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 5b05cf3837da..ea16ba5d8da6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -32,7 +32,6 @@ #include #include /* For ARPHRD_xxx */ -#include #include #include "ipoib.h" diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 0322dc75396f..4bd161e86f8d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -30,7 +30,6 @@ * SOFTWARE. */ -#include #include #include From f156b944e5f406cdbdca031d4d25d47828b9a97b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:02:59 +0200 Subject: [PATCH 018/186] RDMA/iser: Delete useless module.h include There is no need in include of module.h in the following files. Link: https://lore.kernel.org/r/bdbf940ca5edbbc649153fa15737b779c073c498.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Acked-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_memory.c | 1 - drivers/infiniband/ulp/iser/iser_verbs.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 660982625488..f1fd05d8609d 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -30,7 +30,6 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include #include #include #include diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 8bf87b073d9b..8893bc27d8f5 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -32,7 +32,6 @@ * SOFTWARE. */ #include -#include #include #include From 9b1b61c5fb847f20eb3f5c6d76170c257df6bcf1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jan 2022 20:03:00 +0200 Subject: [PATCH 019/186] RDMA/opa: Delete useless module.h include There is no need in include of module.h in the following file. Link: https://lore.kernel.org/r/9384bc21bc3b60b340e04746568746dd4cdfa468.1642960861.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c index aeff68f582d3..071f35711468 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c @@ -50,7 +50,6 @@ * netdev functionality. */ -#include #include #include From 7df1023970d53e0540b665e5c90b4d4a330a3113 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 27 Jan 2022 15:37:30 -0600 Subject: [PATCH 020/186] RDMA/rxe: Move rxe_mcast_add/delete to rxe_mcast.c Move rxe_mcast_add and rxe_mcast_delete from rxe_net.c to rxe_mcast.c, make static and remove declarations from rxe_loc.h. Link: https://lore.kernel.org/r/20220127213755.31697-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 2 -- drivers/infiniband/sw/rxe/rxe_mcast.c | 18 ++++++++++++++++++ drivers/infiniband/sw/rxe/rxe_net.c | 18 ------------------ 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index b1e174afb1d4..bcec33c3c3b7 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -106,8 +106,6 @@ int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb); int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct sk_buff *skb); const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); -int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid); -int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid); /* rxe_qp.c */ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index bd1ac88b8700..e5689c161984 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -7,6 +7,24 @@ #include "rxe.h" #include "rxe_loc.h" +static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) +{ + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + + return dev_mc_add(rxe->ndev, ll_addr); +} + +static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) +{ + unsigned char ll_addr[ETH_ALEN]; + + ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); + + return dev_mc_del(rxe->ndev, ll_addr); +} + /* caller should hold mc_grp_pool->pool_lock */ static struct rxe_mc_grp *create_grp(struct rxe_dev *rxe, struct rxe_pool *pool, diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index be72bdbfb4ba..a8cfa7160478 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -20,24 +20,6 @@ static struct rxe_recv_sockets recv_sockets; -int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) -{ - unsigned char ll_addr[ETH_ALEN]; - - ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - - return dev_mc_add(rxe->ndev, ll_addr); -} - -int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) -{ - unsigned char ll_addr[ETH_ALEN]; - - ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - - return dev_mc_del(rxe->ndev, ll_addr); -} - static struct dst_entry *rxe_find_route4(struct net_device *ndev, struct in_addr *saddr, struct in_addr *daddr) From 758c7f1e9cc9f1d1da480dcde34430ae56888c76 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 27 Jan 2022 15:37:31 -0600 Subject: [PATCH 021/186] RDMA/rxe: Move rxe_mcast_attach/detach to rxe_mcast.c Move rxe_mcast_attach and rxe_mcast_detach from rxe_verbs.c to rxe_mcast.c, Make non-static and add declarations to rxe_loc.h. Make the subroutines in rxe_mcast.c referenced by these routines static and remove their declarations from rxe_loc.h. Link: https://lore.kernel.org/r/20220127213755.31697-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 12 ++------- drivers/infiniband/sw/rxe/rxe_mcast.c | 36 +++++++++++++++++++++++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 26 ------------------- 3 files changed, 33 insertions(+), 41 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index bcec33c3c3b7..dc606241f0d6 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -40,18 +40,10 @@ void rxe_cq_disable(struct rxe_cq *cq); void rxe_cq_cleanup(struct rxe_pool_elem *arg); /* rxe_mcast.c */ -int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, - struct rxe_mc_grp **grp_p); - -int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - struct rxe_mc_grp *grp); - -int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - union ib_gid *mgid); - void rxe_drop_all_mcast_groups(struct rxe_qp *qp); - void rxe_mc_cleanup(struct rxe_pool_elem *arg); +int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); +int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); /* rxe_mmap.c */ struct rxe_mmap_info { diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index e5689c161984..f86e32f4e77f 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -52,8 +52,8 @@ static struct rxe_mc_grp *create_grp(struct rxe_dev *rxe, return grp; } -int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, - struct rxe_mc_grp **grp_p) +static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mc_grp **grp_p) { int err; struct rxe_mc_grp *grp; @@ -81,7 +81,7 @@ done: return 0; } -int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, +static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_mc_grp *grp) { int err; @@ -125,8 +125,8 @@ out: return err; } -int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - union ib_gid *mgid) +static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid) { struct rxe_mc_grp *grp; struct rxe_mc_elem *elem, *tmp; @@ -194,3 +194,29 @@ void rxe_mc_cleanup(struct rxe_pool_elem *elem) rxe_drop_key(grp); rxe_mcast_delete(rxe, &grp->mgid); } + +int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + int err; + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mc_grp *grp; + + /* takes a ref on grp if successful */ + err = rxe_mcast_get_grp(rxe, mgid, &grp); + if (err) + return err; + + err = rxe_mcast_add_grp_elem(rxe, qp, grp); + + rxe_drop_ref(grp); + return err; +} + +int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) +{ + struct rxe_dev *rxe = to_rdev(ibqp->device); + struct rxe_qp *qp = to_rqp(ibqp); + + return rxe_mcast_drop_grp_elem(rxe, qp, mgid); +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 915ad6664321..f7682541f9af 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -999,32 +999,6 @@ static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, return n; } -static int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) -{ - int err; - struct rxe_dev *rxe = to_rdev(ibqp->device); - struct rxe_qp *qp = to_rqp(ibqp); - struct rxe_mc_grp *grp; - - /* takes a ref on grp if successful */ - err = rxe_mcast_get_grp(rxe, mgid, &grp); - if (err) - return err; - - err = rxe_mcast_add_grp_elem(rxe, qp, grp); - - rxe_drop_ref(grp); - return err; -} - -static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) -{ - struct rxe_dev *rxe = to_rdev(ibqp->device); - struct rxe_qp *qp = to_rqp(ibqp); - - return rxe_mcast_drop_grp_elem(rxe, qp, mgid); -} - static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { From 02e3524474b857d52f9005cd33717670dc966e8b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 27 Jan 2022 15:37:32 -0600 Subject: [PATCH 022/186] RDMA/rxe: Rename rxe_mc_grp and rxe_mc_elem Rename rxe_mc_grp to rxe_mcg. Rename rxe_mc_elem to rxe_mca. These can be read 'multicast group' and 'multicast attachment'. 'elem' collided with the use of elem in rxe pools and was a little confusing. Link: https://lore.kernel.org/r/20220127213755.31697-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 26 +++++++++++++------------- drivers/infiniband/sw/rxe/rxe_pool.c | 10 +++++----- drivers/infiniband/sw/rxe/rxe_recv.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index f86e32f4e77f..949784198d80 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -26,12 +26,12 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) } /* caller should hold mc_grp_pool->pool_lock */ -static struct rxe_mc_grp *create_grp(struct rxe_dev *rxe, +static struct rxe_mcg *create_grp(struct rxe_dev *rxe, struct rxe_pool *pool, union ib_gid *mgid) { int err; - struct rxe_mc_grp *grp; + struct rxe_mcg *grp; grp = rxe_alloc_locked(&rxe->mc_grp_pool); if (!grp) @@ -53,10 +53,10 @@ static struct rxe_mc_grp *create_grp(struct rxe_dev *rxe, } static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, - struct rxe_mc_grp **grp_p) + struct rxe_mcg **grp_p) { int err; - struct rxe_mc_grp *grp; + struct rxe_mcg *grp; struct rxe_pool *pool = &rxe->mc_grp_pool; if (rxe->attr.max_mcast_qp_attach == 0) @@ -82,10 +82,10 @@ done: } static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - struct rxe_mc_grp *grp) + struct rxe_mcg *grp) { int err; - struct rxe_mc_elem *elem; + struct rxe_mca *elem; /* check to see of the qp is already a member of the group */ spin_lock_bh(&qp->grp_lock); @@ -128,8 +128,8 @@ out: static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, union ib_gid *mgid) { - struct rxe_mc_grp *grp; - struct rxe_mc_elem *elem, *tmp; + struct rxe_mcg *grp; + struct rxe_mca *elem, *tmp; grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); if (!grp) @@ -162,8 +162,8 @@ err1: void rxe_drop_all_mcast_groups(struct rxe_qp *qp) { - struct rxe_mc_grp *grp; - struct rxe_mc_elem *elem; + struct rxe_mcg *grp; + struct rxe_mca *elem; while (1) { spin_lock_bh(&qp->grp_lock); @@ -171,7 +171,7 @@ void rxe_drop_all_mcast_groups(struct rxe_qp *qp) spin_unlock_bh(&qp->grp_lock); break; } - elem = list_first_entry(&qp->grp_list, struct rxe_mc_elem, + elem = list_first_entry(&qp->grp_list, struct rxe_mca, grp_list); list_del(&elem->grp_list); spin_unlock_bh(&qp->grp_lock); @@ -188,7 +188,7 @@ void rxe_drop_all_mcast_groups(struct rxe_qp *qp) void rxe_mc_cleanup(struct rxe_pool_elem *elem) { - struct rxe_mc_grp *grp = container_of(elem, typeof(*grp), elem); + struct rxe_mcg *grp = container_of(elem, typeof(*grp), elem); struct rxe_dev *rxe = grp->rxe; rxe_drop_key(grp); @@ -200,7 +200,7 @@ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) int err; struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); - struct rxe_mc_grp *grp; + struct rxe_mcg *grp; /* takes a ref on grp if successful */ err = rxe_mcast_get_grp(rxe, mgid, &grp); diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 4cb003885e00..63c594173565 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -83,17 +83,17 @@ static const struct rxe_type_info { }, [RXE_TYPE_MC_GRP] = { .name = "rxe-mc_grp", - .size = sizeof(struct rxe_mc_grp), - .elem_offset = offsetof(struct rxe_mc_grp, elem), + .size = sizeof(struct rxe_mcg), + .elem_offset = offsetof(struct rxe_mcg, elem), .cleanup = rxe_mc_cleanup, .flags = RXE_POOL_KEY, - .key_offset = offsetof(struct rxe_mc_grp, mgid), + .key_offset = offsetof(struct rxe_mcg, mgid), .key_size = sizeof(union ib_gid), }, [RXE_TYPE_MC_ELEM] = { .name = "rxe-mc_elem", - .size = sizeof(struct rxe_mc_elem), - .elem_offset = offsetof(struct rxe_mc_elem, elem), + .size = sizeof(struct rxe_mca), + .elem_offset = offsetof(struct rxe_mca, elem), }, }; diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 6a6cc1fa90e4..7ff6b53555f4 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -233,8 +233,8 @@ static inline void rxe_rcv_pkt(struct rxe_pkt_info *pkt, struct sk_buff *skb) static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) { struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); - struct rxe_mc_grp *mcg; - struct rxe_mc_elem *mce; + struct rxe_mcg *mcg; + struct rxe_mca *mce; struct rxe_qp *qp; union ib_gid dgid; int err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index e48969e8d4c8..388b7dc23dd7 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -353,7 +353,7 @@ struct rxe_mw { u64 length; }; -struct rxe_mc_grp { +struct rxe_mcg { struct rxe_pool_elem elem; spinlock_t mcg_lock; /* guard group */ struct rxe_dev *rxe; @@ -364,12 +364,12 @@ struct rxe_mc_grp { u16 pkey; }; -struct rxe_mc_elem { +struct rxe_mca { struct rxe_pool_elem elem; struct list_head qp_list; struct list_head grp_list; struct rxe_qp *qp; - struct rxe_mc_grp *grp; + struct rxe_mcg *grp; }; struct rxe_port { From f9f484605779fbdc1dcbb820834ace80d8211cad Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 27 Jan 2022 15:37:33 -0600 Subject: [PATCH 023/186] RDMA/rxe: Enforce IBA o10-2.2.3 Add code to check if a QP is attached to one or more multicast groups when destroy_qp is called and return an error if so. Link: https://lore.kernel.org/r/20220127213755.31697-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 9 +-------- drivers/infiniband/sw/rxe/rxe_mcast.c | 2 ++ drivers/infiniband/sw/rxe/rxe_qp.c | 14 ++++++++++++++ drivers/infiniband/sw/rxe/rxe_verbs.c | 5 +++++ drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index dc606241f0d6..052beaaacf43 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -101,26 +101,19 @@ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); /* rxe_qp.c */ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init); - int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct ib_qp_init_attr *init, struct rxe_create_qp_resp __user *uresp, struct ib_pd *ibpd, struct ib_udata *udata); - int rxe_qp_to_init(struct rxe_qp *qp, struct ib_qp_init_attr *init); - int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); - int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata); - int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); - void rxe_qp_error(struct rxe_qp *qp); - +int rxe_qp_chk_destroy(struct rxe_qp *qp); void rxe_qp_destroy(struct rxe_qp *qp); - void rxe_qp_cleanup(struct rxe_pool_elem *elem); static inline int qp_num(struct rxe_qp *qp) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 949784198d80..34e3c52f0b72 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -114,6 +114,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, grp->num_qp++; elem->qp = qp; elem->grp = grp; + atomic_inc(&qp->mcg_num); list_add(&elem->qp_list, &grp->qp_list); list_add(&elem->grp_list, &qp->grp_list); @@ -143,6 +144,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, list_del(&elem->qp_list); list_del(&elem->grp_list); grp->num_qp--; + atomic_dec(&qp->mcg_num); spin_unlock_bh(&grp->mcg_lock); spin_unlock_bh(&qp->grp_lock); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 5018b9387694..3325c2400e2f 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -770,6 +770,20 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) return 0; } +int rxe_qp_chk_destroy(struct rxe_qp *qp) +{ + /* See IBA o10-2.2.3 + * An attempt to destroy a QP while attached to a mcast group + * will fail immediately. + */ + if (atomic_read(&qp->mcg_num)) { + pr_debug("Attempt to destroy QP while attached to multicast group\n"); + return -EBUSY; + } + + return 0; +} + /* called by the destroy qp verb */ void rxe_qp_destroy(struct rxe_qp *qp) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f7682541f9af..9f0aef4b649d 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -493,6 +493,11 @@ static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct rxe_qp *qp = to_rqp(ibqp); + int ret; + + ret = rxe_qp_chk_destroy(qp); + if (ret) + return ret; rxe_qp_destroy(qp); rxe_drop_index(qp); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 388b7dc23dd7..4910d0782e33 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -235,6 +235,7 @@ struct rxe_qp { /* list of mcast groups qp has joined (for cleanup) */ struct list_head grp_list; spinlock_t grp_lock; /* guard grp_list */ + atomic_t mcg_num; struct sk_buff_head req_pkts; struct sk_buff_head resp_pkts; From 8a7fa872ff7922db1477208ba27cae6dd7e48012 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 27 Jan 2022 15:37:34 -0600 Subject: [PATCH 024/186] RDMA/rxe: Remove rxe_drop_all_macst_groups With o10-2.2.3 enforced rxe_drop_all_mcast_groups is completely unnecessary. Remove it and references to it. Link: https://lore.kernel.org/r/20220127213755.31697-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 1 - drivers/infiniband/sw/rxe/rxe_mcast.c | 26 -------------------------- drivers/infiniband/sw/rxe/rxe_qp.c | 2 -- 3 files changed, 29 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 052beaaacf43..af40e3c212fb 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -40,7 +40,6 @@ void rxe_cq_disable(struct rxe_cq *cq); void rxe_cq_cleanup(struct rxe_pool_elem *arg); /* rxe_mcast.c */ -void rxe_drop_all_mcast_groups(struct rxe_qp *qp); void rxe_mc_cleanup(struct rxe_pool_elem *arg); int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 34e3c52f0b72..39a41daa7a6b 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -162,32 +162,6 @@ err1: return -EINVAL; } -void rxe_drop_all_mcast_groups(struct rxe_qp *qp) -{ - struct rxe_mcg *grp; - struct rxe_mca *elem; - - while (1) { - spin_lock_bh(&qp->grp_lock); - if (list_empty(&qp->grp_list)) { - spin_unlock_bh(&qp->grp_lock); - break; - } - elem = list_first_entry(&qp->grp_list, struct rxe_mca, - grp_list); - list_del(&elem->grp_list); - spin_unlock_bh(&qp->grp_lock); - - grp = elem->grp; - spin_lock_bh(&grp->mcg_lock); - list_del(&elem->qp_list); - grp->num_qp--; - spin_unlock_bh(&grp->mcg_lock); - rxe_drop_ref(grp); - rxe_drop_ref(elem); - } -} - void rxe_mc_cleanup(struct rxe_pool_elem *elem) { struct rxe_mcg *grp = container_of(elem, typeof(*grp), elem); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 3325c2400e2f..b975a5ca8db0 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -812,8 +812,6 @@ static void rxe_qp_do_cleanup(struct work_struct *work) { struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); - rxe_drop_all_mcast_groups(qp); - if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); From d3f6899b0b5617e8900d6b1ae60414e611b1a0f1 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 27 Jan 2022 15:37:35 -0600 Subject: [PATCH 025/186] RDMA/rxe: Remove qp->grp_lock and qp->grp_list Since it is no longer required to cleanup attachments to multicast groups when a QP is destroyed qp->grp_lock and qp->grp_list are no longer needed and are removed. Link: https://lore.kernel.org/r/20220127213755.31697-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 8 -------- drivers/infiniband/sw/rxe/rxe_qp.c | 3 --- drivers/infiniband/sw/rxe/rxe_verbs.h | 5 ----- 3 files changed, 16 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 39a41daa7a6b..9336295c4ee2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -88,7 +88,6 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_mca *elem; /* check to see of the qp is already a member of the group */ - spin_lock_bh(&qp->grp_lock); spin_lock_bh(&grp->mcg_lock); list_for_each_entry(elem, &grp->qp_list, qp_list) { if (elem->qp == qp) { @@ -113,16 +112,13 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, grp->num_qp++; elem->qp = qp; - elem->grp = grp; atomic_inc(&qp->mcg_num); list_add(&elem->qp_list, &grp->qp_list); - list_add(&elem->grp_list, &qp->grp_list); err = 0; out: spin_unlock_bh(&grp->mcg_lock); - spin_unlock_bh(&qp->grp_lock); return err; } @@ -136,18 +132,15 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, if (!grp) goto err1; - spin_lock_bh(&qp->grp_lock); spin_lock_bh(&grp->mcg_lock); list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { if (elem->qp == qp) { list_del(&elem->qp_list); - list_del(&elem->grp_list); grp->num_qp--; atomic_dec(&qp->mcg_num); spin_unlock_bh(&grp->mcg_lock); - spin_unlock_bh(&qp->grp_lock); rxe_drop_ref(elem); rxe_drop_ref(grp); /* ref held by QP */ rxe_drop_ref(grp); /* ref from get_key */ @@ -156,7 +149,6 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, } spin_unlock_bh(&grp->mcg_lock); - spin_unlock_bh(&qp->grp_lock); rxe_drop_ref(grp); /* ref from get_key */ err1: return -EINVAL; diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index b975a5ca8db0..5f270cbf18c6 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -188,9 +188,6 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, break; } - INIT_LIST_HEAD(&qp->grp_list); - - spin_lock_init(&qp->grp_lock); spin_lock_init(&qp->state_lock); atomic_set(&qp->ssn, 0); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 4910d0782e33..55f8ed2bc621 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -232,9 +232,6 @@ struct rxe_qp { struct rxe_av pri_av; struct rxe_av alt_av; - /* list of mcast groups qp has joined (for cleanup) */ - struct list_head grp_list; - spinlock_t grp_lock; /* guard grp_list */ atomic_t mcg_num; struct sk_buff_head req_pkts; @@ -368,9 +365,7 @@ struct rxe_mcg { struct rxe_mca { struct rxe_pool_elem elem; struct list_head qp_list; - struct list_head grp_list; struct rxe_qp *qp; - struct rxe_mcg *grp; }; struct rxe_port { From 0d9c00117b8a57a361b27f7bd94284c94155f039 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 30 Jan 2022 22:57:47 +0000 Subject: [PATCH 026/186] RDMA/mlx4: remove redundant assignment to variable nreq Variable nreq is being assigned a value that is never read. The assignment is redundant and can be removed. Link: https://lore.kernel.org/r/20220130225747.8114-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/srq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 6a381751c0d8..c4cf91235eee 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -320,7 +320,6 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; - nreq = 0; goto out; } From b1377cc37f6bebd57ce8747b7e16163a475af295 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Wed, 29 Dec 2021 11:44:38 +0800 Subject: [PATCH 027/186] RDMA/rxe: Check the last packet by RXE_END_MASK It's wrong to check the last packet by RXE_COMP_MASK because the flag is to indicate if responder needs to generate a completion. Fixes: 9fcd67d1772c ("IB/rxe: increment msn only when completing a request") Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20211229034438.1854908-1-yangx.jy@fujitsu.com Signed-off-by: Xiao Yang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index e8f435fa6e4d..380934e38923 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -814,6 +814,10 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) return RESPST_ERR_INVALIDATE_RKEY; } + if (pkt->mask & RXE_END_MASK) + /* We successfully processed this new request. */ + qp->resp.msn++; + /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; qp->resp.ack_psn = qp->resp.psn; @@ -821,11 +825,9 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; - if (pkt->mask & RXE_COMP_MASK) { - /* We successfully processed this new request. */ - qp->resp.msn++; + if (pkt->mask & RXE_COMP_MASK) return RESPST_COMPLETE; - } else if (qp_type(qp) == IB_QPT_RC) + else if (qp_type(qp) == IB_QPT_RC) return RESPST_ACKNOWLEDGE; else return RESPST_CLEANUP; From 83483055321f1e5e45abc1444c49d916e1c52bc4 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 2 Feb 2022 13:19:20 -0600 Subject: [PATCH 028/186] RDMA/irdma: Refactor DCB bits in prep for DSCP support Rename dcb flag to dcb_vlan_mode in irdma_device struct. Add a new helper function, irdma_set_qos_info, to set the VSI QoS information passed by the PCI driver. Link: https://lore.kernel.org/r/20220202191921.1638-3-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 4 ++-- drivers/infiniband/hw/irdma/ctrl.c | 33 ++++++++++++++++---------- drivers/infiniband/hw/irdma/i40iw_if.c | 2 +- drivers/infiniband/hw/irdma/main.c | 6 ++++- drivers/infiniband/hw/irdma/main.h | 2 +- drivers/infiniband/hw/irdma/verbs.c | 4 ++-- 6 files changed, 32 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 6dea0a49d171..6ff18004ecaa 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -2200,7 +2200,7 @@ irdma_make_cm_node(struct irdma_cm_core *cm_core, struct irdma_device *iwdev, /* set our node specific transport info */ cm_node->ipv4 = cm_info->ipv4; cm_node->vlan_id = cm_info->vlan_id; - if (cm_node->vlan_id >= VLAN_N_VID && iwdev->dcb) + if (cm_node->vlan_id >= VLAN_N_VID && iwdev->dcb_vlan_mode) cm_node->vlan_id = 0; cm_node->tos = cm_info->tos; cm_node->user_pri = cm_info->user_pri; @@ -3959,7 +3959,7 @@ int irdma_create_listen(struct iw_cm_id *cm_id, int backlog) } } - if (cm_info.vlan_id >= VLAN_N_VID && iwdev->dcb) + if (cm_info.vlan_id >= VLAN_N_VID && iwdev->dcb_vlan_mode) cm_info.vlan_id = 0; cm_info.backlog = backlog; cm_info.cm_id = cm_id; diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 3141a9c85de5..ef1d6ad0f1fe 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -70,6 +70,25 @@ void irdma_sc_suspend_resume_qps(struct irdma_sc_vsi *vsi, u8 op) } } +static void irdma_set_qos_info(struct irdma_sc_vsi *vsi, + struct irdma_l2params *l2p) +{ + u8 i; + + vsi->qos_rel_bw = l2p->vsi_rel_bw; + vsi->qos_prio_type = l2p->vsi_prio_type; + for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { + if (vsi->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) + vsi->qos[i].qs_handle = l2p->qs_handle_list[i]; + vsi->qos[i].traffic_class = l2p->up2tc[i]; + vsi->qos[i].rel_bw = + l2p->tc_info[vsi->qos[i].traffic_class].rel_bw; + vsi->qos[i].prio_type = + l2p->tc_info[vsi->qos[i].traffic_class].prio_type; + vsi->qos[i].valid = false; + } +} + /** * irdma_change_l2params - given the new l2 parameters, change all qp * @vsi: RDMA VSI pointer @@ -88,6 +107,7 @@ void irdma_change_l2params(struct irdma_sc_vsi *vsi, return; vsi->tc_change_pending = false; + irdma_set_qos_info(vsi, l2params); irdma_sc_suspend_resume_qps(vsi, IRDMA_OP_RESUME); } @@ -1845,7 +1865,6 @@ static void irdma_null_ws_reset(struct irdma_sc_vsi *vsi) void irdma_sc_vsi_init(struct irdma_sc_vsi *vsi, struct irdma_vsi_init_info *info) { - struct irdma_l2params *l2p; int i; vsi->dev = info->dev; @@ -1858,18 +1877,8 @@ void irdma_sc_vsi_init(struct irdma_sc_vsi *vsi, if (vsi->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) vsi->fcn_id = info->dev->hmc_fn_id; - l2p = info->params; - vsi->qos_rel_bw = l2p->vsi_rel_bw; - vsi->qos_prio_type = l2p->vsi_prio_type; + irdma_set_qos_info(vsi, info->params); for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { - if (vsi->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) - vsi->qos[i].qs_handle = l2p->qs_handle_list[i]; - vsi->qos[i].traffic_class = info->params->up2tc[i]; - vsi->qos[i].rel_bw = - l2p->tc_info[vsi->qos[i].traffic_class].rel_bw; - vsi->qos[i].prio_type = - l2p->tc_info[vsi->qos[i].traffic_class].prio_type; - vsi->qos[i].valid = false; mutex_init(&vsi->qos[i].qos_mutex); INIT_LIST_HEAD(&vsi->qos[i].qplist); } diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c index 43e962b97d6a..8b5bd773da5a 100644 --- a/drivers/infiniband/hw/irdma/i40iw_if.c +++ b/drivers/infiniband/hw/irdma/i40iw_if.c @@ -138,7 +138,7 @@ static int i40iw_open(struct i40e_info *cdev_info, struct i40e_client *client) if (last_qset == IRDMA_NO_QSET) last_qset = qset; else if ((qset != last_qset) && (qset != IRDMA_NO_QSET)) - iwdev->dcb = true; + iwdev->dcb_vlan_mode = true; } if (irdma_rt_init_hw(iwdev, &l2params)) { diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 9fab29039f1c..179667b3e0b8 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -108,8 +108,9 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event l2params.tc_changed = true; ibdev_dbg(&iwdev->ibdev, "CLNT: TC Change\n"); ice_get_qos_params(pf, &qos_info); - iwdev->dcb = qos_info.num_tc > 1; irdma_fill_qos_info(&l2params, &qos_info); + if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) + iwdev->dcb_vlan_mode = qos_info.num_tc > 1; irdma_change_l2params(&iwdev->vsi, &l2params); } else if (*event->type & BIT(IIDC_EVENT_CRIT_ERR)) { ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", @@ -283,6 +284,9 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ l2params.mtu = iwdev->netdev->mtu; ice_get_qos_params(pf, &qos_info); irdma_fill_qos_info(&l2params, &qos_info); + if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) + iwdev->dcb_vlan_mode = l2params.num_tc > 1; + if (irdma_rt_init_hw(iwdev, &l2params)) { err = -EIO; goto err_rt_init; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index cb218cab79ac..cf6732bf70a1 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -345,7 +345,7 @@ struct irdma_device { u8 iw_status; bool roce_mode:1; bool roce_dcqcn_en:1; - bool dcb:1; + bool dcb_vlan_mode:1; bool iw_ooo:1; enum init_completion_state init_state; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 460e757d3fe6..7b144e57578e 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1189,7 +1189,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (ret) return ret; - if (vlan_id >= VLAN_N_VID && iwdev->dcb) + if (vlan_id >= VLAN_N_VID && iwdev->dcb_vlan_mode) vlan_id = 0; if (vlan_id < VLAN_N_VID) { udp_info->insert_vlan_tag = true; @@ -4229,7 +4229,7 @@ static int irdma_create_ah(struct ib_ah *ibah, goto error; } - if (ah_info->vlan_tag >= VLAN_N_VID && iwdev->dcb) + if (ah_info->vlan_tag >= VLAN_N_VID && iwdev->dcb_vlan_mode) ah_info->vlan_tag = 0; if (ah_info->vlan_tag < VLAN_N_VID) { From 4b860c9169dcb8aead7e1744327c52a253f5c5fd Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Wed, 2 Feb 2022 13:19:21 -0600 Subject: [PATCH 029/186] RDMA/irdma: Add support for DSCP Add DSCP support for the Intel Ethernet 800 Series devices. Setup VSI DSCP info when PCI driver indicates DSCP mode during driver probe or as notification event. Link: https://lore.kernel.org/r/20220202191921.1638-4-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 20 ++++++++++++++++---- drivers/infiniband/hw/irdma/cm.h | 7 +++++++ drivers/infiniband/hw/irdma/ctrl.c | 6 ++++++ drivers/infiniband/hw/irdma/main.c | 8 ++++++-- drivers/infiniband/hw/irdma/osdep.h | 1 + drivers/infiniband/hw/irdma/type.h | 4 ++++ 6 files changed, 40 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 6ff18004ecaa..abc101b5fd42 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -2209,8 +2209,12 @@ irdma_make_cm_node(struct irdma_cm_core *cm_core, struct irdma_device *iwdev, ibdev_warn(&iwdev->ibdev, "application TOS[%d] and remote client TOS[%d] mismatch\n", listener->tos, cm_info->tos); - cm_node->tos = max(listener->tos, cm_info->tos); - cm_node->user_pri = rt_tos2priority(cm_node->tos); + if (iwdev->vsi.dscp_mode) { + cm_node->user_pri = listener->user_pri; + } else { + cm_node->tos = max(listener->tos, cm_info->tos); + cm_node->user_pri = rt_tos2priority(cm_node->tos); + } ibdev_dbg(&iwdev->ibdev, "DCB: listener: TOS:[%d] UP:[%d]\n", cm_node->tos, cm_node->user_pri); @@ -3835,7 +3839,11 @@ int irdma_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_info.cm_id = cm_id; cm_info.qh_qpid = iwdev->vsi.ilq->qp_id; cm_info.tos = cm_id->tos; - cm_info.user_pri = rt_tos2priority(cm_id->tos); + if (iwdev->vsi.dscp_mode) + cm_info.user_pri = + iwqp->sc_qp.vsi->dscp_map[irdma_tos2dscp(cm_info.tos)]; + else + cm_info.user_pri = rt_tos2priority(cm_id->tos); if (iwqp->sc_qp.dev->ws_add(iwqp->sc_qp.vsi, cm_info.user_pri)) return -ENOMEM; @@ -3977,7 +3985,11 @@ int irdma_create_listen(struct iw_cm_id *cm_id, int backlog) cm_id->provider_data = cm_listen_node; cm_listen_node->tos = cm_id->tos; - cm_listen_node->user_pri = rt_tos2priority(cm_id->tos); + if (iwdev->vsi.dscp_mode) + cm_listen_node->user_pri = + iwdev->vsi.dscp_map[irdma_tos2dscp(cm_id->tos)]; + else + cm_listen_node->user_pri = rt_tos2priority(cm_id->tos); cm_info.user_pri = cm_listen_node->user_pri; if (!cm_listen_node->reused_node) { if (wildcard) { diff --git a/drivers/infiniband/hw/irdma/cm.h b/drivers/infiniband/hw/irdma/cm.h index 3bf42728e9b7..19c284975fc7 100644 --- a/drivers/infiniband/hw/irdma/cm.h +++ b/drivers/infiniband/hw/irdma/cm.h @@ -384,6 +384,13 @@ int irdma_schedule_cm_timer(struct irdma_cm_node *cm_node, struct irdma_puda_buf *sqbuf, enum irdma_timer_type type, int send_retrans, int close_when_complete); + +static inline u8 irdma_tos2dscp(u8 tos) +{ +#define IRDMA_DSCP_VAL GENMASK(7, 2) + return (u8)FIELD_GET(IRDMA_DSCP_VAL, tos); +} + int irdma_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int irdma_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); int irdma_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index ef1d6ad0f1fe..94a9c26ac83f 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -77,6 +77,12 @@ static void irdma_set_qos_info(struct irdma_sc_vsi *vsi, vsi->qos_rel_bw = l2p->vsi_rel_bw; vsi->qos_prio_type = l2p->vsi_prio_type; + vsi->dscp_mode = l2p->dscp_mode; + if (l2p->dscp_mode) { + memcpy(vsi->dscp_map, l2p->dscp_map, sizeof(vsi->dscp_map)); + for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) + l2p->up2tc[i] = i; + } for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) { if (vsi->dev->hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) vsi->qos[i].qs_handle = l2p->qs_handle_list[i]; diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 179667b3e0b8..97625263899f 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -79,6 +79,10 @@ static void irdma_fill_qos_info(struct irdma_l2params *l2params, } for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++) l2params->up2tc[i] = qos_info->up2tc[i]; + if (qos_info->pfc_mode == IIDC_DSCP_PFC_MODE) { + l2params->dscp_mode = true; + memcpy(l2params->dscp_map, qos_info->dscp_map, sizeof(l2params->dscp_map)); + } } static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event) @@ -110,7 +114,7 @@ static void irdma_iidc_event_handler(struct ice_pf *pf, struct iidc_event *event ice_get_qos_params(pf, &qos_info); irdma_fill_qos_info(&l2params, &qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = qos_info.num_tc > 1; + iwdev->dcb_vlan_mode = qos_info.num_tc > 1 && !l2params.dscp_mode; irdma_change_l2params(&iwdev->vsi, &l2params); } else if (*event->type & BIT(IIDC_EVENT_CRIT_ERR)) { ibdev_warn(&iwdev->ibdev, "ICE OICR event notification: oicr = 0x%08x\n", @@ -285,7 +289,7 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ ice_get_qos_params(pf, &qos_info); irdma_fill_qos_info(&l2params, &qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) - iwdev->dcb_vlan_mode = l2params.num_tc > 1; + iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; if (irdma_rt_init_hw(iwdev, &l2params)) { err = -EIO; diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index 63d8bb3a6903..6e28e437b6fb 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -5,6 +5,7 @@ #include #include +#include #include #include diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 9483bb3e10ea..4290a2ce810a 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -611,6 +611,8 @@ struct irdma_sc_vsi { struct irdma_ws_node *tc_node); u8 qos_rel_bw; u8 qos_prio_type; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; + bool dscp_mode:1; }; struct irdma_sc_dev { @@ -735,11 +737,13 @@ struct irdma_l2params { u16 qs_handle_list[IRDMA_MAX_USER_PRIORITY]; u16 mtu; u8 up2tc[IRDMA_MAX_USER_PRIORITY]; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; u8 num_tc; u8 vsi_rel_bw; u8 vsi_prio_type; bool mtu_changed:1; bool tc_changed:1; + bool dscp_mode:1; }; struct irdma_vsi_init_info { From 68cdd3d2af6964dae2f8d9b53ee94f740dcbda35 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:28:44 -0800 Subject: [PATCH 030/186] cxl: Rename CXL_MEM to CXL_PCI The cxl_mem module was renamed cxl_pci in commit 21e9f76733a8 ("cxl: Rename mem to pci"). In preparation for adding an ancillary driver for cxl_memdev devices (registered on the cxl bus by cxl_pci), go ahead and rename CONFIG_CXL_MEM to CONFIG_CXL_PCI. Free up the CXL_MEM name for that new driver to manage CXL.mem endpoint operations. Suggested-by: Dan Williams Reviewed-by: Jonathan Cameron Signed-off-by: Ben Widawsky Link: https://lore.kernel.org/r/164298412409.3018233.12407355692407890752.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/Kconfig | 23 ++++++++++++----------- drivers/cxl/Makefile | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 67c91378f2dd..ef05e96f8f97 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -13,25 +13,26 @@ menuconfig CXL_BUS if CXL_BUS -config CXL_MEM - tristate "CXL.mem: Memory Devices" +config CXL_PCI + tristate "PCI manageability" default CXL_BUS help - The CXL.mem protocol allows a device to act as a provider of - "System RAM" and/or "Persistent Memory" that is fully coherent - as if the memory was attached to the typical CPU memory - controller. + The CXL specification defines a "CXL memory device" sub-class in the + PCI "memory controller" base class of devices. Device's identified by + this class code provide support for volatile and / or persistent + memory to be mapped into the system address map (Host-managed Device + Memory (HDM)). - Say 'y/m' to enable a driver that will attach to CXL.mem devices for - configuration and management primarily via the mailbox interface. See - Chapter 2.3 Type 3 CXL Device in the CXL 2.0 specification for more - details. + Say 'y/m' to enable a driver that will attach to CXL memory expander + devices enumerated by the memory device class code for configuration + and management primarily via the mailbox interface. See Chapter 2.3 + Type 3 CXL Device in the CXL 2.0 specification for more details. If unsure say 'm'. config CXL_MEM_RAW_COMMANDS bool "RAW Command Interface for Memory Devices" - depends on CXL_MEM + depends on CXL_PCI help Enable CXL RAW command interface. diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile index d1aaabc940f3..cf07ae6cea17 100644 --- a/drivers/cxl/Makefile +++ b/drivers/cxl/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CXL_BUS) += core/ -obj-$(CONFIG_CXL_MEM) += cxl_pci.o +obj-$(CONFIG_CXL_PCI) += cxl_pci.o obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o From 229e8828c206be8fa0a159f6ff71e3b1a0484233 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Mon, 31 Jan 2022 15:51:45 -0800 Subject: [PATCH 031/186] cxl/pci: Implement Interface Ready Timeout The original driver implementation used the doorbell timeout for the Mailbox Interface Ready bit to piggy back off of, since the latter does not have a defined timeout. This functionality, introduced in commit 8adaf747c9f0 ("cxl/mem: Find device capabilities"), needs improvement as the recent "Add Mailbox Ready Time" ECN timeout indicates that the mailbox ready time can be significantly longer that 2 seconds. While the specification limits the maximum timeout to 256s, the cxl_pci driver gives up on the mailbox after 60s. This value corresponds with important timeout values already present in the kernel. A module parameter is provided as an emergency override and represents the default Linux policy for all devices. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron [djbw: add modparam, drop check_device_status()] Co-developed-by: Dan Williams Link: https://lore.kernel.org/r/164367306565.208548.1932299464604450843.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/pci.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 8dc91fd3396a..cc0cdd7e9de3 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #include +#include #include +#include #include #include #include @@ -35,6 +37,20 @@ /* CXL 2.0 - 8.2.8.4 */ #define CXL_MAILBOX_TIMEOUT_MS (2 * HZ) +/* + * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to + * dictate how long to wait for the mailbox to become ready. The new + * field allows the device to tell software the amount of time to wait + * before mailbox ready. This field per the spec theoretically allows + * for up to 255 seconds. 255 seconds is unreasonably long, its longer + * than the maximum SATA port link recovery wait. Default to 60 seconds + * until someone builds a CXL device that needs more time in practice. + */ +static unsigned short mbox_ready_timeout = 60; +module_param(mbox_ready_timeout, ushort, 0644); +MODULE_PARM_DESC(mbox_ready_timeout, + "seconds to wait for mailbox ready status"); + static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) { const unsigned long start = jiffies; @@ -281,6 +297,25 @@ static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *c static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds) { const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET); + unsigned long timeout; + u64 md_status; + + timeout = jiffies + mbox_ready_timeout * HZ; + do { + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + if (md_status & CXLMDEV_MBOX_IF_READY) + break; + if (msleep_interruptible(100)) + break; + } while (!time_after(jiffies, timeout)); + + if (!(md_status & CXLMDEV_MBOX_IF_READY)) { + dev_err(cxlds->dev, + "timeout awaiting mailbox ready, device state:%s%s\n", + md_status & CXLMDEV_DEV_FATAL ? " fatal" : "", + md_status & CXLMDEV_FW_HALT ? " firmware-halt" : ""); + return -EIO; + } cxlds->mbox_send = cxl_pci_mbox_send; cxlds->payload_size = From 4f195ee73ade1adf8326e5ed5fb271da51778991 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:28:54 -0800 Subject: [PATCH 032/186] cxl/pci: Defer mailbox status checks to command timeouts Device status can change without warning at any point in time. This effectively means that no amount of status checking before a command is submitted can guarantee that the device is not in an error condition when the command is later submitted. The clearest signal that a device is not able to process commands is if it fails to process commands. With the above understanding in hand, update cxl_pci_setup_mailbox() to validate the readiness of the mailbox once at the beginning of time, and then use timeouts and busy sequencing errors as the only occasions to report status. Just as before, unless and until the driver gains a reset recovery path, doorbell clearing failures by the device are fatal to mailbox operations. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298413480.3018233.9643395389297971819.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/pci.c | 134 ++++++++++++---------------------------------- 1 file changed, 33 insertions(+), 101 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index cc0cdd7e9de3..baeea0c2e619 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -73,14 +73,16 @@ static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) return 0; } -static void cxl_pci_mbox_timeout(struct cxl_dev_state *cxlds, - struct cxl_mbox_cmd *mbox_cmd) -{ - struct device *dev = cxlds->dev; +#define cxl_err(dev, status, msg) \ + dev_err_ratelimited(dev, msg ", device state %s%s\n", \ + status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ + status & CXLMDEV_FW_HALT ? " firmware-halt" : "") - dev_dbg(dev, "Mailbox command (opcode: %#x size: %zub) timed out\n", - mbox_cmd->opcode, mbox_cmd->size_in); -} +#define cxl_cmd_err(dev, cmd, status, msg) \ + dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \ + (cmd)->opcode, \ + status & CXLMDEV_DEV_FATAL ? " fatal" : "", \ + status & CXLMDEV_FW_HALT ? " firmware-halt" : "") /** * __cxl_pci_mbox_send_cmd() - Execute a mailbox command @@ -134,7 +136,11 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds, /* #1 */ if (cxl_doorbell_busy(cxlds)) { - dev_err_ratelimited(dev, "Mailbox re-busy after acquiring\n"); + u64 md_status = + readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + + cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, + "mailbox queue busy"); return -EBUSY; } @@ -160,7 +166,9 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds, /* #5 */ rc = cxl_pci_mbox_wait_for_doorbell(cxlds); if (rc == -ETIMEDOUT) { - cxl_pci_mbox_timeout(cxlds, mbox_cmd); + u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + + cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout"); return rc; } @@ -198,98 +206,13 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds, return 0; } -/** - * cxl_pci_mbox_get() - Acquire exclusive access to the mailbox. - * @cxlds: The device state to gain access to. - * - * Context: Any context. Takes the mbox_mutex. - * Return: 0 if exclusive access was acquired. - */ -static int cxl_pci_mbox_get(struct cxl_dev_state *cxlds) -{ - struct device *dev = cxlds->dev; - u64 md_status; - int rc; - - mutex_lock_io(&cxlds->mbox_mutex); - - /* - * XXX: There is some amount of ambiguity in the 2.0 version of the spec - * around the mailbox interface ready (8.2.8.5.1.1). The purpose of the - * bit is to allow firmware running on the device to notify the driver - * that it's ready to receive commands. It is unclear if the bit needs - * to be read for each transaction mailbox, ie. the firmware can switch - * it on and off as needed. Second, there is no defined timeout for - * mailbox ready, like there is for the doorbell interface. - * - * Assumptions: - * 1. The firmware might toggle the Mailbox Interface Ready bit, check - * it for every command. - * - * 2. If the doorbell is clear, the firmware should have first set the - * Mailbox Interface Ready bit. Therefore, waiting for the doorbell - * to be ready is sufficient. - */ - rc = cxl_pci_mbox_wait_for_doorbell(cxlds); - if (rc) { - dev_warn(dev, "Mailbox interface not ready\n"); - goto out; - } - - md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); - if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) { - dev_err(dev, "mbox: reported doorbell ready, but not mbox ready\n"); - rc = -EBUSY; - goto out; - } - - /* - * Hardware shouldn't allow a ready status but also have failure bits - * set. Spit out an error, this should be a bug report - */ - rc = -EFAULT; - if (md_status & CXLMDEV_DEV_FATAL) { - dev_err(dev, "mbox: reported ready, but fatal\n"); - goto out; - } - if (md_status & CXLMDEV_FW_HALT) { - dev_err(dev, "mbox: reported ready, but halted\n"); - goto out; - } - if (CXLMDEV_RESET_NEEDED(md_status)) { - dev_err(dev, "mbox: reported ready, but reset needed\n"); - goto out; - } - - /* with lock held */ - return 0; - -out: - mutex_unlock(&cxlds->mbox_mutex); - return rc; -} - -/** - * cxl_pci_mbox_put() - Release exclusive access to the mailbox. - * @cxlds: The device state to communicate with. - * - * Context: Any context. Expects mbox_mutex to be held. - */ -static void cxl_pci_mbox_put(struct cxl_dev_state *cxlds) -{ - mutex_unlock(&cxlds->mbox_mutex); -} - static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd) { int rc; - rc = cxl_pci_mbox_get(cxlds); - if (rc) - return rc; - + mutex_lock_io(&cxlds->mbox_mutex); rc = __cxl_pci_mbox_send_cmd(cxlds, cmd); - cxl_pci_mbox_put(cxlds); + mutex_unlock(&cxlds->mbox_mutex); return rc; } @@ -310,11 +233,20 @@ static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds) } while (!time_after(jiffies, timeout)); if (!(md_status & CXLMDEV_MBOX_IF_READY)) { - dev_err(cxlds->dev, - "timeout awaiting mailbox ready, device state:%s%s\n", - md_status & CXLMDEV_DEV_FATAL ? " fatal" : "", - md_status & CXLMDEV_FW_HALT ? " firmware-halt" : ""); - return -EIO; + cxl_err(cxlds->dev, md_status, + "timeout awaiting mailbox ready"); + return -ETIMEDOUT; + } + + /* + * A command may be in flight from a previous driver instance, + * think kexec, do one doorbell wait so that + * __cxl_pci_mbox_send_cmd() can assume that it is the only + * source for future doorbell busy events. + */ + if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) { + cxl_err(cxlds->dev, md_status, "timeout awaiting mailbox idle"); + return -ETIMEDOUT; } cxlds->mbox_send = cxl_pci_mbox_send; From 46c6ad27625ca00f59903585e41667d7a45b4eb8 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:00 -0800 Subject: [PATCH 033/186] cxl: Flesh out register names Get a better naming scheme in place for upcoming additions. By dropping redundant usages of CXL and DVSEC where appropriate we can get more concise and also more grepable defines. Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Signed-off-by: Ben Widawsky Link: https://lore.kernel.org/r/164298414022.3018233.15522855498759815097.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/pci.c | 14 +++++++------- drivers/cxl/pci.h | 19 ++++++++++--------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index baeea0c2e619..0981d8f375ad 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -370,10 +370,10 @@ static int cxl_map_regs(struct cxl_dev_state *cxlds, struct cxl_register_map *ma static void cxl_decode_regblock(u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { - map->block_offset = - ((u64)reg_hi << 32) | (reg_lo & CXL_REGLOC_ADDR_MASK); - map->barno = FIELD_GET(CXL_REGLOC_BIR_MASK, reg_lo); - map->reg_type = FIELD_GET(CXL_REGLOC_RBI_MASK, reg_lo); + map->block_offset = ((u64)reg_hi << 32) | + (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + map->barno = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); } /** @@ -394,15 +394,15 @@ static int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, int regloc, i; regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL, - PCI_DVSEC_ID_CXL_REGLOC_DVSEC_ID); + CXL_DVSEC_REG_LOCATOR); if (!regloc) return -ENXIO; pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); - regloc += PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET; - regblocks = (regloc_size - PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET) / 8; + regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; + regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; for (i = 0; i < regblocks; i++, regloc += 8) { u32 reg_lo, reg_hi; diff --git a/drivers/cxl/pci.h b/drivers/cxl/pci.h index 7d3e4bf06b45..29b8eaef3a0a 100644 --- a/drivers/cxl/pci.h +++ b/drivers/cxl/pci.h @@ -7,17 +7,21 @@ /* * See section 8.1 Configuration Space Registers in the CXL 2.0 - * Specification + * Specification. Names are taken straight from the specification with "CXL" and + * "DVSEC" redundancies removed. When obvious, abbreviations may be used. */ #define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20) #define PCI_DVSEC_VENDOR_ID_CXL 0x1E98 -#define PCI_DVSEC_ID_CXL 0x0 -#define PCI_DVSEC_ID_CXL_REGLOC_DVSEC_ID 0x8 -#define PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET 0xC +/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ +#define CXL_DVSEC_PCIE_DEVICE 0 -/* BAR Indicator Register (BIR) */ -#define CXL_REGLOC_BIR_MASK GENMASK(2, 0) +/* CXL 2.0 8.1.9: Register Locator DVSEC */ +#define CXL_DVSEC_REG_LOCATOR 8 +#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC +#define CXL_DVSEC_REG_LOCATOR_BIR_MASK GENMASK(2, 0) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK GENMASK(15, 8) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK GENMASK(31, 16) /* Register Block Identifier (RBI) */ enum cxl_regloc_type { @@ -28,7 +32,4 @@ enum cxl_regloc_type { CXL_REGLOC_RBI_TYPES }; -#define CXL_REGLOC_RBI_MASK GENMASK(15, 8) -#define CXL_REGLOC_ADDR_MASK GENMASK(31, 16) - #endif /* __CXL_PCI_H__ */ From 8baa787b93dbda6b24081297b934e8edd886d4bb Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:05 -0800 Subject: [PATCH 034/186] cxl/pci: Add new DVSEC definitions In preparation for properly supporting memory active timeout, and later on, other attributes obtained from DVSEC fields, add the full list of DVSEC identifiers from the CXL 2.0 specification. Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron (v1) Signed-off-by: Ben Widawsky Link: https://lore.kernel.org/r/164298414567.3018233.12005290051592771878.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/pci.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/cxl/pci.h b/drivers/cxl/pci.h index 29b8eaef3a0a..8ae2b4adc59d 100644 --- a/drivers/cxl/pci.h +++ b/drivers/cxl/pci.h @@ -16,6 +16,21 @@ /* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ #define CXL_DVSEC_PCIE_DEVICE 0 +/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ +#define CXL_DVSEC_FUNCTION_MAP 2 + +/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */ +#define CXL_DVSEC_PORT_EXTENSIONS 3 + +/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ +#define CXL_DVSEC_PORT_GPF 4 + +/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ +#define CXL_DVSEC_DEVICE_GPF 5 + +/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */ +#define CXL_DVSEC_PCIE_FLEXBUS_PORT 7 + /* CXL 2.0 8.1.9: Register Locator DVSEC */ #define CXL_DVSEC_REG_LOCATOR 8 #define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC From 303ebc1b1741b6a18349d8e5753c2d25fdb41a21 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:10 -0800 Subject: [PATCH 035/186] cxl/acpi: Map component registers for Root Ports This implements the TODO in cxl_acpi for mapping component registers. cxl_acpi becomes the second consumer of CXL register block enumeration (cxl_pci being the first). Moving the functionality to cxl_core allows both of these drivers to use the functionality. Equally importantly it allows cxl_core to use the functionality in the future. CXL 2.0 root ports are similar to CXL 2.0 Downstream Ports with the main distinction being they're a part of the CXL 2.0 host bridge. While mapping their component registers is not immediately useful for the CXL drivers, the movement of register block enumeration into core is a vital step towards HDM decoder programming. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron [djbw: fix cxl_regmap_to_base() failure cases] Link: https://lore.kernel.org/r/164298415080.3018233.14694957480228676592.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 13 ++++++++-- drivers/cxl/core/regs.c | 56 +++++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 4 +++ drivers/cxl/pci.c | 52 -------------------------------------- drivers/cxl/pci.h | 9 +++++++ 5 files changed, 80 insertions(+), 54 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 3163167ecc3a..c656a49a11a9 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -7,6 +7,7 @@ #include #include #include "cxl.h" +#include "pci.h" /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ #define CFMWS_INTERLEAVE_WAYS(x) (1 << (x)->interleave_ways) @@ -134,11 +135,13 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, __mock int match_add_root_ports(struct pci_dev *pdev, void *data) { + resource_size_t creg = CXL_RESOURCE_NONE; struct cxl_walk_context *ctx = data; struct pci_bus *root_bus = ctx->root; struct cxl_port *port = ctx->port; int type = pci_pcie_type(pdev); struct device *dev = ctx->dev; + struct cxl_register_map map; u32 lnkcap, port_num; int rc; @@ -152,9 +155,15 @@ __mock int match_add_root_ports(struct pci_dev *pdev, void *data) &lnkcap) != PCIBIOS_SUCCESSFUL) return 0; - /* TODO walk DVSEC to find component register base */ + /* The driver doesn't rely on component registers for Root Ports yet. */ + rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (!rc) + dev_info(&pdev->dev, "No component register block found\n"); + + creg = cxl_regmap_to_base(pdev, &map); + port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - rc = cxl_add_dport(port, &pdev->dev, port_num, CXL_RESOURCE_NONE); + rc = cxl_add_dport(port, &pdev->dev, port_num, creg); if (rc) { ctx->error = rc; return rc; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index e37e23bf4355..0d63758e2605 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -5,6 +5,7 @@ #include #include #include +#include /** * DOC: cxl registers @@ -247,3 +248,58 @@ int cxl_map_device_regs(struct pci_dev *pdev, return 0; } EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, CXL); + +static void cxl_decode_regblock(u32 reg_lo, u32 reg_hi, + struct cxl_register_map *map) +{ + map->block_offset = ((u64)reg_hi << 32) | + (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + map->barno = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); +} + +/** + * cxl_find_regblock() - Locate register blocks by type + * @pdev: The CXL PCI device to enumerate. + * @type: Register Block Indicator id + * @map: Enumeration output, clobbered on error + * + * Return: 0 if register block enumerated, negative error code otherwise + * + * A CXL DVSEC may point to one or more register blocks, search for them + * by @type. + */ +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map) +{ + u32 regloc_size, regblocks; + int regloc, i; + + map->block_offset = U64_MAX; + regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL, + CXL_DVSEC_REG_LOCATOR); + if (!regloc) + return -ENXIO; + + pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); + regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); + + regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; + regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; + + for (i = 0; i < regblocks; i++, regloc += 8) { + u32 reg_lo, reg_hi; + + pci_read_config_dword(pdev, regloc, ®_lo); + pci_read_config_dword(pdev, regloc + 4, ®_hi); + + cxl_decode_regblock(reg_lo, reg_hi, map); + + if (map->reg_type == type) + return 0; + } + + map->block_offset = U64_MAX; + return -ENODEV; +} +EXPORT_SYMBOL_NS_GPL(cxl_find_regblock, CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index a5a0be3f088b..6288a6c1fc5c 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -145,6 +145,10 @@ int cxl_map_device_regs(struct pci_dev *pdev, struct cxl_device_regs *regs, struct cxl_register_map *map); +enum cxl_regloc_type; +int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, + struct cxl_register_map *map); + #define CXL_RESOURCE_NONE ((resource_size_t) -1) #define CXL_TARGET_STRLEN 20 diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0981d8f375ad..7d5f0c873aba 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -367,58 +367,6 @@ static int cxl_map_regs(struct cxl_dev_state *cxlds, struct cxl_register_map *ma return 0; } -static void cxl_decode_regblock(u32 reg_lo, u32 reg_hi, - struct cxl_register_map *map) -{ - map->block_offset = ((u64)reg_hi << 32) | - (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); - map->barno = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); - map->reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); -} - -/** - * cxl_find_regblock() - Locate register blocks by type - * @pdev: The CXL PCI device to enumerate. - * @type: Register Block Indicator id - * @map: Enumeration output, clobbered on error - * - * Return: 0 if register block enumerated, negative error code otherwise - * - * A CXL DVSEC may point to one or more register blocks, search for them - * by @type. - */ -static int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, - struct cxl_register_map *map) -{ - u32 regloc_size, regblocks; - int regloc, i; - - regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL, - CXL_DVSEC_REG_LOCATOR); - if (!regloc) - return -ENXIO; - - pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); - regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); - - regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; - regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; - - for (i = 0; i < regblocks; i++, regloc += 8) { - u32 reg_lo, reg_hi; - - pci_read_config_dword(pdev, regloc, ®_lo); - pci_read_config_dword(pdev, regloc + 4, ®_hi); - - cxl_decode_regblock(reg_lo, reg_hi, map); - - if (map->reg_type == type) - return 0; - } - - return -ENODEV; -} - static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map) { diff --git a/drivers/cxl/pci.h b/drivers/cxl/pci.h index 8ae2b4adc59d..0623bb85f30a 100644 --- a/drivers/cxl/pci.h +++ b/drivers/cxl/pci.h @@ -47,4 +47,13 @@ enum cxl_regloc_type { CXL_REGLOC_RBI_TYPES }; +static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev, + struct cxl_register_map *map) +{ + if (map->block_offset == U64_MAX) + return CXL_RESOURCE_NONE; + + return pci_resource_start(pdev, map->barno) + map->block_offset; +} + #endif /* __CXL_PCI_H__ */ From c57cae78bfa6a8535d4baade451107b0577c2750 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:15 -0800 Subject: [PATCH 036/186] cxl: Introduce module_cxl_driver Many CXL drivers simply want to register and unregister themselves. module_driver already supported this. A simple wrapper around that reduces a decent amount of boilerplate in upcoming patches. Suggested-by: Dan Williams Reviewed-by: Dan Williams Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298415591.3018233.13608495220547681412.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/cxl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 6288a6c1fc5c..38779409a419 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -308,6 +308,9 @@ int __cxl_driver_register(struct cxl_driver *cxl_drv, struct module *owner, #define cxl_driver_register(x) __cxl_driver_register(x, THIS_MODULE, KBUILD_MODNAME) void cxl_driver_unregister(struct cxl_driver *cxl_drv); +#define module_cxl_driver(__cxl_driver) \ + module_driver(__cxl_driver, cxl_driver_register, cxl_driver_unregister) + #define CXL_DEVICE_NVDIMM_BRIDGE 1 #define CXL_DEVICE_NVDIMM 2 From 0ff0af18216436d0151af4e410400c7a19ca9437 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:29:21 -0800 Subject: [PATCH 037/186] cxl/core/port: Rename bus.c to port.c Given it is dominated by port infrastructure, and will only acquire more, rename bus.c to port.c. Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164298416136.3018233.15442880970000855425.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- Documentation/driver-api/cxl/memory-devices.rst | 4 ++-- drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/{bus.c => port.c} | 0 tools/testing/cxl/Kbuild | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) rename drivers/cxl/core/{bus.c => port.c} (100%) diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index 3b8f41395f6b..c8f7a16cd0e3 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -36,10 +36,10 @@ CXL Core .. kernel-doc:: drivers/cxl/cxl.h :internal: -.. kernel-doc:: drivers/cxl/core/bus.c +.. kernel-doc:: drivers/cxl/core/port.c :doc: cxl core -.. kernel-doc:: drivers/cxl/core/bus.c +.. kernel-doc:: drivers/cxl/core/port.c :identifiers: .. kernel-doc:: drivers/cxl/core/pmem.c diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 40ab50318daf..a90202ac88d2 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_CXL_BUS) += cxl_core.o ccflags-y += -I$(srctree)/drivers/cxl -cxl_core-y := bus.o +cxl_core-y := port.o cxl_core-y += pmem.o cxl_core-y += regs.o cxl_core-y += memdev.o diff --git a/drivers/cxl/core/bus.c b/drivers/cxl/core/port.c similarity index 100% rename from drivers/cxl/core/bus.c rename to drivers/cxl/core/port.c diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 1acdf2fc31c5..3299fb0977b2 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -25,7 +25,7 @@ cxl_pmem-y += config_check.o obj-m += cxl_core.o -cxl_core-y := $(CXL_CORE_SRC)/bus.o +cxl_core-y := $(CXL_CORE_SRC)/port.o cxl_core-y += $(CXL_CORE_SRC)/pmem.o cxl_core-y += $(CXL_CORE_SRC)/regs.o cxl_core-y += $(CXL_CORE_SRC)/memdev.o From c3bca8d4bb3ff77b8784cdc794eb1f8f89b10fb5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:29:26 -0800 Subject: [PATCH 038/186] cxl/decoder: Hide physical address information from non-root Just like /proc/iomem, CXL physical address information is reserved for root only. Reviewed-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298416650.3018233.450720006145238709.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 3f9b98ecd18b..c5e74c6f04e8 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -49,7 +49,7 @@ static ssize_t start_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%#llx\n", cxld->range.start); } -static DEVICE_ATTR_RO(start); +static DEVICE_ATTR_ADMIN_RO(start); static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) From 608135db1b790170d22848815c4671407af74e37 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:31 -0800 Subject: [PATCH 039/186] cxl/core: Convert decoder range to resource CXL decoders manage address ranges in a hierarchical fashion whereby a leaf is a unique subregion of its parent decoder (midlevel or root). It therefore makes sense to use the resource API for handling this. Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron (v1) Signed-off-by: Ben Widawsky Link: https://lore.kernel.org/r/164298417191.3018233.5201055578165414714.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 22 ++++++++-------------- drivers/cxl/core/port.c | 23 +++++++++++++++++++++-- drivers/cxl/cxl.h | 8 ++++++-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index c656a49a11a9..da70f1836db6 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -108,10 +108,8 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, cxld->flags = cfmws_to_decoder_flags(cfmws->restrictions); cxld->target_type = CXL_DECODER_EXPANDER; - cxld->range = (struct range){ - .start = cfmws->base_hpa, - .end = cfmws->base_hpa + cfmws->window_size - 1, - }; + cxld->platform_res = (struct resource)DEFINE_RES_MEM(cfmws->base_hpa, + cfmws->window_size); cxld->interleave_ways = CFMWS_INTERLEAVE_WAYS(cfmws); cxld->interleave_granularity = CFMWS_INTERLEAVE_GRANULARITY(cfmws); @@ -121,14 +119,13 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, else rc = cxl_decoder_autoremove(dev, cxld); if (rc) { - dev_err(dev, "Failed to add decoder for %#llx-%#llx\n", - cfmws->base_hpa, - cfmws->base_hpa + cfmws->window_size - 1); + dev_err(dev, "Failed to add decoder for %pr\n", + &cxld->platform_res); return 0; } - dev_dbg(dev, "add: %s node: %d range %#llx-%#llx\n", - dev_name(&cxld->dev), phys_to_target_node(cxld->range.start), - cfmws->base_hpa, cfmws->base_hpa + cfmws->window_size - 1); + dev_dbg(dev, "add: %s node: %d range %pr\n", dev_name(&cxld->dev), + phys_to_target_node(cxld->platform_res.start), + &cxld->platform_res); return 0; } @@ -270,10 +267,7 @@ static int add_host_bridge_uport(struct device *match, void *arg) cxld->interleave_ways = 1; cxld->interleave_granularity = PAGE_SIZE; cxld->target_type = CXL_DECODER_EXPANDER; - cxld->range = (struct range) { - .start = 0, - .end = -1, - }; + cxld->platform_res = (struct resource)DEFINE_RES_MEM(0, 0); device_lock(&port->dev); dport = list_first_entry(&port->dports, typeof(*dport), list); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c5e74c6f04e8..63c76cb2a2ec 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -46,8 +46,14 @@ static ssize_t start_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_decoder *cxld = to_cxl_decoder(dev); + u64 start; - return sysfs_emit(buf, "%#llx\n", cxld->range.start); + if (is_root_decoder(dev)) + start = cxld->platform_res.start; + else + start = cxld->decoder_range.start; + + return sysfs_emit(buf, "%#llx\n", start); } static DEVICE_ATTR_ADMIN_RO(start); @@ -55,8 +61,14 @@ static ssize_t size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct cxl_decoder *cxld = to_cxl_decoder(dev); + u64 size; - return sysfs_emit(buf, "%#llx\n", range_len(&cxld->range)); + if (is_root_decoder(dev)) + size = resource_size(&cxld->platform_res); + else + size = range_len(&cxld->decoder_range); + + return sysfs_emit(buf, "%#llx\n", size); } static DEVICE_ATTR_RO(size); @@ -546,6 +558,13 @@ int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) if (rc) return rc; + /* + * Platform decoder resources should show up with a reasonable name. All + * other resources are just sub ranges within the main decoder resource. + */ + if (is_root_decoder(dev)) + cxld->platform_res.name = dev_name(dev); + return device_add(dev); } EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 38779409a419..bfd95acea66c 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -179,7 +179,8 @@ enum cxl_decoder_type { * struct cxl_decoder - CXL address range decode configuration * @dev: this decoder's device * @id: kernel device name id - * @range: address range considered by this decoder + * @platform_res: address space resources considered by root decoder + * @decoder_range: address space resources considered by midlevel decoder * @interleave_ways: number of cxl_dports in this decode * @interleave_granularity: data stride per dport * @target_type: accelerator vs expander (type2 vs type3) selector @@ -190,7 +191,10 @@ enum cxl_decoder_type { struct cxl_decoder { struct device dev; int id; - struct range range; + union { + struct resource platform_res; + struct range decoder_range; + }; int interleave_ways; int interleave_granularity; enum cxl_decoder_type target_type; From d54c1bbe2d34e301382968d8b05bd8162e8f60fb Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Mon, 31 Jan 2022 13:33:13 -0800 Subject: [PATCH 040/186] cxl/core/port: Clarify decoder creation Add wrappers for the creation of decoder objects at the root level and switch level, and keep the core helper private to cxl/core/port.c. Root decoders are static descriptors conveyed from platform firmware (e.g. ACPI CFMWS). Switch decoders are CXL standard decoders enumerated via the HDM decoder capability structure. The base address for the HDM decoder capability structure may be conveyed either by PCIe or platform firmware (ACPI CEDT.CHBS). Additionally, the kdoc descriptions for these helpers and their dependencies is updated. Signed-off-by: Ben Widawsky [djbw: fixup changelog, clarify kdoc] Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164366463014.111117.9714595404002687111.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 4 +- drivers/cxl/core/port.c | 83 +++++++++++++++++++++++++++++++++++++---- drivers/cxl/cxl.h | 16 +++++++- 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index da70f1836db6..0b267eabb15e 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -102,7 +102,7 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, for (i = 0; i < CFMWS_INTERLEAVE_WAYS(cfmws); i++) target_map[i] = cfmws->interleave_targets[i]; - cxld = cxl_decoder_alloc(root_port, CFMWS_INTERLEAVE_WAYS(cfmws)); + cxld = cxl_root_decoder_alloc(root_port, CFMWS_INTERLEAVE_WAYS(cfmws)); if (IS_ERR(cxld)) return 0; @@ -260,7 +260,7 @@ static int add_host_bridge_uport(struct device *match, void *arg) * dport. Disable the range until the first CXL region is enumerated / * activated. */ - cxld = cxl_decoder_alloc(port, 1); + cxld = cxl_switch_decoder_alloc(port, 1); if (IS_ERR(cxld)) return PTR_ERR(cxld); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 63c76cb2a2ec..88ffec71464a 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -495,13 +495,26 @@ out_unlock: return rc; } -struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, int nr_targets) +/** + * cxl_decoder_alloc - Allocate a new CXL decoder + * @port: owning port of this decoder + * @nr_targets: downstream targets accessible by this decoder. All upstream + * ports and root ports must have at least 1 target. + * + * A port should contain one or more decoders. Each of those decoders enable + * some address space for CXL.mem utilization. A decoder is expected to be + * configured by the caller before registering. + * + * Return: A new cxl decoder to be registered by cxl_decoder_add() + */ +static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) { struct cxl_decoder *cxld; struct device *dev; int rc = 0; - if (nr_targets > CXL_DECODER_MAX_INTERLEAVE || nr_targets < 1) + if (nr_targets > CXL_DECODER_MAX_INTERLEAVE || nr_targets == 0) return ERR_PTR(-EINVAL); cxld = kzalloc(struct_size(cxld, target, nr_targets), GFP_KERNEL); @@ -519,20 +532,74 @@ struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, int nr_targets) device_set_pm_not_required(dev); dev->parent = &port->dev; dev->bus = &cxl_bus_type; - - /* root ports do not have a cxl_port_type parent */ - if (port->dev.parent->type == &cxl_port_type) - dev->type = &cxl_decoder_switch_type; + if (is_cxl_root(port)) + cxld->dev.type = &cxl_decoder_root_type; else - dev->type = &cxl_decoder_root_type; + cxld->dev.type = &cxl_decoder_switch_type; return cxld; err: kfree(cxld); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(cxl_decoder_alloc, CXL); +/** + * cxl_root_decoder_alloc - Allocate a root level decoder + * @port: owning CXL root of this decoder + * @nr_targets: static number of downstream targets + * + * Return: A new cxl decoder to be registered by cxl_decoder_add(). A + * 'CXL root' decoder is one that decodes from a top-level / static platform + * firmware description of CXL resources into a CXL standard decode + * topology. + */ +struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) +{ + if (!is_cxl_root(port)) + return ERR_PTR(-EINVAL); + + return cxl_decoder_alloc(port, nr_targets); +} +EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL); + +/** + * cxl_switch_decoder_alloc - Allocate a switch level decoder + * @port: owning CXL switch port of this decoder + * @nr_targets: max number of dynamically addressable downstream targets + * + * Return: A new cxl decoder to be registered by cxl_decoder_add(). A + * 'switch' decoder is any decoder that can be enumerated by PCIe + * topology and the HDM Decoder Capability. This includes the decoders + * that sit between Switch Upstream Ports / Switch Downstream Ports and + * Host Bridges / Root Ports. + */ +struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets) +{ + if (is_cxl_root(port)) + return ERR_PTR(-EINVAL); + + return cxl_decoder_alloc(port, nr_targets); +} +EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); + +/** + * cxl_decoder_add - Add a decoder with targets + * @cxld: The cxl decoder allocated by cxl_decoder_alloc() + * @target_map: A list of downstream ports that this decoder can direct memory + * traffic to. These numbers should correspond with the port number + * in the PCIe Link Capabilities structure. + * + * Certain types of decoders may not have any targets. The main example of this + * is an endpoint device. A more awkward example is a hostbridge whose root + * ports get hot added (technically possible, though unlikely). + * + * Context: Process context. Takes and releases the cxld's device lock. + * + * Return: Negative error code if the decoder wasn't properly configured; else + * returns 0. + */ int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) { struct cxl_port *port; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index bfd95acea66c..621a70e023c1 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -278,6 +278,17 @@ struct cxl_dport { struct list_head list; }; +/* + * The platform firmware device hosting the root is also the top of the + * CXL port topology. All other CXL ports have another CXL port as their + * parent and their ->uport / host device is out-of-line of the port + * ancestry. + */ +static inline bool is_cxl_root(struct cxl_port *port) +{ + return port->uport == port->dev.parent; +} + struct cxl_port *to_cxl_port(struct device *dev); struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, @@ -288,7 +299,10 @@ int cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, struct cxl_decoder *to_cxl_decoder(struct device *dev); bool is_root_decoder(struct device *dev); -struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, int nr_targets); +struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets); +struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, + unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); From d621bc2e7282f9955033a6359877fd4ac4be60e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:29:42 -0800 Subject: [PATCH 041/186] cxl/core: Fix cxl_probe_component_regs() error message Fix a '\n' vs '/n' typo. Fixes: 08422378c4ad ("cxl/pci: Add HDM decoder capabilities") Acked-by: Ben Widawsky Link: https://lore.kernel.org/r/164298418268.3018233.17790073375430834911.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/regs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 0d63758e2605..12a6cbddf110 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -50,7 +50,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) { dev_err(dev, - "Couldn't locate the CXL.cache and CXL.mem capability array header./n"); + "Couldn't locate the CXL.cache and CXL.mem capability array header.\n"); return; } From d2b61ed2ff63fd9f294db8399c7a680ea7fe8a23 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:47 -0800 Subject: [PATCH 042/186] cxl/core/port: Make passthrough decoder init implicit Unused CXL decoders, or ports which use a passthrough decoder (no HDM decoder registers) are expected to be initialized in a specific way. Since upcoming drivers will want the same initialization, and it was already a requirement to have consumers of the API configure the decoder specific to their needs, initialize to this passthrough state by default. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298418778.3018233.13573986275832546547.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 5 ----- drivers/cxl/core/port.c | 9 ++++++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 0b267eabb15e..4e8086525edc 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -264,11 +264,6 @@ static int add_host_bridge_uport(struct device *match, void *arg) if (IS_ERR(cxld)) return PTR_ERR(cxld); - cxld->interleave_ways = 1; - cxld->interleave_granularity = PAGE_SIZE; - cxld->target_type = CXL_DECODER_EXPANDER; - cxld->platform_res = (struct resource)DEFINE_RES_MEM(0, 0); - device_lock(&port->dev); dport = list_first_entry(&port->dports, typeof(*dport), list); device_unlock(&port->dev); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 88ffec71464a..73ff42a05473 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -505,7 +505,8 @@ out_unlock: * some address space for CXL.mem utilization. A decoder is expected to be * configured by the caller before registering. * - * Return: A new cxl decoder to be registered by cxl_decoder_add() + * Return: A new cxl decoder to be registered by cxl_decoder_add(). The decoder + * is initialized to be a "passthrough" decoder. */ static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, unsigned int nr_targets) @@ -537,6 +538,12 @@ static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, else cxld->dev.type = &cxl_decoder_switch_type; + /* Pre initialize an "empty" decoder */ + cxld->interleave_ways = 1; + cxld->interleave_granularity = PAGE_SIZE; + cxld->target_type = CXL_DECODER_EXPANDER; + cxld->platform_res = (struct resource)DEFINE_RES_MEM(0, 0); + return cxld; err: kfree(cxld); From 53fa1bff3426344d466d91e81f076eab677d0ece Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:29:53 -0800 Subject: [PATCH 043/186] cxl/core: Track port depth In preparation for proving CXL subsystem usage of the device_lock() order track the depth of ports with the expectation that shallower port locks can be held over deeper port locks. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298419321.3018233.4469731547378993606.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 2 ++ drivers/cxl/cxl.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 73ff42a05473..f287d87da6d6 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -362,6 +362,8 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, if (IS_ERR(port)) return port; + if (parent_port) + port->depth = parent_port->depth + 1; dev = &port->dev; if (parent_port) rc = dev_set_name(dev, "port%d", port->id); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 621a70e023c1..7ade555076bc 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -252,6 +252,7 @@ struct cxl_walk_context { * @dports: cxl_dport instances referenced by decoders * @decoder_ida: allocator for decoder ids * @component_reg_phys: component register capability base address (optional) + * @depth: How deep this port is relative to the root. depth 0 is the root. */ struct cxl_port { struct device dev; @@ -260,6 +261,7 @@ struct cxl_port { struct list_head dports; struct ida decoder_ida; resource_size_t component_reg_phys; + unsigned int depth; }; /** From 3c5b903955251ea464fca383a42d981e33004df6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 11:50:09 -0800 Subject: [PATCH 044/186] cxl: Prove CXL locking When CONFIG_PROVE_LOCKING is enabled the 'struct device' definition gets an additional mutex that is not clobbered by lockdep_set_novalidate_class() like the typical device_lock(). This allows for local annotation of subsystem locks with mutex_lock_nested() per the subsystem's object/lock hierarchy. For CXL, this primarily needs the ability to lock ports by depth and child objects of ports by their parent parent-port lock. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164365853422.99383.1052399160445197427.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 10 ++--- drivers/cxl/core/pmem.c | 4 +- drivers/cxl/core/port.c | 47 +++++++++++++++++------ drivers/cxl/cxl.h | 81 ++++++++++++++++++++++++++++++++++++++++ drivers/cxl/pmem.c | 12 +++--- drivers/nvdimm/nd-core.h | 2 +- lib/Kconfig.debug | 23 ++++++++++++ 7 files changed, 154 insertions(+), 25 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 4e8086525edc..93d1dc56892a 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -176,14 +176,14 @@ static struct cxl_dport *find_dport_by_dev(struct cxl_port *port, struct device { struct cxl_dport *dport; - device_lock(&port->dev); + cxl_device_lock(&port->dev); list_for_each_entry(dport, &port->dports, list) if (dport->dport == dev) { - device_unlock(&port->dev); + cxl_device_unlock(&port->dev); return dport; } - device_unlock(&port->dev); + cxl_device_unlock(&port->dev); return NULL; } @@ -264,9 +264,9 @@ static int add_host_bridge_uport(struct device *match, void *arg) if (IS_ERR(cxld)) return PTR_ERR(cxld); - device_lock(&port->dev); + cxl_device_lock(&port->dev); dport = list_first_entry(&port->dports, typeof(*dport), list); - device_unlock(&port->dev); + cxl_device_unlock(&port->dev); single_port_map[0] = dport->port_id; diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index b5fca97b0a07..40b3f5030496 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -115,10 +115,10 @@ static void unregister_nvb(void *_cxl_nvb) * work to flush. Once the state has been changed to 'dead' then no new * work can be queued by user-triggered bind. */ - device_lock(&cxl_nvb->dev); + cxl_device_lock(&cxl_nvb->dev); flush = cxl_nvb->state != CXL_NVB_NEW; cxl_nvb->state = CXL_NVB_DEAD; - device_unlock(&cxl_nvb->dev); + cxl_device_unlock(&cxl_nvb->dev); /* * Even though the device core will trigger device_release_driver() diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index f287d87da6d6..9285cdb734b2 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -111,7 +111,7 @@ static ssize_t target_list_show(struct device *dev, ssize_t offset = 0; int i, rc = 0; - device_lock(dev); + cxl_device_lock(dev); for (i = 0; i < cxld->interleave_ways; i++) { struct cxl_dport *dport = cxld->target[i]; struct cxl_dport *next = NULL; @@ -127,7 +127,7 @@ static ssize_t target_list_show(struct device *dev, break; offset += rc; } - device_unlock(dev); + cxl_device_unlock(dev); if (rc < 0) return rc; @@ -214,6 +214,12 @@ bool is_root_decoder(struct device *dev) } EXPORT_SYMBOL_NS_GPL(is_root_decoder, CXL); +bool is_cxl_decoder(struct device *dev) +{ + return dev->type->release == cxl_decoder_release; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_decoder, CXL); + struct cxl_decoder *to_cxl_decoder(struct device *dev) { if (dev_WARN_ONCE(dev, dev->type->release != cxl_decoder_release, @@ -235,10 +241,10 @@ static void cxl_port_release(struct device *dev) struct cxl_port *port = to_cxl_port(dev); struct cxl_dport *dport, *_d; - device_lock(dev); + cxl_device_lock(dev); list_for_each_entry_safe(dport, _d, &port->dports, list) cxl_dport_release(dport); - device_unlock(dev); + cxl_device_unlock(dev); ida_free(&cxl_port_ida, port->id); kfree(port); } @@ -254,6 +260,12 @@ static const struct device_type cxl_port_type = { .groups = cxl_port_attribute_groups, }; +bool is_cxl_port(struct device *dev) +{ + return dev->type == &cxl_port_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_port, CXL); + struct cxl_port *to_cxl_port(struct device *dev) { if (dev_WARN_ONCE(dev, dev->type != &cxl_port_type, @@ -261,13 +273,14 @@ struct cxl_port *to_cxl_port(struct device *dev) return NULL; return container_of(dev, struct cxl_port, dev); } +EXPORT_SYMBOL_NS_GPL(to_cxl_port, CXL); static void unregister_port(void *_port) { struct cxl_port *port = _port; struct cxl_dport *dport; - device_lock(&port->dev); + cxl_device_lock(&port->dev); list_for_each_entry(dport, &port->dports, list) { char link_name[CXL_TARGET_STRLEN]; @@ -276,7 +289,7 @@ static void unregister_port(void *_port) continue; sysfs_remove_link(&port->dev.kobj, link_name); } - device_unlock(&port->dev); + cxl_device_unlock(&port->dev); device_unregister(&port->dev); } @@ -407,7 +420,7 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) { struct cxl_dport *dup; - device_lock(&port->dev); + cxl_device_lock(&port->dev); dup = find_dport(port, new->port_id); if (dup) dev_err(&port->dev, @@ -416,7 +429,7 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) dev_name(dup->dport)); else list_add_tail(&new->list, &port->dports); - device_unlock(&port->dev); + cxl_device_unlock(&port->dev); return dup ? -EEXIST : 0; } @@ -475,7 +488,7 @@ static int decoder_populate_targets(struct cxl_decoder *cxld, if (!target_map) return 0; - device_lock(&port->dev); + cxl_device_lock(&port->dev); if (list_empty(&port->dports)) { rc = -EINVAL; goto out_unlock; @@ -492,7 +505,7 @@ static int decoder_populate_targets(struct cxl_decoder *cxld, } out_unlock: - device_unlock(&port->dev); + cxl_device_unlock(&port->dev); return rc; } @@ -717,15 +730,27 @@ static int cxl_bus_match(struct device *dev, struct device_driver *drv) static int cxl_bus_probe(struct device *dev) { - return to_cxl_drv(dev->driver)->probe(dev); + int rc; + + /* + * Take the CXL nested lock since the driver core only holds + * @dev->mutex and not @dev->lockdep_mutex. + */ + cxl_nested_lock(dev); + rc = to_cxl_drv(dev->driver)->probe(dev); + cxl_nested_unlock(dev); + + return rc; } static void cxl_bus_remove(struct device *dev) { struct cxl_driver *cxl_drv = to_cxl_drv(dev->driver); + cxl_nested_lock(dev); if (cxl_drv->remove) cxl_drv->remove(dev); + cxl_nested_unlock(dev); } struct bus_type cxl_bus_type = { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 7ade555076bc..6a38d2e1f3dd 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -291,6 +291,7 @@ static inline bool is_cxl_root(struct cxl_port *port) return port->uport == port->dev.parent; } +bool is_cxl_port(struct device *dev); struct cxl_port *to_cxl_port(struct device *dev); struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, @@ -301,6 +302,7 @@ int cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, struct cxl_decoder *to_cxl_decoder(struct device *dev); bool is_root_decoder(struct device *dev); +bool is_cxl_decoder(struct device *dev); struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, @@ -353,4 +355,83 @@ struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd); #ifndef __mock #define __mock static #endif + +#ifdef CONFIG_PROVE_CXL_LOCKING +enum cxl_lock_class { + CXL_ANON_LOCK, + CXL_NVDIMM_LOCK, + CXL_NVDIMM_BRIDGE_LOCK, + CXL_PORT_LOCK, + /* + * Be careful to add new lock classes here, CXL_PORT_LOCK is + * extended by the port depth, so a maximum CXL port topology + * depth would need to be defined first. + */ +}; + +static inline void cxl_nested_lock(struct device *dev) +{ + if (is_cxl_port(dev)) { + struct cxl_port *port = to_cxl_port(dev); + + mutex_lock_nested(&dev->lockdep_mutex, + CXL_PORT_LOCK + port->depth); + } else if (is_cxl_decoder(dev)) { + struct cxl_port *port = to_cxl_port(dev->parent); + + /* + * A decoder is the immediate child of a port, so set + * its lock class equal to other child device siblings. + */ + mutex_lock_nested(&dev->lockdep_mutex, + CXL_PORT_LOCK + port->depth + 1); + } else if (is_cxl_nvdimm_bridge(dev)) + mutex_lock_nested(&dev->lockdep_mutex, CXL_NVDIMM_BRIDGE_LOCK); + else if (is_cxl_nvdimm(dev)) + mutex_lock_nested(&dev->lockdep_mutex, CXL_NVDIMM_LOCK); + else + mutex_lock_nested(&dev->lockdep_mutex, CXL_ANON_LOCK); +} + +static inline void cxl_nested_unlock(struct device *dev) +{ + mutex_unlock(&dev->lockdep_mutex); +} + +static inline void cxl_device_lock(struct device *dev) +{ + /* + * For double lock errors the lockup will happen before lockdep + * warns at cxl_nested_lock(), so assert explicitly. + */ + lockdep_assert_not_held(&dev->lockdep_mutex); + + device_lock(dev); + cxl_nested_lock(dev); +} + +static inline void cxl_device_unlock(struct device *dev) +{ + cxl_nested_unlock(dev); + device_unlock(dev); +} +#else +static inline void cxl_nested_lock(struct device *dev) +{ +} + +static inline void cxl_nested_unlock(struct device *dev) +{ +} + +static inline void cxl_device_lock(struct device *dev) +{ + device_lock(dev); +} + +static inline void cxl_device_unlock(struct device *dev) +{ + device_unlock(dev); +} +#endif #endif /* __CXL_H__ */ diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index b65a272a2d6d..15ad666ab03e 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -43,7 +43,7 @@ static int cxl_nvdimm_probe(struct device *dev) if (!cxl_nvb) return -ENXIO; - device_lock(&cxl_nvb->dev); + cxl_device_lock(&cxl_nvb->dev); if (!cxl_nvb->nvdimm_bus) { rc = -ENXIO; goto out; @@ -68,7 +68,7 @@ static int cxl_nvdimm_probe(struct device *dev) dev_set_drvdata(dev, nvdimm); rc = devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm); out: - device_unlock(&cxl_nvb->dev); + cxl_device_unlock(&cxl_nvb->dev); put_device(&cxl_nvb->dev); return rc; @@ -233,7 +233,7 @@ static void cxl_nvb_update_state(struct work_struct *work) struct nvdimm_bus *victim_bus = NULL; bool release = false, rescan = false; - device_lock(&cxl_nvb->dev); + cxl_device_lock(&cxl_nvb->dev); switch (cxl_nvb->state) { case CXL_NVB_ONLINE: if (!online_nvdimm_bus(cxl_nvb)) { @@ -251,7 +251,7 @@ static void cxl_nvb_update_state(struct work_struct *work) default: break; } - device_unlock(&cxl_nvb->dev); + cxl_device_unlock(&cxl_nvb->dev); if (release) device_release_driver(&cxl_nvb->dev); @@ -327,9 +327,9 @@ static int cxl_nvdimm_bridge_reset(struct device *dev, void *data) return 0; cxl_nvb = to_cxl_nvdimm_bridge(dev); - device_lock(dev); + cxl_device_lock(dev); cxl_nvb->state = CXL_NVB_NEW; - device_unlock(dev); + cxl_device_unlock(dev); return 0; } diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h index a11850dd475d..2650a852eeaf 100644 --- a/drivers/nvdimm/nd-core.h +++ b/drivers/nvdimm/nd-core.h @@ -185,7 +185,7 @@ static inline void devm_nsio_disable(struct device *dev, } #endif -#ifdef CONFIG_PROVE_LOCKING +#ifdef CONFIG_PROVE_NVDIMM_LOCKING extern struct class *nd_class; enum { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 14b89aa37c5c..7dea203964f7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1516,6 +1516,29 @@ config CSD_LOCK_WAIT_DEBUG include the IPI handler function currently executing (if any) and relevant stack traces. +choice + prompt "Lock debugging: prove subsystem device_lock() correctness" + depends on PROVE_LOCKING + help + For subsystems that have instrumented their usage of the device_lock() + with nested annotations, enable lock dependency checking. The locking + hierarchy 'subclass' identifiers are not compatible across + sub-systems, so only one can be enabled at a time. + +config PROVE_NVDIMM_LOCKING + bool "NVDIMM" + depends on LIBNVDIMM + help + Enable lockdep to validate nd_device_lock() usage. + +config PROVE_CXL_LOCKING + bool "CXL" + depends on CXL_BUS + help + Enable lockdep to validate cxl_device_lock() usage. + +endchoice + endmenu # lock debugging config TRACE_IRQFLAGS From 86c8ea0f3b32aae6d824bdc0d835b6a9361dc912 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 15:35:18 -0800 Subject: [PATCH 045/186] cxl/core/port: Use dedicated lock for decoder target list Lockdep reports: ====================================================== WARNING: possible circular locking dependency detected 5.16.0-rc1+ #142 Tainted: G OE ------------------------------------------------------ cxl/1220 is trying to acquire lock: ffff979b85475460 (kn->active#144){++++}-{0:0}, at: __kernfs_remove+0x1ab/0x1e0 but task is already holding lock: ffff979b87ab38e8 (&dev->lockdep_mutex#2/4){+.+.}-{3:3}, at: cxl_remove_ep+0x50c/0x5c0 [cxl_core] ...where cxl_remove_ep() is a helper that wants to delete ports while holding a lock on the host device for that port. That sets up a lockdep violation whereby target_list_show() can not rely holding the decoder's device lock while walking the target_list. Switch to a dedicated seqlock for this purpose. Reported-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164367209095.208169.1171673319121271280.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 30 +++++++++++++++++++++++------- drivers/cxl/cxl.h | 2 ++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 9285cdb734b2..fc5d86222bc3 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -104,14 +104,11 @@ static ssize_t target_type_show(struct device *dev, } static DEVICE_ATTR_RO(target_type); -static ssize_t target_list_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t emit_target_list(struct cxl_decoder *cxld, char *buf) { - struct cxl_decoder *cxld = to_cxl_decoder(dev); ssize_t offset = 0; int i, rc = 0; - cxl_device_lock(dev); for (i = 0; i < cxld->interleave_ways; i++) { struct cxl_dport *dport = cxld->target[i]; struct cxl_dport *next = NULL; @@ -124,13 +121,29 @@ static ssize_t target_list_show(struct device *dev, rc = sysfs_emit_at(buf, offset, "%d%s", dport->port_id, next ? "," : ""); if (rc < 0) - break; + return rc; offset += rc; } - cxl_device_unlock(dev); + + return offset; +} + +static ssize_t target_list_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + ssize_t offset; + unsigned int seq; + int rc; + + do { + seq = read_seqbegin(&cxld->target_lock); + rc = emit_target_list(cxld, buf); + } while (read_seqretry(&cxld->target_lock, seq)); if (rc < 0) return rc; + offset = rc; rc = sysfs_emit_at(buf, offset, "\n"); if (rc < 0) @@ -494,15 +507,17 @@ static int decoder_populate_targets(struct cxl_decoder *cxld, goto out_unlock; } + write_seqlock(&cxld->target_lock); for (i = 0; i < cxld->nr_targets; i++) { struct cxl_dport *dport = find_dport(port, target_map[i]); if (!dport) { rc = -ENXIO; - goto out_unlock; + break; } cxld->target[i] = dport; } + write_sequnlock(&cxld->target_lock); out_unlock: cxl_device_unlock(&port->dev); @@ -543,6 +558,7 @@ static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, cxld->id = rc; cxld->nr_targets = nr_targets; + seqlock_init(&cxld->target_lock); dev = &cxld->dev; device_initialize(dev); device_set_pm_not_required(dev); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 6a38d2e1f3dd..e79162999088 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -185,6 +185,7 @@ enum cxl_decoder_type { * @interleave_granularity: data stride per dport * @target_type: accelerator vs expander (type2 vs type3) selector * @flags: memory type capabilities and locking + * @target_lock: coordinate coherent reads of the target list * @nr_targets: number of elements in @target * @target: active ordered target list in current decoder configuration */ @@ -199,6 +200,7 @@ struct cxl_decoder { int interleave_granularity; enum cxl_decoder_type target_type; unsigned long flags; + seqlock_t target_lock; int nr_targets; struct cxl_dport *target[]; }; From 5ff7316f6fea4798c66b1ba953d1ebe6617503e4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 08:44:52 -0800 Subject: [PATCH 046/186] cxl/port: Introduce cxl_port_to_pci_bus() Add a helper for converting a PCI enumerated cxl_port into the pci_bus that hosts its dports. For switch ports this is trivial, but for root ports there is no generic way to go from a platform defined host bridge device, like ACPI0016 to its corresponding pci_bus. Rather than spill ACPI goop outside of the cxl_acpi driver, just arrange for it to register an xarray translation from the uport device to the corresponding pci_bus. This is in preparation for centralizing dport enumeration in the core. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164364745633.85488.9744017377155103992.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 14 +++++++++----- drivers/cxl/core/port.c | 37 +++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 3 +++ 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 93d1dc56892a..ab2b76532272 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -225,17 +225,21 @@ static int add_host_bridge_uport(struct device *match, void *arg) return 0; } + /* + * Note that this lookup already succeeded in + * to_cxl_host_bridge(), so no need to check for failure here + */ + pci_root = acpi_pci_find_root(bridge->handle); + rc = devm_cxl_register_pci_bus(host, match, pci_root->bus); + if (rc) + return rc; + port = devm_cxl_add_port(host, match, dport->component_reg_phys, root_port); if (IS_ERR(port)) return PTR_ERR(port); dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev)); - /* - * Note that this lookup already succeeded in - * to_cxl_host_bridge(), so no need to check for failure here - */ - pci_root = acpi_pci_find_root(bridge->handle); ctx = (struct cxl_walk_context){ .dev = host, .root = pci_root->bus, diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index fc5d86222bc3..2a4230d685d5 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -25,6 +25,7 @@ */ static DEFINE_IDA(cxl_port_ida); +static DEFINE_XARRAY(cxl_root_buses); static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -418,6 +419,42 @@ err: } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, CXL); +struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port) +{ + /* There is no pci_bus associated with a CXL platform-root port */ + if (is_cxl_root(port)) + return NULL; + + if (dev_is_pci(port->uport)) { + struct pci_dev *pdev = to_pci_dev(port->uport); + + return pdev->subordinate; + } + + return xa_load(&cxl_root_buses, (unsigned long)port->uport); +} +EXPORT_SYMBOL_NS_GPL(cxl_port_to_pci_bus, CXL); + +static void unregister_pci_bus(void *uport) +{ + xa_erase(&cxl_root_buses, (unsigned long)uport); +} + +int devm_cxl_register_pci_bus(struct device *host, struct device *uport, + struct pci_bus *bus) +{ + int rc; + + if (dev_is_pci(uport)) + return -EINVAL; + + rc = xa_insert(&cxl_root_buses, (unsigned long)uport, bus, GFP_KERNEL); + if (rc) + return rc; + return devm_add_action_or_reset(host, unregister_pci_bus, uport); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, CXL); + static struct cxl_dport *find_dport(struct cxl_port *port, int id) { struct cxl_dport *dport; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index e79162999088..4d4cc8292137 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -295,6 +295,9 @@ static inline bool is_cxl_root(struct cxl_port *port) bool is_cxl_port(struct device *dev); struct cxl_port *to_cxl_port(struct device *dev); +int devm_cxl_register_pci_bus(struct device *host, struct device *uport, + struct pci_bus *bus); +struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port); struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, struct cxl_port *parent_port); From a46cfc0f011ce77d120e1cdbf973f733d18f0105 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 16:34:40 -0800 Subject: [PATCH 047/186] cxl/pmem: Introduce a find_cxl_root() helper In preparation for switch port enumeration while also preserving the potential for multi-domain / multi-root CXL topologies. Introduce a 'struct device' generic mechanism for retrieving a root CXL port, if one is registered. Note that the only known multi-domain CXL configurations are running the cxl_test unit test on a system that also publishes an ACPI0017 device. With this in hand the nvdimm-bridge lookup can be with device_find_child() instead of bus_find_device() + custom mocked lookup infrastructure in cxl_test. The mechanism looks for a 2nd level port since the root level topology is platform-firmware specific and the 2nd level down follows standard PCIe topology expectations. The cxl_acpi 2nd level is associated with a PCIe Root Port. Reported-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164367562182.225521.9488555616768096049.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/pmem.c | 14 +++++++--- drivers/cxl/core/port.c | 49 +++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 1 + tools/testing/cxl/Kbuild | 2 -- tools/testing/cxl/mock_pmem.c | 24 ----------------- 5 files changed, 60 insertions(+), 30 deletions(-) delete mode 100644 tools/testing/cxl/mock_pmem.c diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index 40b3f5030496..8de240c4d96b 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -57,24 +57,30 @@ bool is_cxl_nvdimm_bridge(struct device *dev) } EXPORT_SYMBOL_NS_GPL(is_cxl_nvdimm_bridge, CXL); -__mock int match_nvdimm_bridge(struct device *dev, const void *data) +static int match_nvdimm_bridge(struct device *dev, void *data) { return is_cxl_nvdimm_bridge(dev); } struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd) { + struct cxl_port *port = find_cxl_root(&cxl_nvd->dev); struct device *dev; - dev = bus_find_device(&cxl_bus_type, NULL, cxl_nvd, match_nvdimm_bridge); + if (!port) + return NULL; + + dev = device_find_child(&port->dev, NULL, match_nvdimm_bridge); + put_device(&port->dev); + if (!dev) return NULL; + return to_cxl_nvdimm_bridge(dev); } EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, CXL); -static struct cxl_nvdimm_bridge * -cxl_nvdimm_bridge_alloc(struct cxl_port *port) +static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port) { struct cxl_nvdimm_bridge *cxl_nvb; struct device *dev; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 2a4230d685d5..af7a515e4572 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -455,6 +455,55 @@ int devm_cxl_register_pci_bus(struct device *host, struct device *uport, } EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, CXL); +/* Find a 2nd level CXL port that has a dport that is an ancestor of @match */ +static int match_root_child(struct device *dev, const void *match) +{ + const struct device *iter = NULL; + struct cxl_port *port, *parent; + struct cxl_dport *dport; + + if (!is_cxl_port(dev)) + return 0; + + port = to_cxl_port(dev); + if (is_cxl_root(port)) + return 0; + + parent = to_cxl_port(port->dev.parent); + if (!is_cxl_root(parent)) + return 0; + + cxl_device_lock(&port->dev); + list_for_each_entry(dport, &port->dports, list) { + iter = match; + while (iter) { + if (iter == dport->dport) + goto out; + iter = iter->parent; + } + } +out: + cxl_device_unlock(&port->dev); + + return !!iter; +} + +struct cxl_port *find_cxl_root(struct device *dev) +{ + struct device *port_dev; + struct cxl_port *root; + + port_dev = bus_find_device(&cxl_bus_type, NULL, dev, match_root_child); + if (!port_dev) + return NULL; + + root = to_cxl_port(port_dev->parent); + get_device(&root->dev); + put_device(port_dev); + return root; +} +EXPORT_SYMBOL_NS_GPL(find_cxl_root, CXL); + static struct cxl_dport *find_dport(struct cxl_port *port, int id) { struct cxl_dport *dport; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 4d4cc8292137..61b0db526fa2 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -304,6 +304,7 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, int cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, resource_size_t component_reg_phys); +struct cxl_port *find_cxl_root(struct device *dev); struct cxl_decoder *to_cxl_decoder(struct device *dev); bool is_root_decoder(struct device *dev); diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 3299fb0977b2..ddaee8a2c418 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -32,6 +32,4 @@ cxl_core-y += $(CXL_CORE_SRC)/memdev.o cxl_core-y += $(CXL_CORE_SRC)/mbox.o cxl_core-y += config_check.o -cxl_core-y += mock_pmem.o - obj-m += test/ diff --git a/tools/testing/cxl/mock_pmem.c b/tools/testing/cxl/mock_pmem.c deleted file mode 100644 index f7315e6f52c0..000000000000 --- a/tools/testing/cxl/mock_pmem.c +++ /dev/null @@ -1,24 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* Copyright(c) 2021 Intel Corporation. All rights reserved. */ -#include -#include "test/mock.h" -#include - -int match_nvdimm_bridge(struct device *dev, const void *data) -{ - int index, rc = 0; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - const struct cxl_nvdimm *cxl_nvd = data; - - if (ops) { - if (dev->type == &cxl_nvdimm_bridge_type && - (ops->is_mock_dev(dev->parent->parent) == - ops->is_mock_dev(cxl_nvd->dev.parent->parent))) - rc = 1; - } else - rc = dev->type == &cxl_nvdimm_bridge_type; - - put_cxl_mock_ops(index); - - return rc; -} From c978f1b10aba8ce4f8e1f6fcc86b174e08a6e7f7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 17:07:38 -0800 Subject: [PATCH 048/186] cxl/port: Up-level cxl_add_dport() locking requirements to the caller In preparation for moving dport enumeration into the core, require the port device lock to be acquired by the caller. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164367759016.324231.105551648350470000.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 2 ++ drivers/cxl/core/port.c | 3 +-- tools/testing/cxl/mock_acpi.c | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index ab2b76532272..5d848b77d8e8 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -342,7 +342,9 @@ static int add_host_bridge_dport(struct device *match, void *arg) return 0; } + cxl_device_lock(&root_port->dev); rc = cxl_add_dport(root_port, match, uid, ctx.chbcr); + cxl_device_unlock(&root_port->dev); if (rc) { dev_err(host, "failed to add downstream port: %s\n", dev_name(match)); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index af7a515e4572..369cc52e0837 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -519,7 +519,7 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) { struct cxl_dport *dup; - cxl_device_lock(&port->dev); + device_lock_assert(&port->dev); dup = find_dport(port, new->port_id); if (dup) dev_err(&port->dev, @@ -528,7 +528,6 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) dev_name(dup->dport)); else list_add_tail(&new->list, &port->dports); - cxl_device_unlock(&port->dev); return dup ? -EEXIST : 0; } diff --git a/tools/testing/cxl/mock_acpi.c b/tools/testing/cxl/mock_acpi.c index 4c8a493ace56..c953e3ab6494 100644 --- a/tools/testing/cxl/mock_acpi.c +++ b/tools/testing/cxl/mock_acpi.c @@ -57,7 +57,9 @@ static int match_add_root_port(struct pci_dev *pdev, void *data) /* TODO walk DVSEC to find component register base */ port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); + cxl_device_lock(&port->dev); rc = cxl_add_dport(port, &pdev->dev, port_num, CXL_RESOURCE_NONE); + cxl_device_unlock(&port->dev); if (rc) { dev_err(dev, "failed to add dport: %s (%d)\n", dev_name(&pdev->dev), rc); @@ -78,7 +80,9 @@ static int mock_add_root_port(struct platform_device *pdev, void *data) struct device *dev = ctx->dev; int rc; + cxl_device_lock(&port->dev); rc = cxl_add_dport(port, &pdev->dev, pdev->id, CXL_RESOURCE_NONE); + cxl_device_unlock(&port->dev); if (rc) { dev_err(dev, "failed to add dport: %s (%d)\n", dev_name(&pdev->dev), rc); From af9cae9facc2de773b4aa59916913cfd6e18bdd0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:30:25 -0800 Subject: [PATCH 049/186] cxl/pci: Rename pci.h to cxlpci.h Similar to the mem.h rename, if the core wants to reuse definitions from drivers/cxl/pci.h it is unable to use as that collides with archs that have an arch/$arch/include/asm/pci.h, like MIPS. Reported-by: kernel test robot Acked-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298422510.3018233.14693126572756675563.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/regs.c | 2 +- drivers/cxl/{pci.h => cxlpci.h} | 1 + drivers/cxl/pci.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) rename drivers/cxl/{pci.h => cxlpci.h} (99%) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 5d848b77d8e8..8d4fd7534e1e 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -6,8 +6,8 @@ #include #include #include +#include "cxlpci.h" #include "cxl.h" -#include "pci.h" /* Encode defined in CXL 2.0 8.2.5.12.7 HDM Decoder Control Register */ #define CFMWS_INTERLEAVE_WAYS(x) (1 << (x)->interleave_ways) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 12a6cbddf110..65d7f5880671 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include /** * DOC: cxl registers diff --git a/drivers/cxl/pci.h b/drivers/cxl/cxlpci.h similarity index 99% rename from drivers/cxl/pci.h rename to drivers/cxl/cxlpci.h index 0623bb85f30a..eb00f597a157 100644 --- a/drivers/cxl/pci.h +++ b/drivers/cxl/cxlpci.h @@ -2,6 +2,7 @@ /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #ifndef __CXL_PCI_H__ #define __CXL_PCI_H__ +#include "cxl.h" #define CXL_MEMORY_PROGIF 0x10 diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 7d5f0c873aba..8b435b875b65 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -10,7 +10,7 @@ #include #include #include "cxlmem.h" -#include "pci.h" +#include "cxlpci.h" #include "cxl.h" /** From 98d2d3a264543680281fd8a4e6ae490ca26b4f85 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 18:10:04 -0800 Subject: [PATCH 050/186] cxl/core: Generalize dport enumeration in the core The core houses infrastructure for decoder resources. A CXL port's dports are more closely related to decoder infrastructure than topology enumeration. Implement generic PCI based dport enumeration in the core, i.e. arrange for existing root port enumeration from cxl_acpi to share code with switch port enumeration which just amounts to a small difference in a pci_walk_bus() invocation once the appropriate 'struct pci_bus' has been retrieved. Set the convention that decoder objects are registered after all dports are enumerated. This enables userspace to know when the CXL core is finished establishing 'dportX' links underneath the 'portX' object. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164368114191.354031.5270501846455462665.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 67 +++------------------- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/pci.c | 101 ++++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 91 +++++++++++++++++------------- drivers/cxl/cxl.h | 16 ++---- drivers/cxl/cxlpci.h | 1 + tools/testing/cxl/Kbuild | 3 +- tools/testing/cxl/mock_acpi.c | 78 -------------------------- tools/testing/cxl/test/cxl.c | 67 +++++++++++++++------- tools/testing/cxl/test/mock.c | 45 +++++++-------- tools/testing/cxl/test/mock.h | 6 +- 11 files changed, 238 insertions(+), 238 deletions(-) create mode 100644 drivers/cxl/core/pci.c diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 8d4fd7534e1e..259441245687 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -130,48 +130,6 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, return 0; } -__mock int match_add_root_ports(struct pci_dev *pdev, void *data) -{ - resource_size_t creg = CXL_RESOURCE_NONE; - struct cxl_walk_context *ctx = data; - struct pci_bus *root_bus = ctx->root; - struct cxl_port *port = ctx->port; - int type = pci_pcie_type(pdev); - struct device *dev = ctx->dev; - struct cxl_register_map map; - u32 lnkcap, port_num; - int rc; - - if (pdev->bus != root_bus) - return 0; - if (!pci_is_pcie(pdev)) - return 0; - if (type != PCI_EXP_TYPE_ROOT_PORT) - return 0; - if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, - &lnkcap) != PCIBIOS_SUCCESSFUL) - return 0; - - /* The driver doesn't rely on component registers for Root Ports yet. */ - rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); - if (!rc) - dev_info(&pdev->dev, "No component register block found\n"); - - creg = cxl_regmap_to_base(pdev, &map); - - port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - rc = cxl_add_dport(port, &pdev->dev, port_num, creg); - if (rc) { - ctx->error = rc; - return rc; - } - ctx->count++; - - dev_dbg(dev, "add dport%d: %s\n", port_num, dev_name(&pdev->dev)); - - return 0; -} - static struct cxl_dport *find_dport_by_dev(struct cxl_port *port, struct device *dev) { struct cxl_dport *dport; @@ -210,7 +168,6 @@ static int add_host_bridge_uport(struct device *match, void *arg) struct device *host = root_port->dev.parent; struct acpi_device *bridge = to_cxl_host_bridge(host, match); struct acpi_pci_root *pci_root; - struct cxl_walk_context ctx; int single_port_map[1], rc; struct cxl_decoder *cxld; struct cxl_dport *dport; @@ -240,18 +197,10 @@ static int add_host_bridge_uport(struct device *match, void *arg) return PTR_ERR(port); dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev)); - ctx = (struct cxl_walk_context){ - .dev = host, - .root = pci_root->bus, - .port = port, - }; - pci_walk_bus(pci_root->bus, match_add_root_ports, &ctx); - - if (ctx.count == 0) - return -ENODEV; - if (ctx.error) - return ctx.error; - if (ctx.count > 1) + rc = devm_cxl_port_enumerate_dports(host, port); + if (rc < 0) + return rc; + if (rc > 1) return 0; /* TODO: Scan CHBCR for HDM Decoder resources */ @@ -311,9 +260,9 @@ static int cxl_get_chbcr(union acpi_subtable_headers *header, void *arg, static int add_host_bridge_dport(struct device *match, void *arg) { - int rc; acpi_status status; unsigned long long uid; + struct cxl_dport *dport; struct cxl_chbs_context ctx; struct cxl_port *root_port = arg; struct device *host = root_port->dev.parent; @@ -343,12 +292,12 @@ static int add_host_bridge_dport(struct device *match, void *arg) } cxl_device_lock(&root_port->dev); - rc = cxl_add_dport(root_port, match, uid, ctx.chbcr); + dport = devm_cxl_add_dport(host, root_port, match, uid, ctx.chbcr); cxl_device_unlock(&root_port->dev); - if (rc) { + if (IS_ERR(dport)) { dev_err(host, "failed to add downstream port: %s\n", dev_name(match)); - return rc; + return PTR_ERR(dport); } dev_dbg(host, "add dport%llu: %s\n", uid, dev_name(match)); return 0; diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index a90202ac88d2..91057f0ec763 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -7,3 +7,4 @@ cxl_core-y += pmem.o cxl_core-y += regs.o cxl_core-y += memdev.o cxl_core-y += mbox.o +cxl_core-y += pci.o diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c new file mode 100644 index 000000000000..c5a9e03ed477 --- /dev/null +++ b/drivers/cxl/core/pci.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2021 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include "core.h" + +/** + * DOC: cxl core pci + * + * Compute Express Link protocols are layered on top of PCIe. CXL core provides + * a set of helpers for CXL interactions which occur via PCIe. + */ + +struct cxl_walk_context { + struct pci_bus *bus; + struct device *host; + struct cxl_port *port; + int type; + int error; + int count; +}; + +static int match_add_dports(struct pci_dev *pdev, void *data) +{ + struct cxl_walk_context *ctx = data; + struct cxl_port *port = ctx->port; + int type = pci_pcie_type(pdev); + struct cxl_register_map map; + struct cxl_dport *dport; + u32 lnkcap, port_num; + int rc; + + if (pdev->bus != ctx->bus) + return 0; + if (!pci_is_pcie(pdev)) + return 0; + if (type != ctx->type) + return 0; + if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, + &lnkcap)) + return 0; + + rc = cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (rc) + dev_dbg(&port->dev, "failed to find component registers\n"); + + port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); + cxl_device_lock(&port->dev); + dport = devm_cxl_add_dport(ctx->host, port, &pdev->dev, port_num, + cxl_regmap_to_base(pdev, &map)); + cxl_device_unlock(&port->dev); + if (IS_ERR(dport)) { + ctx->error = PTR_ERR(dport); + return PTR_ERR(dport); + } + ctx->count++; + + dev_dbg(&port->dev, "add dport%d: %s\n", port_num, dev_name(&pdev->dev)); + + return 0; +} + +/** + * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port + * @host: devm context + * @port: cxl_port whose ->uport is the upstream of dports to be enumerated + * + * Returns a positive number of dports enumerated or a negative error + * code. + */ +int devm_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port) +{ + struct pci_bus *bus = cxl_port_to_pci_bus(port); + struct cxl_walk_context ctx; + int type; + + if (!bus) + return -ENXIO; + + if (pci_is_root_bus(bus)) + type = PCI_EXP_TYPE_ROOT_PORT; + else + type = PCI_EXP_TYPE_DOWNSTREAM; + + ctx = (struct cxl_walk_context) { + .host = host, + .port = port, + .bus = bus, + .type = type, + }; + pci_walk_bus(bus, match_add_dports, &ctx); + + if (ctx.count == 0) + return -ENODEV; + if (ctx.error) + return ctx.error; + return ctx.count; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_port_enumerate_dports, CXL); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 369cc52e0837..fee9c7affef4 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -243,22 +243,10 @@ struct cxl_decoder *to_cxl_decoder(struct device *dev) } EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, CXL); -static void cxl_dport_release(struct cxl_dport *dport) -{ - list_del(&dport->list); - put_device(dport->dport); - kfree(dport); -} - static void cxl_port_release(struct device *dev) { struct cxl_port *port = to_cxl_port(dev); - struct cxl_dport *dport, *_d; - cxl_device_lock(dev); - list_for_each_entry_safe(dport, _d, &port->dports, list) - cxl_dport_release(dport); - cxl_device_unlock(dev); ida_free(&cxl_port_ida, port->id); kfree(port); } @@ -292,18 +280,7 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_port, CXL); static void unregister_port(void *_port) { struct cxl_port *port = _port; - struct cxl_dport *dport; - cxl_device_lock(&port->dev); - list_for_each_entry(dport, &port->dports, list) { - char link_name[CXL_TARGET_STRLEN]; - - if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", - dport->port_id) >= CXL_TARGET_STRLEN) - continue; - sysfs_remove_link(&port->dev.kobj, link_name); - } - cxl_device_unlock(&port->dev); device_unregister(&port->dev); } @@ -532,51 +509,87 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) return dup ? -EEXIST : 0; } +static void cxl_dport_remove(void *data) +{ + struct cxl_dport *dport = data; + struct cxl_port *port = dport->port; + + put_device(dport->dport); + cxl_device_lock(&port->dev); + list_del(&dport->list); + cxl_device_unlock(&port->dev); +} + +static void cxl_dport_unlink(void *data) +{ + struct cxl_dport *dport = data; + struct cxl_port *port = dport->port; + char link_name[CXL_TARGET_STRLEN]; + + sprintf(link_name, "dport%d", dport->port_id); + sysfs_remove_link(&port->dev.kobj, link_name); +} + /** - * cxl_add_dport - append downstream port data to a cxl_port + * devm_cxl_add_dport - append downstream port data to a cxl_port + * @host: devm context for allocations * @port: the cxl_port that references this dport * @dport_dev: firmware or PCI device representing the dport * @port_id: identifier for this dport in a decoder's target list * @component_reg_phys: optional location of CXL component registers * - * Note that all allocations and links are undone by cxl_port deletion - * and release. + * Note that dports are appended to the devm release action's of the + * either the port's host (for root ports), or the port itself (for + * switch ports) */ -int cxl_add_dport(struct cxl_port *port, struct device *dport_dev, int port_id, - resource_size_t component_reg_phys) +struct cxl_dport *devm_cxl_add_dport(struct device *host, struct cxl_port *port, + struct device *dport_dev, int port_id, + resource_size_t component_reg_phys) { char link_name[CXL_TARGET_STRLEN]; struct cxl_dport *dport; int rc; + if (!host->driver) { + dev_WARN_ONCE(&port->dev, 1, "dport:%s bad devm context\n", + dev_name(dport_dev)); + return ERR_PTR(-ENXIO); + } + if (snprintf(link_name, CXL_TARGET_STRLEN, "dport%d", port_id) >= CXL_TARGET_STRLEN) - return -EINVAL; + return ERR_PTR(-EINVAL); - dport = kzalloc(sizeof(*dport), GFP_KERNEL); + dport = devm_kzalloc(host, sizeof(*dport), GFP_KERNEL); if (!dport) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&dport->list); - dport->dport = get_device(dport_dev); + dport->dport = dport_dev; dport->port_id = port_id; dport->component_reg_phys = component_reg_phys; dport->port = port; rc = add_dport(port, dport); if (rc) - goto err; + return ERR_PTR(rc); + + get_device(dport_dev); + rc = devm_add_action_or_reset(host, cxl_dport_remove, dport); + if (rc) + return ERR_PTR(rc); rc = sysfs_create_link(&port->dev.kobj, &dport_dev->kobj, link_name); if (rc) - goto err; + return ERR_PTR(rc); - return 0; -err: - cxl_dport_release(dport); - return rc; + rc = devm_add_action_or_reset(host, cxl_dport_unlink, dport); + if (rc) + return ERR_PTR(rc); + + return dport; } -EXPORT_SYMBOL_NS_GPL(cxl_add_dport, CXL); +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL); static int decoder_populate_targets(struct cxl_decoder *cxld, struct cxl_port *port, int *target_map) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 61b0db526fa2..0754c68ccd33 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -236,14 +236,6 @@ struct cxl_nvdimm { struct nvdimm *nvdimm; }; -struct cxl_walk_context { - struct device *dev; - struct pci_bus *root; - struct cxl_port *port; - int error; - int count; -}; - /** * struct cxl_port - logical collection of upstream port devices and * downstream port devices to construct a CXL memory @@ -295,17 +287,17 @@ static inline bool is_cxl_root(struct cxl_port *port) bool is_cxl_port(struct device *dev); struct cxl_port *to_cxl_port(struct device *dev); +struct pci_bus; int devm_cxl_register_pci_bus(struct device *host, struct device *uport, struct pci_bus *bus); struct pci_bus *cxl_port_to_pci_bus(struct cxl_port *port); struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, struct cxl_port *parent_port); - -int cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, - resource_size_t component_reg_phys); struct cxl_port *find_cxl_root(struct device *dev); - +struct cxl_dport *devm_cxl_add_dport(struct device *host, struct cxl_port *port, + struct device *dport, int port_id, + resource_size_t component_reg_phys); struct cxl_decoder *to_cxl_decoder(struct device *dev); bool is_root_decoder(struct device *dev); bool is_cxl_decoder(struct device *dev); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index eb00f597a157..103636fda198 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -57,4 +57,5 @@ static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev, return pci_resource_start(pdev, map->barno) + map->block_offset; } +int devm_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port); #endif /* __CXL_PCI_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index ddaee8a2c418..61123544aa49 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -3,8 +3,8 @@ ldflags-y += --wrap=acpi_table_parse_cedt ldflags-y += --wrap=is_acpi_device_node ldflags-y += --wrap=acpi_evaluate_integer ldflags-y += --wrap=acpi_pci_find_root -ldflags-y += --wrap=pci_walk_bus ldflags-y += --wrap=nvdimm_bus_register +ldflags-y += --wrap=devm_cxl_port_enumerate_dports DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl @@ -30,6 +30,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pmem.o cxl_core-y += $(CXL_CORE_SRC)/regs.o cxl_core-y += $(CXL_CORE_SRC)/memdev.o cxl_core-y += $(CXL_CORE_SRC)/mbox.o +cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += config_check.o obj-m += test/ diff --git a/tools/testing/cxl/mock_acpi.c b/tools/testing/cxl/mock_acpi.c index c953e3ab6494..55813de26d46 100644 --- a/tools/testing/cxl/mock_acpi.c +++ b/tools/testing/cxl/mock_acpi.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include "test/mock.h" @@ -34,80 +33,3 @@ out: put_cxl_mock_ops(index); return found; } - -static int match_add_root_port(struct pci_dev *pdev, void *data) -{ - struct cxl_walk_context *ctx = data; - struct pci_bus *root_bus = ctx->root; - struct cxl_port *port = ctx->port; - int type = pci_pcie_type(pdev); - struct device *dev = ctx->dev; - u32 lnkcap, port_num; - int rc; - - if (pdev->bus != root_bus) - return 0; - if (!pci_is_pcie(pdev)) - return 0; - if (type != PCI_EXP_TYPE_ROOT_PORT) - return 0; - if (pci_read_config_dword(pdev, pci_pcie_cap(pdev) + PCI_EXP_LNKCAP, - &lnkcap) != PCIBIOS_SUCCESSFUL) - return 0; - - /* TODO walk DVSEC to find component register base */ - port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - cxl_device_lock(&port->dev); - rc = cxl_add_dport(port, &pdev->dev, port_num, CXL_RESOURCE_NONE); - cxl_device_unlock(&port->dev); - if (rc) { - dev_err(dev, "failed to add dport: %s (%d)\n", - dev_name(&pdev->dev), rc); - ctx->error = rc; - return rc; - } - ctx->count++; - - dev_dbg(dev, "add dport%d: %s\n", port_num, dev_name(&pdev->dev)); - - return 0; -} - -static int mock_add_root_port(struct platform_device *pdev, void *data) -{ - struct cxl_walk_context *ctx = data; - struct cxl_port *port = ctx->port; - struct device *dev = ctx->dev; - int rc; - - cxl_device_lock(&port->dev); - rc = cxl_add_dport(port, &pdev->dev, pdev->id, CXL_RESOURCE_NONE); - cxl_device_unlock(&port->dev); - if (rc) { - dev_err(dev, "failed to add dport: %s (%d)\n", - dev_name(&pdev->dev), rc); - ctx->error = rc; - return rc; - } - ctx->count++; - - dev_dbg(dev, "add dport%d: %s\n", pdev->id, dev_name(&pdev->dev)); - - return 0; -} - -int match_add_root_ports(struct pci_dev *dev, void *data) -{ - int index, rc; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - struct platform_device *pdev = (struct platform_device *) dev; - - if (ops && ops->is_mock_port(pdev)) - rc = mock_add_root_port(pdev, data); - else - rc = match_add_root_port(dev, data); - - put_cxl_mock_ops(index); - - return rc; -} diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 736d99006fb7..ef002e909d38 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -317,6 +317,19 @@ static bool is_mock_bridge(struct device *dev) for (i = 0; i < ARRAY_SIZE(cxl_host_bridge); i++) if (dev == &cxl_host_bridge[i]->dev) return true; + return false; +} + +static bool is_mock_port(struct device *dev) +{ + int i; + + if (is_mock_bridge(dev)) + return true; + + for (i = 0; i < ARRAY_SIZE(cxl_root_port); i++) + if (dev == &cxl_root_port[i]->dev) + return true; return false; } @@ -366,26 +379,6 @@ static struct acpi_pci_root mock_pci_root[NR_CXL_HOST_BRIDGES] = { }, }; -static struct platform_device *mock_cxl_root_port(struct pci_bus *bus, int index) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mock_pci_bus); i++) - if (bus == &mock_pci_bus[i]) - return cxl_root_port[index + i * NR_CXL_ROOT_PORTS]; - return NULL; -} - -static bool is_mock_port(struct platform_device *pdev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(cxl_root_port); i++) - if (pdev == cxl_root_port[i]) - return true; - return false; -} - static bool is_mock_bus(struct pci_bus *bus) { int i; @@ -405,16 +398,47 @@ static struct acpi_pci_root *mock_acpi_pci_find_root(acpi_handle handle) return &mock_pci_root[host_bridge_index(adev)]; } +static int mock_cxl_port_enumerate_dports(struct device *host, + struct cxl_port *port) +{ + struct device *dev = &port->dev; + int i; + + for (i = 0; i < ARRAY_SIZE(cxl_root_port); i++) { + struct platform_device *pdev = cxl_root_port[i]; + struct cxl_dport *dport; + + if (pdev->dev.parent != port->uport) + continue; + + cxl_device_lock(&port->dev); + dport = devm_cxl_add_dport(host, port, &pdev->dev, pdev->id, + CXL_RESOURCE_NONE); + cxl_device_unlock(&port->dev); + + if (IS_ERR(dport)) { + dev_err(dev, "failed to add dport: %s (%ld)\n", + dev_name(&pdev->dev), PTR_ERR(dport)); + return PTR_ERR(dport); + } + + dev_dbg(dev, "add dport%d: %s\n", pdev->id, + dev_name(&pdev->dev)); + } + + return 0; +} + static struct cxl_mock_ops cxl_mock_ops = { .is_mock_adev = is_mock_adev, .is_mock_bridge = is_mock_bridge, .is_mock_bus = is_mock_bus, .is_mock_port = is_mock_port, .is_mock_dev = is_mock_dev, - .mock_port = mock_cxl_root_port, .acpi_table_parse_cedt = mock_acpi_table_parse_cedt, .acpi_evaluate_integer = mock_acpi_evaluate_integer, .acpi_pci_find_root = mock_acpi_pci_find_root, + .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; @@ -598,3 +622,4 @@ module_init(cxl_test_init); module_exit(cxl_test_exit); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); +MODULE_IMPORT_NS(CXL); diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 17408f892df4..56b4b7d734bc 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "mock.h" static LIST_HEAD(mock); @@ -114,32 +116,6 @@ struct acpi_pci_root *__wrap_acpi_pci_find_root(acpi_handle handle) } EXPORT_SYMBOL_GPL(__wrap_acpi_pci_find_root); -void __wrap_pci_walk_bus(struct pci_bus *bus, - int (*cb)(struct pci_dev *, void *), void *userdata) -{ - int index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_bus(bus)) { - int rc, i; - - /* - * Simulate 2 root ports per host-bridge and no - * depth recursion. - */ - for (i = 0; i < 2; i++) { - rc = cb((struct pci_dev *) ops->mock_port(bus, i), - userdata); - if (rc) - break; - } - } else - pci_walk_bus(bus, cb, userdata); - - put_cxl_mock_ops(index); -} -EXPORT_SYMBOL_GPL(__wrap_pci_walk_bus); - struct nvdimm_bus * __wrap_nvdimm_bus_register(struct device *dev, struct nvdimm_bus_descriptor *nd_desc) @@ -155,5 +131,22 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); +int __wrap_devm_cxl_port_enumerate_dports(struct device *host, + struct cxl_port *port) +{ + int rc, index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops && ops->is_mock_port(port->uport)) + rc = ops->devm_cxl_port_enumerate_dports(host, port); + else + rc = devm_cxl_port_enumerate_dports(host, port); + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_port_enumerate_dports, CXL); + MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS(ACPI); +MODULE_IMPORT_NS(CXL); diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 15ed0fd877e4..99e7ff38090d 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -2,6 +2,7 @@ #include #include +#include struct cxl_mock_ops { struct list_head list; @@ -15,10 +16,11 @@ struct cxl_mock_ops { struct acpi_object_list *arguments, unsigned long long *data); struct acpi_pci_root *(*acpi_pci_find_root)(acpi_handle handle); - struct platform_device *(*mock_port)(struct pci_bus *bus, int index); bool (*is_mock_bus)(struct pci_bus *bus); - bool (*is_mock_port)(struct platform_device *pdev); + bool (*is_mock_port)(struct device *dev); bool (*is_mock_dev)(struct device *dev); + int (*devm_cxl_port_enumerate_dports)(struct device *host, + struct cxl_port *port); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); From d17d0540a0dbf109210f7b57a37571e2978da0fa Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 1 Feb 2022 12:24:30 -0800 Subject: [PATCH 051/186] cxl/core/hdm: Add CXL standard decoder enumeration to the core Unlike the decoder enumeration for "root decoders" described by platform firmware, standard decoders can be enumerated from the component registers space once the base address has been identified (via PCI, ACPI, or another mechanism). Add common infrastructure for HDM (Host-managed-Device-Memory) Decoder enumeration and share it between host-bridge, upstream switch port, and cxl_test defined decoders. The locking model for switch level decoders is to hold the port lock over the enumeration. This facilitates moving the dport and decoder enumeration to a 'port' driver. For now, the only enumerator of decoder resources is the cxl_acpi root driver. Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164374688404.395335.9239248252443123526.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 43 ++---- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 2 + drivers/cxl/core/hdm.c | 248 ++++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 57 ++++++-- drivers/cxl/core/regs.c | 5 +- drivers/cxl/cxl.h | 33 ++++- drivers/cxl/cxlmem.h | 8 ++ tools/testing/cxl/Kbuild | 4 + tools/testing/cxl/test/cxl.c | 29 ++++ tools/testing/cxl/test/mock.c | 50 +++++++ tools/testing/cxl/test/mock.h | 3 + 12 files changed, 434 insertions(+), 49 deletions(-) create mode 100644 drivers/cxl/core/hdm.c diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 259441245687..8c2ced91518b 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -168,10 +168,10 @@ static int add_host_bridge_uport(struct device *match, void *arg) struct device *host = root_port->dev.parent; struct acpi_device *bridge = to_cxl_host_bridge(host, match); struct acpi_pci_root *pci_root; - int single_port_map[1], rc; - struct cxl_decoder *cxld; struct cxl_dport *dport; + struct cxl_hdm *cxlhdm; struct cxl_port *port; + int rc; if (!bridge) return 0; @@ -200,37 +200,24 @@ static int add_host_bridge_uport(struct device *match, void *arg) rc = devm_cxl_port_enumerate_dports(host, port); if (rc < 0) return rc; - if (rc > 1) - return 0; - - /* TODO: Scan CHBCR for HDM Decoder resources */ - - /* - * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability - * Structure) single ported host-bridges need not publish a decoder - * capability when a passthrough decode can be assumed, i.e. all - * transactions that the uport sees are claimed and passed to the single - * dport. Disable the range until the first CXL region is enumerated / - * activated. - */ - cxld = cxl_switch_decoder_alloc(port, 1); - if (IS_ERR(cxld)) - return PTR_ERR(cxld); - cxl_device_lock(&port->dev); - dport = list_first_entry(&port->dports, typeof(*dport), list); - cxl_device_unlock(&port->dev); + if (rc == 1) { + rc = devm_cxl_add_passthrough_decoder(host, port); + goto out; + } - single_port_map[0] = dport->port_id; + cxlhdm = devm_cxl_setup_hdm(host, port); + if (IS_ERR(cxlhdm)) { + rc = PTR_ERR(cxlhdm); + goto out; + } - rc = cxl_decoder_add(cxld, single_port_map); + rc = devm_cxl_enumerate_decoders(host, cxlhdm); if (rc) - put_device(&cxld->dev); - else - rc = cxl_decoder_autoremove(host, cxld); + dev_err(&port->dev, "Couldn't enumerate decoders (%d)\n", rc); - if (rc == 0) - dev_dbg(host, "add: %s\n", dev_name(&cxld->dev)); +out: + cxl_device_unlock(&port->dev); return rc; } diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 91057f0ec763..6d37cd78b151 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -8,3 +8,4 @@ cxl_core-y += regs.o cxl_core-y += memdev.o cxl_core-y += mbox.o cxl_core-y += pci.o +cxl_core-y += hdm.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index e0c9aacc4e9c..1a50c0fc399c 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -14,6 +14,8 @@ struct cxl_mem_query_commands; int cxl_query_cmd(struct cxl_memdev *cxlmd, struct cxl_mem_query_commands __user *q); int cxl_send_cmd(struct cxl_memdev *cxlmd, struct cxl_send_command __user *s); +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); int cxl_memdev_init(void); void cxl_memdev_exit(void); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c new file mode 100644 index 000000000000..84f4ed288a88 --- /dev/null +++ b/drivers/cxl/core/hdm.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include + +#include "cxlmem.h" +#include "core.h" + +/** + * DOC: cxl core hdm + * + * Compute Express Link Host Managed Device Memory, starting with the + * CXL 2.0 specification, is managed by an array of HDM Decoder register + * instances per CXL port and per CXL endpoint. Define common helpers + * for enumerating these registers and capabilities. + */ + +static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, + int *target_map) +{ + int rc; + + rc = cxl_decoder_add_locked(cxld, target_map); + if (rc) { + put_device(&cxld->dev); + dev_err(&port->dev, "Failed to add decoder\n"); + return rc; + } + + rc = cxl_decoder_autoremove(&port->dev, cxld); + if (rc) + return rc; + + dev_dbg(&cxld->dev, "Added to port %s\n", dev_name(&port->dev)); + + return 0; +} + +/* + * Per the CXL specification (8.2.5.12 CXL HDM Decoder Capability Structure) + * single ported host-bridges need not publish a decoder capability when a + * passthrough decode can be assumed, i.e. all transactions that the uport sees + * are claimed and passed to the single dport. Disable the range until the first + * CXL region is enumerated / activated. + */ +int devm_cxl_add_passthrough_decoder(struct device *host, struct cxl_port *port) +{ + struct cxl_decoder *cxld; + struct cxl_dport *dport; + int single_port_map[1]; + + cxld = cxl_switch_decoder_alloc(port, 1); + if (IS_ERR(cxld)) + return PTR_ERR(cxld); + + device_lock_assert(&port->dev); + + dport = list_first_entry(&port->dports, typeof(*dport), list); + single_port_map[0] = dport->port_id; + + return add_hdm_decoder(port, cxld, single_port_map); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_passthrough_decoder, CXL); + +static void parse_hdm_decoder_caps(struct cxl_hdm *cxlhdm) +{ + u32 hdm_cap; + + hdm_cap = readl(cxlhdm->regs.hdm_decoder + CXL_HDM_DECODER_CAP_OFFSET); + cxlhdm->decoder_count = cxl_hdm_decoder_count(hdm_cap); + cxlhdm->target_count = + FIELD_GET(CXL_HDM_DECODER_TARGET_COUNT_MASK, hdm_cap); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_11_8, hdm_cap)) + cxlhdm->interleave_mask |= GENMASK(11, 8); + if (FIELD_GET(CXL_HDM_DECODER_INTERLEAVE_14_12, hdm_cap)) + cxlhdm->interleave_mask |= GENMASK(14, 12); +} + +static void __iomem *map_hdm_decoder_regs(struct cxl_port *port, + void __iomem *crb) +{ + struct cxl_component_reg_map map; + + cxl_probe_component_regs(&port->dev, crb, &map); + if (!map.hdm_decoder.valid) { + dev_err(&port->dev, "HDM decoder registers invalid\n"); + return IOMEM_ERR_PTR(-ENXIO); + } + + return crb + map.hdm_decoder.offset; +} + +/** + * devm_cxl_setup_hdm - map HDM decoder component registers + * @host: devm context for allocations + * @port: cxl_port to map + */ +struct cxl_hdm *devm_cxl_setup_hdm(struct device *host, struct cxl_port *port) +{ + struct device *dev = &port->dev; + void __iomem *crb, *hdm; + struct cxl_hdm *cxlhdm; + + cxlhdm = devm_kzalloc(host, sizeof(*cxlhdm), GFP_KERNEL); + if (!cxlhdm) + return ERR_PTR(-ENOMEM); + + cxlhdm->port = port; + crb = devm_cxl_iomap_block(host, port->component_reg_phys, + CXL_COMPONENT_REG_BLOCK_SIZE); + if (!crb) { + dev_err(dev, "No component registers mapped\n"); + return ERR_PTR(-ENXIO); + } + + hdm = map_hdm_decoder_regs(port, crb); + if (IS_ERR(hdm)) + return ERR_CAST(hdm); + cxlhdm->regs.hdm_decoder = hdm; + + parse_hdm_decoder_caps(cxlhdm); + if (cxlhdm->decoder_count == 0) { + dev_err(dev, "Spec violation. Caps invalid\n"); + return ERR_PTR(-ENXIO); + } + + return cxlhdm; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); + +static int to_interleave_granularity(u32 ctrl) +{ + int val = FIELD_GET(CXL_HDM_DECODER0_CTRL_IG_MASK, ctrl); + + return 256 << val; +} + +static int to_interleave_ways(u32 ctrl) +{ + int val = FIELD_GET(CXL_HDM_DECODER0_CTRL_IW_MASK, ctrl); + + switch (val) { + case 0 ... 4: + return 1 << val; + case 8 ... 10: + return 3 << (val - 8); + default: + return 0; + } +} + +static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, + void __iomem *hdm, int which) +{ + u64 size, base; + u32 ctrl; + int i; + union { + u64 value; + unsigned char target_id[8]; + } target_list; + + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(which)); + base = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which)); + size = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which)); + + if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) + size = 0; + + cxld->decoder_range = (struct range) { + .start = base, + .end = base + size - 1, + }; + + /* switch decoders are always enabled if committed */ + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) { + cxld->flags |= CXL_DECODER_F_ENABLE; + if (ctrl & CXL_HDM_DECODER0_CTRL_LOCK) + cxld->flags |= CXL_DECODER_F_LOCK; + } + cxld->interleave_ways = to_interleave_ways(ctrl); + cxld->interleave_granularity = to_interleave_granularity(ctrl); + + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl)) + cxld->target_type = CXL_DECODER_EXPANDER; + else + cxld->target_type = CXL_DECODER_ACCELERATOR; + + target_list.value = + ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); + for (i = 0; i < cxld->interleave_ways; i++) + target_map[i] = target_list.target_id[i]; +} + +/** + * devm_cxl_enumerate_decoders - add decoder objects per HDM register set + * @host: devm allocation context + * @cxlhdm: Structure to populate with HDM capabilities + */ +int devm_cxl_enumerate_decoders(struct device *host, struct cxl_hdm *cxlhdm) +{ + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + struct cxl_port *port = cxlhdm->port; + int i, committed; + u32 ctrl; + + /* + * Since the register resource was recently claimed via request_region() + * be careful about trusting the "not-committed" status until the commit + * timeout has elapsed. The commit timeout is 10ms (CXL 2.0 + * 8.2.5.12.20), but double it to be tolerant of any clock skew between + * host and target. + */ + for (i = 0, committed = 0; i < cxlhdm->decoder_count; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(i)); + if (ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED) + committed++; + } + + /* ensure that future checks of committed can be trusted */ + if (committed != cxlhdm->decoder_count) + msleep(20); + + for (i = 0; i < cxlhdm->decoder_count; i++) { + int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; + int rc, target_count = cxlhdm->target_count; + struct cxl_decoder *cxld; + + cxld = cxl_switch_decoder_alloc(port, target_count); + if (IS_ERR(cxld)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxld); + } + + init_hdm_decoder(cxld, target_map, cxlhdm->regs.hdm_decoder, i); + rc = add_hdm_decoder(port, cxld, target_map); + if (rc) { + dev_warn(&port->dev, + "Failed to add decoder to port\n"); + return rc; + } + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, CXL); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index fee9c7affef4..4dfb9df9e648 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -594,16 +594,15 @@ EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL); static int decoder_populate_targets(struct cxl_decoder *cxld, struct cxl_port *port, int *target_map) { - int rc = 0, i; + int i, rc = 0; if (!target_map) return 0; - cxl_device_lock(&port->dev); - if (list_empty(&port->dports)) { - rc = -EINVAL; - goto out_unlock; - } + device_lock_assert(&port->dev); + + if (list_empty(&port->dports)) + return -EINVAL; write_seqlock(&cxld->target_lock); for (i = 0; i < cxld->nr_targets; i++) { @@ -617,9 +616,6 @@ static int decoder_populate_targets(struct cxl_decoder *cxld, } write_sequnlock(&cxld->target_lock); -out_unlock: - cxl_device_unlock(&port->dev); - return rc; } @@ -721,7 +717,7 @@ struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); /** - * cxl_decoder_add - Add a decoder with targets + * cxl_decoder_add_locked - Add a decoder with targets * @cxld: The cxl decoder allocated by cxl_decoder_alloc() * @target_map: A list of downstream ports that this decoder can direct memory * traffic to. These numbers should correspond with the port number @@ -731,12 +727,15 @@ EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); * is an endpoint device. A more awkward example is a hostbridge whose root * ports get hot added (technically possible, though unlikely). * - * Context: Process context. Takes and releases the cxld's device lock. + * This is the locked variant of cxl_decoder_add(). + * + * Context: Process context. Expects the device lock of the port that owns the + * @cxld to be held. * * Return: Negative error code if the decoder wasn't properly configured; else * returns 0. */ -int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) +int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) { struct cxl_port *port; struct device *dev; @@ -770,6 +769,40 @@ int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) return device_add(dev); } +EXPORT_SYMBOL_NS_GPL(cxl_decoder_add_locked, CXL); + +/** + * cxl_decoder_add - Add a decoder with targets + * @cxld: The cxl decoder allocated by cxl_decoder_alloc() + * @target_map: A list of downstream ports that this decoder can direct memory + * traffic to. These numbers should correspond with the port number + * in the PCIe Link Capabilities structure. + * + * This is the unlocked variant of cxl_decoder_add_locked(). + * See cxl_decoder_add_locked(). + * + * Context: Process context. Takes and releases the device lock of the port that + * owns the @cxld. + */ +int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map) +{ + struct cxl_port *port; + int rc; + + if (WARN_ON_ONCE(!cxld)) + return -EINVAL; + + if (WARN_ON_ONCE(IS_ERR(cxld))) + return PTR_ERR(cxld); + + port = to_cxl_port(cxld->dev.parent); + + cxl_device_lock(&port->dev); + rc = cxl_decoder_add_locked(cxld, target_map); + cxl_device_unlock(&port->dev); + + return rc; +} EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL); static void cxld_unregister(void *dev) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 65d7f5880671..718b6b0ae4b3 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -159,9 +159,8 @@ void cxl_probe_device_regs(struct device *dev, void __iomem *base, } EXPORT_SYMBOL_NS_GPL(cxl_probe_device_regs, CXL); -static void __iomem *devm_cxl_iomap_block(struct device *dev, - resource_size_t addr, - resource_size_t length) +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length) { void __iomem *ret_val; struct resource *res; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 0754c68ccd33..c127d5c0ac96 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -17,6 +17,9 @@ * (port-driver, region-driver, nvdimm object-drivers... etc). */ +/* CXL 2.0 8.2.4 CXL Component Register Layout and Definition */ +#define CXL_COMPONENT_REG_BLOCK_SIZE SZ_64K + /* CXL 2.0 8.2.5 CXL.cache and CXL.mem Registers*/ #define CXL_CM_OFFSET 0x1000 #define CXL_CM_CAP_HDR_OFFSET 0x0 @@ -36,11 +39,23 @@ #define CXL_HDM_DECODER_CAP_OFFSET 0x0 #define CXL_HDM_DECODER_COUNT_MASK GENMASK(3, 0) #define CXL_HDM_DECODER_TARGET_COUNT_MASK GENMASK(7, 4) -#define CXL_HDM_DECODER0_BASE_LOW_OFFSET 0x10 -#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET 0x14 -#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET 0x18 -#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET 0x1c -#define CXL_HDM_DECODER0_CTRL_OFFSET 0x20 +#define CXL_HDM_DECODER_INTERLEAVE_11_8 BIT(8) +#define CXL_HDM_DECODER_INTERLEAVE_14_12 BIT(9) +#define CXL_HDM_DECODER_CTRL_OFFSET 0x4 +#define CXL_HDM_DECODER_ENABLE BIT(1) +#define CXL_HDM_DECODER0_BASE_LOW_OFFSET(i) (0x20 * (i) + 0x10) +#define CXL_HDM_DECODER0_BASE_HIGH_OFFSET(i) (0x20 * (i) + 0x14) +#define CXL_HDM_DECODER0_SIZE_LOW_OFFSET(i) (0x20 * (i) + 0x18) +#define CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(i) (0x20 * (i) + 0x1c) +#define CXL_HDM_DECODER0_CTRL_OFFSET(i) (0x20 * (i) + 0x20) +#define CXL_HDM_DECODER0_CTRL_IG_MASK GENMASK(3, 0) +#define CXL_HDM_DECODER0_CTRL_IW_MASK GENMASK(7, 4) +#define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) +#define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) +#define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_TYPE BIT(12) +#define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) +#define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) static inline int cxl_hdm_decoder_count(u32 cap_hdr) { @@ -162,7 +177,8 @@ int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, #define CXL_DECODER_F_TYPE2 BIT(2) #define CXL_DECODER_F_TYPE3 BIT(3) #define CXL_DECODER_F_LOCK BIT(4) -#define CXL_DECODER_F_MASK GENMASK(4, 0) +#define CXL_DECODER_F_ENABLE BIT(5) +#define CXL_DECODER_F_MASK GENMASK(5, 0) enum cxl_decoder_type { CXL_DECODER_ACCELERATOR = 2, @@ -306,7 +322,12 @@ struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); +int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); +struct cxl_hdm; +struct cxl_hdm *devm_cxl_setup_hdm(struct device *host, struct cxl_port *port); +int devm_cxl_enumerate_decoders(struct device *host, struct cxl_hdm *cxlhdm); +int devm_cxl_add_passthrough_decoder(struct device *host, struct cxl_port *port); extern struct bus_type cxl_bus_type; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 8d96d009ad90..fca2d1b5f6ff 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -264,4 +264,12 @@ int cxl_mem_create_range_info(struct cxl_dev_state *cxlds); struct cxl_dev_state *cxl_dev_state_create(struct device *dev); void set_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); void clear_exclusive_cxl_commands(struct cxl_dev_state *cxlds, unsigned long *cmds); + +struct cxl_hdm { + struct cxl_component_regs regs; + unsigned int decoder_count; + unsigned int target_count; + unsigned int interleave_mask; + struct cxl_port *port; +}; #endif /* __CXL_MEM_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 61123544aa49..3045d7cba0db 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -5,6 +5,9 @@ ldflags-y += --wrap=acpi_evaluate_integer ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register ldflags-y += --wrap=devm_cxl_port_enumerate_dports +ldflags-y += --wrap=devm_cxl_setup_hdm +ldflags-y += --wrap=devm_cxl_add_passthrough_decoder +ldflags-y += --wrap=devm_cxl_enumerate_decoders DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl @@ -31,6 +34,7 @@ cxl_core-y += $(CXL_CORE_SRC)/regs.o cxl_core-y += $(CXL_CORE_SRC)/memdev.o cxl_core-y += $(CXL_CORE_SRC)/mbox.o cxl_core-y += $(CXL_CORE_SRC)/pci.o +cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += config_check.o obj-m += test/ diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index ef002e909d38..81c09380c537 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "mock.h" #define NR_CXL_HOST_BRIDGES 4 @@ -398,6 +399,31 @@ static struct acpi_pci_root *mock_acpi_pci_find_root(acpi_handle handle) return &mock_pci_root[host_bridge_index(adev)]; } +static struct cxl_hdm *mock_cxl_setup_hdm(struct device *host, + struct cxl_port *port) +{ + struct cxl_hdm *cxlhdm = devm_kzalloc(&port->dev, sizeof(*cxlhdm), GFP_KERNEL); + + if (!cxlhdm) + return ERR_PTR(-ENOMEM); + + cxlhdm->port = port; + return cxlhdm; +} + +static int mock_cxl_add_passthrough_decoder(struct device *host, + struct cxl_port *port) +{ + dev_err(&port->dev, "unexpected passthrough decoder for cxl_test\n"); + return -EOPNOTSUPP; +} + +static int mock_cxl_enumerate_decoders(struct device *host, + struct cxl_hdm *cxlhdm) +{ + return 0; +} + static int mock_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port) { @@ -439,6 +465,9 @@ static struct cxl_mock_ops cxl_mock_ops = { .acpi_evaluate_integer = mock_acpi_evaluate_integer, .acpi_pci_find_root = mock_acpi_pci_find_root, .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, + .devm_cxl_setup_hdm = mock_cxl_setup_hdm, + .devm_cxl_add_passthrough_decoder = mock_cxl_add_passthrough_decoder, + .devm_cxl_enumerate_decoders = mock_cxl_enumerate_decoders, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 56b4b7d734bc..18d3b65e2a9b 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -131,6 +131,56 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); +struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct device *host, + struct cxl_port *port) +{ + int index; + struct cxl_hdm *cxlhdm; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops && ops->is_mock_port(port->uport)) + cxlhdm = ops->devm_cxl_setup_hdm(host, port); + else + cxlhdm = devm_cxl_setup_hdm(host, port); + put_cxl_mock_ops(index); + + return cxlhdm; +} +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_setup_hdm, CXL); + +int __wrap_devm_cxl_add_passthrough_decoder(struct device *host, + struct cxl_port *port) +{ + int rc, index; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops && ops->is_mock_port(port->uport)) + rc = ops->devm_cxl_add_passthrough_decoder(host, port); + else + rc = devm_cxl_add_passthrough_decoder(host, port); + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_passthrough_decoder, CXL); + +int __wrap_devm_cxl_enumerate_decoders(struct device *host, + struct cxl_hdm *cxlhdm) +{ + int rc, index; + struct cxl_port *port = cxlhdm->port; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops && ops->is_mock_port(port->uport)) + rc = ops->devm_cxl_enumerate_decoders(host, cxlhdm); + else + rc = devm_cxl_enumerate_decoders(host, cxlhdm); + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_enumerate_decoders, CXL); + int __wrap_devm_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port) { diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 99e7ff38090d..15e48063ea4b 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -21,6 +21,9 @@ struct cxl_mock_ops { bool (*is_mock_dev)(struct device *dev); int (*devm_cxl_port_enumerate_dports)(struct device *host, struct cxl_port *port); + struct cxl_hdm *(*devm_cxl_setup_hdm)(struct device *host, struct cxl_port *port); + int (*devm_cxl_add_passthrough_decoder)(struct device *host, struct cxl_port *port); + int (*devm_cxl_enumerate_decoders)(struct device *host, struct cxl_hdm *hdm); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); From 83fbdbe4c18678eebe63fc49913f149a5afde057 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:30:41 -0800 Subject: [PATCH 052/186] cxl/core: Emit modalias for CXL devices In order to enable libkmod lookups for CXL device objects to their corresponding module, add 'modalias' to the base attribute of CXL devices. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164298424120.3018233.15611905873808708542.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 9 +++++++++ drivers/cxl/core/port.c | 26 ++++++++++++++++--------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 0b6a2e6e8fbb..6d8cbf3355b5 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -34,6 +34,15 @@ Description: the same value communicated in the DEVTYPE environment variable for uevents for devices on the "cxl" bus. +What: /sys/bus/cxl/devices/*/modalias +Date: December, 2021 +KernelVersion: v5.18 +Contact: linux-cxl@vger.kernel.org +Description: + CXL device objects export the modalias attribute which mirrors + the same value communicated in the MODALIAS environment variable + for uevents for devices on the "cxl" bus. + What: /sys/bus/cxl/devices/portX/uport Date: June, 2021 KernelVersion: v5.14 diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 4dfb9df9e648..0bbd8fb8f35d 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -34,8 +34,25 @@ static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(devtype); +static int cxl_device_id(struct device *dev) +{ + if (dev->type == &cxl_nvdimm_bridge_type) + return CXL_DEVICE_NVDIMM_BRIDGE; + if (dev->type == &cxl_nvdimm_type) + return CXL_DEVICE_NVDIMM; + return 0; +} + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, CXL_MODALIAS_FMT "\n", cxl_device_id(dev)); +} +static DEVICE_ATTR_RO(modalias); + static struct attribute *cxl_base_attributes[] = { &dev_attr_devtype.attr, + &dev_attr_modalias.attr, NULL, }; @@ -855,15 +872,6 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv) } EXPORT_SYMBOL_NS_GPL(cxl_driver_unregister, CXL); -static int cxl_device_id(struct device *dev) -{ - if (dev->type == &cxl_nvdimm_bridge_type) - return CXL_DEVICE_NVDIMM_BRIDGE; - if (dev->type == &cxl_nvdimm_type) - return CXL_DEVICE_NVDIMM; - return 0; -} - static int cxl_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { return add_uevent_var(env, "MODALIAS=" CXL_MODALIAS_FMT, From 54cdbf845cf719c09b45ae588cba469aabb3159c Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 1 Feb 2022 13:07:51 -0800 Subject: [PATCH 053/186] cxl/port: Add a driver for 'struct cxl_port' objects The need for a CXL port driver and a dedicated cxl_bus_type is driven by a need to simultaneously support 2 independent physical memory decode domains (cache coherent CXL.mem and uncached PCI.mmio) that also intersect at a single PCIe device node. A CXL Port is a device that advertises a CXL Component Register block with an "HDM Decoder Capability Structure". >From Documentation/driver-api/cxl/memory-devices.rst: Similar to how a RAID driver takes disk objects and assembles them into a new logical device, the CXL subsystem is tasked to take PCIe and ACPI objects and assemble them into a CXL.mem decode topology. The need for runtime configuration of the CXL.mem topology is also similar to RAID in that different environments with the same hardware configuration may decide to assemble the topology in contrasting ways. One may choose performance (RAID0) striping memory across multiple Host Bridges and endpoints while another may opt for fault tolerance and disable any striping in the CXL.mem topology. The port driver identifies whether an endpoint Memory Expander is connected to a CXL topology. If an active (bound to the 'cxl_port' driver) CXL Port is not found at every PCIe Switch Upstream port and an active "root" CXL Port then the device is just a plain PCIe endpoint only capable of participating in PCI.mmio and DMA cycles, not CXL.mem coherent interleave sets. The 'cxl_port' driver lets the CXL subsystem leverage driver-core infrastructure for setup and teardown of register resources and communicating device activation status to userspace. The cxl_bus_type can rendezvous the async arrival of platform level CXL resources (via the 'cxl_acpi' driver) with the asynchronous enumeration of Memory Expander endpoints, while also implementing a hierarchical locking model independent of the associated 'struct pci_dev' locking model. The locking for dport and decoder enumeration is now handled in the core rather than callers. For now the port driver only enumerates and registers CXL resources (downstream port metadata and decoder resources) later it will be used to take action on its decoders in response to CXL.mem region provisioning requests. Note1: cxlpci.h has long depended on pci.h, but port.c was the first to not include pci.h. Carry that dependency in cxlpci.h. Note2: cxl port enumeration and probing complicates CXL subsystem init to the point that it helps to have centralized debug logging of probe events in cxl_bus_probe(). Reported-by: kernel test robot Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Co-developed-by: Dan Williams Link: https://lore.kernel.org/r/164374948116.464348.1772618057599155408.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- .../driver-api/cxl/memory-devices.rst | 302 ++++++++++++++++++ drivers/cxl/Kconfig | 5 + drivers/cxl/Makefile | 2 + drivers/cxl/acpi.c | 26 +- drivers/cxl/core/pci.c | 2 - drivers/cxl/core/port.c | 34 +- drivers/cxl/cxl.h | 4 + drivers/cxl/cxlpci.h | 1 + drivers/cxl/port.c | 63 ++++ tools/testing/cxl/Kbuild | 5 + tools/testing/cxl/test/cxl.c | 2 - 11 files changed, 415 insertions(+), 31 deletions(-) create mode 100644 drivers/cxl/port.c diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index c8f7a16cd0e3..3498d38d7cbd 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -14,6 +14,303 @@ that optionally define a device's contribution to an interleaved address range across multiple devices underneath a host-bridge or interleaved across host-bridges. +CXL Bus: Theory of Operation +============================ +Similar to how a RAID driver takes disk objects and assembles them into a new +logical device, the CXL subsystem is tasked to take PCIe and ACPI objects and +assemble them into a CXL.mem decode topology. The need for runtime configuration +of the CXL.mem topology is also similar to RAID in that different environments +with the same hardware configuration may decide to assemble the topology in +contrasting ways. One may choose performance (RAID0) striping memory across +multiple Host Bridges and endpoints while another may opt for fault tolerance +and disable any striping in the CXL.mem topology. + +Platform firmware enumerates a menu of interleave options at the "CXL root port" +(Linux term for the top of the CXL decode topology). From there, PCIe topology +dictates which endpoints can participate in which Host Bridge decode regimes. +Each PCIe Switch in the path between the root and an endpoint introduces a point +at which the interleave can be split. For example platform firmware may say at a +given range only decodes to 1 one Host Bridge, but that Host Bridge may in turn +interleave cycles across multiple Root Ports. An intervening Switch between a +port and an endpoint may interleave cycles across multiple Downstream Switch +Ports, etc. + +Here is a sample listing of a CXL topology defined by 'cxl_test'. The 'cxl_test' +module generates an emulated CXL topology of 2 Host Bridges each with 2 Root +Ports. Each of those Root Ports are connected to 2-way switches with endpoints +connected to those downstream ports for a total of 8 endpoints:: + + # cxl list -BEMPu -b cxl_test + { + "bus":"root3", + "provider":"cxl_test", + "ports:root3":[ + { + "port":"port5", + "host":"cxl_host_bridge.1", + "ports:port5":[ + { + "port":"port8", + "host":"cxl_switch_uport.1", + "endpoints:port8":[ + { + "endpoint":"endpoint9", + "host":"mem2", + "memdev":{ + "memdev":"mem2", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x1", + "numa_node":1, + "host":"cxl_mem.1" + } + }, + { + "endpoint":"endpoint15", + "host":"mem6", + "memdev":{ + "memdev":"mem6", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x5", + "numa_node":1, + "host":"cxl_mem.5" + } + } + ] + }, + { + "port":"port12", + "host":"cxl_switch_uport.3", + "endpoints:port12":[ + { + "endpoint":"endpoint17", + "host":"mem8", + "memdev":{ + "memdev":"mem8", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x7", + "numa_node":1, + "host":"cxl_mem.7" + } + }, + { + "endpoint":"endpoint13", + "host":"mem4", + "memdev":{ + "memdev":"mem4", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x3", + "numa_node":1, + "host":"cxl_mem.3" + } + } + ] + } + ] + }, + { + "port":"port4", + "host":"cxl_host_bridge.0", + "ports:port4":[ + { + "port":"port6", + "host":"cxl_switch_uport.0", + "endpoints:port6":[ + { + "endpoint":"endpoint7", + "host":"mem1", + "memdev":{ + "memdev":"mem1", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0", + "numa_node":0, + "host":"cxl_mem.0" + } + }, + { + "endpoint":"endpoint14", + "host":"mem5", + "memdev":{ + "memdev":"mem5", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x4", + "numa_node":0, + "host":"cxl_mem.4" + } + } + ] + }, + { + "port":"port10", + "host":"cxl_switch_uport.2", + "endpoints:port10":[ + { + "endpoint":"endpoint16", + "host":"mem7", + "memdev":{ + "memdev":"mem7", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x6", + "numa_node":0, + "host":"cxl_mem.6" + } + }, + { + "endpoint":"endpoint11", + "host":"mem3", + "memdev":{ + "memdev":"mem3", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x2", + "numa_node":0, + "host":"cxl_mem.2" + } + } + ] + } + ] + } + ] + } + +In that listing each "root", "port", and "endpoint" object correspond a kernel +'struct cxl_port' object. A 'cxl_port' is a device that can decode CXL.mem to +its descendants. So "root" claims non-PCIe enumerable platform decode ranges and +decodes them to "ports", "ports" decode to "endpoints", and "endpoints" +represent the decode from SPA (System Physical Address) to DPA (Device Physical +Address). + +Continuing the RAID analogy, disks have both topology metadata and on device +metadata that determine RAID set assembly. CXL Port topology and CXL Port link +status is metadata for CXL.mem set assembly. The CXL Port topology is enumerated +by the arrival of a CXL.mem device. I.e. unless and until the PCIe core attaches +the cxl_pci driver to a CXL Memory Expander there is no role for CXL Port +objects. Conversely for hot-unplug / removal scenarios, there is no need for +the Linux PCI core to tear down switch-level CXL resources because the endpoint +->remove() event cleans up the port data that was established to support that +Memory Expander. + +The port metadata and potential decode schemes that a give memory device may +participate can be determined via a command like:: + + # cxl list -BDMu -d root -m mem3 + { + "bus":"root3", + "provider":"cxl_test", + "decoders:root3":[ + { + "decoder":"decoder3.1", + "resource":"0x8030000000", + "size":"512.00 MiB (536.87 MB)", + "volatile_capable":true, + "nr_targets":2 + }, + { + "decoder":"decoder3.3", + "resource":"0x8060000000", + "size":"512.00 MiB (536.87 MB)", + "pmem_capable":true, + "nr_targets":2 + }, + { + "decoder":"decoder3.0", + "resource":"0x8020000000", + "size":"256.00 MiB (268.44 MB)", + "volatile_capable":true, + "nr_targets":1 + }, + { + "decoder":"decoder3.2", + "resource":"0x8050000000", + "size":"256.00 MiB (268.44 MB)", + "pmem_capable":true, + "nr_targets":1 + } + ], + "memdevs:root3":[ + { + "memdev":"mem3", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x2", + "numa_node":0, + "host":"cxl_mem.2" + } + ] + } + +...which queries the CXL topology to ask "given CXL Memory Expander with a kernel +device name of 'mem3' which platform level decode ranges may this device +participate". A given expander can participate in multiple CXL.mem interleave +sets simultaneously depending on how many decoder resource it has. In this +example mem3 can participate in one or more of a PMEM interleave that spans to +Host Bridges, a PMEM interleave that targets a single Host Bridge, a Volatile +memory interleave that spans 2 Host Bridges, and a Volatile memory interleave +that only targets a single Host Bridge. + +Conversely the memory devices that can participate in a given platform level +decode scheme can be determined via a command like the following:: + + # cxl list -MDu -d 3.2 + [ + { + "memdevs":[ + { + "memdev":"mem1", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0", + "numa_node":0, + "host":"cxl_mem.0" + }, + { + "memdev":"mem5", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x4", + "numa_node":0, + "host":"cxl_mem.4" + }, + { + "memdev":"mem7", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x6", + "numa_node":0, + "host":"cxl_mem.6" + }, + { + "memdev":"mem3", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x2", + "numa_node":0, + "host":"cxl_mem.2" + } + ] + }, + { + "root decoders":[ + { + "decoder":"decoder3.2", + "resource":"0x8050000000", + "size":"256.00 MiB (268.44 MB)", + "pmem_capable":true, + "nr_targets":1 + } + ] + } + ] + +...where the naming scheme for decoders is "decoder.". + Driver Infrastructure ===================== @@ -28,6 +325,11 @@ CXL Memory Device .. kernel-doc:: drivers/cxl/pci.c :internal: +CXL Port +-------- +.. kernel-doc:: drivers/cxl/port.c + :doc: cxl port + CXL Core -------- .. kernel-doc:: drivers/cxl/cxl.h diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index ef05e96f8f97..4f4f7587f6ca 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -77,4 +77,9 @@ config CXL_PMEM provisioning the persistent memory capacity of CXL memory expanders. If unsure say 'm'. + +config CXL_PORT + default CXL_BUS + tristate + endif diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile index cf07ae6cea17..56fcac2323cb 100644 --- a/drivers/cxl/Makefile +++ b/drivers/cxl/Makefile @@ -3,7 +3,9 @@ obj-$(CONFIG_CXL_BUS) += core/ obj-$(CONFIG_CXL_PCI) += cxl_pci.o obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o +obj-$(CONFIG_CXL_PORT) += cxl_port.o cxl_pci-y := pci.o cxl_acpi-y := acpi.o cxl_pmem-y := pmem.o +cxl_port-y := port.o diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 8c2ced91518b..82591642ea90 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -169,7 +169,6 @@ static int add_host_bridge_uport(struct device *match, void *arg) struct acpi_device *bridge = to_cxl_host_bridge(host, match); struct acpi_pci_root *pci_root; struct cxl_dport *dport; - struct cxl_hdm *cxlhdm; struct cxl_port *port; int rc; @@ -197,28 +196,7 @@ static int add_host_bridge_uport(struct device *match, void *arg) return PTR_ERR(port); dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev)); - rc = devm_cxl_port_enumerate_dports(host, port); - if (rc < 0) - return rc; - cxl_device_lock(&port->dev); - if (rc == 1) { - rc = devm_cxl_add_passthrough_decoder(host, port); - goto out; - } - - cxlhdm = devm_cxl_setup_hdm(host, port); - if (IS_ERR(cxlhdm)) { - rc = PTR_ERR(cxlhdm); - goto out; - } - - rc = devm_cxl_enumerate_decoders(host, cxlhdm); - if (rc) - dev_err(&port->dev, "Couldn't enumerate decoders (%d)\n", rc); - -out: - cxl_device_unlock(&port->dev); - return rc; + return 0; } struct cxl_chbs_context { @@ -278,9 +256,7 @@ static int add_host_bridge_dport(struct device *match, void *arg) return 0; } - cxl_device_lock(&root_port->dev); dport = devm_cxl_add_dport(host, root_port, match, uid, ctx.chbcr); - cxl_device_unlock(&root_port->dev); if (IS_ERR(dport)) { dev_err(host, "failed to add downstream port: %s\n", dev_name(match)); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c5a9e03ed477..8ec5f74da679 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -47,10 +47,8 @@ static int match_add_dports(struct pci_dev *pdev, void *data) dev_dbg(&port->dev, "failed to find component registers\n"); port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - cxl_device_lock(&port->dev); dport = devm_cxl_add_dport(ctx->host, port, &pdev->dev, port_num, cxl_regmap_to_base(pdev, &map)); - cxl_device_unlock(&port->dev); if (IS_ERR(dport)) { ctx->error = PTR_ERR(dport); return PTR_ERR(dport); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 0bbd8fb8f35d..a66284b7eb1b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -40,6 +40,11 @@ static int cxl_device_id(struct device *dev) return CXL_DEVICE_NVDIMM_BRIDGE; if (dev->type == &cxl_nvdimm_type) return CXL_DEVICE_NVDIMM; + if (is_cxl_port(dev)) { + if (is_cxl_root(to_cxl_port(dev))) + return CXL_DEVICE_ROOT; + return CXL_DEVICE_PORT; + } return 0; } @@ -298,6 +303,9 @@ static void unregister_port(void *_port) { struct cxl_port *port = _port; + if (!is_cxl_root(port)) + device_lock_assert(port->dev.parent); + device_unregister(&port->dev); } @@ -526,15 +534,34 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) return dup ? -EEXIST : 0; } +/* + * Since root-level CXL dports cannot be enumerated by PCI they are not + * enumerated by the common port driver that acquires the port lock over + * dport add/remove. Instead, root dports are manually added by a + * platform driver and cond_cxl_root_lock() is used to take the missing + * port lock in that case. + */ +static void cond_cxl_root_lock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + cxl_device_lock(&port->dev); +} + +static void cond_cxl_root_unlock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + cxl_device_unlock(&port->dev); +} + static void cxl_dport_remove(void *data) { struct cxl_dport *dport = data; struct cxl_port *port = dport->port; put_device(dport->dport); - cxl_device_lock(&port->dev); + cond_cxl_root_lock(port); list_del(&dport->list); - cxl_device_unlock(&port->dev); + cond_cxl_root_unlock(port); } static void cxl_dport_unlink(void *data) @@ -587,7 +614,9 @@ struct cxl_dport *devm_cxl_add_dport(struct device *host, struct cxl_port *port, dport->component_reg_phys = component_reg_phys; dport->port = port; + cond_cxl_root_lock(port); rc = add_dport(port, dport); + cond_cxl_root_unlock(port); if (rc) return ERR_PTR(rc); @@ -895,6 +924,7 @@ static int cxl_bus_probe(struct device *dev) rc = to_cxl_drv(dev->driver)->probe(dev); cxl_nested_unlock(dev); + dev_dbg(dev, "probe: %d\n", rc); return rc; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index c127d5c0ac96..2b24eb56618f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -163,6 +163,8 @@ int cxl_map_device_regs(struct pci_dev *pdev, enum cxl_regloc_type; int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); #define CXL_RESOURCE_NONE ((resource_size_t) -1) #define CXL_TARGET_STRLEN 20 @@ -354,6 +356,8 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv); #define CXL_DEVICE_NVDIMM_BRIDGE 1 #define CXL_DEVICE_NVDIMM 2 +#define CXL_DEVICE_PORT 3 +#define CXL_DEVICE_ROOT 4 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*") #define CXL_MODALIAS_FMT "cxl:t%d" diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 103636fda198..47640f19e899 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -2,6 +2,7 @@ /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #ifndef __CXL_PCI_H__ #define __CXL_PCI_H__ +#include #include "cxl.h" #define CXL_MEMORY_PROGIF 0x10 diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c new file mode 100644 index 000000000000..daa4c3c33aed --- /dev/null +++ b/drivers/cxl/port.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include + +#include "cxlmem.h" +#include "cxlpci.h" + +/** + * DOC: cxl port + * + * The port driver enumerates dport via PCI and scans for HDM + * (Host-managed-Device-Memory) decoder resources via the + * @component_reg_phys value passed in by the agent that registered the + * port. All descendant ports of a CXL root port (described by platform + * firmware) are managed in this drivers context. Each driver instance + * is responsible for tearing down the driver context of immediate + * descendant ports. The locking for this is validated by + * CONFIG_PROVE_CXL_LOCKING. + * + * The primary service this driver provides is presenting APIs to other + * drivers to utilize the decoders, and indicating to userspace (via bind + * status) the connectivity of the CXL.mem protocol throughout the + * PCIe topology. + */ + +static int cxl_port_probe(struct device *dev) +{ + struct cxl_port *port = to_cxl_port(dev); + struct cxl_hdm *cxlhdm; + int rc; + + rc = devm_cxl_port_enumerate_dports(dev, port); + if (rc < 0) + return rc; + + if (rc == 1) + return devm_cxl_add_passthrough_decoder(dev, port); + + cxlhdm = devm_cxl_setup_hdm(dev, port); + if (IS_ERR(cxlhdm)) + return PTR_ERR(cxlhdm); + + rc = devm_cxl_enumerate_decoders(dev, cxlhdm); + if (rc) { + dev_err(dev, "Couldn't enumerate decoders (%d)\n", rc); + return rc; + } + + return 0; +} + +static struct cxl_driver cxl_port_driver = { + .name = "cxl_port", + .probe = cxl_port_probe, + .id = CXL_DEVICE_PORT, +}; + +module_cxl_driver(cxl_port_driver); +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_PORT); diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 3045d7cba0db..27ae13e23e79 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -26,6 +26,11 @@ obj-m += cxl_pmem.o cxl_pmem-y := $(CXL_SRC)/pmem.o cxl_pmem-y += config_check.o +obj-m += cxl_port.o + +cxl_port-y := $(CXL_SRC)/port.o +cxl_port-y += config_check.o + obj-m += cxl_core.o cxl_core-y := $(CXL_CORE_SRC)/port.o diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 81c09380c537..ce6ace286fc7 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -437,10 +437,8 @@ static int mock_cxl_port_enumerate_dports(struct device *host, if (pdev->dev.parent != port->uport) continue; - cxl_device_lock(&port->dev); dport = devm_cxl_add_dport(host, port, &pdev->dev, pdev->id, CXL_RESOURCE_NONE); - cxl_device_unlock(&port->dev); if (IS_ERR(dport)) { dev_err(dev, "failed to add dport: %s (%ld)\n", From 664bf115833c2d4ee717ab63f4e6e72a25c66e77 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 1 Feb 2022 13:23:14 -0800 Subject: [PATCH 054/186] cxl/core/port: Remove @host argument for dport + decoder enumeration Now that dport and decoder enumeration is centralized in the port driver, the @host argument for these helpers can be made implicit. For the root port the host is the port's uport device (ACPI0017 for cxl_acpi), and for all other descendant ports the devm context is the parent of @port. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164375043390.484143.17617734732003230076.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/hdm.c | 12 +++++------- drivers/cxl/core/pci.c | 7 ++----- drivers/cxl/core/port.c | 9 +++++++-- drivers/cxl/cxl.h | 8 ++++---- drivers/cxl/cxlpci.h | 2 +- drivers/cxl/port.c | 8 ++++---- tools/testing/cxl/test/cxl.c | 14 +++++--------- tools/testing/cxl/test/mock.c | 28 ++++++++++++---------------- tools/testing/cxl/test/mock.h | 9 ++++----- 10 files changed, 45 insertions(+), 54 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 82591642ea90..683f2ca32c97 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -256,7 +256,7 @@ static int add_host_bridge_dport(struct device *match, void *arg) return 0; } - dport = devm_cxl_add_dport(host, root_port, match, uid, ctx.chbcr); + dport = devm_cxl_add_dport(root_port, match, uid, ctx.chbcr); if (IS_ERR(dport)) { dev_err(host, "failed to add downstream port: %s\n", dev_name(match)); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 84f4ed288a88..80280db316c0 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -44,7 +44,7 @@ static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, * are claimed and passed to the single dport. Disable the range until the first * CXL region is enumerated / activated. */ -int devm_cxl_add_passthrough_decoder(struct device *host, struct cxl_port *port) +int devm_cxl_add_passthrough_decoder(struct cxl_port *port) { struct cxl_decoder *cxld; struct cxl_dport *dport; @@ -93,21 +93,20 @@ static void __iomem *map_hdm_decoder_regs(struct cxl_port *port, /** * devm_cxl_setup_hdm - map HDM decoder component registers - * @host: devm context for allocations * @port: cxl_port to map */ -struct cxl_hdm *devm_cxl_setup_hdm(struct device *host, struct cxl_port *port) +struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) { struct device *dev = &port->dev; void __iomem *crb, *hdm; struct cxl_hdm *cxlhdm; - cxlhdm = devm_kzalloc(host, sizeof(*cxlhdm), GFP_KERNEL); + cxlhdm = devm_kzalloc(dev, sizeof(*cxlhdm), GFP_KERNEL); if (!cxlhdm) return ERR_PTR(-ENOMEM); cxlhdm->port = port; - crb = devm_cxl_iomap_block(host, port->component_reg_phys, + crb = devm_cxl_iomap_block(dev, port->component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE); if (!crb) { dev_err(dev, "No component registers mapped\n"); @@ -195,10 +194,9 @@ static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, /** * devm_cxl_enumerate_decoders - add decoder objects per HDM register set - * @host: devm allocation context * @cxlhdm: Structure to populate with HDM capabilities */ -int devm_cxl_enumerate_decoders(struct device *host, struct cxl_hdm *cxlhdm) +int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { void __iomem *hdm = cxlhdm->regs.hdm_decoder; struct cxl_port *port = cxlhdm->port; diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 8ec5f74da679..c9a494d6976a 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -15,7 +15,6 @@ struct cxl_walk_context { struct pci_bus *bus; - struct device *host; struct cxl_port *port; int type; int error; @@ -47,7 +46,7 @@ static int match_add_dports(struct pci_dev *pdev, void *data) dev_dbg(&port->dev, "failed to find component registers\n"); port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - dport = devm_cxl_add_dport(ctx->host, port, &pdev->dev, port_num, + dport = devm_cxl_add_dport(port, &pdev->dev, port_num, cxl_regmap_to_base(pdev, &map)); if (IS_ERR(dport)) { ctx->error = PTR_ERR(dport); @@ -62,13 +61,12 @@ static int match_add_dports(struct pci_dev *pdev, void *data) /** * devm_cxl_port_enumerate_dports - enumerate downstream ports of the upstream port - * @host: devm context * @port: cxl_port whose ->uport is the upstream of dports to be enumerated * * Returns a positive number of dports enumerated or a negative error * code. */ -int devm_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port) +int devm_cxl_port_enumerate_dports(struct cxl_port *port) { struct pci_bus *bus = cxl_port_to_pci_bus(port); struct cxl_walk_context ctx; @@ -83,7 +81,6 @@ int devm_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port) type = PCI_EXP_TYPE_DOWNSTREAM; ctx = (struct cxl_walk_context) { - .host = host, .port = port, .bus = bus, .type = type, diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index a66284b7eb1b..62b9f5dc64b5 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -576,7 +576,6 @@ static void cxl_dport_unlink(void *data) /** * devm_cxl_add_dport - append downstream port data to a cxl_port - * @host: devm context for allocations * @port: the cxl_port that references this dport * @dport_dev: firmware or PCI device representing the dport * @port_id: identifier for this dport in a decoder's target list @@ -586,14 +585,20 @@ static void cxl_dport_unlink(void *data) * either the port's host (for root ports), or the port itself (for * switch ports) */ -struct cxl_dport *devm_cxl_add_dport(struct device *host, struct cxl_port *port, +struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, int port_id, resource_size_t component_reg_phys) { char link_name[CXL_TARGET_STRLEN]; struct cxl_dport *dport; + struct device *host; int rc; + if (is_cxl_root(port)) + host = port->uport; + else + host = &port->dev; + if (!host->driver) { dev_WARN_ONCE(&port->dev, 1, "dport:%s bad devm context\n", dev_name(dport_dev)); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 2b24eb56618f..89fbf49ebf98 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -313,7 +313,7 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, struct cxl_port *parent_port); struct cxl_port *find_cxl_root(struct device *dev); -struct cxl_dport *devm_cxl_add_dport(struct device *host, struct cxl_port *port, +struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, resource_size_t component_reg_phys); struct cxl_decoder *to_cxl_decoder(struct device *dev); @@ -327,9 +327,9 @@ int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); struct cxl_hdm; -struct cxl_hdm *devm_cxl_setup_hdm(struct device *host, struct cxl_port *port); -int devm_cxl_enumerate_decoders(struct device *host, struct cxl_hdm *cxlhdm); -int devm_cxl_add_passthrough_decoder(struct device *host, struct cxl_port *port); +struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port); +int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm); +int devm_cxl_add_passthrough_decoder(struct cxl_port *port); extern struct bus_type cxl_bus_type; diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 47640f19e899..766de340c4ce 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -58,5 +58,5 @@ static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev, return pci_resource_start(pdev, map->barno) + map->block_offset; } -int devm_cxl_port_enumerate_dports(struct device *host, struct cxl_port *port); +int devm_cxl_port_enumerate_dports(struct cxl_port *port); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index daa4c3c33aed..5a1aec28dc46 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -31,18 +31,18 @@ static int cxl_port_probe(struct device *dev) struct cxl_hdm *cxlhdm; int rc; - rc = devm_cxl_port_enumerate_dports(dev, port); + rc = devm_cxl_port_enumerate_dports(port); if (rc < 0) return rc; if (rc == 1) - return devm_cxl_add_passthrough_decoder(dev, port); + return devm_cxl_add_passthrough_decoder(port); - cxlhdm = devm_cxl_setup_hdm(dev, port); + cxlhdm = devm_cxl_setup_hdm(port); if (IS_ERR(cxlhdm)) return PTR_ERR(cxlhdm); - rc = devm_cxl_enumerate_decoders(dev, cxlhdm); + rc = devm_cxl_enumerate_decoders(cxlhdm); if (rc) { dev_err(dev, "Couldn't enumerate decoders (%d)\n", rc); return rc; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index ce6ace286fc7..40ed567952e6 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -399,8 +399,7 @@ static struct acpi_pci_root *mock_acpi_pci_find_root(acpi_handle handle) return &mock_pci_root[host_bridge_index(adev)]; } -static struct cxl_hdm *mock_cxl_setup_hdm(struct device *host, - struct cxl_port *port) +static struct cxl_hdm *mock_cxl_setup_hdm(struct cxl_port *port) { struct cxl_hdm *cxlhdm = devm_kzalloc(&port->dev, sizeof(*cxlhdm), GFP_KERNEL); @@ -411,21 +410,18 @@ static struct cxl_hdm *mock_cxl_setup_hdm(struct device *host, return cxlhdm; } -static int mock_cxl_add_passthrough_decoder(struct device *host, - struct cxl_port *port) +static int mock_cxl_add_passthrough_decoder(struct cxl_port *port) { dev_err(&port->dev, "unexpected passthrough decoder for cxl_test\n"); return -EOPNOTSUPP; } -static int mock_cxl_enumerate_decoders(struct device *host, - struct cxl_hdm *cxlhdm) +static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { return 0; } -static int mock_cxl_port_enumerate_dports(struct device *host, - struct cxl_port *port) +static int mock_cxl_port_enumerate_dports(struct cxl_port *port) { struct device *dev = &port->dev; int i; @@ -437,7 +433,7 @@ static int mock_cxl_port_enumerate_dports(struct device *host, if (pdev->dev.parent != port->uport) continue; - dport = devm_cxl_add_dport(host, port, &pdev->dev, pdev->id, + dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id, CXL_RESOURCE_NONE); if (IS_ERR(dport)) { diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 18d3b65e2a9b..6e8c9d63c92d 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -131,66 +131,62 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); -struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct device *host, - struct cxl_port *port) +struct cxl_hdm *__wrap_devm_cxl_setup_hdm(struct cxl_port *port) { int index; struct cxl_hdm *cxlhdm; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (ops && ops->is_mock_port(port->uport)) - cxlhdm = ops->devm_cxl_setup_hdm(host, port); + cxlhdm = ops->devm_cxl_setup_hdm(port); else - cxlhdm = devm_cxl_setup_hdm(host, port); + cxlhdm = devm_cxl_setup_hdm(port); put_cxl_mock_ops(index); return cxlhdm; } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_setup_hdm, CXL); -int __wrap_devm_cxl_add_passthrough_decoder(struct device *host, - struct cxl_port *port) +int __wrap_devm_cxl_add_passthrough_decoder(struct cxl_port *port) { int rc, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (ops && ops->is_mock_port(port->uport)) - rc = ops->devm_cxl_add_passthrough_decoder(host, port); + rc = ops->devm_cxl_add_passthrough_decoder(port); else - rc = devm_cxl_add_passthrough_decoder(host, port); + rc = devm_cxl_add_passthrough_decoder(port); put_cxl_mock_ops(index); return rc; } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_passthrough_decoder, CXL); -int __wrap_devm_cxl_enumerate_decoders(struct device *host, - struct cxl_hdm *cxlhdm) +int __wrap_devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { int rc, index; struct cxl_port *port = cxlhdm->port; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (ops && ops->is_mock_port(port->uport)) - rc = ops->devm_cxl_enumerate_decoders(host, cxlhdm); + rc = ops->devm_cxl_enumerate_decoders(cxlhdm); else - rc = devm_cxl_enumerate_decoders(host, cxlhdm); + rc = devm_cxl_enumerate_decoders(cxlhdm); put_cxl_mock_ops(index); return rc; } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_enumerate_decoders, CXL); -int __wrap_devm_cxl_port_enumerate_dports(struct device *host, - struct cxl_port *port) +int __wrap_devm_cxl_port_enumerate_dports(struct cxl_port *port) { int rc, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); if (ops && ops->is_mock_port(port->uport)) - rc = ops->devm_cxl_port_enumerate_dports(host, port); + rc = ops->devm_cxl_port_enumerate_dports(port); else - rc = devm_cxl_port_enumerate_dports(host, port); + rc = devm_cxl_port_enumerate_dports(port); put_cxl_mock_ops(index); return rc; diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 15e48063ea4b..738f24e3988a 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -19,11 +19,10 @@ struct cxl_mock_ops { bool (*is_mock_bus)(struct pci_bus *bus); bool (*is_mock_port)(struct device *dev); bool (*is_mock_dev)(struct device *dev); - int (*devm_cxl_port_enumerate_dports)(struct device *host, - struct cxl_port *port); - struct cxl_hdm *(*devm_cxl_setup_hdm)(struct device *host, struct cxl_port *port); - int (*devm_cxl_add_passthrough_decoder)(struct device *host, struct cxl_port *port); - int (*devm_cxl_enumerate_decoders)(struct device *host, struct cxl_hdm *hdm); + int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port); + struct cxl_hdm *(*devm_cxl_setup_hdm)(struct cxl_port *port); + int (*devm_cxl_add_passthrough_decoder)(struct cxl_port *port); + int (*devm_cxl_enumerate_decoders)(struct cxl_hdm *hdm); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); From 4112a08dd3c5ea0a96029f14061f2320826cfd32 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 1 Feb 2022 13:28:53 -0800 Subject: [PATCH 055/186] cxl/pci: Store component register base in cxlds In preparation for defining a cxl_port object to represent the decoder resources of a memory expander capture the component register base address. The port driver uses the component register base to enumerate the HDM Decoder Capability structure. Unlike other cxl_port objects the endpoint port decodes from upstream SPA to downstream DPA rather than upstream port to downstream port. Signed-off-by: Ben Widawsky Reported-by: kernel test robot Reviewed-by: Jonathan Cameron [djbw: clarify changelog] Link: https://lore.kernel.org/r/164375084181.484304.3919737667590006795.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/cxlmem.h | 3 +++ drivers/cxl/pci.c | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index fca2d1b5f6ff..90d67fff5bed 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -116,6 +116,7 @@ struct cxl_mbox_cmd { * @active_persistent_bytes: sum of hard + soft persistent * @next_volatile_bytes: volatile capacity change pending device reset * @next_persistent_bytes: persistent capacity change pending device reset + * @component_reg_phys: register base of component registers * @mbox_send: @dev specific transport for transmitting mailbox commands * * See section 8.2.9.5.2 Capacity Configuration and Label Storage for @@ -145,6 +146,8 @@ struct cxl_dev_state { u64 next_volatile_bytes; u64 next_persistent_bytes; + resource_size_t component_reg_phys; + int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd); }; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 8b435b875b65..bf14c365ea33 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -416,6 +416,17 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; + /* + * If the component registers can't be found, the cxl_pci driver may + * still be useful for management functions so don't return an error. + */ + cxlds->component_reg_phys = CXL_RESOURCE_NONE; + rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + if (rc) + dev_warn(&pdev->dev, "No component registers (%d)\n", rc); + + cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map); + rc = cxl_pci_setup_mailbox(cxlds); if (rc) return rc; From 06e279e5ebe4f32ffe544ec96a199870319a7315 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 1 Feb 2022 14:06:32 -0800 Subject: [PATCH 056/186] cxl/pci: Cache device DVSEC offset The PCIe device DVSEC, defined in the CXL 2.0 spec, 8.1.3 is required to be implemented by CXL 2.0 endpoint devices. In preparation for consuming this information in a new cxl_mem driver, retrieve the CXL DVSEC position and warn about the implications of not finding it. Allow for mailbox operation even if the CXL DVSEC is missing. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164375309615.513620.7874131241128599893.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/cxlmem.h | 2 ++ drivers/cxl/pci.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 90d67fff5bed..5cf5329e13a9 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -98,6 +98,7 @@ struct cxl_mbox_cmd { * * @dev: The device associated with this CXL state * @regs: Parsed register blocks + * @cxl_dvsec: Offset to the PCIe device DVSEC * @payload_size: Size of space for payload * (CXL 2.0 8.2.8.4.3 Mailbox Capabilities Register) * @lsa_size: Size of Label Storage Area @@ -126,6 +127,7 @@ struct cxl_dev_state { struct device *dev; struct cxl_regs regs; + int cxl_dvsec; size_t payload_size; size_t lsa_size; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index bf14c365ea33..c94002166084 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -408,6 +408,12 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (IS_ERR(cxlds)) return PTR_ERR(cxlds); + cxlds->cxl_dvsec = pci_find_dvsec_capability( + pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); + if (!cxlds->cxl_dvsec) + dev_warn(&pdev->dev, + "Device DVSEC not present, skip CXL.mem init\n"); + rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); if (rc) return rc; From 560f78559006a4bab20455ae7eca33d8417c38fc Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 1 Feb 2022 15:48:56 -0800 Subject: [PATCH 057/186] cxl/pci: Retrieve CXL DVSEC memory info Before CXL 2.0 HDM Decoder Capability mechanisms can be utilized in a device the driver must determine that the device is ready for CXL.mem operation and that platform firmware, or some other agent, has established an active decode via the legacy CXL 1.1 decoder mechanism. This legacy mechanism is defined in the CXL DVSEC as a set of range registers and status bits that take time to settle after a reset. Validate the CXL memory decode setup via the DVSEC and cache it for later consideration by the cxl_mem driver (to be added). Failure to validate is not fatal to the cxl_pci driver since that is only providing CXL command support over PCI.mmio, and might be needed to rectify CXL DVSEC validation problems. Any potential ranges that the device is already claiming via DVSEC need to be reconciled with the dynamic provisioning ranges provided by platform firmware (like ACPI CEDT.CFMWS). Leave that reconciliation to the cxl_mem driver. [djbw: shorten defines] [djbw: change precise spin wait to generous msleep] Reported-by: kernel test robot Signed-off-by: Ben Widawsky [djbw: clarify changelog] Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164375911821.559935.7375160041663453400.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/cxlmem.h | 14 +++++ drivers/cxl/cxlpci.h | 13 +++++ drivers/cxl/pci.c | 119 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 5cf5329e13a9..00f55f4066b9 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -89,6 +89,18 @@ struct cxl_mbox_cmd { */ #define CXL_CAPACITY_MULTIPLIER SZ_256M +/** + * struct cxl_endpoint_dvsec_info - Cached DVSEC info + * @mem_enabled: cached value of mem_enabled in the DVSEC, PCIE_DEVICE + * @ranges: Number of active HDM ranges this device uses. + * @dvsec_range: cached attributes of the ranges in the DVSEC, PCIE_DEVICE + */ +struct cxl_endpoint_dvsec_info { + bool mem_enabled; + int ranges; + struct range dvsec_range[2]; +}; + /** * struct cxl_dev_state - The driver device state * @@ -118,6 +130,7 @@ struct cxl_mbox_cmd { * @next_volatile_bytes: volatile capacity change pending device reset * @next_persistent_bytes: persistent capacity change pending device reset * @component_reg_phys: register base of component registers + * @info: Cached DVSEC information about the device. * @mbox_send: @dev specific transport for transmitting mailbox commands * * See section 8.2.9.5.2 Capacity Configuration and Label Storage for @@ -149,6 +162,7 @@ struct cxl_dev_state { u64 next_persistent_bytes; resource_size_t component_reg_phys; + struct cxl_endpoint_dvsec_info info; int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd); }; diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 766de340c4ce..329e7ea3f36a 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -17,6 +17,19 @@ /* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ #define CXL_DVSEC_PCIE_DEVICE 0 +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_MEM_CAPABLE BIT(2) +#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE BIT(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID BIT(0) +#define CXL_DVSEC_MEM_ACTIVE BIT(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) /* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ #define CXL_DVSEC_FUNCTION_MAP 2 diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index c94002166084..6b3270246545 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -386,6 +386,120 @@ static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, return rc; } +static int wait_for_valid(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec, rc; + u32 val; + + /* + * Memory_Info_Valid: When set, indicates that the CXL Range 1 Size high + * and Size Low registers are valid. Must be set within 1 second of + * deassertion of reset to CXL device. Likely it is already set by the + * time this runs, but otherwise give a 1.5 second timeout in case of + * clock skew. + */ + rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val); + if (rc) + return rc; + + if (val & CXL_DVSEC_MEM_INFO_VALID) + return 0; + + msleep(1500); + + rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val); + if (rc) + return rc; + + if (val & CXL_DVSEC_MEM_INFO_VALID) + return 0; + + return -ETIMEDOUT; +} + +static int cxl_dvsec_ranges(struct cxl_dev_state *cxlds) +{ + struct cxl_endpoint_dvsec_info *info = &cxlds->info; + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + int hdm_count, rc, i; + u16 cap, ctrl; + + if (!d) + return -ENXIO; + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); + if (rc) + return rc; + + rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + if (rc) + return rc; + + if (!(cap & CXL_DVSEC_MEM_CAPABLE)) + return -ENXIO; + + /* + * It is not allowed by spec for MEM.capable to be set and have 0 legacy + * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this + * driver is for a spec defined class code which must be CXL.mem + * capable, there is no point in continuing to enable CXL.mem. + */ + hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + if (!hdm_count || hdm_count > 2) + return -EINVAL; + + rc = wait_for_valid(cxlds); + if (rc) + return rc; + + info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); + + for (i = 0; i < hdm_count; i++) { + u64 base, size; + u32 temp; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); + if (rc) + return rc; + + size = (u64)temp << 32; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); + if (rc) + return rc; + + size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); + if (rc) + return rc; + + base = (u64)temp << 32; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); + if (rc) + return rc; + + base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK; + + info->dvsec_range[i] = (struct range) { + .start = base, + .end = base + size - 1 + }; + + if (size) + info->ranges++; + } + + return 0; +} + static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct cxl_register_map map; @@ -449,6 +563,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; + rc = cxl_dvsec_ranges(cxlds); + if (rc) + dev_warn(&pdev->dev, + "Failed to get DVSEC range information (%d)\n", rc); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); From 523e594d9cc03db962c741ce02c8a58aab58a123 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 23 Jan 2022 16:31:13 -0800 Subject: [PATCH 058/186] cxl/pci: Implement wait for media active CXL 2.0 8.1.3.8.2 states: Memory_Active: When set, indicates that the CXL Range 1 memory is fully initialized and available for software use. Must be set within Range 1. Memory_Active_Timeout of deassertion of reset to CXL device if CXL.mem HwInit Mode=1 Unfortunately, Memory_Active can take quite a long time depending on media size (up to 256s per 2.0 spec). Provide a callback for the eventual establishment of CXL.mem operations via the 'cxl_mem' driver the 'struct cxl_memdev'. The implementation waits for 60s by default for now and can be overridden by the mbox_ready_time module parameter. Signed-off-by: Ben Widawsky [djbw: switch to sleeping wait] Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298427373.3018233.9309741847039301834.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/cxlmem.h | 2 ++ drivers/cxl/pci.c | 49 +++++++++++++++++++++++++++++++++++- tools/testing/cxl/test/mem.c | 8 ++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 00f55f4066b9..e70838e5dc17 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -132,6 +132,7 @@ struct cxl_endpoint_dvsec_info { * @component_reg_phys: register base of component registers * @info: Cached DVSEC information about the device. * @mbox_send: @dev specific transport for transmitting mailbox commands + * @wait_media_ready: @dev specific method to await media ready * * See section 8.2.9.5.2 Capacity Configuration and Label Storage for * details on capacity parameters. @@ -165,6 +166,7 @@ struct cxl_dev_state { struct cxl_endpoint_dvsec_info info; int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd); + int (*wait_media_ready)(struct cxl_dev_state *cxlds); }; enum cxl_opcode { diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 6b3270246545..c01bd44d11c4 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -49,7 +49,7 @@ static unsigned short mbox_ready_timeout = 60; module_param(mbox_ready_timeout, ushort, 0644); MODULE_PARM_DESC(mbox_ready_timeout, - "seconds to wait for mailbox ready status"); + "seconds to wait for mailbox ready / memory active status"); static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds) { @@ -418,6 +418,51 @@ static int wait_for_valid(struct cxl_dev_state *cxlds) return -ETIMEDOUT; } +/* + * Wait up to @mbox_ready_timeout for the device to report memory + * active. + */ +static int wait_for_media_ready(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + int d = cxlds->cxl_dvsec; + bool active = false; + u64 md_status; + int rc, i; + + rc = wait_for_valid(cxlds); + if (rc) + return rc; + + for (i = mbox_ready_timeout; i; i--) { + u32 temp; + int rc; + + rc = pci_read_config_dword( + pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &temp); + if (rc) + return rc; + + active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); + if (active) + break; + msleep(1000); + } + + if (!active) { + dev_err(&pdev->dev, + "timeout awaiting memory active after %d seconds\n", + mbox_ready_timeout); + return -ETIMEDOUT; + } + + md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET); + if (!CXLMDEV_READY(md_status)) + return -EIO; + + return 0; +} + static int cxl_dvsec_ranges(struct cxl_dev_state *cxlds) { struct cxl_endpoint_dvsec_info *info = &cxlds->info; @@ -528,6 +573,8 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_warn(&pdev->dev, "Device DVSEC not present, skip CXL.mem init\n"); + cxlds->wait_media_ready = wait_for_media_ready; + rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map); if (rc) return rc; diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8c2086c4caef..3af3f94de0c3 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -236,6 +237,12 @@ static int cxl_mock_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd * return rc; } +static int cxl_mock_wait_media_ready(struct cxl_dev_state *cxlds) +{ + msleep(100); + return 0; +} + static void label_area_release(void *lsa) { vfree(lsa); @@ -262,6 +269,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) return PTR_ERR(cxlds); cxlds->mbox_send = cxl_mock_mbox_send; + cxlds->wait_media_ready = cxl_mock_wait_media_ready; cxlds->payload_size = SZ_4K; rc = cxl_enumerate_cmds(cxlds); From bcc79ea34398845d814170ddc06a457b35ae1975 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 31 Jan 2022 13:56:11 -0800 Subject: [PATCH 059/186] cxl/pci: Emit device serial number Per the CXL specification (8.1.12.2 Memory Device PCIe Capabilities and Extended Capabilities) the Device Serial Number capability is mandatory. Emit it for user tooling to identify devices. It is reasonable to ask whether the attribute should be added to the list of PCI sysfs device attributes. The PCI layer can optionally emit it too, but the CXL subsystem is aiming to preserve its independence and the possibility of CXL topologies with non-PCI devices in it. To date that has only proven useful for the 'cxl_test' model, but as can be seen with seen with ACPI0016 devices, sometimes all that is needed is a platform firmware table to point to CXL Component Registers in MMIO space to define a "CXL" device. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164366608838.196598.16856227191534267098.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 9 +++++++++ drivers/cxl/core/memdev.c | 11 +++++++++++ drivers/cxl/cxlmem.h | 2 ++ drivers/cxl/pci.c | 1 + tools/testing/cxl/test/mem.c | 1 + 5 files changed, 24 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 6d8cbf3355b5..87c0e5e65322 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -25,6 +25,15 @@ Description: identically named field in the Identify Memory Device Output Payload in the CXL-2.0 specification. +What: /sys/bus/cxl/devices/memX/serial +Date: January, 2022 +KernelVersion: v5.18 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) 64-bit serial number per the PCIe Device Serial Number + capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2 + Memory Device PCIe Capabilities and Extended Capabilities. + What: /sys/bus/cxl/devices/*/devtype Date: June, 2021 KernelVersion: v5.14 diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 61029cb7ac62..1e574b052583 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -89,7 +89,18 @@ static ssize_t pmem_size_show(struct device *dev, struct device_attribute *attr, static struct device_attribute dev_attr_pmem_size = __ATTR(size, 0444, pmem_size_show, NULL); +static ssize_t serial_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + + return sysfs_emit(buf, "%#llx\n", cxlds->serial); +} +static DEVICE_ATTR_RO(serial); + static struct attribute *cxl_memdev_attributes[] = { + &dev_attr_serial.attr, &dev_attr_firmware_version.attr, &dev_attr_payload_max.attr, &dev_attr_label_storage_size.attr, diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index e70838e5dc17..0ba0cf8dcdbc 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -131,6 +131,7 @@ struct cxl_endpoint_dvsec_info { * @next_persistent_bytes: persistent capacity change pending device reset * @component_reg_phys: register base of component registers * @info: Cached DVSEC information about the device. + * @serial: PCIe Device Serial Number * @mbox_send: @dev specific transport for transmitting mailbox commands * @wait_media_ready: @dev specific method to await media ready * @@ -164,6 +165,7 @@ struct cxl_dev_state { resource_size_t component_reg_phys; struct cxl_endpoint_dvsec_info info; + u64 serial; int (*mbox_send)(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd); int (*wait_media_ready)(struct cxl_dev_state *cxlds); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index c01bd44d11c4..8a7267d116b7 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -567,6 +567,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (IS_ERR(cxlds)) return PTR_ERR(cxlds); + cxlds->serial = pci_get_dsn(pdev); cxlds->cxl_dvsec = pci_find_dvsec_capability( pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); if (!cxlds->cxl_dvsec) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 3af3f94de0c3..36ef337c775c 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -268,6 +268,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (IS_ERR(cxlds)) return PTR_ERR(cxlds); + cxlds->serial = pdev->id; cxlds->mbox_send = cxl_mock_mbox_send; cxlds->wait_media_ready = cxl_mock_wait_media_ready; cxlds->payload_size = SZ_4K; From cf1f6877b088cd9ddeb5f3db8ade3a61e3a3f9eb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:31:24 -0800 Subject: [PATCH 060/186] cxl/memdev: Add numa_node attribute While CXL memory targets will have their own memory target node, individual memory devices may be affinitized like other PCI devices. Emit that attribute for memdevs. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164298428430.3018233.16409089892707993289.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 9 +++++++++ drivers/cxl/core/memdev.c | 17 +++++++++++++++++ tools/testing/cxl/test/cxl.c | 1 + 3 files changed, 27 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 87c0e5e65322..0b51cfec0c66 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -34,6 +34,15 @@ Description: capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2 Memory Device PCIe Capabilities and Extended Capabilities. +What: /sys/bus/cxl/devices/memX/numa_node +Date: January, 2022 +KernelVersion: v5.18 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) If NUMA is enabled and the platform has affinitized the + host PCI device for this memory device, emit the CPU node + affinity for this device. + What: /sys/bus/cxl/devices/*/devtype Date: June, 2021 KernelVersion: v5.14 diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 1e574b052583..b2773664e407 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -99,11 +99,19 @@ static ssize_t serial_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(serial); +static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + static struct attribute *cxl_memdev_attributes[] = { &dev_attr_serial.attr, &dev_attr_firmware_version.attr, &dev_attr_payload_max.attr, &dev_attr_label_storage_size.attr, + &dev_attr_numa_node.attr, NULL, }; @@ -117,8 +125,17 @@ static struct attribute *cxl_memdev_ram_attributes[] = { NULL, }; +static umode_t cxl_memdev_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA) && a == &dev_attr_numa_node.attr) + return 0; + return a->mode; +} + static struct attribute_group cxl_memdev_attribute_group = { .attrs = cxl_memdev_attributes, + .is_visible = cxl_memdev_visible, }; static struct attribute_group cxl_memdev_ram_attribute_group = { diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 40ed567952e6..cd2f20f2707f 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -583,6 +583,7 @@ static __init int cxl_test_init(void) if (!pdev) goto err_mem; pdev->dev.parent = &port->dev; + set_dev_node(&pdev->dev, i % 2); rc = platform_device_add(pdev); if (rc) { From 2703c16c75aea142c3079ec34ae2262c0557ef7f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 4 Feb 2022 07:08:40 -0800 Subject: [PATCH 061/186] cxl/core/port: Add switch port enumeration So far the platorm level CXL resources have been enumerated by the cxl_acpi driver, and cxl_pci has gathered all the pre-requisite information it needs to fire up a cxl_mem driver. However, the first thing the cxl_mem driver will be tasked to do is validate that all the PCIe Switches in its ancestry also have CXL capabilities and an CXL.mem link established. Provide a common mechanism for a CXL.mem endpoint driver to enumerate all the ancestor CXL ports in the topology and validate CXL.mem connectivity. Multiple endpoints may end up racing to establish a shared port in the topology. This race is resolved via taking the device-lock on a parent CXL Port before establishing a new child. The winner of the race establishes the port, the loser simply registers its interest in the port via 'struct cxl_ep' place-holder reference. At endpoint teardown the same parent port lock is taken as 'struct cxl_ep' references are deleted. Last endpoint to drop its reference unregisters the port. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164398731146.902644.1029761300481366248.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 17 +- drivers/cxl/core/port.c | 431 +++++++++++++++++++++++++++++++++++++++- drivers/cxl/cxl.h | 19 ++ 3 files changed, 440 insertions(+), 27 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 683f2ca32c97..7bd53dc691ec 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -130,21 +130,6 @@ static int cxl_parse_cfmws(union acpi_subtable_headers *header, void *arg, return 0; } -static struct cxl_dport *find_dport_by_dev(struct cxl_port *port, struct device *dev) -{ - struct cxl_dport *dport; - - cxl_device_lock(&port->dev); - list_for_each_entry(dport, &port->dports, list) - if (dport->dport == dev) { - cxl_device_unlock(&port->dev); - return dport; - } - - cxl_device_unlock(&port->dev); - return NULL; -} - __mock struct acpi_device *to_cxl_host_bridge(struct device *host, struct device *dev) { @@ -175,7 +160,7 @@ static int add_host_bridge_uport(struct device *match, void *arg) if (!bridge) return 0; - dport = find_dport_by_dev(root_port, match); + dport = cxl_find_dport_by_dev(root_port, match); if (!dport) { dev_dbg(host, "host bridge expected and not found\n"); return 0; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 62b9f5dc64b5..c5779c982c80 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "core.h" @@ -265,10 +266,24 @@ struct cxl_decoder *to_cxl_decoder(struct device *dev) } EXPORT_SYMBOL_NS_GPL(to_cxl_decoder, CXL); +static void cxl_ep_release(struct cxl_ep *ep) +{ + if (!ep) + return; + list_del(&ep->list); + put_device(ep->ep); + kfree(ep); +} + static void cxl_port_release(struct device *dev) { struct cxl_port *port = to_cxl_port(dev); + struct cxl_ep *ep, *_e; + cxl_device_lock(dev); + list_for_each_entry_safe(ep, _e, &port->endpoints, list) + cxl_ep_release(ep); + cxl_device_unlock(dev); ida_free(&cxl_port_ida, port->id); kfree(port); } @@ -359,6 +374,7 @@ static struct cxl_port *cxl_port_alloc(struct device *uport, port->component_reg_phys = component_reg_phys; ida_init(&port->decoder_ida); INIT_LIST_HEAD(&port->dports); + INIT_LIST_HEAD(&port->endpoints); device_initialize(dev); device_set_pm_not_required(dev); @@ -457,25 +473,36 @@ int devm_cxl_register_pci_bus(struct device *host, struct device *uport, } EXPORT_SYMBOL_NS_GPL(devm_cxl_register_pci_bus, CXL); +static bool dev_is_cxl_root_child(struct device *dev) +{ + struct cxl_port *port, *parent; + + if (!is_cxl_port(dev)) + return false; + + port = to_cxl_port(dev); + if (is_cxl_root(port)) + return false; + + parent = to_cxl_port(port->dev.parent); + if (is_cxl_root(parent)) + return true; + + return false; +} + /* Find a 2nd level CXL port that has a dport that is an ancestor of @match */ static int match_root_child(struct device *dev, const void *match) { const struct device *iter = NULL; - struct cxl_port *port, *parent; struct cxl_dport *dport; + struct cxl_port *port; - if (!is_cxl_port(dev)) + if (!dev_is_cxl_root_child(dev)) return 0; port = to_cxl_port(dev); - if (is_cxl_root(port)) - return 0; - - parent = to_cxl_port(port->dev.parent); - if (!is_cxl_root(parent)) - return 0; - - cxl_device_lock(&port->dev); + cxl_device_lock(dev); list_for_each_entry(dport, &port->dports, list) { iter = match; while (iter) { @@ -485,7 +512,7 @@ static int match_root_child(struct device *dev, const void *match) } } out: - cxl_device_unlock(&port->dev); + cxl_device_unlock(dev); return !!iter; } @@ -642,6 +669,388 @@ struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport, CXL); +static struct cxl_ep *find_ep(struct cxl_port *port, struct device *ep_dev) +{ + struct cxl_ep *ep; + + device_lock_assert(&port->dev); + list_for_each_entry(ep, &port->endpoints, list) + if (ep->ep == ep_dev) + return ep; + return NULL; +} + +static int add_ep(struct cxl_port *port, struct cxl_ep *new) +{ + struct cxl_ep *dup; + + cxl_device_lock(&port->dev); + if (port->dead) { + cxl_device_unlock(&port->dev); + return -ENXIO; + } + dup = find_ep(port, new->ep); + if (!dup) + list_add_tail(&new->list, &port->endpoints); + cxl_device_unlock(&port->dev); + + return dup ? -EEXIST : 0; +} + +/** + * cxl_add_ep - register an endpoint's interest in a port + * @port: a port in the endpoint's topology ancestry + * @ep_dev: device representing the endpoint + * + * Intermediate CXL ports are scanned based on the arrival of endpoints. + * When those endpoints depart the port can be destroyed once all + * endpoints that care about that port have been removed. + */ +static int cxl_add_ep(struct cxl_port *port, struct device *ep_dev) +{ + struct cxl_ep *ep; + int rc; + + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (!ep) + return -ENOMEM; + + INIT_LIST_HEAD(&ep->list); + ep->ep = get_device(ep_dev); + + rc = add_ep(port, ep); + if (rc) + cxl_ep_release(ep); + return rc; +} + +struct cxl_find_port_ctx { + const struct device *dport_dev; + const struct cxl_port *parent_port; +}; + +static int match_port_by_dport(struct device *dev, const void *data) +{ + const struct cxl_find_port_ctx *ctx = data; + struct cxl_port *port; + + if (!is_cxl_port(dev)) + return 0; + if (ctx->parent_port && dev->parent != &ctx->parent_port->dev) + return 0; + + port = to_cxl_port(dev); + return cxl_find_dport_by_dev(port, ctx->dport_dev) != NULL; +} + +static struct cxl_port *__find_cxl_port(struct cxl_find_port_ctx *ctx) +{ + struct device *dev; + + if (!ctx->dport_dev) + return NULL; + + dev = bus_find_device(&cxl_bus_type, NULL, ctx, match_port_by_dport); + if (dev) + return to_cxl_port(dev); + return NULL; +} + +static struct cxl_port *find_cxl_port(struct device *dport_dev) +{ + struct cxl_find_port_ctx ctx = { + .dport_dev = dport_dev, + }; + + return __find_cxl_port(&ctx); +} + +static struct cxl_port *find_cxl_port_at(struct cxl_port *parent_port, + struct device *dport_dev) +{ + struct cxl_find_port_ctx ctx = { + .dport_dev = dport_dev, + .parent_port = parent_port, + }; + + return __find_cxl_port(&ctx); +} + +/* + * All users of grandparent() are using it to walk PCIe-like swich port + * hierarchy. A PCIe switch is comprised of a bridge device representing the + * upstream switch port and N bridges representing downstream switch ports. When + * bridges stack the grand-parent of a downstream switch port is another + * downstream switch port in the immediate ancestor switch. + */ +static struct device *grandparent(struct device *dev) +{ + if (dev && dev->parent) + return dev->parent->parent; + return NULL; +} + +/* + * The natural end of life of a non-root 'cxl_port' is when its parent port goes + * through a ->remove() event ("top-down" unregistration). The unnatural trigger + * for a port to be unregistered is when all memdevs beneath that port have gone + * through ->remove(). This "bottom-up" removal selectively removes individual + * child ports manually. This depends on devm_cxl_add_port() to not change is + * devm action registration order. + */ +static void delete_switch_port(struct cxl_port *port, struct list_head *dports) +{ + struct cxl_dport *dport, *_d; + + list_for_each_entry_safe(dport, _d, dports, list) { + devm_release_action(&port->dev, cxl_dport_unlink, dport); + devm_release_action(&port->dev, cxl_dport_remove, dport); + devm_kfree(&port->dev, dport); + } + devm_release_action(port->dev.parent, cxl_unlink_uport, port); + devm_release_action(port->dev.parent, unregister_port, port); +} + +static void cxl_detach_ep(void *data) +{ + struct cxl_memdev *cxlmd = data; + struct device *iter; + + for (iter = &cxlmd->dev; iter; iter = grandparent(iter)) { + struct device *dport_dev = grandparent(iter); + struct cxl_port *port, *parent_port; + LIST_HEAD(reap_dports); + struct cxl_ep *ep; + + if (!dport_dev) + break; + + port = find_cxl_port(dport_dev); + if (!port || is_cxl_root(port)) { + put_device(&port->dev); + continue; + } + + parent_port = to_cxl_port(port->dev.parent); + cxl_device_lock(&parent_port->dev); + if (!parent_port->dev.driver) { + /* + * The bottom-up race to delete the port lost to a + * top-down port disable, give up here, because the + * parent_port ->remove() will have cleaned up all + * descendants. + */ + cxl_device_unlock(&parent_port->dev); + put_device(&port->dev); + continue; + } + + cxl_device_lock(&port->dev); + ep = find_ep(port, &cxlmd->dev); + dev_dbg(&cxlmd->dev, "disconnect %s from %s\n", + ep ? dev_name(ep->ep) : "", dev_name(&port->dev)); + cxl_ep_release(ep); + if (ep && !port->dead && list_empty(&port->endpoints) && + !is_cxl_root(parent_port)) { + /* + * This was the last ep attached to a dynamically + * enumerated port. Block new cxl_add_ep() and garbage + * collect the port. + */ + port->dead = true; + list_splice_init(&port->dports, &reap_dports); + } + cxl_device_unlock(&port->dev); + + if (!list_empty(&reap_dports)) { + dev_dbg(&cxlmd->dev, "delete %s\n", + dev_name(&port->dev)); + delete_switch_port(port, &reap_dports); + } + put_device(&port->dev); + cxl_device_unlock(&parent_port->dev); + } +} + +static resource_size_t find_component_registers(struct device *dev) +{ + struct cxl_register_map map; + struct pci_dev *pdev; + + /* + * Theoretically, CXL component registers can be hosted on a + * non-PCI device, in practice, only cxl_test hits this case. + */ + if (!dev_is_pci(dev)) + return CXL_RESOURCE_NONE; + + pdev = to_pci_dev(dev); + + cxl_find_regblock(pdev, CXL_REGLOC_RBI_COMPONENT, &map); + return cxl_regmap_to_base(pdev, &map); +} + +static int add_port_attach_ep(struct cxl_memdev *cxlmd, + struct device *uport_dev, + struct device *dport_dev) +{ + struct device *dparent = grandparent(dport_dev); + struct cxl_port *port, *parent_port = NULL; + resource_size_t component_reg_phys; + int rc; + + if (!dparent) { + /* + * The iteration reached the topology root without finding the + * CXL-root 'cxl_port' on a previous iteration, fail for now to + * be re-probed after platform driver attaches. + */ + dev_dbg(&cxlmd->dev, "%s is a root dport\n", + dev_name(dport_dev)); + return -ENXIO; + } + + parent_port = find_cxl_port(dparent); + if (!parent_port) { + /* iterate to create this parent_port */ + return -EAGAIN; + } + + cxl_device_lock(&parent_port->dev); + if (!parent_port->dev.driver) { + dev_warn(&cxlmd->dev, + "port %s:%s disabled, failed to enumerate CXL.mem\n", + dev_name(&parent_port->dev), dev_name(uport_dev)); + port = ERR_PTR(-ENXIO); + goto out; + } + + port = find_cxl_port_at(parent_port, dport_dev); + if (!port) { + component_reg_phys = find_component_registers(uport_dev); + port = devm_cxl_add_port(&parent_port->dev, uport_dev, + component_reg_phys, parent_port); + if (!IS_ERR(port)) + get_device(&port->dev); + } +out: + cxl_device_unlock(&parent_port->dev); + + if (IS_ERR(port)) + rc = PTR_ERR(port); + else { + dev_dbg(&cxlmd->dev, "add to new port %s:%s\n", + dev_name(&port->dev), dev_name(port->uport)); + rc = cxl_add_ep(port, &cxlmd->dev); + if (rc == -EEXIST) { + /* + * "can't" happen, but this error code means + * something to the caller, so translate it. + */ + rc = -ENXIO; + } + put_device(&port->dev); + } + + put_device(&parent_port->dev); + return rc; +} + +int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd) +{ + struct device *dev = &cxlmd->dev; + struct device *iter; + int rc; + + rc = devm_add_action_or_reset(&cxlmd->dev, cxl_detach_ep, cxlmd); + if (rc) + return rc; + + /* + * Scan for and add all cxl_ports in this device's ancestry. + * Repeat until no more ports are added. Abort if a port add + * attempt fails. + */ +retry: + for (iter = dev; iter; iter = grandparent(iter)) { + struct device *dport_dev = grandparent(iter); + struct device *uport_dev; + struct cxl_port *port; + + if (!dport_dev) + return 0; + + uport_dev = dport_dev->parent; + if (!uport_dev) { + dev_warn(dev, "at %s no parent for dport: %s\n", + dev_name(iter), dev_name(dport_dev)); + return -ENXIO; + } + + dev_dbg(dev, "scan: iter: %s dport_dev: %s parent: %s\n", + dev_name(iter), dev_name(dport_dev), + dev_name(uport_dev)); + port = find_cxl_port(dport_dev); + if (port) { + dev_dbg(&cxlmd->dev, + "found already registered port %s:%s\n", + dev_name(&port->dev), dev_name(port->uport)); + rc = cxl_add_ep(port, &cxlmd->dev); + + /* + * If the endpoint already exists in the port's list, + * that's ok, it was added on a previous pass. + * Otherwise, retry in add_port_attach_ep() after taking + * the parent_port lock as the current port may be being + * reaped. + */ + if (rc && rc != -EEXIST) { + put_device(&port->dev); + return rc; + } + + /* Any more ports to add between this one and the root? */ + if (!dev_is_cxl_root_child(&port->dev)) { + put_device(&port->dev); + continue; + } + + put_device(&port->dev); + return 0; + } + + rc = add_port_attach_ep(cxlmd, uport_dev, dport_dev); + /* port missing, try to add parent */ + if (rc == -EAGAIN) + continue; + /* failed to add ep or port */ + if (rc) + return rc; + /* port added, new descendants possible, start over */ + goto retry; + } + + return 0; +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, CXL); + +struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port, + const struct device *dev) +{ + struct cxl_dport *dport; + + cxl_device_lock(&port->dev); + list_for_each_entry(dport, &port->dports, list) + if (dport->dport == dev) { + cxl_device_unlock(&port->dev); + return dport; + } + + cxl_device_unlock(&port->dev); + return NULL; +} +EXPORT_SYMBOL_NS_GPL(cxl_find_dport_by_dev, CXL); + static int decoder_populate_targets(struct cxl_decoder *cxld, struct cxl_port *port, int *target_map) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 89fbf49ebf98..de4fbe7cbf42 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -262,8 +262,10 @@ struct cxl_nvdimm { * @uport: PCI or platform device implementing the upstream port capability * @id: id for port device-name * @dports: cxl_dport instances referenced by decoders + * @endpoints: cxl_ep instances, endpoints that are a descendant of this port * @decoder_ida: allocator for decoder ids * @component_reg_phys: component register capability base address (optional) + * @dead: last ep has been removed, force port re-creation * @depth: How deep this port is relative to the root. depth 0 is the root. */ struct cxl_port { @@ -271,8 +273,10 @@ struct cxl_port { struct device *uport; int id; struct list_head dports; + struct list_head endpoints; struct ida decoder_ida; resource_size_t component_reg_phys; + bool dead; unsigned int depth; }; @@ -292,6 +296,16 @@ struct cxl_dport { struct list_head list; }; +/** + * struct cxl_ep - track an endpoint's interest in a port + * @ep: device that hosts a generic CXL endpoint (expander or accelerator) + * @list: node on port->endpoints list + */ +struct cxl_ep { + struct device *ep; + struct list_head list; +}; + /* * The platform firmware device hosting the root is also the top of the * CXL port topology. All other CXL ports have another CXL port as their @@ -313,9 +327,14 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, resource_size_t component_reg_phys, struct cxl_port *parent_port); struct cxl_port *find_cxl_root(struct device *dev); +int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); + struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, resource_size_t component_reg_phys); +struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port, + const struct device *dev); + struct cxl_decoder *to_cxl_decoder(struct device *dev); bool is_root_decoder(struct device *dev); bool is_cxl_decoder(struct device *dev); From 8dd2bc0f8e02d39bd80851ca787bcbdb7d495e69 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Fri, 4 Feb 2022 07:18:31 -0800 Subject: [PATCH 062/186] cxl/mem: Add the cxl_mem driver At this point the subsystem can enumerate all CXL ports (CXL.mem decode resources in upstream switch ports and host bridges) in a system. The last mile is connecting those ports to endpoints. The cxl_mem driver connects an endpoint device to the platform CXL.mem protoctol decode-topology. At ->probe() time it walks its device-topology-ancestry and adds a CXL Port object at every Upstream Port hop until it gets to CXL root. The CXL root object is only present after a platform firmware driver registers platform CXL resources. For ACPI based platform this is managed by the ACPI0017 device and the cxl_acpi driver. The ports are registered such that disabling a given port automatically unregisters all descendant ports, and the chain can only be registered after the root is established. Given ACPI device scanning may run asynchronously compared to PCI device scanning the root driver is tasked with rescanning the bus after the root successfully probes. Conversely if any ports in a chain between the root and an endpoint becomes disconnected it subsequently triggers the endpoint to unregister. Given lock depenedencies the endpoint unregistration happens in a workqueue asynchronously. If userspace cares about synchronizing delayed work after port events the /sys/bus/cxl/flush attribute is available for that purpose. Reported-by: Randy Dunlap Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron [djbw: clarify changelog, rework hotplug support] Link: https://lore.kernel.org/r/164398782997.903003.9725273241627693186.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 9 + .../driver-api/cxl/memory-devices.rst | 9 + drivers/cxl/Kconfig | 16 ++ drivers/cxl/Makefile | 2 + drivers/cxl/acpi.c | 3 +- drivers/cxl/core/memdev.c | 16 ++ drivers/cxl/core/port.c | 105 +++++++- drivers/cxl/cxl.h | 6 + drivers/cxl/cxlmem.h | 8 + drivers/cxl/mem.c | 228 ++++++++++++++++++ drivers/cxl/port.c | 12 + tools/testing/cxl/Kbuild | 6 + tools/testing/cxl/mock_mem.c | 10 + 13 files changed, 425 insertions(+), 5 deletions(-) create mode 100644 drivers/cxl/mem.c create mode 100644 tools/testing/cxl/mock_mem.c diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 0b51cfec0c66..7c2b846521f3 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -1,3 +1,12 @@ +What: /sys/bus/cxl/flush +Date: Januarry, 2022 +KernelVersion: v5.18 +Contact: linux-cxl@vger.kernel.org +Description: + (WO) If userspace manually unbinds a port the kernel schedules + all descendant memdevs for unbind. Writing '1' to this attribute + flushes that work. + What: /sys/bus/cxl/devices/memX/firmware_version Date: December, 2020 KernelVersion: v5.12 diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index 3498d38d7cbd..db476bb170b6 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -325,6 +325,9 @@ CXL Memory Device .. kernel-doc:: drivers/cxl/pci.c :internal: +.. kernel-doc:: drivers/cxl/mem.c + :doc: cxl mem + CXL Port -------- .. kernel-doc:: drivers/cxl/port.c @@ -344,6 +347,12 @@ CXL Core .. kernel-doc:: drivers/cxl/core/port.c :identifiers: +.. kernel-doc:: drivers/cxl/core/pci.c + :doc: cxl core pci + +.. kernel-doc:: drivers/cxl/core/pci.c + :identifiers: + .. kernel-doc:: drivers/cxl/core/pmem.c :doc: cxl pmem diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 4f4f7587f6ca..b88ab956bb7c 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -78,6 +78,22 @@ config CXL_PMEM If unsure say 'm'. +config CXL_MEM + tristate "CXL: Memory Expansion" + depends on CXL_PCI + default CXL_BUS + help + The CXL.mem protocol allows a device to act as a provider of "System + RAM" and/or "Persistent Memory" that is fully coherent as if the + memory were attached to the typical CPU memory controller. This is + known as HDM "Host-managed Device Memory". + + Say 'y/m' to enable a driver that will attach to CXL.mem devices for + memory expansion and control of HDM. See Chapter 9.13 in the CXL 2.0 + specification for a detailed description of HDM. + + If unsure say 'm'. + config CXL_PORT default CXL_BUS tristate diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile index 56fcac2323cb..ce267ef11d93 100644 --- a/drivers/cxl/Makefile +++ b/drivers/cxl/Makefile @@ -1,10 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_CXL_BUS) += core/ obj-$(CONFIG_CXL_PCI) += cxl_pci.o +obj-$(CONFIG_CXL_MEM) += cxl_mem.o obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o obj-$(CONFIG_CXL_PORT) += cxl_port.o +cxl_mem-y := mem.o cxl_pci-y := pci.o cxl_acpi-y := acpi.o cxl_pmem-y := pmem.o diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 7bd53dc691ec..d8295572bde9 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -314,7 +314,8 @@ static int cxl_acpi_probe(struct platform_device *pdev) if (rc < 0) return rc; - return 0; + /* In case PCI is scanned before ACPI re-trigger memdev attach */ + return cxl_bus_rescan(); } static const struct acpi_device_id cxl_acpi_ids[] = { diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index b2773664e407..1f76b28f9826 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -162,6 +162,12 @@ static const struct device_type cxl_memdev_type = { .groups = cxl_memdev_attribute_groups, }; +bool is_cxl_memdev(struct device *dev) +{ + return dev->type == &cxl_memdev_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_memdev, CXL); + /** * set_exclusive_cxl_commands() - atomically disable user cxl commands * @cxlds: The device state to operate on @@ -213,6 +219,15 @@ static void cxl_memdev_unregister(void *_cxlmd) put_device(dev); } +static void detach_memdev(struct work_struct *work) +{ + struct cxl_memdev *cxlmd; + + cxlmd = container_of(work, typeof(*cxlmd), detach_work); + device_release_driver(&cxlmd->dev); + put_device(&cxlmd->dev); +} + static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, const struct file_operations *fops) { @@ -237,6 +252,7 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, dev->devt = MKDEV(cxl_mem_major, cxlmd->id); dev->type = &cxl_memdev_type; device_set_pm_not_required(dev); + INIT_WORK(&cxlmd->detach_work, detach_memdev); cdev = &cxlmd->cdev; cdev_init(cdev, fops); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c5779c982c80..f460460b12b3 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #include +#include #include #include #include @@ -46,6 +47,8 @@ static int cxl_device_id(struct device *dev) return CXL_DEVICE_ROOT; return CXL_DEVICE_PORT; } + if (is_cxl_memdev(dev)) + return CXL_DEVICE_MEMORY_EXPANDER; return 0; } @@ -318,8 +321,10 @@ static void unregister_port(void *_port) { struct cxl_port *port = _port; - if (!is_cxl_root(port)) + if (!is_cxl_root(port)) { device_lock_assert(port->dev.parent); + port->uport = NULL; + } device_unregister(&port->dev); } @@ -410,7 +415,9 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, if (parent_port) port->depth = parent_port->depth + 1; dev = &port->dev; - if (parent_port) + if (is_cxl_memdev(uport)) + rc = dev_set_name(dev, "endpoint%d", port->id); + else if (parent_port) rc = dev_set_name(dev, "port%d", port->id); else rc = dev_set_name(dev, "root%d", port->id); @@ -790,6 +797,38 @@ static struct device *grandparent(struct device *dev) return NULL; } +static void delete_endpoint(void *data) +{ + struct cxl_memdev *cxlmd = data; + struct cxl_port *endpoint = dev_get_drvdata(&cxlmd->dev); + struct cxl_port *parent_port; + struct device *parent; + + parent_port = cxl_mem_find_port(cxlmd); + if (!parent_port) + return; + parent = &parent_port->dev; + + cxl_device_lock(parent); + if (parent->driver && endpoint->uport) { + devm_release_action(parent, cxl_unlink_uport, endpoint); + devm_release_action(parent, unregister_port, endpoint); + } + cxl_device_unlock(parent); + put_device(parent); + put_device(&endpoint->dev); +} + +int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint) +{ + struct device *dev = &cxlmd->dev; + + get_device(&endpoint->dev); + dev_set_drvdata(dev, endpoint); + return devm_add_action_or_reset(dev, delete_endpoint, cxlmd); +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_autoremove, CXL); + /* * The natural end of life of a non-root 'cxl_port' is when its parent port goes * through a ->remove() event ("top-down" unregistration). The unnatural trigger @@ -1034,6 +1073,12 @@ retry: } EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_ports, CXL); +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd) +{ + return find_cxl_port(grandparent(&cxlmd->dev)); +} +EXPORT_SYMBOL_NS_GPL(cxl_mem_find_port, CXL); + struct cxl_dport *cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dev) { @@ -1352,12 +1397,54 @@ static void cxl_bus_remove(struct device *dev) cxl_nested_unlock(dev); } +static struct workqueue_struct *cxl_bus_wq; + +int cxl_bus_rescan(void) +{ + return bus_rescan_devices(&cxl_bus_type); +} +EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, CXL); + +bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) +{ + return queue_work(cxl_bus_wq, &cxlmd->detach_work); +} +EXPORT_SYMBOL_NS_GPL(schedule_cxl_memdev_detach, CXL); + +/* for user tooling to ensure port disable work has completed */ +static ssize_t flush_store(struct bus_type *bus, const char *buf, size_t count) +{ + if (sysfs_streq(buf, "1")) { + flush_workqueue(cxl_bus_wq); + return count; + } + + return -EINVAL; +} + +static BUS_ATTR_WO(flush); + +static struct attribute *cxl_bus_attributes[] = { + &bus_attr_flush.attr, + NULL, +}; + +static struct attribute_group cxl_bus_attribute_group = { + .attrs = cxl_bus_attributes, +}; + +static const struct attribute_group *cxl_bus_attribute_groups[] = { + &cxl_bus_attribute_group, + NULL, +}; + struct bus_type cxl_bus_type = { .name = "cxl", .uevent = cxl_bus_uevent, .match = cxl_bus_match, .probe = cxl_bus_probe, .remove = cxl_bus_remove, + .bus_groups = cxl_bus_attribute_groups, }; EXPORT_SYMBOL_NS_GPL(cxl_bus_type, CXL); @@ -1371,12 +1458,21 @@ static __init int cxl_core_init(void) if (rc) return rc; + cxl_bus_wq = alloc_ordered_workqueue("cxl_port", 0); + if (!cxl_bus_wq) { + rc = -ENOMEM; + goto err_wq; + } + rc = bus_register(&cxl_bus_type); if (rc) - goto err; + goto err_bus; + return 0; -err: +err_bus: + destroy_workqueue(cxl_bus_wq); +err_wq: cxl_memdev_exit(); cxl_mbox_exit(); return rc; @@ -1385,6 +1481,7 @@ err: static void cxl_core_exit(void) { bus_unregister(&cxl_bus_type); + destroy_workqueue(cxl_bus_wq); cxl_memdev_exit(); cxl_mbox_exit(); } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index de4fbe7cbf42..f5e5b4ac8228 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -328,6 +328,9 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, struct cxl_port *parent_port); struct cxl_port *find_cxl_root(struct device *dev); int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); +int cxl_bus_rescan(void); +struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd); +bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd); struct cxl_dport *devm_cxl_add_dport(struct cxl_port *port, struct device *dport, int port_id, @@ -345,6 +348,8 @@ struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); +int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint); + struct cxl_hdm; struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port); int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm); @@ -377,6 +382,7 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv); #define CXL_DEVICE_NVDIMM 2 #define CXL_DEVICE_PORT 3 #define CXL_DEVICE_ROOT 4 +#define CXL_DEVICE_MEMORY_EXPANDER 5 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*") #define CXL_MODALIAS_FMT "cxl:t%d" diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 0ba0cf8dcdbc..5d33ce24fe09 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,12 +34,14 @@ * @dev: driver core device object * @cdev: char dev core object for ioctl operations * @cxlds: The device state backing this device + * @detach_work: active memdev lost a port in its ancestry * @id: id number of this memdev instance. */ struct cxl_memdev { struct device dev; struct cdev cdev; struct cxl_dev_state *cxlds; + struct work_struct detach_work; int id; }; @@ -48,6 +50,12 @@ static inline struct cxl_memdev *to_cxl_memdev(struct device *dev) return container_of(dev, struct cxl_memdev, dev); } +bool is_cxl_memdev(struct device *dev); +static inline bool is_cxl_endpoint(struct cxl_port *port) +{ + return is_cxl_memdev(port->uport); +} + struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); /** diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c new file mode 100644 index 000000000000..49a4b1c47299 --- /dev/null +++ b/drivers/cxl/mem.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include + +#include "cxlmem.h" +#include "cxlpci.h" + +/** + * DOC: cxl mem + * + * CXL memory endpoint devices and switches are CXL capable devices that are + * participating in CXL.mem protocol. Their functionality builds on top of the + * CXL.io protocol that allows enumerating and configuring components via + * standard PCI mechanisms. + * + * The cxl_mem driver owns kicking off the enumeration of this CXL.mem + * capability. With the detection of a CXL capable endpoint, the driver will + * walk up to find the platform specific port it is connected to, and determine + * if there are intervening switches in the path. If there are switches, a + * secondary action is to enumerate those (implemented in cxl_core). Finally the + * cxl_mem driver adds the device it is bound to as a CXL endpoint-port for use + * in higher level operations. + */ + +static int wait_for_media(struct cxl_memdev *cxlmd) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_endpoint_dvsec_info *info = &cxlds->info; + int rc; + + if (!info->mem_enabled) + return -EBUSY; + + rc = cxlds->wait_media_ready(cxlds); + if (rc) + return rc; + + /* + * We know the device is active, and enabled, if any ranges are non-zero + * we'll need to check later before adding the port since that owns the + * HDM decoder registers. + */ + return 0; +} + +static int create_endpoint(struct cxl_memdev *cxlmd, + struct cxl_port *parent_port) +{ + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_port *endpoint; + + endpoint = devm_cxl_add_port(&parent_port->dev, &cxlmd->dev, + cxlds->component_reg_phys, parent_port); + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + + dev_dbg(&cxlmd->dev, "add: %s\n", dev_name(&endpoint->dev)); + + if (!endpoint->dev.driver) { + dev_err(&cxlmd->dev, "%s failed probe\n", + dev_name(&endpoint->dev)); + return -ENXIO; + } + + return cxl_endpoint_autoremove(cxlmd, endpoint); +} + +/** + * cxl_dvsec_decode_init() - Setup HDM decoding for the endpoint + * @cxlds: Device state + * + * Additionally, enables global HDM decoding. Warning: don't call this outside + * of probe. Once probe is complete, the port driver owns all access to the HDM + * decoder registers. + * + * Returns: false if DVSEC Ranges are being used instead of HDM + * decoders, or if it can not be determined if DVSEC Ranges are in use. + * Otherwise, returns true. + */ +__mock bool cxl_dvsec_decode_init(struct cxl_dev_state *cxlds) +{ + struct cxl_endpoint_dvsec_info *info = &cxlds->info; + struct cxl_register_map map; + struct cxl_component_reg_map *cmap = &map.component_map; + bool global_enable, do_hdm_init = false; + void __iomem *crb; + u32 global_ctrl; + + /* map hdm decoder */ + crb = ioremap(cxlds->component_reg_phys, CXL_COMPONENT_REG_BLOCK_SIZE); + if (!crb) { + dev_dbg(cxlds->dev, "Failed to map component registers\n"); + return false; + } + + cxl_probe_component_regs(cxlds->dev, crb, cmap); + if (!cmap->hdm_decoder.valid) { + dev_dbg(cxlds->dev, "Invalid HDM decoder registers\n"); + goto out; + } + + global_ctrl = readl(crb + cmap->hdm_decoder.offset + + CXL_HDM_DECODER_CTRL_OFFSET); + global_enable = global_ctrl & CXL_HDM_DECODER_ENABLE; + if (!global_enable && info->ranges) { + dev_dbg(cxlds->dev, + "DVSEC ranges already programmed and HDM decoders not enabled.\n"); + goto out; + } + + do_hdm_init = true; + + /* + * Permanently (for this boot at least) opt the device into HDM + * operation. Individual HDM decoders still need to be enabled after + * this point. + */ + if (!global_enable) { + dev_dbg(cxlds->dev, "Enabling HDM decode\n"); + writel(global_ctrl | CXL_HDM_DECODER_ENABLE, + crb + cmap->hdm_decoder.offset + + CXL_HDM_DECODER_CTRL_OFFSET); + } + +out: + iounmap(crb); + return do_hdm_init; +} + +static int cxl_mem_probe(struct device *dev) +{ + struct cxl_memdev *cxlmd = to_cxl_memdev(dev); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct cxl_port *parent_port; + int rc; + + /* + * Someone is trying to reattach this device after it lost its port + * connection (an endpoint port previously registered by this memdev was + * disabled). This racy check is ok because if the port is still gone, + * no harm done, and if the port hierarchy comes back it will re-trigger + * this probe. Port rescan and memdev detach work share the same + * single-threaded workqueue. + */ + if (work_pending(&cxlmd->detach_work)) + return -EBUSY; + + rc = wait_for_media(cxlmd); + if (rc) { + dev_err(dev, "Media not active (%d)\n", rc); + return rc; + } + + /* + * If DVSEC ranges are being used instead of HDM decoder registers there + * is no use in trying to manage those. + */ + if (!cxl_dvsec_decode_init(cxlds)) { + struct cxl_endpoint_dvsec_info *info = &cxlds->info; + int i; + + /* */ + for (i = 0; i < 2; i++) { + u64 base, size; + + /* + * Give a nice warning to the user that BIOS has really + * botched things for them if it didn't place DVSEC + * ranges in the memory map. + */ + base = info->dvsec_range[i].start; + size = range_len(&info->dvsec_range[i]); + if (size && !region_intersects(base, size, + IORESOURCE_SYSTEM_RAM, + IORES_DESC_NONE)) { + dev_err(dev, + "DVSEC range %#llx-%#llx must be reserved by BIOS, but isn't\n", + base, base + size - 1); + } + } + dev_err(dev, + "Active DVSEC range registers in use. Will not bind.\n"); + return -EBUSY; + } + + rc = devm_cxl_enumerate_ports(cxlmd); + if (rc) + return rc; + + parent_port = cxl_mem_find_port(cxlmd); + if (!parent_port) { + dev_err(dev, "CXL port topology not found\n"); + return -ENXIO; + } + + cxl_device_lock(&parent_port->dev); + if (!parent_port->dev.driver) { + dev_err(dev, "CXL port topology %s not enabled\n", + dev_name(&parent_port->dev)); + rc = -ENXIO; + goto out; + } + + rc = create_endpoint(cxlmd, parent_port); +out: + cxl_device_unlock(&parent_port->dev); + put_device(&parent_port->dev); + return rc; +} + +static struct cxl_driver cxl_mem_driver = { + .name = "cxl_mem", + .probe = cxl_mem_probe, + .id = CXL_DEVICE_MEMORY_EXPANDER, +}; + +module_cxl_driver(cxl_mem_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); +/* + * create_endpoint() wants to validate port driver attach immediately after + * endpoint registration. + */ +MODULE_SOFTDEP("pre: cxl_port"); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 5a1aec28dc46..4d4e23b9adff 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -25,12 +25,24 @@ * PCIe topology. */ +static void schedule_detach(void *cxlmd) +{ + schedule_cxl_memdev_detach(cxlmd); +} + static int cxl_port_probe(struct device *dev) { struct cxl_port *port = to_cxl_port(dev); struct cxl_hdm *cxlhdm; int rc; + if (is_cxl_endpoint(port)) { + struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport); + + get_device(&cxlmd->dev); + return devm_add_action_or_reset(dev, schedule_detach, cxlmd); + } + rc = devm_cxl_port_enumerate_dports(port); if (rc < 0) return rc; diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 27ae13e23e79..82e49ab0937d 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -31,6 +31,12 @@ obj-m += cxl_port.o cxl_port-y := $(CXL_SRC)/port.o cxl_port-y += config_check.o +obj-m += cxl_mem.o + +cxl_mem-y := $(CXL_SRC)/mem.o +cxl_mem-y += mock_mem.o +cxl_mem-y += config_check.o + obj-m += cxl_core.o cxl_core-y := $(CXL_CORE_SRC)/port.o diff --git a/tools/testing/cxl/mock_mem.c b/tools/testing/cxl/mock_mem.c new file mode 100644 index 000000000000..d1dec5845139 --- /dev/null +++ b/tools/testing/cxl/mock_mem.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ + +#include + +struct cxl_dev_state; +bool cxl_dvsec_decode_init(struct cxl_dev_state *cxlds) +{ + return true; +} From 8aea0ef19fde030f983aba0e7ec5bcf10880a6fe Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:31:41 -0800 Subject: [PATCH 063/186] cxl/core: Move target_list out of base decoder attributes In preparation for introducing endpoint decoder objects, move the target_list attribute out of the common set since it has no meaning for endpoint decoders. Reviewed-by: Jonathan Cameron Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164298430100.3018233.4715072508880290970.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index f460460b12b3..359d4303de9a 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -184,7 +184,6 @@ static struct attribute *cxl_decoder_base_attrs[] = { &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_locked.attr, - &dev_attr_target_list.attr, NULL, }; @@ -197,6 +196,7 @@ static struct attribute *cxl_decoder_root_attrs[] = { &dev_attr_cap_ram.attr, &dev_attr_cap_type2.attr, &dev_attr_cap_type3.attr, + &dev_attr_target_list.attr, NULL, }; @@ -213,6 +213,7 @@ static const struct attribute_group *cxl_decoder_root_attribute_groups[] = { static struct attribute *cxl_decoder_switch_attrs[] = { &dev_attr_target_type.attr, + &dev_attr_target_list.attr, NULL, }; From 9b71e1c9c3aaae5079f5e267785b6f035c5f23da Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Wed, 2 Feb 2022 20:02:06 -0800 Subject: [PATCH 064/186] cxl/core/port: Add endpoint decoders Recall that a CXL Port is any object that publishes a CXL HDM Decoder Capability structure. That is Host Bridge and Switches that have been enabled so far. Now, add decoder support to the 'endpoint' CXL Ports registered by the cxl_mem driver. They mostly share the same enumeration as Bridges and Switches, but witout a target list. The target of endpoint decode is device-internal DPA space, not another downstream port. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron [djbw: clarify changelog, hookup enumeration in the port driver] Link: https://lore.kernel.org/r/164386092069.765089.14895687988217608642.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/hdm.c | 8 ++++- drivers/cxl/core/port.c | 65 ++++++++++++++++++++++++++++++++++++----- drivers/cxl/cxl.h | 1 + drivers/cxl/port.c | 17 ++++++----- 4 files changed, 74 insertions(+), 17 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 80280db316c0..05b0b292e72d 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -186,6 +186,9 @@ static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, else cxld->target_type = CXL_DECODER_ACCELERATOR; + if (is_cxl_endpoint(to_cxl_port(cxld->dev.parent))) + return; + target_list.value = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); for (i = 0; i < cxld->interleave_ways; i++) @@ -225,7 +228,10 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) int rc, target_count = cxlhdm->target_count; struct cxl_decoder *cxld; - cxld = cxl_switch_decoder_alloc(port, target_count); + if (is_cxl_endpoint(port)) + cxld = cxl_endpoint_decoder_alloc(port); + else + cxld = cxl_switch_decoder_alloc(port, target_count); if (IS_ERR(cxld)) { dev_warn(&port->dev, "Failed to allocate the decoder\n"); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 359d4303de9a..bc18d339738b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -228,6 +228,22 @@ static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = { NULL, }; +static struct attribute *cxl_decoder_endpoint_attrs[] = { + &dev_attr_target_type.attr, + NULL, +}; + +static struct attribute_group cxl_decoder_endpoint_attribute_group = { + .attrs = cxl_decoder_endpoint_attrs, +}; + +static const struct attribute_group *cxl_decoder_endpoint_attribute_groups[] = { + &cxl_decoder_base_attribute_group, + &cxl_decoder_endpoint_attribute_group, + &cxl_base_attribute_group, + NULL, +}; + static void cxl_decoder_release(struct device *dev) { struct cxl_decoder *cxld = to_cxl_decoder(dev); @@ -237,6 +253,12 @@ static void cxl_decoder_release(struct device *dev) kfree(cxld); } +static const struct device_type cxl_decoder_endpoint_type = { + .name = "cxl_decoder_endpoint", + .release = cxl_decoder_release, + .groups = cxl_decoder_endpoint_attribute_groups, +}; + static const struct device_type cxl_decoder_switch_type = { .name = "cxl_decoder_switch", .release = cxl_decoder_release, @@ -249,6 +271,11 @@ static const struct device_type cxl_decoder_root_type = { .groups = cxl_decoder_root_attribute_groups, }; +static bool is_endpoint_decoder(struct device *dev) +{ + return dev->type == &cxl_decoder_endpoint_type; +} + bool is_root_decoder(struct device *dev) { return dev->type == &cxl_decoder_root_type; @@ -1129,7 +1156,9 @@ static int decoder_populate_targets(struct cxl_decoder *cxld, * cxl_decoder_alloc - Allocate a new CXL decoder * @port: owning port of this decoder * @nr_targets: downstream targets accessible by this decoder. All upstream - * ports and root ports must have at least 1 target. + * ports and root ports must have at least 1 target. Endpoint + * devices will have 0 targets. Callers wishing to register an + * endpoint device should specify 0. * * A port should contain one or more decoders. Each of those decoders enable * some address space for CXL.mem utilization. A decoder is expected to be @@ -1145,7 +1174,7 @@ static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, struct device *dev; int rc = 0; - if (nr_targets > CXL_DECODER_MAX_INTERLEAVE || nr_targets == 0) + if (nr_targets > CXL_DECODER_MAX_INTERLEAVE) return ERR_PTR(-EINVAL); cxld = kzalloc(struct_size(cxld, target, nr_targets), GFP_KERNEL); @@ -1166,6 +1195,8 @@ static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, dev->bus = &cxl_bus_type; if (is_cxl_root(port)) cxld->dev.type = &cxl_decoder_root_type; + else if (is_cxl_endpoint(port)) + cxld->dev.type = &cxl_decoder_endpoint_type; else cxld->dev.type = &cxl_decoder_switch_type; @@ -1215,13 +1246,28 @@ EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL); struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets) { - if (is_cxl_root(port)) + if (is_cxl_root(port) || is_cxl_endpoint(port)) return ERR_PTR(-EINVAL); return cxl_decoder_alloc(port, nr_targets); } EXPORT_SYMBOL_NS_GPL(cxl_switch_decoder_alloc, CXL); +/** + * cxl_endpoint_decoder_alloc - Allocate an endpoint decoder + * @port: owning port of this decoder + * + * Return: A new cxl decoder to be registered by cxl_decoder_add() + */ +struct cxl_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) +{ + if (!is_cxl_endpoint(port)) + return ERR_PTR(-EINVAL); + + return cxl_decoder_alloc(port, 0); +} +EXPORT_SYMBOL_NS_GPL(cxl_endpoint_decoder_alloc, CXL); + /** * cxl_decoder_add_locked - Add a decoder with targets * @cxld: The cxl decoder allocated by cxl_decoder_alloc() @@ -1256,12 +1302,15 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) if (cxld->interleave_ways < 1) return -EINVAL; - port = to_cxl_port(cxld->dev.parent); - rc = decoder_populate_targets(cxld, port, target_map); - if (rc) - return rc; - dev = &cxld->dev; + + port = to_cxl_port(cxld->dev.parent); + if (!is_endpoint_decoder(dev)) { + rc = decoder_populate_targets(cxld, port, target_map); + if (rc) + return rc; + } + rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id); if (rc) return rc; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index f5e5b4ac8228..990b6670222e 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -346,6 +346,7 @@ struct cxl_decoder *cxl_root_decoder_alloc(struct cxl_port *port, struct cxl_decoder *cxl_switch_decoder_alloc(struct cxl_port *port, unsigned int nr_targets); int cxl_decoder_add(struct cxl_decoder *cxld, int *target_map); +struct cxl_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port); int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map); int cxl_decoder_autoremove(struct device *host, struct cxl_decoder *cxld); int cxl_endpoint_autoremove(struct cxl_memdev *cxlmd, struct cxl_port *endpoint); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 4d4e23b9adff..d420da5fc39c 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -40,16 +40,17 @@ static int cxl_port_probe(struct device *dev) struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport); get_device(&cxlmd->dev); - return devm_add_action_or_reset(dev, schedule_detach, cxlmd); + rc = devm_add_action_or_reset(dev, schedule_detach, cxlmd); + if (rc) + return rc; + } else { + rc = devm_cxl_port_enumerate_dports(port); + if (rc < 0) + return rc; + if (rc == 1) + return devm_cxl_add_passthrough_decoder(port); } - rc = devm_cxl_port_enumerate_dports(port); - if (rc < 0) - return rc; - - if (rc == 1) - return devm_cxl_add_passthrough_decoder(port); - cxlhdm = devm_cxl_setup_hdm(port); if (IS_ERR(cxlhdm)) return PTR_ERR(cxlhdm); From f246abd67ff0dcb471ce2361a913b935d4c8ac11 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:31:51 -0800 Subject: [PATCH 065/186] tools/testing/cxl: Mock dvsec_ranges() For test purposes, pretend that that CXL DVSEC ranges are not in active use and the device is ready CXL.mem operation. Link: https://lore.kernel.org/r/164298431119.3018233.17175518196764977542.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- tools/testing/cxl/test/mem.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 36ef337c775c..b6b726eff3e2 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -248,6 +248,14 @@ static void label_area_release(void *lsa) vfree(lsa); } +static void mock_validate_dvsec_ranges(struct cxl_dev_state *cxlds) +{ + struct cxl_endpoint_dvsec_info *info; + + info = &cxlds->info; + info->mem_enabled = true; +} + static int cxl_mock_mem_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -285,6 +293,8 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; + mock_validate_dvsec_ranges(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); From a4a0ce242fcd7022349212c4e2f795762e6ff050 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:31:56 -0800 Subject: [PATCH 066/186] tools/testing/cxl: Fix root port to host bridge assignment Mocked root-ports are meant to be round-robin assigned to host-bridges. Fixes: 67dcdd4d3b83 ("tools/testing/cxl: Introduce a mocked-up CXL port hierarchy") Link: https://lore.kernel.org/r/164298431629.3018233.14004377108116384485.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index cd2f20f2707f..7e4a0b1ee436 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -558,7 +558,7 @@ static __init int cxl_test_init(void) for (i = 0; i < ARRAY_SIZE(cxl_root_port); i++) { struct platform_device *bridge = - cxl_host_bridge[i / NR_CXL_ROOT_PORTS]; + cxl_host_bridge[i % ARRAY_SIZE(cxl_host_bridge)]; struct platform_device *pdev; pdev = platform_device_alloc("cxl_root_port", i); From c1915142e8c1168498c8b08cd4e02728d1c33563 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:32:01 -0800 Subject: [PATCH 067/186] tools/testing/cxl: Mock one level of switches The CXL port enumeration process adds intermediate CXL ports that are discovered between "root" CXL ports enumerated by 'cxl_acpi' and endpoints enumerated by 'cxl_pci + cxl_mem'. Test the dynamic discovery of intermediate switch ports in a CXL topology. Link: https://lore.kernel.org/r/164298432189.3018233.13142151550113000967.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 138 ++++++++++++++++++++++++----------- 1 file changed, 97 insertions(+), 41 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 7e4a0b1ee436..ea88fabc3198 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -11,14 +11,21 @@ #include #include "mock.h" -#define NR_CXL_HOST_BRIDGES 4 +#define NR_CXL_HOST_BRIDGES 2 #define NR_CXL_ROOT_PORTS 2 +#define NR_CXL_SWITCH_PORTS 2 static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; static struct platform_device *cxl_root_port[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS]; -struct platform_device *cxl_mem[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS]; +static struct platform_device + *cxl_switch_uport[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS]; +static struct platform_device + *cxl_switch_dport[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS * + NR_CXL_SWITCH_PORTS]; +struct platform_device + *cxl_mem[NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS * NR_CXL_SWITCH_PORTS]; static struct acpi_device acpi0017_mock; static struct acpi_device host_bridge[NR_CXL_HOST_BRIDGES] = { @@ -28,12 +35,6 @@ static struct acpi_device host_bridge[NR_CXL_HOST_BRIDGES] = { [1] = { .handle = &host_bridge[1], }, - [2] = { - .handle = &host_bridge[2], - }, - [3] = { - .handle = &host_bridge[3], - }, }; static bool is_mock_dev(struct device *dev) @@ -71,7 +72,7 @@ static struct { } cfmws0; struct { struct acpi_cedt_cfmws cfmws; - u32 target[4]; + u32 target[2]; } cfmws1; struct { struct acpi_cedt_cfmws cfmws; @@ -79,7 +80,7 @@ static struct { } cfmws2; struct { struct acpi_cedt_cfmws cfmws; - u32 target[4]; + u32 target[2]; } cfmws3; } __packed mock_cedt = { .cedt = { @@ -105,22 +106,6 @@ static struct { .uid = 1, .cxl_version = ACPI_CEDT_CHBS_VERSION_CXL20, }, - .chbs[2] = { - .header = { - .type = ACPI_CEDT_TYPE_CHBS, - .length = sizeof(mock_cedt.chbs[0]), - }, - .uid = 2, - .cxl_version = ACPI_CEDT_CHBS_VERSION_CXL20, - }, - .chbs[3] = { - .header = { - .type = ACPI_CEDT_TYPE_CHBS, - .length = sizeof(mock_cedt.chbs[0]), - }, - .uid = 3, - .cxl_version = ACPI_CEDT_CHBS_VERSION_CXL20, - }, .cfmws0 = { .cfmws = { .header = { @@ -142,14 +127,14 @@ static struct { .type = ACPI_CEDT_TYPE_CFMWS, .length = sizeof(mock_cedt.cfmws1), }, - .interleave_ways = 2, + .interleave_ways = 1, .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_VOLATILE, .qtg_id = 1, - .window_size = SZ_256M * 4, + .window_size = SZ_256M * 2, }, - .target = { 0, 1, 2, 3 }, + .target = { 0, 1, }, }, .cfmws2 = { .cfmws = { @@ -172,14 +157,14 @@ static struct { .type = ACPI_CEDT_TYPE_CFMWS, .length = sizeof(mock_cedt.cfmws3), }, - .interleave_ways = 2, + .interleave_ways = 1, .granularity = 4, .restrictions = ACPI_CEDT_CFMWS_RESTRICT_TYPE3 | ACPI_CEDT_CFMWS_RESTRICT_PMEM, .qtg_id = 3, - .window_size = SZ_256M * 4, + .window_size = SZ_256M * 2, }, - .target = { 0, 1, 2, 3 }, + .target = { 0, 1, }, }, }; @@ -332,6 +317,17 @@ static bool is_mock_port(struct device *dev) if (dev == &cxl_root_port[i]->dev) return true; + for (i = 0; i < ARRAY_SIZE(cxl_switch_uport); i++) + if (dev == &cxl_switch_uport[i]->dev) + return true; + + for (i = 0; i < ARRAY_SIZE(cxl_switch_dport); i++) + if (dev == &cxl_switch_dport[i]->dev) + return true; + + if (is_cxl_memdev(dev)) + return is_mock_dev(dev->parent); + return false; } @@ -372,12 +368,6 @@ static struct acpi_pci_root mock_pci_root[NR_CXL_HOST_BRIDGES] = { [1] = { .bus = &mock_pci_bus[1], }, - [2] = { - .bus = &mock_pci_bus[2], - }, - [3] = { - .bus = &mock_pci_bus[3], - }, }; static bool is_mock_bus(struct pci_bus *bus) @@ -446,6 +436,26 @@ static int mock_cxl_port_enumerate_dports(struct cxl_port *port) dev_name(&pdev->dev)); } + for (i = 0; i < ARRAY_SIZE(cxl_switch_dport); i++) { + struct platform_device *pdev = cxl_switch_dport[i]; + struct cxl_dport *dport; + + if (pdev->dev.parent != port->uport) + continue; + + dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id, + CXL_RESOURCE_NONE); + + if (IS_ERR(dport)) { + dev_err(dev, "failed to add dport: %s (%ld)\n", + dev_name(&pdev->dev), PTR_ERR(dport)); + return PTR_ERR(dport); + } + + dev_dbg(dev, "add dport%d: %s\n", pdev->id, + dev_name(&pdev->dev)); + } + return 0; } @@ -574,15 +584,51 @@ static __init int cxl_test_init(void) cxl_root_port[i] = pdev; } - BUILD_BUG_ON(ARRAY_SIZE(cxl_mem) != ARRAY_SIZE(cxl_root_port)); + BUILD_BUG_ON(ARRAY_SIZE(cxl_switch_uport) != ARRAY_SIZE(cxl_root_port)); + for (i = 0; i < ARRAY_SIZE(cxl_switch_uport); i++) { + struct platform_device *root_port = cxl_root_port[i]; + struct platform_device *pdev; + + pdev = platform_device_alloc("cxl_switch_uport", i); + if (!pdev) + goto err_port; + pdev->dev.parent = &root_port->dev; + + rc = platform_device_add(pdev); + if (rc) { + platform_device_put(pdev); + goto err_uport; + } + cxl_switch_uport[i] = pdev; + } + + for (i = 0; i < ARRAY_SIZE(cxl_switch_dport); i++) { + struct platform_device *uport = + cxl_switch_uport[i % ARRAY_SIZE(cxl_switch_uport)]; + struct platform_device *pdev; + + pdev = platform_device_alloc("cxl_switch_dport", i); + if (!pdev) + goto err_port; + pdev->dev.parent = &uport->dev; + + rc = platform_device_add(pdev); + if (rc) { + platform_device_put(pdev); + goto err_dport; + } + cxl_switch_dport[i] = pdev; + } + + BUILD_BUG_ON(ARRAY_SIZE(cxl_mem) != ARRAY_SIZE(cxl_switch_dport)); for (i = 0; i < ARRAY_SIZE(cxl_mem); i++) { - struct platform_device *port = cxl_root_port[i]; + struct platform_device *dport = cxl_switch_dport[i]; struct platform_device *pdev; pdev = alloc_memdev(i); if (!pdev) goto err_mem; - pdev->dev.parent = &port->dev; + pdev->dev.parent = &dport->dev; set_dev_node(&pdev->dev, i % 2); rc = platform_device_add(pdev); @@ -611,6 +657,12 @@ err_add: err_mem: for (i = ARRAY_SIZE(cxl_mem) - 1; i >= 0; i--) platform_device_unregister(cxl_mem[i]); +err_dport: + for (i = ARRAY_SIZE(cxl_switch_dport) - 1; i >= 0; i--) + platform_device_unregister(cxl_switch_dport[i]); +err_uport: + for (i = ARRAY_SIZE(cxl_switch_uport) - 1; i >= 0; i--) + platform_device_unregister(cxl_switch_uport[i]); err_port: for (i = ARRAY_SIZE(cxl_root_port) - 1; i >= 0; i--) platform_device_unregister(cxl_root_port[i]); @@ -633,6 +685,10 @@ static __exit void cxl_test_exit(void) platform_device_unregister(cxl_acpi); for (i = ARRAY_SIZE(cxl_mem) - 1; i >= 0; i--) platform_device_unregister(cxl_mem[i]); + for (i = ARRAY_SIZE(cxl_switch_dport) - 1; i >= 0; i--) + platform_device_unregister(cxl_switch_dport[i]); + for (i = ARRAY_SIZE(cxl_switch_uport) - 1; i >= 0; i--) + platform_device_unregister(cxl_switch_uport[i]); for (i = ARRAY_SIZE(cxl_root_port) - 1; i >= 0; i--) platform_device_unregister(cxl_root_port[i]); for (i = ARRAY_SIZE(cxl_host_bridge) - 1; i >= 0; i--) From 7c7d68db0254e1b50d2d80b47774fc605846d49c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:32:07 -0800 Subject: [PATCH 068/186] tools/testing/cxl: Enumerate mock decoders Enumerate 2-decoders per switch port and endpoint in the cxl_test topology. Link: https://lore.kernel.org/r/164298432699.3018233.12131068635065601541.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 120 +++++++++++++++++++++++++++++------ 1 file changed, 99 insertions(+), 21 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index ea88fabc3198..1b36e67dcd7e 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -14,6 +14,7 @@ #define NR_CXL_HOST_BRIDGES 2 #define NR_CXL_ROOT_PORTS 2 #define NR_CXL_SWITCH_PORTS 2 +#define NR_CXL_PORT_DECODERS 2 static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; @@ -406,38 +407,115 @@ static int mock_cxl_add_passthrough_decoder(struct cxl_port *port) return -EOPNOTSUPP; } + +struct target_map_ctx { + int *target_map; + int index; + int target_count; +}; + +static int map_targets(struct device *dev, void *data) +{ + struct platform_device *pdev = to_platform_device(dev); + struct target_map_ctx *ctx = data; + + ctx->target_map[ctx->index++] = pdev->id; + + if (ctx->index > ctx->target_count) { + dev_WARN_ONCE(dev, 1, "too many targets found?\n"); + return -ENXIO; + } + + return 0; +} + static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { + struct cxl_port *port = cxlhdm->port; + struct cxl_port *parent_port = to_cxl_port(port->dev.parent); + int target_count, i; + + if (is_cxl_endpoint(port)) + target_count = 0; + else if (is_cxl_root(parent_port)) + target_count = NR_CXL_ROOT_PORTS; + else + target_count = NR_CXL_SWITCH_PORTS; + + for (i = 0; i < NR_CXL_PORT_DECODERS; i++) { + int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; + struct target_map_ctx ctx = { + .target_map = target_map, + .target_count = target_count, + }; + struct cxl_decoder *cxld; + int rc; + + if (target_count) + cxld = cxl_switch_decoder_alloc(port, target_count); + else + cxld = cxl_endpoint_decoder_alloc(port); + if (IS_ERR(cxld)) { + dev_warn(&port->dev, + "Failed to allocate the decoder\n"); + return PTR_ERR(cxld); + } + + cxld->decoder_range = (struct range) { + .start = 0, + .end = -1, + }; + + cxld->flags = CXL_DECODER_F_ENABLE; + cxld->interleave_ways = min_not_zero(target_count, 1); + cxld->interleave_granularity = SZ_4K; + cxld->target_type = CXL_DECODER_EXPANDER; + + if (target_count) { + rc = device_for_each_child(port->uport, &ctx, + map_targets); + if (rc) { + put_device(&cxld->dev); + return rc; + } + } + + rc = cxl_decoder_add_locked(cxld, target_map); + if (rc) { + put_device(&cxld->dev); + dev_err(&port->dev, "Failed to add decoder\n"); + return rc; + } + + rc = cxl_decoder_autoremove(&port->dev, cxld); + if (rc) + return rc; + dev_dbg(&cxld->dev, "Added to port %s\n", dev_name(&port->dev)); + } + return 0; } static int mock_cxl_port_enumerate_dports(struct cxl_port *port) { struct device *dev = &port->dev; - int i; + struct platform_device **array; + int i, array_size; - for (i = 0; i < ARRAY_SIZE(cxl_root_port); i++) { - struct platform_device *pdev = cxl_root_port[i]; - struct cxl_dport *dport; - - if (pdev->dev.parent != port->uport) - continue; - - dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id, - CXL_RESOURCE_NONE); - - if (IS_ERR(dport)) { - dev_err(dev, "failed to add dport: %s (%ld)\n", - dev_name(&pdev->dev), PTR_ERR(dport)); - return PTR_ERR(dport); - } - - dev_dbg(dev, "add dport%d: %s\n", pdev->id, - dev_name(&pdev->dev)); + if (port->depth == 1) { + array_size = ARRAY_SIZE(cxl_root_port); + array = cxl_root_port; + } else if (port->depth == 2) { + array_size = ARRAY_SIZE(cxl_switch_dport); + array = cxl_switch_dport; + } else { + dev_WARN_ONCE(&port->dev, 1, "unexpected depth %d\n", + port->depth); + return -ENXIO; } - for (i = 0; i < ARRAY_SIZE(cxl_switch_dport); i++) { - struct platform_device *pdev = cxl_switch_dport[i]; + for (i = 0; i < array_size; i++) { + struct platform_device *pdev = array[i]; struct cxl_dport *dport; if (pdev->dev.parent != port->uport) From 64cda3ae6bc785960cd1b4f78c19f7ca53e0130b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 23 Jan 2022 16:32:12 -0800 Subject: [PATCH 069/186] tools/testing/cxl: Add a physical_node link Emulate what ACPI does to link a host bridge platform firmware device to device node on the PCI bus. In this case it's just self referencing link, but it otherwise lets the tooling test out its lookup code. Link: https://lore.kernel.org/r/164298433209.3018233.18101085948127163720.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- tools/testing/cxl/test/cxl.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 1b36e67dcd7e..431f2bddf6c8 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -641,7 +641,12 @@ static __init int cxl_test_init(void) platform_device_put(pdev); goto err_bridge; } + cxl_host_bridge[i] = pdev; + rc = sysfs_create_link(&pdev->dev.kobj, &pdev->dev.kobj, + "physical_node"); + if (rc) + goto err_bridge; } for (i = 0; i < ARRAY_SIZE(cxl_root_port); i++) { @@ -745,8 +750,14 @@ err_port: for (i = ARRAY_SIZE(cxl_root_port) - 1; i >= 0; i--) platform_device_unregister(cxl_root_port[i]); err_bridge: - for (i = ARRAY_SIZE(cxl_host_bridge) - 1; i >= 0; i--) + for (i = ARRAY_SIZE(cxl_host_bridge) - 1; i >= 0; i--) { + struct platform_device *pdev = cxl_host_bridge[i]; + + if (!pdev) + continue; + sysfs_remove_link(&pdev->dev.kobj, "physical_node"); platform_device_unregister(cxl_host_bridge[i]); + } err_populate: depopulate_all_mock_resources(); err_gen_pool_add: @@ -769,8 +780,14 @@ static __exit void cxl_test_exit(void) platform_device_unregister(cxl_switch_uport[i]); for (i = ARRAY_SIZE(cxl_root_port) - 1; i >= 0; i--) platform_device_unregister(cxl_root_port[i]); - for (i = ARRAY_SIZE(cxl_host_bridge) - 1; i >= 0; i--) + for (i = ARRAY_SIZE(cxl_host_bridge) - 1; i >= 0; i--) { + struct platform_device *pdev = cxl_host_bridge[i]; + + if (!pdev) + continue; + sysfs_remove_link(&pdev->dev.kobj, "physical_node"); platform_device_unregister(cxl_host_bridge[i]); + } depopulate_all_mock_resources(); gen_pool_destroy(cxl_mock_pool); unregister_cxl_mock_ops(&cxl_mock_ops); From 0909b4e5287bcda34f00da878ac1f37a0921d959 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 25 Jan 2022 21:24:04 -0800 Subject: [PATCH 070/186] cxl/core/port: Fix / relax decoder target enumeration If the decoder is not presently active the target_list may not be accurate. Perform a best effort mapping and assume that it will be fixed up when the decoder is enabled. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164317464406.3438644.6609329492458460242.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/acpi.c | 2 +- drivers/cxl/core/port.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index d8295572bde9..d15a6aec0331 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -15,7 +15,7 @@ static unsigned long cfmws_to_decoder_flags(int restrictions) { - unsigned long flags = 0; + unsigned long flags = CXL_DECODER_F_ENABLE; if (restrictions & ACPI_CEDT_CFMWS_RESTRICT_TYPE2) flags |= CXL_DECODER_F_TYPE2; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index bc18d339738b..c3e83c624700 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1307,8 +1307,11 @@ int cxl_decoder_add_locked(struct cxl_decoder *cxld, int *target_map) port = to_cxl_port(cxld->dev.parent); if (!is_endpoint_decoder(dev)) { rc = decoder_populate_targets(cxld, port, target_map); - if (rc) + if (rc && (cxld->flags & CXL_DECODER_F_ENABLE)) { + dev_err(&port->dev, + "Failed to populate active decoder targets\n"); return rc; + } } rc = dev_set_name(dev, "decoder%d.%d", port->id, cxld->id); From 7004cc9d1585fb7fc9ec7e3e92b70b65e13b11fd Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 25 Jan 2022 21:24:09 -0800 Subject: [PATCH 071/186] cxl/core/port: Handle invalid decoders In case init_hdm_decoder() finds invalid settings, skip to the next valid decoder. Only fail port enumeration if zero valid decoders are found. This protects the driver init against broken hardware and / or future interleave capabilities. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/164317464918.3438644.12371149695618136198.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/hdm.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 05b0b292e72d..0e89a7a932d4 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -149,8 +149,8 @@ static int to_interleave_ways(u32 ctrl) } } -static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, - void __iomem *hdm, int which) +static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, + int *target_map, void __iomem *hdm, int which) { u64 size, base; u32 ctrl; @@ -166,6 +166,11 @@ static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, if (!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED)) size = 0; + if (base == U64_MAX || size == U64_MAX) { + dev_warn(&port->dev, "decoder%d.%d: Invalid resource range\n", + port->id, cxld->id); + return -ENXIO; + } cxld->decoder_range = (struct range) { .start = base, @@ -179,6 +184,12 @@ static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, cxld->flags |= CXL_DECODER_F_LOCK; } cxld->interleave_ways = to_interleave_ways(ctrl); + if (!cxld->interleave_ways) { + dev_warn(&port->dev, + "decoder%d.%d: Invalid interleave ways (ctrl: %#x)\n", + port->id, cxld->id, ctrl); + return -ENXIO; + } cxld->interleave_granularity = to_interleave_granularity(ctrl); if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl)) @@ -187,12 +198,14 @@ static void init_hdm_decoder(struct cxl_decoder *cxld, int *target_map, cxld->target_type = CXL_DECODER_ACCELERATOR; if (is_cxl_endpoint(to_cxl_port(cxld->dev.parent))) - return; + return 0; target_list.value = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_TL_LOW(which)); for (i = 0; i < cxld->interleave_ways; i++) target_map[i] = target_list.target_id[i]; + + return 0; } /** @@ -203,7 +216,7 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { void __iomem *hdm = cxlhdm->regs.hdm_decoder; struct cxl_port *port = cxlhdm->port; - int i, committed; + int i, committed, failed; u32 ctrl; /* @@ -223,7 +236,7 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) if (committed != cxlhdm->decoder_count) msleep(20); - for (i = 0; i < cxlhdm->decoder_count; i++) { + for (i = 0, failed = 0; i < cxlhdm->decoder_count; i++) { int target_map[CXL_DECODER_MAX_INTERLEAVE] = { 0 }; int rc, target_count = cxlhdm->target_count; struct cxl_decoder *cxld; @@ -238,7 +251,13 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) return PTR_ERR(cxld); } - init_hdm_decoder(cxld, target_map, cxlhdm->regs.hdm_decoder, i); + rc = init_hdm_decoder(port, cxld, target_map, + cxlhdm->regs.hdm_decoder, i); + if (rc) { + put_device(&cxld->dev); + failed++; + continue; + } rc = add_hdm_decoder(port, cxld, target_map); if (rc) { dev_warn(&port->dev, @@ -247,6 +266,11 @@ int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) } } + if (failed == cxlhdm->decoder_count) { + dev_err(&port->dev, "No valid decoders found\n"); + return -ENXIO; + } + return 0; } EXPORT_SYMBOL_NS_GPL(devm_cxl_enumerate_decoders, CXL); From 74b0fe80409733055971bbfaf33c80a33fddeeb3 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Tue, 1 Feb 2022 15:34:37 +0000 Subject: [PATCH 072/186] cxl/regs: Fix size of CXL Capability Header Register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In CXL 2.0, 8.2.5.1 CXL Capability Header Register: this register is given as 32 bits. 8.2.3 which covers the CXL 2.0 Component registers, including the CXL Capability Header Register states that access restrictions specified in Section 8.2.2 apply. 8.2.2 includes: * A 32 bit register shall be accessed as a 4 Byte quantity. ... If these rules are not followed, the behavior is undefined. Discovered during review of CXL QEMU emulation. Alex Bennée pointed out there was a comment saying that 4 byte registers must be read with a 4 byte read, but 8 byte reads were being emulated. https://lore.kernel.org/qemu-devel/87bkzyd3c7.fsf@linaro.org/ Fixing that, led to this code failing. Whilst a given hardware implementation 'might' work with an 8 byte read, it should not be relied upon. The QEMU emulation v5 will return 0 and log the wrong access width. The code moved, so one fixes tag for where this will directly apply and also a reference to the earlier introduction of the code for backports. Fixes: 0f06157e0135 ("cxl/core: Move register mapping infrastructure") Fixes: 08422378c4ad ("cxl/pci: Add HDM decoder capabilities") Signed-off-by: Jonathan Cameron Cc: Alex Bennée Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/20220201153437.2873-1-Jonathan.Cameron@huawei.com Signed-off-by: Dan Williams --- drivers/cxl/core/regs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 718b6b0ae4b3..39a129c57d40 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -36,7 +36,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, struct cxl_component_reg_map *map) { int cap, cap_count; - u64 cap_array; + u32 cap_array; *map = (struct cxl_component_reg_map) { 0 }; @@ -46,7 +46,7 @@ void cxl_probe_component_regs(struct device *dev, void __iomem *base, */ base += CXL_CM_OFFSET; - cap_array = readq(base + CXL_CM_CAP_HDR_OFFSET); + cap_array = readl(base + CXL_CM_CAP_HDR_OFFSET); if (FIELD_GET(CXL_CM_CAP_HDR_ID_MASK, cap_array) != CM_CAP_HDR_CAP_ID) { dev_err(dev, From 3c8bc3954d771fc4889508ad5d16f084adc4d64d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 10 Feb 2022 14:49:03 -0800 Subject: [PATCH 073/186] RDMA/hfi: Replace cpumask_weight with cpumask_empty where appropriate drivers/infiniband/hw/hfi1/affinity.c code calls cpumask_weight() to check if any bit of a given cpumask is set. We can do it more efficiently with cpumask_empty() because cpumask_empty() stops traversing the cpumask as soon as it finds first set bit, while cpumask_weight() counts all bits unconditionally. Link: https://lore.kernel.org/r/20220210224933.379149-20-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/affinity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 706b3b659713..877f8e84a672 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -666,7 +666,7 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) * engines, use the same CPU cores as general/control * context. */ - if (cpumask_weight(&entry->def_intr.mask) == 0) + if (cpumask_empty(&entry->def_intr.mask)) cpumask_copy(&entry->def_intr.mask, &entry->general_intr_mask); } @@ -686,7 +686,7 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) * vectors, use the same CPU core as the general/control * context. */ - if (cpumask_weight(&entry->comp_vect_mask) == 0) + if (cpumask_empty(&entry->comp_vect_mask)) cpumask_copy(&entry->comp_vect_mask, &entry->general_intr_mask); } From 5c3c067b601bcbcd381214e3a40e666fda5f3d6f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 8 Feb 2022 23:37:28 -0800 Subject: [PATCH 074/186] cxl/core/port: Fix unregister_port() lock assertion The device_lock_assert() in unregister_port() fails to pick the right device leading to splats like the following from: echo "ACPI0017:00" > /sys/bus/platform/drivers/cxl_acpi/unbind WARNING: CPU: 32 PID: 1147 at include/linux/device.h:787 unregister_port+0x49/0x50 [cxl_c [..] RIP: 0010:unregister_port+0x49/0x50 [cxl_core] [..] Call Trace: release_nodes+0x63/0x80 devres_release_all+0x8b/0xc0 __device_release_driver+0x190/0x240 device_driver_detach+0x3e/0xa0 unbind_store+0x113/0x130 Fix it up to assert on the device_lock() for ACPI0017 for root and 1st level ports, and parent ports for all the rest. Fixes: 54cdbf845cf7 ("cxl/port: Add a driver for 'struct cxl_port' objects") Reported-by: Ben Widawsky Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164439224893.2941117.18331456248117887720.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c3e83c624700..9b4bbd51fbaa 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -348,12 +348,28 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_port, CXL); static void unregister_port(void *_port) { struct cxl_port *port = _port; + struct cxl_port *parent; + struct device *lock_dev; - if (!is_cxl_root(port)) { - device_lock_assert(port->dev.parent); - port->uport = NULL; - } + if (is_cxl_root(port)) + parent = NULL; + else + parent = to_cxl_port(port->dev.parent); + /* + * CXL root port's and the first level of ports are unregistered + * under the platform firmware device lock, all other ports are + * unregistered while holding their parent port lock. + */ + if (!parent) + lock_dev = port->uport; + else if (is_cxl_root(parent)) + lock_dev = parent->uport; + else + lock_dev = &parent->dev; + + device_lock_assert(lock_dev); + port->uport = NULL; device_unregister(&port->dev); } From e6e17cc6ed751072513fe16cb595ac09f6821a43 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 8 Feb 2022 23:37:34 -0800 Subject: [PATCH 075/186] cxl/core: Fix cxl_device_lock() class detection If cxl_device_lock() is used on a non-CXL device the expectation is that the lock class will fall back to CXL_ANON_LOCK. Instead it crashes when trying to determine if the device is a 'decoder'. Specifically when the device has a NULL type pointer. Just check for NULL before de-referencing ->release. Fixes: 3c5b90395525 ("cxl: Prove CXL locking") Reported-by: Ben Widawsky Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164439225406.2941117.3927102269866914339.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 9b4bbd51fbaa..d29eb2abdbc2 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -284,7 +284,7 @@ EXPORT_SYMBOL_NS_GPL(is_root_decoder, CXL); bool is_cxl_decoder(struct device *dev) { - return dev->type->release == cxl_decoder_release; + return dev->type && dev->type->release == cxl_decoder_release; } EXPORT_SYMBOL_NS_GPL(is_cxl_decoder, CXL); From a099b08599e6ae6b8e9faccee83760dab622c11e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 15 Feb 2022 13:44:49 -0600 Subject: [PATCH 076/186] RDMA/rxe: Revert changes from irqsave to bh locks A previous patch replaced all irqsave locks in rxe with bh locks. This ran into problems because rdmacm has a bad habit of calling rdma verbs APIs while disabling irqs. This is not allowed during spin_unlock_bh() causing programs that use rdmacm to fail. This patch reverts the changes to locks that had this problem or got dragged into the same mess. After this patch blktests/check -q srp now runs correctly. Link: https://lore.kernel.org/r/20220215194448.44369-1-rpearsonhpe@gmail.com Fixes: 21adfa7a3c4e ("RDMA/rxe: Replace irqsave locks with bh locks") Reported-by: Guoqing Jiang Reported-by: Bart Van Assche Signed-off-by: Bob Pearson Tested-by: Bart Van Assche Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 20 +++++++++++------- drivers/infiniband/sw/rxe/rxe_mcast.c | 19 ++++++++++------- drivers/infiniband/sw/rxe/rxe_pool.c | 30 ++++++++++++++++----------- drivers/infiniband/sw/rxe/rxe_queue.c | 10 +++++---- drivers/infiniband/sw/rxe/rxe_resp.c | 11 +++++----- drivers/infiniband/sw/rxe/rxe_verbs.c | 27 ++++++++++++++---------- 6 files changed, 69 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 6baaaa34458e..642b52539ac3 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -42,13 +42,14 @@ err1: static void rxe_send_complete(struct tasklet_struct *t) { struct rxe_cq *cq = from_tasklet(cq, t, comp_task); + unsigned long flags; - spin_lock_bh(&cq->cq_lock); + spin_lock_irqsave(&cq->cq_lock, flags); if (cq->is_dying) { - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, flags); return; } - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, flags); cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } @@ -107,12 +108,13 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) struct ib_event ev; int full; void *addr; + unsigned long flags; - spin_lock_bh(&cq->cq_lock); + spin_lock_irqsave(&cq->cq_lock, flags); full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); if (unlikely(full)) { - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, flags); if (cq->ibcq.event_handler) { ev.device = cq->ibcq.device; ev.element.cq = &cq->ibcq; @@ -128,7 +130,7 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, flags); if ((cq->notify == IB_CQ_NEXT_COMP) || (cq->notify == IB_CQ_SOLICITED && solicited)) { @@ -141,9 +143,11 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) void rxe_cq_disable(struct rxe_cq *cq) { - spin_lock_bh(&cq->cq_lock); + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); cq->is_dying = true; - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, flags); } void rxe_cq_cleanup(struct rxe_pool_elem *elem) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 9336295c4ee2..2878a56d9994 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -58,11 +58,12 @@ static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, int err; struct rxe_mcg *grp; struct rxe_pool *pool = &rxe->mc_grp_pool; + unsigned long flags; if (rxe->attr.max_mcast_qp_attach == 0) return -EINVAL; - write_lock_bh(&pool->pool_lock); + write_lock_irqsave(&pool->pool_lock, flags); grp = rxe_pool_get_key_locked(pool, mgid); if (grp) @@ -70,13 +71,13 @@ static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, grp = create_grp(rxe, pool, mgid); if (IS_ERR(grp)) { - write_unlock_bh(&pool->pool_lock); + write_unlock_irqrestore(&pool->pool_lock, flags); err = PTR_ERR(grp); return err; } done: - write_unlock_bh(&pool->pool_lock); + write_unlock_irqrestore(&pool->pool_lock, flags); *grp_p = grp; return 0; } @@ -86,9 +87,10 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, { int err; struct rxe_mca *elem; + unsigned long flags; /* check to see of the qp is already a member of the group */ - spin_lock_bh(&grp->mcg_lock); + spin_lock_irqsave(&grp->mcg_lock, flags); list_for_each_entry(elem, &grp->qp_list, qp_list) { if (elem->qp == qp) { err = 0; @@ -118,7 +120,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, err = 0; out: - spin_unlock_bh(&grp->mcg_lock); + spin_unlock_irqrestore(&grp->mcg_lock, flags); return err; } @@ -127,12 +129,13 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, { struct rxe_mcg *grp; struct rxe_mca *elem, *tmp; + unsigned long flags; grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); if (!grp) goto err1; - spin_lock_bh(&grp->mcg_lock); + spin_lock_irqsave(&grp->mcg_lock, flags); list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { if (elem->qp == qp) { @@ -140,7 +143,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, grp->num_qp--; atomic_dec(&qp->mcg_num); - spin_unlock_bh(&grp->mcg_lock); + spin_unlock_irqrestore(&grp->mcg_lock, flags); rxe_drop_ref(elem); rxe_drop_ref(grp); /* ref held by QP */ rxe_drop_ref(grp); /* ref from get_key */ @@ -148,7 +151,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, } } - spin_unlock_bh(&grp->mcg_lock); + spin_unlock_irqrestore(&grp->mcg_lock, flags); rxe_drop_ref(grp); /* ref from get_key */ err1: return -EINVAL; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 63c594173565..a11dab13c192 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -260,11 +260,12 @@ int __rxe_add_key_locked(struct rxe_pool_elem *elem, void *key) int __rxe_add_key(struct rxe_pool_elem *elem, void *key) { struct rxe_pool *pool = elem->pool; + unsigned long flags; int err; - write_lock_bh(&pool->pool_lock); + write_lock_irqsave(&pool->pool_lock, flags); err = __rxe_add_key_locked(elem, key); - write_unlock_bh(&pool->pool_lock); + write_unlock_irqrestore(&pool->pool_lock, flags); return err; } @@ -279,10 +280,11 @@ void __rxe_drop_key_locked(struct rxe_pool_elem *elem) void __rxe_drop_key(struct rxe_pool_elem *elem) { struct rxe_pool *pool = elem->pool; + unsigned long flags; - write_lock_bh(&pool->pool_lock); + write_lock_irqsave(&pool->pool_lock, flags); __rxe_drop_key_locked(elem); - write_unlock_bh(&pool->pool_lock); + write_unlock_irqrestore(&pool->pool_lock, flags); } int __rxe_add_index_locked(struct rxe_pool_elem *elem) @@ -299,11 +301,12 @@ int __rxe_add_index_locked(struct rxe_pool_elem *elem) int __rxe_add_index(struct rxe_pool_elem *elem) { struct rxe_pool *pool = elem->pool; + unsigned long flags; int err; - write_lock_bh(&pool->pool_lock); + write_lock_irqsave(&pool->pool_lock, flags); err = __rxe_add_index_locked(elem); - write_unlock_bh(&pool->pool_lock); + write_unlock_irqrestore(&pool->pool_lock, flags); return err; } @@ -319,10 +322,11 @@ void __rxe_drop_index_locked(struct rxe_pool_elem *elem) void __rxe_drop_index(struct rxe_pool_elem *elem) { struct rxe_pool *pool = elem->pool; + unsigned long flags; - write_lock_bh(&pool->pool_lock); + write_lock_irqsave(&pool->pool_lock, flags); __rxe_drop_index_locked(elem); - write_unlock_bh(&pool->pool_lock); + write_unlock_irqrestore(&pool->pool_lock, flags); } void *rxe_alloc_locked(struct rxe_pool *pool) @@ -440,11 +444,12 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index) void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) { + unsigned long flags; void *obj; - read_lock_bh(&pool->pool_lock); + read_lock_irqsave(&pool->pool_lock, flags); obj = rxe_pool_get_index_locked(pool, index); - read_unlock_bh(&pool->pool_lock); + read_unlock_irqrestore(&pool->pool_lock, flags); return obj; } @@ -484,11 +489,12 @@ void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key) void *rxe_pool_get_key(struct rxe_pool *pool, void *key) { + unsigned long flags; void *obj; - read_lock_bh(&pool->pool_lock); + read_lock_irqsave(&pool->pool_lock, flags); obj = rxe_pool_get_key_locked(pool, key); - read_unlock_bh(&pool->pool_lock); + read_unlock_irqrestore(&pool->pool_lock, flags); return obj; } diff --git a/drivers/infiniband/sw/rxe/rxe_queue.c b/drivers/infiniband/sw/rxe/rxe_queue.c index a1b283dd2d4c..dbd4971039c0 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.c +++ b/drivers/infiniband/sw/rxe/rxe_queue.c @@ -151,6 +151,8 @@ int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, struct rxe_queue *new_q; unsigned int num_elem = *num_elem_p; int err; + unsigned long producer_flags; + unsigned long consumer_flags; new_q = rxe_queue_init(q->rxe, &num_elem, elem_size, q->type); if (!new_q) @@ -164,17 +166,17 @@ int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, goto err1; } - spin_lock_bh(consumer_lock); + spin_lock_irqsave(consumer_lock, consumer_flags); if (producer_lock) { - spin_lock_bh(producer_lock); + spin_lock_irqsave(producer_lock, producer_flags); err = resize_finish(q, new_q, num_elem); - spin_unlock_bh(producer_lock); + spin_unlock_irqrestore(producer_lock, producer_flags); } else { err = resize_finish(q, new_q, num_elem); } - spin_unlock_bh(consumer_lock); + spin_unlock_irqrestore(consumer_lock, consumer_flags); rxe_queue_cleanup(new_q); /* new/old dep on err */ if (err) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 380934e38923..c369d78fc8e8 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -297,21 +297,22 @@ static enum resp_states get_srq_wqe(struct rxe_qp *qp) struct ib_event ev; unsigned int count; size_t size; + unsigned long flags; if (srq->error) return RESPST_ERR_RNR; - spin_lock_bh(&srq->rq.consumer_lock); + spin_lock_irqsave(&srq->rq.consumer_lock, flags); wqe = queue_head(q, QUEUE_TYPE_FROM_CLIENT); if (!wqe) { - spin_unlock_bh(&srq->rq.consumer_lock); + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); return RESPST_ERR_RNR; } /* don't trust user space data */ if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) { - spin_unlock_bh(&srq->rq.consumer_lock); + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); pr_warn("%s: invalid num_sge in SRQ entry\n", __func__); return RESPST_ERR_MALFORMED_WQE; } @@ -327,11 +328,11 @@ static enum resp_states get_srq_wqe(struct rxe_qp *qp) goto event; } - spin_unlock_bh(&srq->rq.consumer_lock); + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); return RESPST_CHK_LENGTH; event: - spin_unlock_bh(&srq->rq.consumer_lock); + spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); ev.device = qp->ibqp.device; ev.element.srq = qp->ibqp.srq; ev.event = IB_EVENT_SRQ_LIMIT_REACHED; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9f0aef4b649d..80df9a8f71a1 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -384,8 +384,9 @@ static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, { int err = 0; struct rxe_srq *srq = to_rsrq(ibsrq); + unsigned long flags; - spin_lock_bh(&srq->rq.producer_lock); + spin_lock_irqsave(&srq->rq.producer_lock, flags); while (wr) { err = post_one_recv(&srq->rq, wr); @@ -394,7 +395,7 @@ static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wr = wr->next; } - spin_unlock_bh(&srq->rq.producer_lock); + spin_unlock_irqrestore(&srq->rq.producer_lock, flags); if (err) *bad_wr = wr; @@ -643,18 +644,19 @@ static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, int err; struct rxe_sq *sq = &qp->sq; struct rxe_send_wqe *send_wqe; + unsigned long flags; int full; err = validate_send_wr(qp, ibwr, mask, length); if (err) return err; - spin_lock_bh(&qp->sq.sq_lock); + spin_lock_irqsave(&qp->sq.sq_lock, flags); full = queue_full(sq->queue, QUEUE_TYPE_TO_DRIVER); if (unlikely(full)) { - spin_unlock_bh(&qp->sq.sq_lock); + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); return -ENOMEM; } @@ -663,7 +665,7 @@ static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, queue_advance_producer(sq->queue, QUEUE_TYPE_TO_DRIVER); - spin_unlock_bh(&qp->sq.sq_lock); + spin_unlock_irqrestore(&qp->sq.sq_lock, flags); return 0; } @@ -743,6 +745,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int err = 0; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_rq *rq = &qp->rq; + unsigned long flags; if (unlikely((qp_state(qp) < IB_QPS_INIT) || !qp->valid)) { *bad_wr = wr; @@ -756,7 +759,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, goto err1; } - spin_lock_bh(&rq->producer_lock); + spin_lock_irqsave(&rq->producer_lock, flags); while (wr) { err = post_one_recv(rq, wr); @@ -767,7 +770,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, wr = wr->next; } - spin_unlock_bh(&rq->producer_lock); + spin_unlock_irqrestore(&rq->producer_lock, flags); if (qp->resp.state == QP_STATE_ERROR) rxe_run_task(&qp->resp.task, 1); @@ -848,8 +851,9 @@ static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) int i; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_cqe *cqe; + unsigned long flags; - spin_lock_bh(&cq->cq_lock); + spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { cqe = queue_head(cq->queue, QUEUE_TYPE_FROM_DRIVER); if (!cqe) @@ -858,7 +862,7 @@ static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) memcpy(wc++, &cqe->ibwc, sizeof(*wc)); queue_advance_consumer(cq->queue, QUEUE_TYPE_FROM_DRIVER); } - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, flags); return i; } @@ -878,8 +882,9 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) struct rxe_cq *cq = to_rcq(ibcq); int ret = 0; int empty; + unsigned long irq_flags; - spin_lock_bh(&cq->cq_lock); + spin_lock_irqsave(&cq->cq_lock, irq_flags); if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = flags & IB_CQ_SOLICITED_MASK; @@ -888,7 +893,7 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; - spin_unlock_bh(&cq->cq_lock); + spin_unlock_irqrestore(&cq->cq_lock, irq_flags); return ret; } From 9fd0eb7c3c73c80a7bbe28dc71ae8ec5698a7e84 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:35 -0600 Subject: [PATCH 077/186] RDMA/rxe: Move mcg_lock to rxe Replace mcg->mcg_lock and mc_grp_pool->pool_lock by rxe->mcg_lock. This is the first step of several intended to decouple the mc_grp and mc_elem objects from the rxe pool code. Link: https://lore.kernel.org/r/20220208211644.123457-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 2 ++ drivers/infiniband/sw/rxe/rxe_mcast.c | 19 +++++++++---------- drivers/infiniband/sw/rxe/rxe_recv.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 3 ++- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index fab291245366..e74c4216b314 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -211,6 +211,8 @@ static int rxe_init(struct rxe_dev *rxe) spin_lock_init(&rxe->pending_lock); INIT_LIST_HEAD(&rxe->pending_mmaps); + spin_lock_init(&rxe->mcg_lock); + mutex_init(&rxe->usdev_lock); return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 2878a56d9994..fae04497cf2b 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -25,7 +25,7 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) return dev_mc_del(rxe->ndev, ll_addr); } -/* caller should hold mc_grp_pool->pool_lock */ +/* caller should hold rxe->mcg_lock */ static struct rxe_mcg *create_grp(struct rxe_dev *rxe, struct rxe_pool *pool, union ib_gid *mgid) @@ -38,7 +38,6 @@ static struct rxe_mcg *create_grp(struct rxe_dev *rxe, return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&grp->qp_list); - spin_lock_init(&grp->mcg_lock); grp->rxe = rxe; rxe_add_key_locked(grp, mgid); @@ -63,7 +62,7 @@ static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, if (rxe->attr.max_mcast_qp_attach == 0) return -EINVAL; - write_lock_irqsave(&pool->pool_lock, flags); + spin_lock_irqsave(&rxe->mcg_lock, flags); grp = rxe_pool_get_key_locked(pool, mgid); if (grp) @@ -71,13 +70,13 @@ static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, grp = create_grp(rxe, pool, mgid); if (IS_ERR(grp)) { - write_unlock_irqrestore(&pool->pool_lock, flags); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); err = PTR_ERR(grp); return err; } done: - write_unlock_irqrestore(&pool->pool_lock, flags); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); *grp_p = grp; return 0; } @@ -90,7 +89,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, unsigned long flags; /* check to see of the qp is already a member of the group */ - spin_lock_irqsave(&grp->mcg_lock, flags); + spin_lock_irqsave(&rxe->mcg_lock, flags); list_for_each_entry(elem, &grp->qp_list, qp_list) { if (elem->qp == qp) { err = 0; @@ -120,7 +119,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, err = 0; out: - spin_unlock_irqrestore(&grp->mcg_lock, flags); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); return err; } @@ -135,7 +134,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, if (!grp) goto err1; - spin_lock_irqsave(&grp->mcg_lock, flags); + spin_lock_irqsave(&rxe->mcg_lock, flags); list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { if (elem->qp == qp) { @@ -143,7 +142,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, grp->num_qp--; atomic_dec(&qp->mcg_num); - spin_unlock_irqrestore(&grp->mcg_lock, flags); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); rxe_drop_ref(elem); rxe_drop_ref(grp); /* ref held by QP */ rxe_drop_ref(grp); /* ref from get_key */ @@ -151,7 +150,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, } } - spin_unlock_irqrestore(&grp->mcg_lock, flags); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); rxe_drop_ref(grp); /* ref from get_key */ err1: return -EINVAL; diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 7ff6b53555f4..a084b5d69937 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -250,7 +250,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) if (!mcg) goto drop; /* mcast group not registered */ - spin_lock_bh(&mcg->mcg_lock); + spin_lock_bh(&rxe->mcg_lock); /* this is unreliable datagram service so we let * failures to deliver a multicast packet to a @@ -298,7 +298,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) } } - spin_unlock_bh(&mcg->mcg_lock); + spin_unlock_bh(&rxe->mcg_lock); rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 55f8ed2bc621..9940c69cbb63 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -353,7 +353,6 @@ struct rxe_mw { struct rxe_mcg { struct rxe_pool_elem elem; - spinlock_t mcg_lock; /* guard group */ struct rxe_dev *rxe; struct list_head qp_list; union ib_gid mgid; @@ -399,6 +398,8 @@ struct rxe_dev { struct rxe_pool mc_grp_pool; struct rxe_pool mc_elem_pool; + spinlock_t mcg_lock; + spinlock_t pending_lock; /* guard pending_mmaps */ struct list_head pending_mmaps; From d572405518ffd7c21882c1f2e9a568f2e8548d0b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:36 -0600 Subject: [PATCH 078/186] RDMA/rxe: Use kzmalloc/kfree for mca Remove rxe_mca (was rxe_mc_elem) from rxe pools and use kzmalloc and kfree to allocate and free in rxe_mcast.c. Call kzalloc outside of spinlocks to avoid having to use GFP_ATOMIC. Link: https://lore.kernel.org/r/20220208211644.123457-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 8 - drivers/infiniband/sw/rxe/rxe_mcast.c | 204 ++++++++++++++++---------- drivers/infiniband/sw/rxe/rxe_pool.c | 5 - drivers/infiniband/sw/rxe/rxe_pool.h | 3 +- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 - 5 files changed, 127 insertions(+), 95 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index e74c4216b314..7386a51b953d 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -29,7 +29,6 @@ void rxe_dealloc(struct ib_device *ib_dev) rxe_pool_cleanup(&rxe->mr_pool); rxe_pool_cleanup(&rxe->mw_pool); rxe_pool_cleanup(&rxe->mc_grp_pool); - rxe_pool_cleanup(&rxe->mc_elem_pool); if (rxe->tfm) crypto_free_shash(rxe->tfm); @@ -163,15 +162,8 @@ static int rxe_init_pools(struct rxe_dev *rxe) if (err) goto err9; - err = rxe_pool_init(rxe, &rxe->mc_elem_pool, RXE_TYPE_MC_ELEM, - rxe->attr.max_total_mcast_qp_attach); - if (err) - goto err10; - return 0; -err10: - rxe_pool_cleanup(&rxe->mc_grp_pool); err9: rxe_pool_cleanup(&rxe->mw_pool); err8: diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index fae04497cf2b..2c0a2d3cc93e 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -26,96 +26,104 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) } /* caller should hold rxe->mcg_lock */ -static struct rxe_mcg *create_grp(struct rxe_dev *rxe, - struct rxe_pool *pool, - union ib_gid *mgid) +static struct rxe_mcg *__rxe_create_grp(struct rxe_dev *rxe, + struct rxe_pool *pool, + union ib_gid *mgid) { - int err; struct rxe_mcg *grp; + int err; - grp = rxe_alloc_locked(&rxe->mc_grp_pool); + grp = rxe_alloc_locked(pool); if (!grp) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&grp->qp_list); - grp->rxe = rxe; - rxe_add_key_locked(grp, mgid); - err = rxe_mcast_add(rxe, mgid); if (unlikely(err)) { - rxe_drop_key_locked(grp); rxe_drop_ref(grp); return ERR_PTR(err); } + INIT_LIST_HEAD(&grp->qp_list); + grp->rxe = rxe; + + /* rxe_alloc_locked takes a ref on grp but that will be + * dropped when grp goes out of scope. We need to take a ref + * on the pointer that will be saved in the red-black tree + * by rxe_add_key and used to lookup grp from mgid later. + * Adding key makes object visible to outside so this should + * be done last after the object is ready. + */ + rxe_add_ref(grp); + rxe_add_key_locked(grp, mgid); + return grp; } -static int rxe_mcast_get_grp(struct rxe_dev *rxe, union ib_gid *mgid, - struct rxe_mcg **grp_p) +static struct rxe_mcg *rxe_mcast_get_grp(struct rxe_dev *rxe, + union ib_gid *mgid) { - int err; struct rxe_mcg *grp; struct rxe_pool *pool = &rxe->mc_grp_pool; unsigned long flags; if (rxe->attr.max_mcast_qp_attach == 0) - return -EINVAL; + return ERR_PTR(-EINVAL); spin_lock_irqsave(&rxe->mcg_lock, flags); - grp = rxe_pool_get_key_locked(pool, mgid); - if (grp) - goto done; - - grp = create_grp(rxe, pool, mgid); - if (IS_ERR(grp)) { - spin_unlock_irqrestore(&rxe->mcg_lock, flags); - err = PTR_ERR(grp); - return err; - } - -done: + if (!grp) + grp = __rxe_create_grp(rxe, pool, mgid); spin_unlock_irqrestore(&rxe->mcg_lock, flags); - *grp_p = grp; - return 0; + + return grp; } static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - struct rxe_mcg *grp) + struct rxe_mcg *grp) { - int err; - struct rxe_mca *elem; + struct rxe_mca *mca, *tmp; unsigned long flags; + int err; - /* check to see of the qp is already a member of the group */ + /* check to see if the qp is already a member of the group */ spin_lock_irqsave(&rxe->mcg_lock, flags); - list_for_each_entry(elem, &grp->qp_list, qp_list) { - if (elem->qp == qp) { + list_for_each_entry(mca, &grp->qp_list, qp_list) { + if (mca->qp == qp) { + spin_unlock_irqrestore(&rxe->mcg_lock, flags); + return 0; + } + } + spin_unlock_irqrestore(&rxe->mcg_lock, flags); + + /* speculative alloc new mca without using GFP_ATOMIC */ + mca = kzalloc(sizeof(*mca), GFP_KERNEL); + if (!mca) + return -ENOMEM; + + spin_lock_irqsave(&rxe->mcg_lock, flags); + /* re-check to see if someone else just attached qp */ + list_for_each_entry(tmp, &grp->qp_list, qp_list) { + if (tmp->qp == qp) { + kfree(mca); err = 0; goto out; } } + /* check limits after checking if already attached */ if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) { + kfree(mca); err = -ENOMEM; goto out; } - elem = rxe_alloc_locked(&rxe->mc_elem_pool); - if (!elem) { - err = -ENOMEM; - goto out; - } + /* protect pointer to qp in mca */ + rxe_add_ref(qp); + mca->qp = qp; - /* each qp holds a ref on the grp */ - rxe_add_ref(grp); - - grp->num_qp++; - elem->qp = qp; atomic_inc(&qp->mcg_num); - - list_add(&elem->qp_list, &grp->qp_list); + grp->num_qp++; + list_add(&mca->qp_list, &grp->qp_list); err = 0; out: @@ -123,46 +131,80 @@ out: return err; } -static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - union ib_gid *mgid) +/* caller should be holding rxe->mcg_lock */ +static void __rxe_destroy_grp(struct rxe_mcg *grp) { - struct rxe_mcg *grp; - struct rxe_mca *elem, *tmp; + /* first remove grp from red-black tree then drop ref */ + rxe_drop_key_locked(grp); + rxe_drop_ref(grp); + + rxe_mcast_delete(grp->rxe, &grp->mgid); +} + +static void rxe_destroy_grp(struct rxe_mcg *grp) +{ + struct rxe_dev *rxe = grp->rxe; unsigned long flags; - grp = rxe_pool_get_key(&rxe->mc_grp_pool, mgid); - if (!grp) - goto err1; - spin_lock_irqsave(&rxe->mcg_lock, flags); - - list_for_each_entry_safe(elem, tmp, &grp->qp_list, qp_list) { - if (elem->qp == qp) { - list_del(&elem->qp_list); - grp->num_qp--; - atomic_dec(&qp->mcg_num); - - spin_unlock_irqrestore(&rxe->mcg_lock, flags); - rxe_drop_ref(elem); - rxe_drop_ref(grp); /* ref held by QP */ - rxe_drop_ref(grp); /* ref from get_key */ - return 0; - } - } - + __rxe_destroy_grp(grp); spin_unlock_irqrestore(&rxe->mcg_lock, flags); - rxe_drop_ref(grp); /* ref from get_key */ -err1: - return -EINVAL; } void rxe_mc_cleanup(struct rxe_pool_elem *elem) { - struct rxe_mcg *grp = container_of(elem, typeof(*grp), elem); - struct rxe_dev *rxe = grp->rxe; + /* nothing left to do for now */ +} - rxe_drop_key(grp); - rxe_mcast_delete(rxe, &grp->mgid); +static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, + union ib_gid *mgid) +{ + struct rxe_mcg *grp; + struct rxe_mca *mca, *tmp; + unsigned long flags; + int err; + + spin_lock_irqsave(&rxe->mcg_lock, flags); + grp = rxe_pool_get_key_locked(&rxe->mc_grp_pool, mgid); + if (!grp) { + /* we didn't find the mcast group for mgid */ + err = -EINVAL; + goto out_unlock; + } + + list_for_each_entry_safe(mca, tmp, &grp->qp_list, qp_list) { + if (mca->qp == qp) { + list_del(&mca->qp_list); + + /* if the number of qp's attached to the + * mcast group falls to zero go ahead and + * tear it down. This will not free the + * object since we are still holding a ref + * from the get key above. + */ + grp->num_qp--; + if (grp->num_qp <= 0) + __rxe_destroy_grp(grp); + + atomic_dec(&qp->mcg_num); + + /* drop the ref from get key. This will free the + * object if num_qp is zero. + */ + rxe_drop_ref(grp); + kfree(mca); + err = 0; + goto out_unlock; + } + } + + /* we didn't find the qp on the list */ + rxe_drop_ref(grp); + err = -EINVAL; + +out_unlock: + spin_unlock_irqrestore(&rxe->mcg_lock, flags); + return err; } int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) @@ -173,12 +215,16 @@ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) struct rxe_mcg *grp; /* takes a ref on grp if successful */ - err = rxe_mcast_get_grp(rxe, mgid, &grp); - if (err) - return err; + grp = rxe_mcast_get_grp(rxe, mgid); + if (IS_ERR(grp)) + return PTR_ERR(grp); err = rxe_mcast_add_grp_elem(rxe, qp, grp); + /* if we failed to attach the first qp to grp tear it down */ + if (grp->num_qp == 0) + rxe_destroy_grp(grp); + rxe_drop_ref(grp); return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index a11dab13c192..eaf4bfc9b856 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -90,11 +90,6 @@ static const struct rxe_type_info { .key_offset = offsetof(struct rxe_mcg, mgid), .key_size = sizeof(union ib_gid), }, - [RXE_TYPE_MC_ELEM] = { - .name = "rxe-mc_elem", - .size = sizeof(struct rxe_mca), - .elem_offset = offsetof(struct rxe_mca, elem), - }, }; static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 214279310f4d..9201bb6b8b07 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -23,7 +23,6 @@ enum rxe_elem_type { RXE_TYPE_MR, RXE_TYPE_MW, RXE_TYPE_MC_GRP, - RXE_TYPE_MC_ELEM, RXE_NUM_TYPES, /* keep me last */ }; @@ -156,4 +155,6 @@ void rxe_elem_release(struct kref *kref); /* drop a reference on an object */ #define rxe_drop_ref(obj) kref_put(&(obj)->elem.ref_cnt, rxe_elem_release) +#define rxe_read_ref(obj) kref_read(&(obj)->elem.ref_cnt) + #endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 9940c69cbb63..1b0f40881895 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -362,7 +362,6 @@ struct rxe_mcg { }; struct rxe_mca { - struct rxe_pool_elem elem; struct list_head qp_list; struct rxe_qp *qp; }; @@ -396,7 +395,6 @@ struct rxe_dev { struct rxe_pool mr_pool; struct rxe_pool mw_pool; struct rxe_pool mc_grp_pool; - struct rxe_pool mc_elem_pool; spinlock_t mcg_lock; From 5bc15d1f7e3c9b84b40e020983e2cee19a546e72 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:37 -0600 Subject: [PATCH 079/186] RDMA/rxe: Replace grp by mcg, mce by mca Replace 'grp' by 'mcg', 'mce' by 'mca'. Shorten subroutine names in rxe_mcast.c. These name uses are more in line with other object names used. Link: https://lore.kernel.org/r/20220208211644.123457-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 110 +++++++++++++------------- drivers/infiniband/sw/rxe/rxe_recv.c | 8 +- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 2c0a2d3cc93e..ae8103e819d5 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -26,43 +26,43 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) } /* caller should hold rxe->mcg_lock */ -static struct rxe_mcg *__rxe_create_grp(struct rxe_dev *rxe, +static struct rxe_mcg *__rxe_create_mcg(struct rxe_dev *rxe, struct rxe_pool *pool, union ib_gid *mgid) { - struct rxe_mcg *grp; + struct rxe_mcg *mcg; int err; - grp = rxe_alloc_locked(pool); - if (!grp) + mcg = rxe_alloc_locked(pool); + if (!mcg) return ERR_PTR(-ENOMEM); err = rxe_mcast_add(rxe, mgid); if (unlikely(err)) { - rxe_drop_ref(grp); + rxe_drop_ref(mcg); return ERR_PTR(err); } - INIT_LIST_HEAD(&grp->qp_list); - grp->rxe = rxe; + INIT_LIST_HEAD(&mcg->qp_list); + mcg->rxe = rxe; - /* rxe_alloc_locked takes a ref on grp but that will be - * dropped when grp goes out of scope. We need to take a ref + /* rxe_alloc_locked takes a ref on mcg but that will be + * dropped when mcg goes out of scope. We need to take a ref * on the pointer that will be saved in the red-black tree - * by rxe_add_key and used to lookup grp from mgid later. + * by rxe_add_key and used to lookup mcg from mgid later. * Adding key makes object visible to outside so this should * be done last after the object is ready. */ - rxe_add_ref(grp); - rxe_add_key_locked(grp, mgid); + rxe_add_ref(mcg); + rxe_add_key_locked(mcg, mgid); - return grp; + return mcg; } -static struct rxe_mcg *rxe_mcast_get_grp(struct rxe_dev *rxe, +static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) { - struct rxe_mcg *grp; + struct rxe_mcg *mcg; struct rxe_pool *pool = &rxe->mc_grp_pool; unsigned long flags; @@ -70,16 +70,16 @@ static struct rxe_mcg *rxe_mcast_get_grp(struct rxe_dev *rxe, return ERR_PTR(-EINVAL); spin_lock_irqsave(&rxe->mcg_lock, flags); - grp = rxe_pool_get_key_locked(pool, mgid); - if (!grp) - grp = __rxe_create_grp(rxe, pool, mgid); + mcg = rxe_pool_get_key_locked(pool, mgid); + if (!mcg) + mcg = __rxe_create_mcg(rxe, pool, mgid); spin_unlock_irqrestore(&rxe->mcg_lock, flags); - return grp; + return mcg; } -static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, - struct rxe_mcg *grp) +static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, + struct rxe_mcg *mcg) { struct rxe_mca *mca, *tmp; unsigned long flags; @@ -87,7 +87,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, /* check to see if the qp is already a member of the group */ spin_lock_irqsave(&rxe->mcg_lock, flags); - list_for_each_entry(mca, &grp->qp_list, qp_list) { + list_for_each_entry(mca, &mcg->qp_list, qp_list) { if (mca->qp == qp) { spin_unlock_irqrestore(&rxe->mcg_lock, flags); return 0; @@ -102,7 +102,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_irqsave(&rxe->mcg_lock, flags); /* re-check to see if someone else just attached qp */ - list_for_each_entry(tmp, &grp->qp_list, qp_list) { + list_for_each_entry(tmp, &mcg->qp_list, qp_list) { if (tmp->qp == qp) { kfree(mca); err = 0; @@ -111,7 +111,7 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, } /* check limits after checking if already attached */ - if (grp->num_qp >= rxe->attr.max_mcast_qp_attach) { + if (mcg->num_qp >= rxe->attr.max_mcast_qp_attach) { kfree(mca); err = -ENOMEM; goto out; @@ -122,8 +122,8 @@ static int rxe_mcast_add_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, mca->qp = qp; atomic_inc(&qp->mcg_num); - grp->num_qp++; - list_add(&mca->qp_list, &grp->qp_list); + mcg->num_qp++; + list_add(&mca->qp_list, &mcg->qp_list); err = 0; out: @@ -132,22 +132,22 @@ out: } /* caller should be holding rxe->mcg_lock */ -static void __rxe_destroy_grp(struct rxe_mcg *grp) +static void __rxe_destroy_mcg(struct rxe_mcg *mcg) { - /* first remove grp from red-black tree then drop ref */ - rxe_drop_key_locked(grp); - rxe_drop_ref(grp); + /* first remove mcg from red-black tree then drop ref */ + rxe_drop_key_locked(mcg); + rxe_drop_ref(mcg); - rxe_mcast_delete(grp->rxe, &grp->mgid); + rxe_mcast_delete(mcg->rxe, &mcg->mgid); } -static void rxe_destroy_grp(struct rxe_mcg *grp) +static void rxe_destroy_mcg(struct rxe_mcg *mcg) { - struct rxe_dev *rxe = grp->rxe; + struct rxe_dev *rxe = mcg->rxe; unsigned long flags; spin_lock_irqsave(&rxe->mcg_lock, flags); - __rxe_destroy_grp(grp); + __rxe_destroy_mcg(mcg); spin_unlock_irqrestore(&rxe->mcg_lock, flags); } @@ -156,23 +156,23 @@ void rxe_mc_cleanup(struct rxe_pool_elem *elem) /* nothing left to do for now */ } -static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, +static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, union ib_gid *mgid) { - struct rxe_mcg *grp; + struct rxe_mcg *mcg; struct rxe_mca *mca, *tmp; unsigned long flags; int err; spin_lock_irqsave(&rxe->mcg_lock, flags); - grp = rxe_pool_get_key_locked(&rxe->mc_grp_pool, mgid); - if (!grp) { + mcg = rxe_pool_get_key_locked(&rxe->mc_grp_pool, mgid); + if (!mcg) { /* we didn't find the mcast group for mgid */ err = -EINVAL; goto out_unlock; } - list_for_each_entry_safe(mca, tmp, &grp->qp_list, qp_list) { + list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) { if (mca->qp == qp) { list_del(&mca->qp_list); @@ -182,16 +182,16 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, * object since we are still holding a ref * from the get key above. */ - grp->num_qp--; - if (grp->num_qp <= 0) - __rxe_destroy_grp(grp); + mcg->num_qp--; + if (mcg->num_qp <= 0) + __rxe_destroy_mcg(mcg); atomic_dec(&qp->mcg_num); /* drop the ref from get key. This will free the * object if num_qp is zero. */ - rxe_drop_ref(grp); + rxe_drop_ref(mcg); kfree(mca); err = 0; goto out_unlock; @@ -199,7 +199,7 @@ static int rxe_mcast_drop_grp_elem(struct rxe_dev *rxe, struct rxe_qp *qp, } /* we didn't find the qp on the list */ - rxe_drop_ref(grp); + rxe_drop_ref(mcg); err = -EINVAL; out_unlock: @@ -212,20 +212,20 @@ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) int err; struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); - struct rxe_mcg *grp; + struct rxe_mcg *mcg; - /* takes a ref on grp if successful */ - grp = rxe_mcast_get_grp(rxe, mgid); - if (IS_ERR(grp)) - return PTR_ERR(grp); + /* takes a ref on mcg if successful */ + mcg = rxe_get_mcg(rxe, mgid); + if (IS_ERR(mcg)) + return PTR_ERR(mcg); - err = rxe_mcast_add_grp_elem(rxe, qp, grp); + err = rxe_attach_mcg(rxe, qp, mcg); - /* if we failed to attach the first qp to grp tear it down */ - if (grp->num_qp == 0) - rxe_destroy_grp(grp); + /* if we failed to attach the first qp to mcg tear it down */ + if (mcg->num_qp == 0) + rxe_destroy_mcg(mcg); - rxe_drop_ref(grp); + rxe_drop_ref(mcg); return err; } @@ -234,5 +234,5 @@ int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); - return rxe_mcast_drop_grp_elem(rxe, qp, mgid); + return rxe_detach_mcg(rxe, qp, mgid); } diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index a084b5d69937..d91c6660e83c 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -234,7 +234,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) { struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); struct rxe_mcg *mcg; - struct rxe_mca *mce; + struct rxe_mca *mca; struct rxe_qp *qp; union ib_gid dgid; int err; @@ -257,8 +257,8 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) * single QP happen and just move on and try * the rest of them on the list */ - list_for_each_entry(mce, &mcg->qp_list, qp_list) { - qp = mce->qp; + list_for_each_entry(mca, &mcg->qp_list, qp_list) { + qp = mca->qp; /* validate qp for incoming packet */ err = check_type_state(rxe, pkt, qp); @@ -273,7 +273,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) * skb and pass to the QP. Pass the original skb to * the last QP in the list. */ - if (mce->qp_list.next != &mcg->qp_list) { + if (mca->qp_list.next != &mcg->qp_list) { struct sk_buff *cskb; struct rxe_pkt_info *cpkt; From 8a99c81f1231786c364963a9f335eab2cca816a4 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:38 -0600 Subject: [PATCH 080/186] RDMA/rxe: Replace int num_qp by atomic_t qp_num Replace int num_qp in struct rxe_mcg by atomic_t qp_num. Link: https://lore.kernel.org/r/20220208211644.123457-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 9 ++++----- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index ae8103e819d5..1995d24caa60 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -111,7 +111,8 @@ static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, } /* check limits after checking if already attached */ - if (mcg->num_qp >= rxe->attr.max_mcast_qp_attach) { + if (atomic_inc_return(&mcg->qp_num) > rxe->attr.max_mcast_qp_attach) { + atomic_dec(&mcg->qp_num); kfree(mca); err = -ENOMEM; goto out; @@ -122,7 +123,6 @@ static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, mca->qp = qp; atomic_inc(&qp->mcg_num); - mcg->num_qp++; list_add(&mca->qp_list, &mcg->qp_list); err = 0; @@ -182,8 +182,7 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, * object since we are still holding a ref * from the get key above. */ - mcg->num_qp--; - if (mcg->num_qp <= 0) + if (atomic_dec_return(&mcg->qp_num) <= 0) __rxe_destroy_mcg(mcg); atomic_dec(&qp->mcg_num); @@ -222,7 +221,7 @@ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) err = rxe_attach_mcg(rxe, qp, mcg); /* if we failed to attach the first qp to mcg tear it down */ - if (mcg->num_qp == 0) + if (atomic_read(&mcg->qp_num) == 0) rxe_destroy_mcg(mcg); rxe_drop_ref(mcg); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 1b0f40881895..3790163bb265 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -356,7 +356,7 @@ struct rxe_mcg { struct rxe_dev *rxe; struct list_head qp_list; union ib_gid mgid; - int num_qp; + atomic_t qp_num; u32 qkey; u16 pkey; }; From 8a0a5fe0c462438a8c423ebaa0fbb7af5055a155 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:39 -0600 Subject: [PATCH 081/186] RDMA/rxe: Replace pool key by rxe->mcg_tree Continuing to decouple mcg from rxe pools. Create red-black tree code in rxe_mcast.c to hold mcg index. Replace pool key calls by calls to local red-black routines. Link: https://lore.kernel.org/r/20220208211644.123457-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 2 + drivers/infiniband/sw/rxe/rxe_loc.h | 3 +- drivers/infiniband/sw/rxe/rxe_mcast.c | 284 +++++++++++++++++++------- drivers/infiniband/sw/rxe/rxe_recv.c | 4 +- drivers/infiniband/sw/rxe/rxe_verbs.h | 6 +- 5 files changed, 224 insertions(+), 75 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 7386a51b953d..dc36148272dd 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -203,7 +203,9 @@ static int rxe_init(struct rxe_dev *rxe) spin_lock_init(&rxe->pending_lock); INIT_LIST_HEAD(&rxe->pending_mmaps); + /* init multicast support */ spin_lock_init(&rxe->mcg_lock); + rxe->mcg_tree = RB_ROOT; mutex_init(&rxe->usdev_lock); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index af40e3c212fb..d41831878fa6 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -40,9 +40,10 @@ void rxe_cq_disable(struct rxe_cq *cq); void rxe_cq_cleanup(struct rxe_pool_elem *arg); /* rxe_mcast.c */ -void rxe_mc_cleanup(struct rxe_pool_elem *arg); +struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid); int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); +void rxe_mc_cleanup(struct rxe_pool_elem *elem); /* rxe_mmap.c */ struct rxe_mmap_info { diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 1995d24caa60..8585521169b6 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -25,57 +25,229 @@ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) return dev_mc_del(rxe->ndev, ll_addr); } -/* caller should hold rxe->mcg_lock */ -static struct rxe_mcg *__rxe_create_mcg(struct rxe_dev *rxe, - struct rxe_pool *pool, - union ib_gid *mgid) +/** + * __rxe_insert_mcg - insert an mcg into red-black tree (rxe->mcg_tree) + * @mcg: mcg object with an embedded red-black tree node + * + * Context: caller must hold a reference to mcg and rxe->mcg_lock and + * is responsible to avoid adding the same mcg twice to the tree. + */ +static void __rxe_insert_mcg(struct rxe_mcg *mcg) { - struct rxe_mcg *mcg; - int err; + struct rb_root *tree = &mcg->rxe->mcg_tree; + struct rb_node **link = &tree->rb_node; + struct rb_node *node = NULL; + struct rxe_mcg *tmp; + int cmp; - mcg = rxe_alloc_locked(pool); - if (!mcg) - return ERR_PTR(-ENOMEM); + while (*link) { + node = *link; + tmp = rb_entry(node, struct rxe_mcg, node); - err = rxe_mcast_add(rxe, mgid); - if (unlikely(err)) { - rxe_drop_ref(mcg); - return ERR_PTR(err); + cmp = memcmp(&tmp->mgid, &mcg->mgid, sizeof(mcg->mgid)); + if (cmp > 0) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; } - INIT_LIST_HEAD(&mcg->qp_list); - mcg->rxe = rxe; + rb_link_node(&mcg->node, node, link); + rb_insert_color(&mcg->node, tree); +} - /* rxe_alloc_locked takes a ref on mcg but that will be - * dropped when mcg goes out of scope. We need to take a ref - * on the pointer that will be saved in the red-black tree - * by rxe_add_key and used to lookup mcg from mgid later. - * Adding key makes object visible to outside so this should - * be done last after the object is ready. - */ - rxe_add_ref(mcg); - rxe_add_key_locked(mcg, mgid); +/** + * __rxe_remove_mcg - remove an mcg from red-black tree holding lock + * @mcg: mcast group object with an embedded red-black tree node + * + * Context: caller must hold a reference to mcg and rxe->mcg_lock + */ +static void __rxe_remove_mcg(struct rxe_mcg *mcg) +{ + rb_erase(&mcg->node, &mcg->rxe->mcg_tree); +} + +/** + * __rxe_lookup_mcg - lookup mcg in rxe->mcg_tree while holding lock + * @rxe: rxe device object + * @mgid: multicast IP address + * + * Context: caller must hold rxe->mcg_lock + * Returns: mcg on success and takes a ref to mcg else NULL + */ +static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe, + union ib_gid *mgid) +{ + struct rb_root *tree = &rxe->mcg_tree; + struct rxe_mcg *mcg; + struct rb_node *node; + int cmp; + + node = tree->rb_node; + + while (node) { + mcg = rb_entry(node, struct rxe_mcg, node); + + cmp = memcmp(&mcg->mgid, mgid, sizeof(*mgid)); + + if (cmp > 0) + node = node->rb_left; + else if (cmp < 0) + node = node->rb_right; + else + break; + } + + if (node) { + rxe_add_ref(mcg); + return mcg; + } + + return NULL; +} + +/** + * rxe_lookup_mcg - lookup up mcg in red-back tree + * @rxe: rxe device object + * @mgid: multicast IP address + * + * Returns: mcg if found else NULL + */ +struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid) +{ + struct rxe_mcg *mcg; + unsigned long flags; + + spin_lock_irqsave(&rxe->mcg_lock, flags); + mcg = __rxe_lookup_mcg(rxe, mgid); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); return mcg; } -static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, - union ib_gid *mgid) +/** + * __rxe_init_mcg - initialize a new mcg + * @rxe: rxe device + * @mgid: multicast address as a gid + * @mcg: new mcg object + * + * Context: caller should hold rxe->mcg lock + * Returns: 0 on success else an error + */ +static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, + struct rxe_mcg *mcg) { - struct rxe_mcg *mcg; - struct rxe_pool *pool = &rxe->mc_grp_pool; - unsigned long flags; + int err; - if (rxe->attr.max_mcast_qp_attach == 0) + err = rxe_mcast_add(rxe, mgid); + if (unlikely(err)) + return err; + + memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); + INIT_LIST_HEAD(&mcg->qp_list); + mcg->rxe = rxe; + + /* caller holds a ref on mcg but that will be + * dropped when mcg goes out of scope. We need to take a ref + * on the pointer that will be saved in the red-black tree + * by __rxe_insert_mcg and used to lookup mcg from mgid later. + * Inserting mcg makes it visible to outside so this should + * be done last after the object is ready. + */ + rxe_add_ref(mcg); + __rxe_insert_mcg(mcg); + + return 0; +} + +/** + * rxe_get_mcg - lookup or allocate a mcg + * @rxe: rxe device object + * @mgid: multicast IP address as a gid + * + * Returns: mcg on success else ERR_PTR(error) + */ +static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) +{ + struct rxe_pool *pool = &rxe->mc_grp_pool; + struct rxe_mcg *mcg, *tmp; + unsigned long flags; + int err; + + if (rxe->attr.max_mcast_grp == 0) return ERR_PTR(-EINVAL); - spin_lock_irqsave(&rxe->mcg_lock, flags); - mcg = rxe_pool_get_key_locked(pool, mgid); - if (!mcg) - mcg = __rxe_create_mcg(rxe, pool, mgid); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); + /* check to see if mcg already exists */ + mcg = rxe_lookup_mcg(rxe, mgid); + if (mcg) + return mcg; + /* speculative alloc of new mcg */ + mcg = rxe_alloc(pool); + if (!mcg) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&rxe->mcg_lock, flags); + /* re-check to see if someone else just added it */ + tmp = __rxe_lookup_mcg(rxe, mgid); + if (tmp) { + rxe_drop_ref(mcg); + mcg = tmp; + goto out; + } + + if (atomic_inc_return(&rxe->mcg_num) > rxe->attr.max_mcast_grp) { + err = -ENOMEM; + goto err_dec; + } + + err = __rxe_init_mcg(rxe, mgid, mcg); + if (err) + goto err_dec; +out: + spin_unlock_irqrestore(&rxe->mcg_lock, flags); return mcg; + +err_dec: + atomic_dec(&rxe->mcg_num); + spin_unlock_irqrestore(&rxe->mcg_lock, flags); + rxe_drop_ref(mcg); + return ERR_PTR(err); +} + +/** + * __rxe_destroy_mcg - destroy mcg object holding rxe->mcg_lock + * @mcg: the mcg object + * + * Context: caller is holding rxe->mcg_lock + * no qp's are attached to mcg + */ +static void __rxe_destroy_mcg(struct rxe_mcg *mcg) +{ + /* remove mcg from red-black tree then drop ref */ + __rxe_remove_mcg(mcg); + rxe_drop_ref(mcg); + + rxe_mcast_delete(mcg->rxe, &mcg->mgid); +} + +/** + * rxe_destroy_mcg - destroy mcg object + * @mcg: the mcg object + * + * Context: no qp's are attached to mcg + */ +static void rxe_destroy_mcg(struct rxe_mcg *mcg) +{ + unsigned long flags; + + spin_lock_irqsave(&mcg->rxe->mcg_lock, flags); + __rxe_destroy_mcg(mcg); + spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags); +} + +void rxe_mc_cleanup(struct rxe_pool_elem *elem) +{ + /* nothing left to do for now */ } static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, @@ -131,31 +303,6 @@ out: return err; } -/* caller should be holding rxe->mcg_lock */ -static void __rxe_destroy_mcg(struct rxe_mcg *mcg) -{ - /* first remove mcg from red-black tree then drop ref */ - rxe_drop_key_locked(mcg); - rxe_drop_ref(mcg); - - rxe_mcast_delete(mcg->rxe, &mcg->mgid); -} - -static void rxe_destroy_mcg(struct rxe_mcg *mcg) -{ - struct rxe_dev *rxe = mcg->rxe; - unsigned long flags; - - spin_lock_irqsave(&rxe->mcg_lock, flags); - __rxe_destroy_mcg(mcg); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); -} - -void rxe_mc_cleanup(struct rxe_pool_elem *elem) -{ - /* nothing left to do for now */ -} - static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, union ib_gid *mgid) { @@ -164,17 +311,16 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, unsigned long flags; int err; - spin_lock_irqsave(&rxe->mcg_lock, flags); - mcg = rxe_pool_get_key_locked(&rxe->mc_grp_pool, mgid); - if (!mcg) { - /* we didn't find the mcast group for mgid */ - err = -EINVAL; - goto out_unlock; - } + mcg = rxe_lookup_mcg(rxe, mgid); + if (!mcg) + return -EINVAL; + spin_lock_irqsave(&rxe->mcg_lock, flags); list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) { if (mca->qp == qp) { list_del(&mca->qp_list); + atomic_dec(&qp->mcg_num); + rxe_drop_ref(qp); /* if the number of qp's attached to the * mcast group falls to zero go ahead and @@ -185,10 +331,8 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, if (atomic_dec_return(&mcg->qp_num) <= 0) __rxe_destroy_mcg(mcg); - atomic_dec(&qp->mcg_num); - /* drop the ref from get key. This will free the - * object if num_qp is zero. + * object if qp_num is zero. */ rxe_drop_ref(mcg); kfree(mca); diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index d91c6660e83c..fb265902f7e3 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -246,7 +246,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) memcpy(&dgid, &ipv6_hdr(skb)->daddr, sizeof(dgid)); /* lookup mcast group corresponding to mgid, takes a ref */ - mcg = rxe_pool_get_key(&rxe->mc_grp_pool, &dgid); + mcg = rxe_lookup_mcg(rxe, &dgid); if (!mcg) goto drop; /* mcast group not registered */ @@ -300,7 +300,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) spin_unlock_bh(&rxe->mcg_lock); - rxe_drop_ref(mcg); /* drop ref from rxe_pool_get_key. */ + rxe_drop_ref(mcg); if (likely(!skb)) return; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 3790163bb265..caa5b1b05019 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -353,6 +353,7 @@ struct rxe_mw { struct rxe_mcg { struct rxe_pool_elem elem; + struct rb_node node; struct rxe_dev *rxe; struct list_head qp_list; union ib_gid mgid; @@ -396,7 +397,10 @@ struct rxe_dev { struct rxe_pool mw_pool; struct rxe_pool mc_grp_pool; + /* multicast support */ spinlock_t mcg_lock; + struct rb_root mcg_tree; + atomic_t mcg_num; spinlock_t pending_lock; /* guard pending_mmaps */ struct list_head pending_mmaps; @@ -477,6 +481,4 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); -void rxe_mc_cleanup(struct rxe_pool_elem *elem); - #endif /* RXE_VERBS_H */ From d2ccf0411d253433409278cd517a091a5b7b613e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:40 -0600 Subject: [PATCH 082/186] RDMA/rxe: Remove key'ed object support Now that rxe_mcast.c has it's own red-black tree support there is no longer any requirement for key'ed objects in rxe pools. This patch removes the key APIs and related code. Link: https://lore.kernel.org/r/20220208211644.123457-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_pool.c | 126 --------------------------- drivers/infiniband/sw/rxe/rxe_pool.h | 38 -------- 2 files changed, 164 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index eaf4bfc9b856..940c8eeab8fd 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -16,8 +16,6 @@ static const struct rxe_type_info { enum rxe_pool_flags flags; u32 min_index; u32 max_index; - size_t key_offset; - size_t key_size; } rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_UC] = { .name = "rxe-uc", @@ -147,12 +145,6 @@ int rxe_pool_init( goto out; } - if (pool->flags & RXE_POOL_KEY) { - pool->key.tree = RB_ROOT; - pool->key.key_offset = info->key_offset; - pool->key.key_size = info->key_size; - } - out: return err; } @@ -209,79 +201,6 @@ static int rxe_insert_index(struct rxe_pool *pool, struct rxe_pool_elem *new) return 0; } -static int rxe_insert_key(struct rxe_pool *pool, struct rxe_pool_elem *new) -{ - struct rb_node **link = &pool->key.tree.rb_node; - struct rb_node *parent = NULL; - struct rxe_pool_elem *elem; - int cmp; - - while (*link) { - parent = *link; - elem = rb_entry(parent, struct rxe_pool_elem, key_node); - - cmp = memcmp((u8 *)elem + pool->key.key_offset, - (u8 *)new + pool->key.key_offset, - pool->key.key_size); - - if (cmp == 0) { - pr_warn("key already exists!\n"); - return -EINVAL; - } - - if (cmp > 0) - link = &(*link)->rb_left; - else - link = &(*link)->rb_right; - } - - rb_link_node(&new->key_node, parent, link); - rb_insert_color(&new->key_node, &pool->key.tree); - - return 0; -} - -int __rxe_add_key_locked(struct rxe_pool_elem *elem, void *key) -{ - struct rxe_pool *pool = elem->pool; - int err; - - memcpy((u8 *)elem + pool->key.key_offset, key, pool->key.key_size); - err = rxe_insert_key(pool, elem); - - return err; -} - -int __rxe_add_key(struct rxe_pool_elem *elem, void *key) -{ - struct rxe_pool *pool = elem->pool; - unsigned long flags; - int err; - - write_lock_irqsave(&pool->pool_lock, flags); - err = __rxe_add_key_locked(elem, key); - write_unlock_irqrestore(&pool->pool_lock, flags); - - return err; -} - -void __rxe_drop_key_locked(struct rxe_pool_elem *elem) -{ - struct rxe_pool *pool = elem->pool; - - rb_erase(&elem->key_node, &pool->key.tree); -} - -void __rxe_drop_key(struct rxe_pool_elem *elem) -{ - struct rxe_pool *pool = elem->pool; - unsigned long flags; - - write_lock_irqsave(&pool->pool_lock, flags); - __rxe_drop_key_locked(elem); - write_unlock_irqrestore(&pool->pool_lock, flags); -} - int __rxe_add_index_locked(struct rxe_pool_elem *elem) { struct rxe_pool *pool = elem->pool; @@ -448,48 +367,3 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) return obj; } - -void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key) -{ - struct rb_node *node; - struct rxe_pool_elem *elem; - void *obj; - int cmp; - - node = pool->key.tree.rb_node; - - while (node) { - elem = rb_entry(node, struct rxe_pool_elem, key_node); - - cmp = memcmp((u8 *)elem + pool->key.key_offset, - key, pool->key.key_size); - - if (cmp > 0) - node = node->rb_left; - else if (cmp < 0) - node = node->rb_right; - else - break; - } - - if (node) { - kref_get(&elem->ref_cnt); - obj = elem->obj; - } else { - obj = NULL; - } - - return obj; -} - -void *rxe_pool_get_key(struct rxe_pool *pool, void *key) -{ - unsigned long flags; - void *obj; - - read_lock_irqsave(&pool->pool_lock, flags); - obj = rxe_pool_get_key_locked(pool, key); - read_unlock_irqrestore(&pool->pool_lock, flags); - - return obj; -} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 9201bb6b8b07..2db9d9872cd1 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -9,7 +9,6 @@ enum rxe_pool_flags { RXE_POOL_INDEX = BIT(1), - RXE_POOL_KEY = BIT(2), RXE_POOL_NO_ALLOC = BIT(4), }; @@ -32,9 +31,6 @@ struct rxe_pool_elem { struct kref ref_cnt; struct list_head list; - /* only used if keyed */ - struct rb_node key_node; - /* only used if indexed */ struct rb_node index_node; u32 index; @@ -61,13 +57,6 @@ struct rxe_pool { u32 max_index; u32 min_index; } index; - - /* only used if keyed */ - struct { - struct rb_root tree; - size_t key_offset; - size_t key_size; - } key; }; /* initialize a pool of objects with given limit on @@ -112,26 +101,6 @@ void __rxe_drop_index(struct rxe_pool_elem *elem); #define rxe_drop_index(obj) __rxe_drop_index(&(obj)->elem) -/* assign a key to a keyed object and insert object into - * pool's rb tree holding and not holding pool_lock - */ -int __rxe_add_key_locked(struct rxe_pool_elem *elem, void *key); - -#define rxe_add_key_locked(obj, key) __rxe_add_key_locked(&(obj)->elem, key) - -int __rxe_add_key(struct rxe_pool_elem *elem, void *key); - -#define rxe_add_key(obj, key) __rxe_add_key(&(obj)->elem, key) - -/* remove elem from rb tree holding and not holding the pool_lock */ -void __rxe_drop_key_locked(struct rxe_pool_elem *elem); - -#define rxe_drop_key_locked(obj) __rxe_drop_key_locked(&(obj)->elem) - -void __rxe_drop_key(struct rxe_pool_elem *elem); - -#define rxe_drop_key(obj) __rxe_drop_key(&(obj)->elem) - /* lookup an indexed object from index holding and not holding the pool_lock. * takes a reference on object */ @@ -139,13 +108,6 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index); void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); -/* lookup keyed object from key holding and not holding the pool_lock. - * takes a reference on the objecti - */ -void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key); - -void *rxe_pool_get_key(struct rxe_pool *pool, void *key); - /* cleanup an object when all references are dropped */ void rxe_elem_release(struct kref *kref); From 3810c1a1cbe8f3157f644b4e42f6c0157dfd22cb Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 8 Feb 2022 15:16:41 -0600 Subject: [PATCH 083/186] RDMA/rxe: Remove mcg from rxe pools Finish removing mcg from rxe pools. Replace rxe pools ref counting by kref's. Replace rxe_alloc by kzalloc. Link: https://lore.kernel.org/r/20220208211644.123457-8-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 8 ------ drivers/infiniband/sw/rxe/rxe_loc.h | 2 +- drivers/infiniband/sw/rxe/rxe_mcast.c | 39 ++++++++++++++++----------- drivers/infiniband/sw/rxe/rxe_pool.c | 9 ------- drivers/infiniband/sw/rxe/rxe_pool.h | 1 - drivers/infiniband/sw/rxe/rxe_recv.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- 7 files changed, 27 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index dc36148272dd..3520eb2db685 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -28,7 +28,6 @@ void rxe_dealloc(struct ib_device *ib_dev) rxe_pool_cleanup(&rxe->cq_pool); rxe_pool_cleanup(&rxe->mr_pool); rxe_pool_cleanup(&rxe->mw_pool); - rxe_pool_cleanup(&rxe->mc_grp_pool); if (rxe->tfm) crypto_free_shash(rxe->tfm); @@ -157,15 +156,8 @@ static int rxe_init_pools(struct rxe_dev *rxe) if (err) goto err8; - err = rxe_pool_init(rxe, &rxe->mc_grp_pool, RXE_TYPE_MC_GRP, - rxe->attr.max_mcast_grp); - if (err) - goto err9; - return 0; -err9: - rxe_pool_cleanup(&rxe->mw_pool); err8: rxe_pool_cleanup(&rxe->mr_pool); err7: diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index d41831878fa6..409efeecd581 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -43,7 +43,7 @@ void rxe_cq_cleanup(struct rxe_pool_elem *arg); struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid); int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid); -void rxe_mc_cleanup(struct rxe_pool_elem *elem); +void rxe_cleanup_mcg(struct kref *kref); /* rxe_mmap.c */ struct rxe_mmap_info { diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 8585521169b6..4935fe5c5868 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -98,7 +98,7 @@ static struct rxe_mcg *__rxe_lookup_mcg(struct rxe_dev *rxe, } if (node) { - rxe_add_ref(mcg); + kref_get(&mcg->ref_cnt); return mcg; } @@ -142,6 +142,7 @@ static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, if (unlikely(err)) return err; + kref_init(&mcg->ref_cnt); memcpy(&mcg->mgid, mgid, sizeof(mcg->mgid)); INIT_LIST_HEAD(&mcg->qp_list); mcg->rxe = rxe; @@ -153,7 +154,7 @@ static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, * Inserting mcg makes it visible to outside so this should * be done last after the object is ready. */ - rxe_add_ref(mcg); + kref_get(&mcg->ref_cnt); __rxe_insert_mcg(mcg); return 0; @@ -168,7 +169,6 @@ static int __rxe_init_mcg(struct rxe_dev *rxe, union ib_gid *mgid, */ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) { - struct rxe_pool *pool = &rxe->mc_grp_pool; struct rxe_mcg *mcg, *tmp; unsigned long flags; int err; @@ -182,7 +182,7 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) return mcg; /* speculative alloc of new mcg */ - mcg = rxe_alloc(pool); + mcg = kzalloc(sizeof(*mcg), GFP_KERNEL); if (!mcg) return ERR_PTR(-ENOMEM); @@ -190,7 +190,7 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) /* re-check to see if someone else just added it */ tmp = __rxe_lookup_mcg(rxe, mgid); if (tmp) { - rxe_drop_ref(mcg); + kfree(mcg); mcg = tmp; goto out; } @@ -210,10 +210,21 @@ out: err_dec: atomic_dec(&rxe->mcg_num); spin_unlock_irqrestore(&rxe->mcg_lock, flags); - rxe_drop_ref(mcg); + kfree(mcg); return ERR_PTR(err); } +/** + * rxe_cleanup_mcg - cleanup mcg for kref_put + * @kref: + */ +void rxe_cleanup_mcg(struct kref *kref) +{ + struct rxe_mcg *mcg = container_of(kref, typeof(*mcg), ref_cnt); + + kfree(mcg); +} + /** * __rxe_destroy_mcg - destroy mcg object holding rxe->mcg_lock * @mcg: the mcg object @@ -223,11 +234,14 @@ err_dec: */ static void __rxe_destroy_mcg(struct rxe_mcg *mcg) { + struct rxe_dev *rxe = mcg->rxe; + /* remove mcg from red-black tree then drop ref */ __rxe_remove_mcg(mcg); - rxe_drop_ref(mcg); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); rxe_mcast_delete(mcg->rxe, &mcg->mgid); + atomic_dec(&rxe->mcg_num); } /** @@ -245,11 +259,6 @@ static void rxe_destroy_mcg(struct rxe_mcg *mcg) spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags); } -void rxe_mc_cleanup(struct rxe_pool_elem *elem) -{ - /* nothing left to do for now */ -} - static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_mcg *mcg) { @@ -334,7 +343,7 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, /* drop the ref from get key. This will free the * object if qp_num is zero. */ - rxe_drop_ref(mcg); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); kfree(mca); err = 0; goto out_unlock; @@ -342,7 +351,7 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, } /* we didn't find the qp on the list */ - rxe_drop_ref(mcg); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); err = -EINVAL; out_unlock: @@ -368,7 +377,7 @@ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) if (atomic_read(&mcg->qp_num) == 0) rxe_destroy_mcg(mcg); - rxe_drop_ref(mcg); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 940c8eeab8fd..16056b918ace 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -79,15 +79,6 @@ static const struct rxe_type_info { .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, }, - [RXE_TYPE_MC_GRP] = { - .name = "rxe-mc_grp", - .size = sizeof(struct rxe_mcg), - .elem_offset = offsetof(struct rxe_mcg, elem), - .cleanup = rxe_mc_cleanup, - .flags = RXE_POOL_KEY, - .key_offset = offsetof(struct rxe_mcg, mgid), - .key_size = sizeof(union ib_gid), - }, }; static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 2db9d9872cd1..8fc95c6b7b9b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -21,7 +21,6 @@ enum rxe_elem_type { RXE_TYPE_CQ, RXE_TYPE_MR, RXE_TYPE_MW, - RXE_TYPE_MC_GRP, RXE_NUM_TYPES, /* keep me last */ }; diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index fb265902f7e3..53924453abef 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -300,7 +300,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) spin_unlock_bh(&rxe->mcg_lock); - rxe_drop_ref(mcg); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); if (likely(!skb)) return; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index caa5b1b05019..20fe3ee6589d 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -352,8 +352,8 @@ struct rxe_mw { }; struct rxe_mcg { - struct rxe_pool_elem elem; struct rb_node node; + struct kref ref_cnt; struct rxe_dev *rxe; struct list_head qp_list; union ib_gid mgid; From 748663c8ccf6b2e5a800de19127c2cc1c4423fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Wed, 9 Feb 2022 16:39:35 +0100 Subject: [PATCH 084/186] IB/cma: Allow XRC INI QPs to set their local ACK timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XRC INI QPs should be able to adjust their local ACK timeout. Fixes: 2c1619edef61 ("IB/cma: Define option to set ack timeout and pack tos_set") Link: https://lore.kernel.org/r/1644421175-31943-1-git-send-email-haakon.bugge@oracle.com Signed-off-by: Håkon Bugge Suggested-by: Avneesh Pant Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 27a00ce2e101..288ff0735875 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2640,7 +2640,7 @@ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; - if (id->qp_type != IB_QPT_RC) + if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); From 41ae9105f5e23e98a1734be2eeddddf488e42c2e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 10 Feb 2022 17:04:42 -0800 Subject: [PATCH 085/186] cxl/port: Fix endpoint refcount leak An endpoint can be unregistered via two paths. Either its parent port is unregistered, or the memdev that registered the endpoint is removed. The memdev remove path is responsible for synchronizing against the parent ->remove() event and if the memdev remove path wins, manually trigger unregister_port() via devm_release_action(). Until that race is resolved the memdev remove path holds a reference on the endpoint. If the parent port for the endpoint can not be found that is an indication that the endpoint has already been registered. Be sure to drop the reference in all exit paths from delete_endpoint(). Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver") Link: https://lore.kernel.org/r/164454148209.3429624.12905500880311609053.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index d29eb2abdbc2..0fc1441be014 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -850,7 +850,7 @@ static void delete_endpoint(void *data) parent_port = cxl_mem_find_port(cxlmd); if (!parent_port) - return; + goto out; parent = &parent_port->dev; cxl_device_lock(parent); @@ -860,6 +860,7 @@ static void delete_endpoint(void *data) } cxl_device_unlock(parent); put_device(parent); +out: put_device(&endpoint->dev); } From 74be98774dfbc5b8b795db726bd772e735d2edd4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 16 Feb 2022 16:25:11 -0800 Subject: [PATCH 086/186] cxl/port: Hold port reference until decoder release KASAN + DEBUG_KOBJECT_RELEASE reports a potential use-after-free in cxl_decoder_release() where it goes to reference its parent, a cxl_port, to free its id back to port->decoder_ida. BUG: KASAN: use-after-free in to_cxl_port+0x18/0x90 [cxl_core] Read of size 8 at addr ffff888119270908 by task kworker/35:2/379 CPU: 35 PID: 379 Comm: kworker/35:2 Tainted: G OE 5.17.0-rc2+ #198 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Workqueue: events kobject_delayed_cleanup Call Trace: dump_stack_lvl+0x59/0x73 print_address_description.constprop.0+0x1f/0x150 ? to_cxl_port+0x18/0x90 [cxl_core] kasan_report.cold+0x83/0xdf ? to_cxl_port+0x18/0x90 [cxl_core] to_cxl_port+0x18/0x90 [cxl_core] cxl_decoder_release+0x2a/0x60 [cxl_core] device_release+0x5f/0x100 kobject_cleanup+0x80/0x1c0 The device core only guarantees parent lifetime until all children are unregistered. If a child needs a parent to complete its ->release() callback that child needs to hold a reference to extend the lifetime of the parent. Fixes: 40ba17afdfab ("cxl/acpi: Introduce cxl_decoder objects") Reported-by: Ben Widawsky Tested-by: Ben Widawsky Reviewed-by: Ben Widawsky Link: https://lore.kernel.org/r/164505751190.4175768.13324905271463416712.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 0fc1441be014..c1681248f322 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -251,6 +251,7 @@ static void cxl_decoder_release(struct device *dev) ida_free(&port->decoder_ida, cxld->id); kfree(cxld); + put_device(&port->dev); } static const struct device_type cxl_decoder_endpoint_type = { @@ -1202,7 +1203,10 @@ static struct cxl_decoder *cxl_decoder_alloc(struct cxl_port *port, if (rc < 0) goto err; + /* need parent to stick around to release the id */ + get_device(&port->dev); cxld->id = rc; + cxld->nr_targets = nr_targets; seqlock_init(&cxld->target_lock); dev = &cxld->dev; From c2e8021a535d3e7cc4f5f1418c4acf97589f8eb5 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 27 Jan 2022 20:40:53 +0800 Subject: [PATCH 087/186] fsdax: fix function description The function name has been changed, so the description should be updated too. Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220127124058.1172422-5-ruansy.fnst@fujitsu.com Signed-off-by: Dan Williams --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index cd03485867a7..c8d57080c1aa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -390,7 +390,7 @@ static struct page *dax_busy_page(void *entry) } /* - * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page + * dax_lock_page - Lock the DAX entry corresponding to a page * @page: The page whose entry we want to lock * * Context: Process context. From a7e8de822e0b1979f08767c751f6c8a9c1d4ad86 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Fri, 11 Feb 2022 23:11:11 -0800 Subject: [PATCH 088/186] dax: make sure inodes are flushed before destroy cache A bug can be triggered by following command $ modprobe nd_pmem && modprobe -r nd_pmem [ 10.060014] BUG dax_cache (Not tainted): Objects remaining in dax_cache on __kmem_cache_shutdown() [ 10.060938] Slab 0x0000000085b729ac objects=9 used=1 fp=0x000000004f5ae469 flags=0x200000000010200(slab|head|node) [ 10.062433] Call Trace: [ 10.062673] dump_stack_lvl+0x34/0x44 [ 10.062865] slab_err+0x90/0xd0 [ 10.063619] __kmem_cache_shutdown+0x13b/0x2f0 [ 10.063848] kmem_cache_destroy+0x4a/0x110 [ 10.064058] __x64_sys_delete_module+0x265/0x300 This is caused by dax_fs_exit() not flushing inodes before destroy cache. To fix this issue, call rcu_barrier() before destroy cache. Signed-off-by: Tong Zhang Reviewed-by: Ira Weiny Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220212071111.148575-1-ztong0001@gmail.com Fixes: 7b6be8444e0f ("dax: refactor dax-fs into a generic provider of 'struct dax_device' instances") Signed-off-by: Dan Williams --- drivers/dax/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index e3029389d809..6bd565fe2e63 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -476,6 +476,7 @@ static int dax_fs_init(void) static void dax_fs_exit(void) { kern_unmount(dax_mnt); + rcu_barrier(); kmem_cache_destroy(dax_cache); } From 185b9826782a53529b2b57328a8f49b1d0cf8f8f Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 15 Feb 2022 19:55:29 +0200 Subject: [PATCH 089/186] RDMA/mlx5: Remove redundant work in struct mlx5_cache_ent delayed_cache_work_func() and the cache_work_func() are both wrappers of __cache_work_func(). Instead of having a special not delayed work, use the delayed work with delay = 0. Link: https://lore.kernel.org/r/18b6ae205e75f087aa4a2a05c81ea8b66d8d88dc.1644947594.git.leonro@nvidia.com Signed-off-by: Aharon Landau Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - drivers/infiniband/hw/mlx5/mr.c | 16 +++------------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index af68bce056a7..bacb6900b39f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -788,7 +788,6 @@ struct mlx5_cache_ent { u32 miss; struct mlx5_ib_dev *dev; - struct work_struct work; struct delayed_work dwork; }; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 157d862fb864..cd14d1b9dc1d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -465,14 +465,14 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) return; if (ent->available_mrs < ent->limit) { ent->fill_to_high_water = true; - queue_work(ent->dev->cache.wq, &ent->work); + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } else if (ent->fill_to_high_water && ent->available_mrs + ent->pending < 2 * ent->limit) { /* * Once we start populating due to hitting a low water mark * continue until we pass the high water mark. */ - queue_work(ent->dev->cache.wq, &ent->work); + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } else if (ent->available_mrs == 2 * ent->limit) { ent->fill_to_high_water = false; } else if (ent->available_mrs > 2 * ent->limit) { @@ -482,7 +482,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) queue_delayed_work(ent->dev->cache.wq, &ent->dwork, msecs_to_jiffies(1000)); else - queue_work(ent->dev->cache.wq, &ent->work); + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } } @@ -558,14 +558,6 @@ static void delayed_cache_work_func(struct work_struct *work) __cache_work_func(ent); } -static void cache_work_func(struct work_struct *work) -{ - struct mlx5_cache_ent *ent; - - ent = container_of(work, struct mlx5_cache_ent, work); - __cache_work_func(ent); -} - /* Allocate a special entry from the cache */ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, unsigned int entry, int access_flags) @@ -726,7 +718,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->dev = dev; ent->limit = 0; - INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); if (i > MR_CACHE_LAST_STD_ENTRY) { @@ -770,7 +761,6 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) spin_lock_irq(&ent->lock); ent->disabled = true; spin_unlock_irq(&ent->lock); - cancel_work_sync(&ent->work); cancel_delayed_work_sync(&ent->dwork); } From 2f0e60d5e9f96341a0c8a01be8878cdb3b29ff20 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 15 Feb 2022 19:55:30 +0200 Subject: [PATCH 090/186] RDMA/mlx5: Fix the flow of a miss in the allocation of a cache ODP MR When an ODP MR cache entry is empty and trying to allocate it, increment the ent->miss counter and call to queue_adjust_cache_locked() to verify the entry is balanced. Fixes: aad719dcf379 ("RDMA/mlx5: Allow MRs to be created in the cache synchronously") Link: https://lore.kernel.org/r/09503e295276dcacc92cb1d8aef1ad0961c99dc1.1644947594.git.leonro@nvidia.com Signed-off-by: Aharon Landau Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index cd14d1b9dc1d..bce3cb6af524 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -577,6 +577,8 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, ent = &cache->ent[entry]; spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { + queue_adjust_cache_locked(ent); + ent->miss++; spin_unlock_irq(&ent->lock); mr = create_cache_mr(ent); if (IS_ERR(mr)) From 56561ac6b27d489feb5d1e7e8b2a55a15063fcad Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 15 Feb 2022 19:55:31 +0200 Subject: [PATCH 091/186] RDMA/mlx5: Merge similar flows of allocating MR from the cache When allocating a MR from the cache, the driver calls to get_cache_mr(), and in case of failure, retries with create_cache_mr(). This is the flow of mlx5_mr_cache_alloc(), so use it instead. Link: https://lore.kernel.org/r/53c85fcd4de6ec9de0b8e6cbb1bf5d5fe19900c3.1644947594.git.leonro@nvidia.com Signed-off-by: Aharon Landau Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 +- drivers/infiniband/hw/mlx5/mr.c | 47 +++------------------------- drivers/infiniband/hw/mlx5/odp.c | 11 +++++-- 3 files changed, 15 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bacb6900b39f..751d02bc755b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1343,7 +1343,8 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - unsigned int entry, int access_flags); + struct mlx5_cache_ent *ent, + int access_flags); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index bce3cb6af524..0c1dc13b4c45 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -558,23 +558,16 @@ static void delayed_cache_work_func(struct work_struct *work) __cache_work_func(ent); } -/* Allocate a special entry from the cache */ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, - unsigned int entry, int access_flags) + struct mlx5_cache_ent *ent, + int access_flags) { - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; - if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY || - entry >= ARRAY_SIZE(cache->ent))) - return ERR_PTR(-EINVAL); - /* Matches access in alloc_cache_mr() */ if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) return ERR_PTR(-EOPNOTSUPP); - ent = &cache->ent[entry]; spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { queue_adjust_cache_locked(ent); @@ -592,32 +585,9 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, mlx5_clear_mr(mr); } - mr->access_flags = access_flags; return mr; } -/* Return a MR already available in the cache */ -static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent) -{ - struct mlx5_ib_mr *mr = NULL; - struct mlx5_cache_ent *ent = req_ent; - - spin_lock_irq(&ent->lock); - if (!list_empty(&ent->head)) { - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); - ent->available_mrs--; - queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); - mlx5_clear_mr(mr); - return mr; - } - queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); - req_ent->miss++; - return NULL; -} - static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_cache_ent *ent = mr->cache_ent; @@ -951,16 +921,9 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } - mr = get_cache_mr(ent); - if (!mr) { - mr = create_cache_mr(ent); - /* - * The above already tried to do the same stuff as reg_create(), - * no reason to try it again. - */ - if (IS_ERR(mr)) - return mr; - } + mr = mlx5_mr_cache_alloc(dev, ent, access_flags); + if (IS_ERR(mr)) + return mr; mr->ibmr.pd = pd; mr->umem = umem; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 86842cd580ba..f834c9590c51 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -407,6 +407,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, unsigned long idx) { + struct mlx5_ib_dev *dev = mr_to_mdev(imr); struct ib_umem_odp *odp; struct mlx5_ib_mr *mr; struct mlx5_ib_mr *ret; @@ -418,13 +419,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = mlx5_mr_cache_alloc( - mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags); + mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY], + imr->access_flags); if (IS_ERR(mr)) { ib_umem_odp_release(odp); return mr; } + mr->access_flags = imr->access_flags; mr->ibmr.pd = imr->ibmr.pd; mr->ibmr.device = &mr_to_mdev(imr)->ib_dev; mr->umem = &odp->umem; @@ -493,12 +495,15 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); - imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags); + imr = mlx5_mr_cache_alloc(dev, + &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY], + access_flags); if (IS_ERR(imr)) { ib_umem_odp_release(umem_odp); return imr; } + imr->access_flags = access_flags; imr->ibmr.pd = &pd->ibpd; imr->ibmr.iova = 0; imr->umem = &umem_odp->umem; From 9ee2516c43823652da597633aed9646dac51c1f8 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 15 Feb 2022 19:55:32 +0200 Subject: [PATCH 092/186] RDMA/mlx5: Store ndescs instead of the translation table size Currently, ent->xlt stores the translation table size. This data should not be stored in the cache entry but be written directly to the mailbox. Store ndescs instead, and deduce the translation table size from it according to the access mode. Link: https://lore.kernel.org/r/e9dbfaa1f279793a6bd28ee5a31cb4f0f0d70f05.1644947594.git.leonro@nvidia.com Signed-off-by: Aharon Landau Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 25 ++++++++++++++++++++++--- drivers/infiniband/hw/mlx5/odp.c | 8 ++------ 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 751d02bc755b..4f04bb55c4c6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -763,9 +763,9 @@ struct mlx5_cache_ent { char name[4]; u32 order; - u32 xlt; u32 access_mode; u32 page; + unsigned int ndescs; u8 disabled:1; u8 fill_to_high_water:1; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 0c1dc13b4c45..eb14ea4bcbba 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -176,6 +176,25 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) spin_unlock_irqrestore(&ent->lock, flags); } +static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) +{ + int ret = 0; + + switch (access_mode) { + case MLX5_MKC_ACCESS_MODE_MTT: + ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / + sizeof(struct mlx5_mtt)); + break; + case MLX5_MKC_ACCESS_MODE_KSM: + ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / + sizeof(struct mlx5_klm)); + break; + default: + WARN_ON(1); + } + return ret; +} + static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) { struct mlx5_ib_mr *mr; @@ -191,7 +210,8 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); - MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); + MLX5_SET(mkc, mkc, translations_octword_size, + get_mkc_octo_size(ent->access_mode, ent->ndescs)); MLX5_SET(mkc, mkc, log_page_size, ent->page); return mr; } @@ -701,8 +721,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) continue; ent->page = PAGE_SHIFT; - ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / - MLX5_IB_UMR_OCTOWORD; + ent->ndescs = 1 << ent->order; ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && !dev->is_rep && mlx5_core_is_pf(dev->mdev) && diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f834c9590c51..41c964a45f89 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1598,18 +1598,14 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) switch (ent->order - 2) { case MLX5_IMR_MTT_CACHE_ENTRY: ent->page = PAGE_SHIFT; - ent->xlt = MLX5_IMR_MTT_ENTRIES * - sizeof(struct mlx5_mtt) / - MLX5_IB_UMR_OCTOWORD; + ent->ndescs = MLX5_IMR_MTT_ENTRIES; ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; ent->limit = 0; break; case MLX5_IMR_KSM_CACHE_ENTRY: ent->page = MLX5_KSM_PAGE_SHIFT; - ent->xlt = mlx5_imr_ksm_entries * - sizeof(struct mlx5_klm) / - MLX5_IB_UMR_OCTOWORD; + ent->ndescs = mlx5_imr_ksm_entries; ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; ent->limit = 0; break; From 77528e2aed9246cf8017b8a6f1b658a264d6f2b2 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Tue, 15 Feb 2022 19:55:33 +0200 Subject: [PATCH 093/186] RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled() The mkc is the key for the mkey cache, hence, created in each attempt to get a cache mkey, while pcie_relaxed_ordering_enabled() is called during the setting of the mkc, but used only for cases where IB_ACCESS_RELAXED_ORDERING is set. pcie_relaxed_ordering_enabled() is an expensive call (26 us). Reorder the code so the driver will call it only when it is needed. Link: https://lore.kernel.org/r/684be1366cb1d4f05aa3e78986205e4bc410443a.1644947594.git.leonro@nvidia.com Signed-off-by: Aharon Landau Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index eb14ea4bcbba..eab7921eb91f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -68,7 +68,6 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, struct ib_pd *pd) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - bool ro_pci_enabled = pcie_relaxed_ordering_enabled(dev->mdev->pdev); MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); @@ -76,12 +75,13 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); MLX5_SET(mkc, mkc, lr, 1); - if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) - MLX5_SET(mkc, mkc, relaxed_ordering_write, - (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled); - if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) - MLX5_SET(mkc, mkc, relaxed_ordering_read, - (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled); + if ((acc & IB_ACCESS_RELAXED_ORDERING) && + pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) + MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) + MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); + } MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); MLX5_SET(mkc, mkc, qpn, 0xffffff); From 4eaa29b45e09d8565bd8bf596750d9b90ad5c806 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Feb 2022 13:05:10 -0800 Subject: [PATCH 094/186] RDMA/ib_srp: Add more documentation Make it more clear what the different ib_srp data structures represent. Link: https://lore.kernel.org/r/20220215210511.28303-2-bvanassche@acm.org Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index abccddeea1e3..55a575e2cace 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -92,6 +92,9 @@ enum srp_iu_type { }; /* + * RDMA adapter in the initiator system. + * + * @dev_list: List of RDMA ports associated with this RDMA adapter (srp_host). * @mr_page_mask: HCA memory registration page mask. * @mr_page_size: HCA memory registration page size. * @mr_max_size: Maximum size in bytes of a single FR registration request. @@ -109,6 +112,12 @@ struct srp_device { bool use_fast_reg; }; +/* + * One port of an RDMA adapter in the initiator system. + * + * @target_list: List of connected target ports (struct srp_target_port). + * @target_lock: Protects @target_list. + */ struct srp_host { struct srp_device *srp_dev; u8 port; @@ -183,7 +192,7 @@ struct srp_rdma_ch { }; /** - * struct srp_target_port + * struct srp_target_port - RDMA port in the SRP target system * @comp_vector: Completion vector used by the first RDMA channel created for * this target port. */ From 2c4b14ea9507106c0599349fbb8efdeb3b7aa840 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Thu, 17 Feb 2022 09:18:49 -0600 Subject: [PATCH 095/186] RDMA/irdma: Remove enum irdma_status_code Replace use of custom irdma_status_code with linux error codes. Remove enum irdma_status_code and header in which its defined. Link: https://lore.kernel.org/r/20220217151851.1518-2-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/cm.c | 44 +- drivers/infiniband/hw/irdma/ctrl.c | 553 ++++++++++++------------- drivers/infiniband/hw/irdma/defs.h | 8 +- drivers/infiniband/hw/irdma/hmc.c | 105 ++--- drivers/infiniband/hw/irdma/hmc.h | 53 +-- drivers/infiniband/hw/irdma/hw.c | 188 ++++----- drivers/infiniband/hw/irdma/i40iw_hw.c | 1 - drivers/infiniband/hw/irdma/main.c | 6 +- drivers/infiniband/hw/irdma/main.h | 42 +- drivers/infiniband/hw/irdma/osdep.h | 40 +- drivers/infiniband/hw/irdma/pble.c | 77 ++-- drivers/infiniband/hw/irdma/pble.h | 25 +- drivers/infiniband/hw/irdma/protos.h | 90 ++-- drivers/infiniband/hw/irdma/puda.c | 132 +++--- drivers/infiniband/hw/irdma/puda.h | 43 +- drivers/infiniband/hw/irdma/status.h | 71 ---- drivers/infiniband/hw/irdma/type.h | 105 +++-- drivers/infiniband/hw/irdma/uda.c | 35 +- drivers/infiniband/hw/irdma/uda.h | 46 +- drivers/infiniband/hw/irdma/uk.c | 122 +++--- drivers/infiniband/hw/irdma/user.h | 62 ++- drivers/infiniband/hw/irdma/utils.c | 199 ++++----- drivers/infiniband/hw/irdma/verbs.c | 48 +-- drivers/infiniband/hw/irdma/ws.c | 19 +- drivers/infiniband/hw/irdma/ws.h | 2 +- 25 files changed, 933 insertions(+), 1183 deletions(-) delete mode 100644 drivers/infiniband/hw/irdma/status.h diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index abc101b5fd42..dedb3b7edd8d 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1501,15 +1501,14 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, * @cm_info: CM info for parent listen node * @cm_parent_listen_node: The parent listen node */ -static enum irdma_status_code -irdma_del_multiple_qhash(struct irdma_device *iwdev, - struct irdma_cm_info *cm_info, - struct irdma_cm_listener *cm_parent_listen_node) +static int irdma_del_multiple_qhash(struct irdma_device *iwdev, + struct irdma_cm_info *cm_info, + struct irdma_cm_listener *cm_parent_listen_node) { struct irdma_cm_listener *child_listen_node; - enum irdma_status_code ret = IRDMA_ERR_CFG; struct list_head *pos, *tpos; unsigned long flags; + int ret = -EINVAL; spin_lock_irqsave(&iwdev->cm_core.listen_list_lock, flags); list_for_each_safe (pos, tpos, @@ -1618,16 +1617,16 @@ u16 irdma_get_vlan_ipv4(u32 *addr) * Adds a qhash and a child listen node for every IPv6 address * on the adapter and adds the associated qhash filter */ -static enum irdma_status_code -irdma_add_mqh_6(struct irdma_device *iwdev, struct irdma_cm_info *cm_info, - struct irdma_cm_listener *cm_parent_listen_node) +static int irdma_add_mqh_6(struct irdma_device *iwdev, + struct irdma_cm_info *cm_info, + struct irdma_cm_listener *cm_parent_listen_node) { struct net_device *ip_dev; struct inet6_dev *idev; struct inet6_ifaddr *ifp, *tmp; - enum irdma_status_code ret = 0; struct irdma_cm_listener *child_listen_node; unsigned long flags; + int ret = 0; rtnl_lock(); for_each_netdev(&init_net, ip_dev) { @@ -1653,7 +1652,7 @@ irdma_add_mqh_6(struct irdma_device *iwdev, struct irdma_cm_info *cm_info, child_listen_node); if (!child_listen_node) { ibdev_dbg(&iwdev->ibdev, "CM: listener memory allocation\n"); - ret = IRDMA_ERR_NO_MEMORY; + ret = -ENOMEM; goto exit; } @@ -1700,16 +1699,16 @@ exit: * Adds a qhash and a child listen node for every IPv4 address * on the adapter and adds the associated qhash filter */ -static enum irdma_status_code -irdma_add_mqh_4(struct irdma_device *iwdev, struct irdma_cm_info *cm_info, - struct irdma_cm_listener *cm_parent_listen_node) +static int irdma_add_mqh_4(struct irdma_device *iwdev, + struct irdma_cm_info *cm_info, + struct irdma_cm_listener *cm_parent_listen_node) { struct net_device *ip_dev; struct in_device *idev; struct irdma_cm_listener *child_listen_node; - enum irdma_status_code ret = 0; unsigned long flags; const struct in_ifaddr *ifa; + int ret = 0; rtnl_lock(); for_each_netdev(&init_net, ip_dev) { @@ -1734,7 +1733,7 @@ irdma_add_mqh_4(struct irdma_device *iwdev, struct irdma_cm_info *cm_info, if (!child_listen_node) { ibdev_dbg(&iwdev->ibdev, "CM: listener memory allocation\n"); in_dev_put(idev); - ret = IRDMA_ERR_NO_MEMORY; + ret = -ENOMEM; goto exit; } @@ -1781,9 +1780,9 @@ exit: * @cm_info: CM info for parent listen node * @cm_listen_node: The parent listen node */ -static enum irdma_status_code -irdma_add_mqh(struct irdma_device *iwdev, struct irdma_cm_info *cm_info, - struct irdma_cm_listener *cm_listen_node) +static int irdma_add_mqh(struct irdma_device *iwdev, + struct irdma_cm_info *cm_info, + struct irdma_cm_listener *cm_listen_node) { if (cm_info->ipv4) return irdma_add_mqh_4(iwdev, cm_info, cm_listen_node); @@ -3205,8 +3204,7 @@ static void irdma_cm_free_ah_nop(struct irdma_cm_node *cm_node) * @iwdev: iwarp device structure * @rdma_ver: HW version */ -enum irdma_status_code irdma_setup_cm_core(struct irdma_device *iwdev, - u8 rdma_ver) +int irdma_setup_cm_core(struct irdma_device *iwdev, u8 rdma_ver) { struct irdma_cm_core *cm_core = &iwdev->cm_core; @@ -3216,7 +3214,7 @@ enum irdma_status_code irdma_setup_cm_core(struct irdma_device *iwdev, /* Handles CM event work items send to Iwarp core */ cm_core->event_wq = alloc_ordered_workqueue("iwarp-event-wq", 0); if (!cm_core->event_wq) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; INIT_LIST_HEAD(&cm_core->listen_list); @@ -3923,10 +3921,10 @@ int irdma_create_listen(struct iw_cm_id *cm_id, int backlog) struct irdma_device *iwdev; struct irdma_cm_listener *cm_listen_node; struct irdma_cm_info cm_info = {}; - enum irdma_status_code err; struct sockaddr_in *laddr; struct sockaddr_in6 *laddr6; bool wildcard = false; + int err; iwdev = to_iwdev(cm_id->device); if (!iwdev) @@ -4337,11 +4335,11 @@ static void irdma_qhash_ctrl(struct irdma_device *iwdev, struct list_head *child_listen_list = &parent_listen_node->child_listen_list; struct irdma_cm_listener *child_listen_node; struct list_head *pos, *tpos; - enum irdma_status_code err; bool node_allocated = false; enum irdma_quad_hash_manage_type op = ifup ? IRDMA_QHASH_MANAGE_TYPE_ADD : IRDMA_QHASH_MANAGE_TYPE_DELETE; + int err; list_for_each_safe (pos, tpos, child_listen_list) { child_listen_node = list_entry(pos, struct irdma_cm_listener, diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 94a9c26ac83f..01cf75e9fd48 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -3,7 +3,6 @@ #include #include "osdep.h" -#include "status.h" #include "hmc.h" #include "defs.h" #include "type.h" @@ -180,17 +179,16 @@ void irdma_sc_pd_init(struct irdma_sc_dev *dev, struct irdma_sc_pd *pd, u32 pd_i * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_add_arp_cache_entry(struct irdma_sc_cqp *cqp, - struct irdma_add_arp_cache_entry_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_add_arp_cache_entry(struct irdma_sc_cqp *cqp, + struct irdma_add_arp_cache_entry_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; u64 hdr; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 8, info->reach_max); set_64bit_val(wqe, 16, ether_addr_to_u64(info->mac_addr)); @@ -218,16 +216,15 @@ irdma_sc_add_arp_cache_entry(struct irdma_sc_cqp *cqp, * @arp_index: arp index to delete arp entry * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_del_arp_cache_entry(struct irdma_sc_cqp *cqp, u64 scratch, - u16 arp_index, bool post_sq) +static int irdma_sc_del_arp_cache_entry(struct irdma_sc_cqp *cqp, u64 scratch, + u16 arp_index, bool post_sq) { __le64 *wqe; u64 hdr; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; hdr = arp_index | FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_MANAGE_ARP) | @@ -252,17 +249,16 @@ irdma_sc_del_arp_cache_entry(struct irdma_sc_cqp *cqp, u64 scratch, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_manage_apbvt_entry(struct irdma_sc_cqp *cqp, - struct irdma_apbvt_info *info, u64 scratch, - bool post_sq) +static int irdma_sc_manage_apbvt_entry(struct irdma_sc_cqp *cqp, + struct irdma_apbvt_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; u64 hdr; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, info->port); @@ -300,7 +296,7 @@ irdma_sc_manage_apbvt_entry(struct irdma_sc_cqp *cqp, * quad hash entry in the hardware will point to iwarp's qp * number and requires no calls from the driver. */ -static enum irdma_status_code +static int irdma_sc_manage_qhash_table_entry(struct irdma_sc_cqp *cqp, struct irdma_qhash_table_info *info, u64 scratch, bool post_sq) @@ -313,7 +309,7 @@ irdma_sc_manage_qhash_table_entry(struct irdma_sc_cqp *cqp, wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 0, ether_addr_to_u64(info->mac_addr)); @@ -376,10 +372,9 @@ irdma_sc_manage_qhash_table_entry(struct irdma_sc_cqp *cqp, * @qp: sc qp * @info: initialization qp info */ -enum irdma_status_code irdma_sc_qp_init(struct irdma_sc_qp *qp, - struct irdma_qp_init_info *info) +int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info) { - enum irdma_status_code ret_code; + int ret_code; u32 pble_obj_cnt; u16 wqe_size; @@ -387,7 +382,7 @@ enum irdma_status_code irdma_sc_qp_init(struct irdma_sc_qp *qp, info->pd->dev->hw_attrs.uk_attrs.max_hw_wq_frags || info->qp_uk_init_info.max_rq_frag_cnt > info->pd->dev->hw_attrs.uk_attrs.max_hw_wq_frags) - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; qp->dev = info->pd->dev; qp->vsi = info->vsi; @@ -410,7 +405,7 @@ enum irdma_status_code irdma_sc_qp_init(struct irdma_sc_qp *qp, if ((info->virtual_map && info->sq_pa >= pble_obj_cnt) || (info->virtual_map && info->rq_pa >= pble_obj_cnt)) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; qp->llp_stream_handle = (void *)(-1); qp->hw_sq_size = irdma_get_encoded_wqe_size(qp->qp_uk.sq_ring.size, @@ -450,8 +445,8 @@ enum irdma_status_code irdma_sc_qp_init(struct irdma_sc_qp *qp, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_qp_create(struct irdma_sc_qp *qp, struct irdma_create_qp_info *info, - u64 scratch, bool post_sq) +int irdma_sc_qp_create(struct irdma_sc_qp *qp, struct irdma_create_qp_info *info, + u64 scratch, bool post_sq) { struct irdma_sc_cqp *cqp; __le64 *wqe; @@ -460,11 +455,11 @@ enum irdma_status_code irdma_sc_qp_create(struct irdma_sc_qp *qp, struct irdma_c cqp = qp->dev->cqp; if (qp->qp_uk.qp_id < cqp->dev->hw_attrs.min_hw_qp_id || qp->qp_uk.qp_id > (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt - 1)) - return IRDMA_ERR_INVALID_QP_ID; + return -EINVAL; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, qp->hw_host_ctx_pa); set_64bit_val(wqe, 40, qp->shadow_area_pa); @@ -501,9 +496,8 @@ enum irdma_status_code irdma_sc_qp_create(struct irdma_sc_qp *qp, struct irdma_c * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_qp_modify(struct irdma_sc_qp *qp, - struct irdma_modify_qp_info *info, - u64 scratch, bool post_sq) +int irdma_sc_qp_modify(struct irdma_sc_qp *qp, struct irdma_modify_qp_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; @@ -514,7 +508,7 @@ enum irdma_status_code irdma_sc_qp_modify(struct irdma_sc_qp *qp, cqp = qp->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; if (info->next_iwarp_state == IRDMA_QP_STATE_TERMINATE) { if (info->dont_send_fin) @@ -572,9 +566,8 @@ enum irdma_status_code irdma_sc_qp_modify(struct irdma_sc_qp *qp, * @ignore_mw_bnd: memory window bind flag * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_qp_destroy(struct irdma_sc_qp *qp, u64 scratch, - bool remove_hash_idx, bool ignore_mw_bnd, - bool post_sq) +int irdma_sc_qp_destroy(struct irdma_sc_qp *qp, u64 scratch, + bool remove_hash_idx, bool ignore_mw_bnd, bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; @@ -583,7 +576,7 @@ enum irdma_status_code irdma_sc_qp_destroy(struct irdma_sc_qp *qp, u64 scratch, cqp = qp->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, qp->hw_host_ctx_pa); set_64bit_val(wqe, 40, qp->shadow_area_pa); @@ -765,16 +758,15 @@ void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_alloc_local_mac_entry(struct irdma_sc_cqp *cqp, u64 scratch, - bool post_sq) +static int irdma_sc_alloc_local_mac_entry(struct irdma_sc_cqp *cqp, u64 scratch, + bool post_sq) { __le64 *wqe; u64 hdr; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_ALLOCATE_LOC_MAC_TABLE_ENTRY) | @@ -800,17 +792,16 @@ irdma_sc_alloc_local_mac_entry(struct irdma_sc_cqp *cqp, u64 scratch, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_add_local_mac_entry(struct irdma_sc_cqp *cqp, - struct irdma_local_mac_entry_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_add_local_mac_entry(struct irdma_sc_cqp *cqp, + struct irdma_local_mac_entry_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; u64 header; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 32, ether_addr_to_u64(info->mac_addr)); @@ -839,16 +830,16 @@ irdma_sc_add_local_mac_entry(struct irdma_sc_cqp *cqp, * @ignore_ref_count: to force mac adde delete * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_del_local_mac_entry(struct irdma_sc_cqp *cqp, u64 scratch, - u16 entry_idx, u8 ignore_ref_count, bool post_sq) +static int irdma_sc_del_local_mac_entry(struct irdma_sc_cqp *cqp, u64 scratch, + u16 entry_idx, u8 ignore_ref_count, + bool post_sq) { __le64 *wqe; u64 header; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; header = FIELD_PREP(IRDMA_CQPSQ_MLM_TABLEIDX, entry_idx) | FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_MANAGE_LOC_MAC_TABLE) | @@ -1061,10 +1052,9 @@ void irdma_sc_qp_setctx(struct irdma_sc_qp *qp, __le64 *qp_ctx, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_alloc_stag(struct irdma_sc_dev *dev, - struct irdma_allocate_stag_info *info, u64 scratch, - bool post_sq) +static int irdma_sc_alloc_stag(struct irdma_sc_dev *dev, + struct irdma_allocate_stag_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; @@ -1081,7 +1071,7 @@ irdma_sc_alloc_stag(struct irdma_sc_dev *dev, cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 8, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID) | @@ -1123,10 +1113,9 @@ irdma_sc_alloc_stag(struct irdma_sc_dev *dev, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, - struct irdma_reg_ns_stag_info *info, u64 scratch, - bool post_sq) +static int irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, + struct irdma_reg_ns_stag_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; u64 fbo; @@ -1144,7 +1133,7 @@ irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, else if (info->page_size == 0x1000) page_size = IRDMA_PAGE_SIZE_4K; else - return IRDMA_ERR_PARAM; + return -EINVAL; if (info->access_rights & (IRDMA_ACCESS_FLAGS_REMOTEREAD_ONLY | IRDMA_ACCESS_FLAGS_REMOTEWRITE_ONLY)) @@ -1154,12 +1143,12 @@ irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, pble_obj_cnt = dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if (info->chunk_size && info->first_pm_pbl_index >= pble_obj_cnt) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; fbo = info->va & (info->page_size - 1); set_64bit_val(wqe, 0, @@ -1212,10 +1201,9 @@ irdma_sc_mr_reg_non_shared(struct irdma_sc_dev *dev, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_dealloc_stag(struct irdma_sc_dev *dev, - struct irdma_dealloc_stag_info *info, u64 scratch, - bool post_sq) +static int irdma_sc_dealloc_stag(struct irdma_sc_dev *dev, + struct irdma_dealloc_stag_info *info, + u64 scratch, bool post_sq) { u64 hdr; __le64 *wqe; @@ -1224,7 +1212,7 @@ irdma_sc_dealloc_stag(struct irdma_sc_dev *dev, cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 8, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID)); @@ -1253,9 +1241,9 @@ irdma_sc_dealloc_stag(struct irdma_sc_dev *dev, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_mw_alloc(struct irdma_sc_dev *dev, struct irdma_mw_alloc_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_mw_alloc(struct irdma_sc_dev *dev, + struct irdma_mw_alloc_info *info, u64 scratch, + bool post_sq) { u64 hdr; struct irdma_sc_cqp *cqp; @@ -1264,7 +1252,7 @@ irdma_sc_mw_alloc(struct irdma_sc_dev *dev, struct irdma_mw_alloc_info *info, cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 8, FLD_LS_64(dev, info->pd_id, IRDMA_CQPSQ_STAG_PDID)); @@ -1294,9 +1282,9 @@ irdma_sc_mw_alloc(struct irdma_sc_dev *dev, struct irdma_mw_alloc_info *info, * @info: fast mr info * @post_sq: flag for cqp db to ring */ -enum irdma_status_code -irdma_sc_mr_fast_register(struct irdma_sc_qp *qp, - struct irdma_fast_reg_stag_info *info, bool post_sq) +int irdma_sc_mr_fast_register(struct irdma_sc_qp *qp, + struct irdma_fast_reg_stag_info *info, + bool post_sq) { u64 temp, hdr; __le64 *wqe; @@ -1318,7 +1306,7 @@ irdma_sc_mr_fast_register(struct irdma_sc_qp *qp, wqe = irdma_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, IRDMA_QP_WQE_MIN_QUANTA, 0, &sq_info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(&qp->qp_uk, wqe_idx); @@ -1847,8 +1835,7 @@ void irdma_terminate_received(struct irdma_sc_qp *qp, } } -static enum irdma_status_code irdma_null_ws_add(struct irdma_sc_vsi *vsi, - u8 user_pri) +static int irdma_null_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) { return 0; } @@ -1933,8 +1920,8 @@ static u8 irdma_get_fcn_id(struct irdma_sc_vsi *vsi) * @vsi: pointer to the vsi structure * @info: The info structure used for initialization */ -enum irdma_status_code irdma_vsi_stats_init(struct irdma_sc_vsi *vsi, - struct irdma_vsi_stats_info *info) +int irdma_vsi_stats_init(struct irdma_sc_vsi *vsi, + struct irdma_vsi_stats_info *info) { u8 fcn_id = info->fcn_id; struct irdma_dma_mem *stats_buff_mem; @@ -1949,7 +1936,7 @@ enum irdma_status_code irdma_vsi_stats_init(struct irdma_sc_vsi *vsi, &stats_buff_mem->pa, GFP_KERNEL); if (!stats_buff_mem->va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; vsi->pestat->gather_info.gather_stats_va = stats_buff_mem->va; vsi->pestat->gather_info.last_gather_stats_va = @@ -1976,7 +1963,7 @@ stats_error: stats_buff_mem->va, stats_buff_mem->pa); stats_buff_mem->va = NULL; - return IRDMA_ERR_CQP_COMPL_ERROR; + return -EIO; } /** @@ -2038,19 +2025,19 @@ u8 irdma_get_encoded_wqe_size(u32 wqsize, enum irdma_queue_type queue_type) * @info: gather stats info structure * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code -irdma_sc_gather_stats(struct irdma_sc_cqp *cqp, - struct irdma_stats_gather_info *info, u64 scratch) +static int irdma_sc_gather_stats(struct irdma_sc_cqp *cqp, + struct irdma_stats_gather_info *info, + u64 scratch) { __le64 *wqe; u64 temp; if (info->stats_buff_mem.size < IRDMA_GATHER_STATS_BUF_SIZE) - return IRDMA_ERR_BUF_TOO_SHORT; + return -ENOMEM; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 40, FIELD_PREP(IRDMA_CQPSQ_STATS_HMC_FCN_INDEX, info->hmc_fcn_index)); @@ -2085,17 +2072,16 @@ irdma_sc_gather_stats(struct irdma_sc_cqp *cqp, * @alloc: alloc vs. delete flag * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code -irdma_sc_manage_stats_inst(struct irdma_sc_cqp *cqp, - struct irdma_stats_inst_info *info, bool alloc, - u64 scratch) +static int irdma_sc_manage_stats_inst(struct irdma_sc_cqp *cqp, + struct irdma_stats_inst_info *info, + bool alloc, u64 scratch) { __le64 *wqe; u64 temp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 40, FIELD_PREP(IRDMA_CQPSQ_STATS_HMC_FCN_INDEX, info->hmc_fn_id)); @@ -2123,9 +2109,8 @@ irdma_sc_manage_stats_inst(struct irdma_sc_cqp *cqp, * @info: User priority map info * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code irdma_sc_set_up_map(struct irdma_sc_cqp *cqp, - struct irdma_up_info *info, - u64 scratch) +static int irdma_sc_set_up_map(struct irdma_sc_cqp *cqp, + struct irdma_up_info *info, u64 scratch) { __le64 *wqe; u64 temp = 0; @@ -2133,7 +2118,7 @@ static enum irdma_status_code irdma_sc_set_up_map(struct irdma_sc_cqp *cqp, wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) temp |= (u64)info->map[i] << (i * 8); @@ -2166,17 +2151,16 @@ static enum irdma_status_code irdma_sc_set_up_map(struct irdma_sc_cqp *cqp, * @node_op: 0 for add 1 for modify, 2 for delete * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code -irdma_sc_manage_ws_node(struct irdma_sc_cqp *cqp, - struct irdma_ws_node_info *info, - enum irdma_ws_node_op node_op, u64 scratch) +static int irdma_sc_manage_ws_node(struct irdma_sc_cqp *cqp, + struct irdma_ws_node_info *info, + enum irdma_ws_node_op node_op, u64 scratch) { __le64 *wqe; u64 temp = 0; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 32, FIELD_PREP(IRDMA_CQPSQ_WS_VSI, info->vsi) | @@ -2209,9 +2193,9 @@ irdma_sc_manage_ws_node(struct irdma_sc_cqp *cqp, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, - struct irdma_qp_flush_info *info, - u64 scratch, bool post_sq) +int irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, + struct irdma_qp_flush_info *info, u64 scratch, + bool post_sq) { u64 temp = 0; __le64 *wqe; @@ -2230,13 +2214,13 @@ enum irdma_status_code irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, ibdev_dbg(to_ibdev(qp->dev), "CQP: Additional flush request ignored for qp %x\n", qp->qp_uk.qp_id); - return IRDMA_ERR_FLUSHED_Q; + return -EALREADY; } cqp = qp->pd->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; if (info->userflushcode) { if (flush_rq) @@ -2283,9 +2267,9 @@ enum irdma_status_code irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code irdma_sc_gen_ae(struct irdma_sc_qp *qp, - struct irdma_gen_ae_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_gen_ae(struct irdma_sc_qp *qp, + struct irdma_gen_ae_info *info, u64 scratch, + bool post_sq) { u64 temp; __le64 *wqe; @@ -2295,7 +2279,7 @@ static enum irdma_status_code irdma_sc_gen_ae(struct irdma_sc_qp *qp, cqp = qp->pd->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; temp = info->ae_code | FIELD_PREP(IRDMA_CQPSQ_FWQE_AESOURCE, info->ae_src); @@ -2323,10 +2307,9 @@ static enum irdma_status_code irdma_sc_gen_ae(struct irdma_sc_qp *qp, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_qp_upload_context(struct irdma_sc_dev *dev, - struct irdma_upload_context_info *info, u64 scratch, - bool post_sq) +static int irdma_sc_qp_upload_context(struct irdma_sc_dev *dev, + struct irdma_upload_context_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; @@ -2335,7 +2318,7 @@ irdma_sc_qp_upload_context(struct irdma_sc_dev *dev, cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, info->buf_pa); @@ -2364,21 +2347,20 @@ irdma_sc_qp_upload_context(struct irdma_sc_dev *dev, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_manage_push_page(struct irdma_sc_cqp *cqp, - struct irdma_cqp_manage_push_page_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_manage_push_page(struct irdma_sc_cqp *cqp, + struct irdma_cqp_manage_push_page_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; u64 hdr; if (info->free_page && info->push_idx >= cqp->dev->hw_attrs.max_hw_device_pages) - return IRDMA_ERR_INVALID_PUSH_PAGE_INDEX; + return -EINVAL; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, info->qs_handle); hdr = FIELD_PREP(IRDMA_CQPSQ_MPP_PPIDX, info->push_idx) | @@ -2404,16 +2386,15 @@ irdma_sc_manage_push_page(struct irdma_sc_cqp *cqp, * @qp: sc qp struct * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code irdma_sc_suspend_qp(struct irdma_sc_cqp *cqp, - struct irdma_sc_qp *qp, - u64 scratch) +static int irdma_sc_suspend_qp(struct irdma_sc_cqp *cqp, struct irdma_sc_qp *qp, + u64 scratch) { u64 hdr; __le64 *wqe; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; hdr = FIELD_PREP(IRDMA_CQPSQ_SUSPENDQP_QPID, qp->qp_uk.qp_id) | FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_SUSPEND_QP) | @@ -2435,16 +2416,15 @@ static enum irdma_status_code irdma_sc_suspend_qp(struct irdma_sc_cqp *cqp, * @qp: sc qp struct * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code irdma_sc_resume_qp(struct irdma_sc_cqp *cqp, - struct irdma_sc_qp *qp, - u64 scratch) +static int irdma_sc_resume_qp(struct irdma_sc_cqp *cqp, struct irdma_sc_qp *qp, + u64 scratch) { u64 hdr; __le64 *wqe; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_CQPSQ_RESUMEQP_QSHANDLE, qp->qs_handle)); @@ -2477,14 +2457,13 @@ static inline void irdma_sc_cq_ack(struct irdma_sc_cq *cq) * @cq: cq struct * @info: cq initialization info */ -enum irdma_status_code irdma_sc_cq_init(struct irdma_sc_cq *cq, - struct irdma_cq_init_info *info) +int irdma_sc_cq_init(struct irdma_sc_cq *cq, struct irdma_cq_init_info *info) { u32 pble_obj_cnt; pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; cq->cq_pa = info->cq_base_pa; cq->dev = info->dev; @@ -2515,23 +2494,21 @@ enum irdma_status_code irdma_sc_cq_init(struct irdma_sc_cq *cq, * @check_overflow: flag for overflow check * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code irdma_sc_cq_create(struct irdma_sc_cq *cq, - u64 scratch, - bool check_overflow, - bool post_sq) +static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, + bool check_overflow, bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; u64 hdr; struct irdma_sc_ceq *ceq; - enum irdma_status_code ret_code = 0; + int ret_code = 0; cqp = cq->dev->cqp; if (cq->cq_uk.cq_id > (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt - 1)) - return IRDMA_ERR_INVALID_CQ_ID; + return -EINVAL; if (cq->ceq_id > (cq->dev->hmc_fpm_misc.max_ceqs - 1)) - return IRDMA_ERR_INVALID_CEQ_ID; + return -EINVAL; ceq = cq->dev->ceq[cq->ceq_id]; if (ceq && ceq->reg_cq) @@ -2544,7 +2521,7 @@ static enum irdma_status_code irdma_sc_cq_create(struct irdma_sc_cq *cq, if (!wqe) { if (ceq && ceq->reg_cq) irdma_sc_remove_cq_ctx(ceq, cq); - return IRDMA_ERR_RING_FULL; + return -ENOMEM; } set_64bit_val(wqe, 0, cq->cq_uk.cq_size); @@ -2590,8 +2567,7 @@ static enum irdma_status_code irdma_sc_cq_create(struct irdma_sc_cq *cq, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, - bool post_sq) +int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq) { struct irdma_sc_cqp *cqp; __le64 *wqe; @@ -2601,7 +2577,7 @@ enum irdma_status_code irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, cqp = cq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; ceq = cq->dev->ceq[cq->ceq_id]; if (ceq && ceq->reg_cq) @@ -2657,9 +2633,9 @@ void irdma_sc_cq_resize(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *inf * @scratch: u64 saved to be used during cqp completion * @post_sq: flag to post to sq */ -static enum irdma_status_code -irdma_sc_cq_modify(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_cq_modify(struct irdma_sc_cq *cq, + struct irdma_modify_cq_info *info, u64 scratch, + bool post_sq) { struct irdma_sc_cqp *cqp; __le64 *wqe; @@ -2669,12 +2645,12 @@ irdma_sc_cq_modify(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *info, pble_obj_cnt = cq->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if (info->cq_resize && info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; cqp = cq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 0, info->cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); @@ -2748,8 +2724,8 @@ static inline void irdma_get_cqp_reg_info(struct irdma_sc_cqp *cqp, u32 *val, * @tail: wqtail register value * @count: how many times to try for completion */ -static enum irdma_status_code irdma_cqp_poll_registers(struct irdma_sc_cqp *cqp, - u32 tail, u32 count) +static int irdma_cqp_poll_registers(struct irdma_sc_cqp *cqp, u32 tail, + u32 count) { u32 i = 0; u32 newtail, error, val; @@ -2761,7 +2737,7 @@ static enum irdma_status_code irdma_cqp_poll_registers(struct irdma_sc_cqp *cqp, ibdev_dbg(to_ibdev(cqp->dev), "CQP: CQPERRCODES error_code[x%08X]\n", error); - return IRDMA_ERR_CQP_COMPL_ERROR; + return -EIO; } if (newtail != tail) { /* SUCCESS */ @@ -2772,7 +2748,7 @@ static enum irdma_status_code irdma_cqp_poll_registers(struct irdma_sc_cqp *cqp, udelay(cqp->dev->hw_attrs.max_sleep_count); } - return IRDMA_ERR_TIMEOUT; + return -ETIMEDOUT; } /** @@ -2927,10 +2903,9 @@ static u64 irdma_sc_decode_fpm_query(__le64 *buf, u32 buf_idx, * parses fpm query buffer and copy max_cnt and * size value of hmc objects in hmc_info */ -static enum irdma_status_code -irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, - struct irdma_hmc_info *hmc_info, - struct irdma_hmc_fpm_misc *hmc_fpm_misc) +static int irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, + struct irdma_hmc_info *hmc_info, + struct irdma_hmc_fpm_misc *hmc_fpm_misc) { struct irdma_hmc_obj_info *obj_info; u64 temp; @@ -2969,7 +2944,7 @@ irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, obj_info[IRDMA_HMC_IW_XFFL].size = 4; hmc_fpm_misc->xf_block_size = FIELD_GET(IRDMA_QUERY_FPM_XFBLOCKSIZE, temp); if (!hmc_fpm_misc->xf_block_size) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; irdma_sc_decode_fpm_query(buf, 72, obj_info, IRDMA_HMC_IW_Q1); get_64bit_val(buf, 80, &temp); @@ -2978,7 +2953,7 @@ irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, hmc_fpm_misc->q1_block_size = FIELD_GET(IRDMA_QUERY_FPM_Q1BLOCKSIZE, temp); if (!hmc_fpm_misc->q1_block_size) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; irdma_sc_decode_fpm_query(buf, 88, obj_info, IRDMA_HMC_IW_TIMER); @@ -3002,7 +2977,7 @@ irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, hmc_fpm_misc->rrf_block_size = FIELD_GET(IRDMA_QUERY_FPM_RRFBLOCKSIZE, temp); if (!hmc_fpm_misc->rrf_block_size && obj_info[IRDMA_HMC_IW_RRFFL].max_cnt) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; irdma_sc_decode_fpm_query(buf, 144, obj_info, IRDMA_HMC_IW_HDR); irdma_sc_decode_fpm_query(buf, 152, obj_info, IRDMA_HMC_IW_MD); @@ -3014,7 +2989,7 @@ irdma_sc_parse_fpm_query_buf(struct irdma_sc_dev *dev, __le64 *buf, hmc_fpm_misc->ooiscf_block_size = FIELD_GET(IRDMA_QUERY_FPM_OOISCFBLOCKSIZE, temp); if (!hmc_fpm_misc->ooiscf_block_size && obj_info[IRDMA_HMC_IW_OOISCFFL].max_cnt) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; return 0; } @@ -3042,8 +3017,7 @@ static u32 irdma_sc_find_reg_cq(struct irdma_sc_ceq *ceq, * @ceq: ceq sc structure * @cq: cq sc structure */ -enum irdma_status_code irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, - struct irdma_sc_cq *cq) +int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq) { unsigned long flags; @@ -3051,7 +3025,7 @@ enum irdma_status_code irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, if (ceq->reg_cq_size == ceq->elem_cnt) { spin_unlock_irqrestore(&ceq->req_cq_lock, flags); - return IRDMA_ERR_REG_CQ_FULL; + return -ENOMEM; } ceq->reg_cq[ceq->reg_cq_size++] = cq; @@ -3092,15 +3066,15 @@ exit: * * Initializes the object and context buffers for a control Queue Pair. */ -enum irdma_status_code irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, - struct irdma_cqp_init_info *info) +int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, + struct irdma_cqp_init_info *info) { u8 hw_sq_size; if (info->sq_size > IRDMA_CQP_SW_SQSIZE_2048 || info->sq_size < IRDMA_CQP_SW_SQSIZE_4 || ((info->sq_size & (info->sq_size - 1)))) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; hw_sq_size = irdma_get_encoded_wqe_size(info->sq_size, IRDMA_QUEUE_TYPE_CQP); @@ -3150,13 +3124,12 @@ enum irdma_status_code irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, * @maj_err: If error, major err number * @min_err: If error, minor err number */ -enum irdma_status_code irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, - u16 *min_err) +int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err) { u64 temp; u8 hw_rev; u32 cnt = 0, p1, p2, val = 0, err_code; - enum irdma_status_code ret_code; + int ret_code; hw_rev = cqp->dev->hw_attrs.uk_attrs.hw_rev; cqp->sdbuf.size = ALIGN(IRDMA_UPDATE_SD_BUFF_SIZE * cqp->sq_size, @@ -3165,7 +3138,7 @@ enum irdma_status_code irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_er cqp->sdbuf.size, &cqp->sdbuf.pa, GFP_KERNEL); if (!cqp->sdbuf.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; spin_lock_init(&cqp->dev->cqp_lock); @@ -3220,7 +3193,7 @@ enum irdma_status_code irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_er do { if (cnt++ > cqp->dev->hw_attrs.max_done_count) { - ret_code = IRDMA_ERR_TIMEOUT; + ret_code = -ETIMEDOUT; goto err; } udelay(cqp->dev->hw_attrs.max_sleep_count); @@ -3228,7 +3201,7 @@ enum irdma_status_code irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_er } while (!val); if (FLD_RS_32(cqp->dev, val, IRDMA_CCQPSTATUS_CCQP_ERR)) { - ret_code = IRDMA_ERR_DEVICE_NOT_SUPPORTED; + ret_code = -EOPNOTSUPP; goto err; } @@ -3269,7 +3242,7 @@ __le64 *irdma_sc_cqp_get_next_send_wqe_idx(struct irdma_sc_cqp *cqp, u64 scratch u32 *wqe_idx) { __le64 *wqe = NULL; - enum irdma_status_code ret_code; + int ret_code; if (IRDMA_RING_FULL_ERR(cqp->sq_ring)) { ibdev_dbg(to_ibdev(cqp->dev), @@ -3296,16 +3269,16 @@ __le64 *irdma_sc_cqp_get_next_send_wqe_idx(struct irdma_sc_cqp *cqp, u64 scratch * irdma_sc_cqp_destroy - destroy cqp during close * @cqp: struct for cqp hw */ -enum irdma_status_code irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp) +int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp) { u32 cnt = 0, val; - enum irdma_status_code ret_code = 0; + int ret_code = 0; writel(0, cqp->dev->hw_regs[IRDMA_CCQPHIGH]); writel(0, cqp->dev->hw_regs[IRDMA_CCQPLOW]); do { if (cnt++ > cqp->dev->hw_attrs.max_done_count) { - ret_code = IRDMA_ERR_TIMEOUT; + ret_code = -ETIMEDOUT; break; } udelay(cqp->dev->hw_attrs.max_sleep_count); @@ -3350,8 +3323,8 @@ void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq) * @ccq: ccq sc struct * @info: completion q entry to return */ -enum irdma_status_code irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, - struct irdma_ccq_cqe_info *info) +int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, + struct irdma_ccq_cqe_info *info) { u64 qp_ctx, temp, temp1; __le64 *cqe; @@ -3359,7 +3332,7 @@ enum irdma_status_code irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, u32 wqe_idx; u32 error; u8 polarity; - enum irdma_status_code ret_code = 0; + int ret_code = 0; if (ccq->cq_uk.avoid_mem_cflct) cqe = IRDMA_GET_CURRENT_EXTENDED_CQ_ELEM(&ccq->cq_uk); @@ -3369,7 +3342,7 @@ enum irdma_status_code irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, get_64bit_val(cqe, 24, &temp); polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, temp); if (polarity != ccq->cq_uk.polarity) - return IRDMA_ERR_Q_EMPTY; + return -ENOENT; get_64bit_val(cqe, 8, &qp_ctx); cqp = (struct irdma_sc_cqp *)(unsigned long)qp_ctx; @@ -3416,25 +3389,25 @@ enum irdma_status_code irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, * @op_code: cqp opcode for completion * @compl_info: completion q entry to return */ -enum irdma_status_code irdma_sc_poll_for_cqp_op_done(struct irdma_sc_cqp *cqp, u8 op_code, - struct irdma_ccq_cqe_info *compl_info) +int irdma_sc_poll_for_cqp_op_done(struct irdma_sc_cqp *cqp, u8 op_code, + struct irdma_ccq_cqe_info *compl_info) { struct irdma_ccq_cqe_info info = {}; struct irdma_sc_cq *ccq; - enum irdma_status_code ret_code = 0; + int ret_code = 0; u32 cnt = 0; ccq = cqp->dev->ccq; while (1) { if (cnt++ > 100 * cqp->dev->hw_attrs.max_done_count) - return IRDMA_ERR_TIMEOUT; + return -ETIMEDOUT; if (irdma_sc_ccq_get_cqe_info(ccq, &info)) { udelay(cqp->dev->hw_attrs.max_sleep_count); continue; } if (info.error && info.op_code != IRDMA_CQP_OP_QUERY_STAG) { - ret_code = IRDMA_ERR_CQP_COMPL_ERROR; + ret_code = -EIO; break; } /* make sure op code matches*/ @@ -3458,17 +3431,16 @@ enum irdma_status_code irdma_sc_poll_for_cqp_op_done(struct irdma_sc_cqp *cqp, u * @info: info for the manage function table operation * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code -irdma_sc_manage_hmc_pm_func_table(struct irdma_sc_cqp *cqp, - struct irdma_hmc_fcn_info *info, - u64 scratch, bool post_sq) +static int irdma_sc_manage_hmc_pm_func_table(struct irdma_sc_cqp *cqp, + struct irdma_hmc_fcn_info *info, + u64 scratch, bool post_sq) { __le64 *wqe; u64 hdr; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 0, 0); set_64bit_val(wqe, 8, 0); @@ -3501,8 +3473,7 @@ irdma_sc_manage_hmc_pm_func_table(struct irdma_sc_cqp *cqp, * for fpm commit * @cqp: struct for cqp hw */ -static enum irdma_status_code -irdma_sc_commit_fpm_val_done(struct irdma_sc_cqp *cqp) +static int irdma_sc_commit_fpm_val_done(struct irdma_sc_cqp *cqp) { return irdma_sc_poll_for_cqp_op_done(cqp, IRDMA_CQP_OP_COMMIT_FPM_VAL, NULL); @@ -3517,19 +3488,19 @@ irdma_sc_commit_fpm_val_done(struct irdma_sc_cqp *cqp) * @post_sq: flag for cqp db to ring * @wait_type: poll ccq or cqp registers for cqp completion */ -static enum irdma_status_code -irdma_sc_commit_fpm_val(struct irdma_sc_cqp *cqp, u64 scratch, u8 hmc_fn_id, - struct irdma_dma_mem *commit_fpm_mem, bool post_sq, - u8 wait_type) +static int irdma_sc_commit_fpm_val(struct irdma_sc_cqp *cqp, u64 scratch, + u8 hmc_fn_id, + struct irdma_dma_mem *commit_fpm_mem, + bool post_sq, u8 wait_type) { __le64 *wqe; u64 hdr; u32 tail, val, error; - enum irdma_status_code ret_code = 0; + int ret_code = 0; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, hmc_fn_id); set_64bit_val(wqe, 32, commit_fpm_mem->pa); @@ -3563,8 +3534,7 @@ irdma_sc_commit_fpm_val(struct irdma_sc_cqp *cqp, u64 scratch, u8 hmc_fn_id, * query fpm * @cqp: struct for cqp hw */ -static enum irdma_status_code -irdma_sc_query_fpm_val_done(struct irdma_sc_cqp *cqp) +static int irdma_sc_query_fpm_val_done(struct irdma_sc_cqp *cqp) { return irdma_sc_poll_for_cqp_op_done(cqp, IRDMA_CQP_OP_QUERY_FPM_VAL, NULL); @@ -3579,19 +3549,19 @@ irdma_sc_query_fpm_val_done(struct irdma_sc_cqp *cqp) * @post_sq: flag for cqp db to ring * @wait_type: poll ccq or cqp registers for cqp completion */ -static enum irdma_status_code -irdma_sc_query_fpm_val(struct irdma_sc_cqp *cqp, u64 scratch, u8 hmc_fn_id, - struct irdma_dma_mem *query_fpm_mem, bool post_sq, - u8 wait_type) +static int irdma_sc_query_fpm_val(struct irdma_sc_cqp *cqp, u64 scratch, + u8 hmc_fn_id, + struct irdma_dma_mem *query_fpm_mem, + bool post_sq, u8 wait_type) { __le64 *wqe; u64 hdr; u32 tail, val, error; - enum irdma_status_code ret_code = 0; + int ret_code = 0; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, hmc_fn_id); set_64bit_val(wqe, 32, query_fpm_mem->pa); @@ -3623,21 +3593,21 @@ irdma_sc_query_fpm_val(struct irdma_sc_cqp *cqp, u64 scratch, u8 hmc_fn_id, * @ceq: ceq sc structure * @info: ceq initialization info */ -enum irdma_status_code irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, - struct irdma_ceq_init_info *info) +int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, + struct irdma_ceq_init_info *info) { u32 pble_obj_cnt; if (info->elem_cnt < info->dev->hw_attrs.min_hw_ceq_size || info->elem_cnt > info->dev->hw_attrs.max_hw_ceq_size) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; if (info->ceq_id > (info->dev->hmc_fpm_misc.max_ceqs - 1)) - return IRDMA_ERR_INVALID_CEQ_ID; + return -EINVAL; pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; ceq->size = sizeof(*ceq); ceq->ceqe_base = (struct irdma_ceqe *)info->ceqe_base; @@ -3670,8 +3640,8 @@ enum irdma_status_code irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code irdma_sc_ceq_create(struct irdma_sc_ceq *ceq, u64 scratch, - bool post_sq) +static int irdma_sc_ceq_create(struct irdma_sc_ceq *ceq, u64 scratch, + bool post_sq) { struct irdma_sc_cqp *cqp; __le64 *wqe; @@ -3680,7 +3650,7 @@ static enum irdma_status_code irdma_sc_ceq_create(struct irdma_sc_ceq *ceq, u64 cqp = ceq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, ceq->elem_cnt); set_64bit_val(wqe, 32, (ceq->virtual_map ? 0 : ceq->ceq_elem_pa)); @@ -3712,8 +3682,7 @@ static enum irdma_status_code irdma_sc_ceq_create(struct irdma_sc_ceq *ceq, u64 * irdma_sc_cceq_create_done - poll for control ceq wqe to complete * @ceq: ceq sc structure */ -static enum irdma_status_code -irdma_sc_cceq_create_done(struct irdma_sc_ceq *ceq) +static int irdma_sc_cceq_create_done(struct irdma_sc_ceq *ceq) { struct irdma_sc_cqp *cqp; @@ -3726,7 +3695,7 @@ irdma_sc_cceq_create_done(struct irdma_sc_ceq *ceq) * irdma_sc_cceq_destroy_done - poll for destroy cceq to complete * @ceq: ceq sc structure */ -enum irdma_status_code irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq) +int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq) { struct irdma_sc_cqp *cqp; @@ -3745,9 +3714,9 @@ enum irdma_status_code irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq) * @ceq: ceq sc structure * @scratch: u64 saved to be used during cqp completion */ -enum irdma_status_code irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch) +int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch) { - enum irdma_status_code ret_code; + int ret_code; struct irdma_sc_dev *dev = ceq->dev; dev->ccq->vsi = ceq->vsi; @@ -3770,8 +3739,7 @@ enum irdma_status_code irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratc * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, - bool post_sq) +int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq) { struct irdma_sc_cqp *cqp; __le64 *wqe; @@ -3780,7 +3748,7 @@ enum irdma_status_code irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratc cqp = ceq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, ceq->elem_cnt); set_64bit_val(wqe, 48, ceq->first_pm_pbl_idx); @@ -3899,19 +3867,19 @@ void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq) * @aeq: aeq structure ptr * @info: aeq initialization info */ -enum irdma_status_code irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, - struct irdma_aeq_init_info *info) +int irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, + struct irdma_aeq_init_info *info) { u32 pble_obj_cnt; if (info->elem_cnt < info->dev->hw_attrs.min_hw_aeq_size || info->elem_cnt > info->dev->hw_attrs.max_hw_aeq_size) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; aeq->size = sizeof(*aeq); aeq->polarity = 1; @@ -3936,8 +3904,8 @@ enum irdma_status_code irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code irdma_sc_aeq_create(struct irdma_sc_aeq *aeq, - u64 scratch, bool post_sq) +static int irdma_sc_aeq_create(struct irdma_sc_aeq *aeq, u64 scratch, + bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; @@ -3946,7 +3914,7 @@ static enum irdma_status_code irdma_sc_aeq_create(struct irdma_sc_aeq *aeq, cqp = aeq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, aeq->elem_cnt); set_64bit_val(wqe, 32, (aeq->virtual_map ? 0 : aeq->aeq_elem_pa)); @@ -3975,8 +3943,8 @@ static enum irdma_status_code irdma_sc_aeq_create(struct irdma_sc_aeq *aeq, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -static enum irdma_status_code irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, - u64 scratch, bool post_sq) +static int irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, u64 scratch, + bool post_sq) { __le64 *wqe; struct irdma_sc_cqp *cqp; @@ -3989,7 +3957,7 @@ static enum irdma_status_code irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, aeq->elem_cnt); set_64bit_val(wqe, 48, aeq->first_pm_pbl_idx); hdr = FIELD_PREP(IRDMA_CQPSQ_OPCODE, IRDMA_CQP_OP_DESTROY_AEQ) | @@ -4012,8 +3980,8 @@ static enum irdma_status_code irdma_sc_aeq_destroy(struct irdma_sc_aeq *aeq, * @aeq: aeq structure ptr * @info: aeqe info to be returned */ -enum irdma_status_code irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, - struct irdma_aeqe_info *info) +int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, + struct irdma_aeqe_info *info) { u64 temp, compl_ctx; __le64 *aeqe; @@ -4027,7 +3995,7 @@ enum irdma_status_code irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, polarity = (u8)FIELD_GET(IRDMA_AEQE_VALID, temp); if (aeq->polarity != polarity) - return IRDMA_ERR_Q_EMPTY; + return -ENOENT; print_hex_dump_debug("WQE: AEQ_ENTRY WQE", DUMP_PREFIX_OFFSET, 16, 8, aeqe, 16, false); @@ -4172,22 +4140,21 @@ void irdma_sc_repost_aeq_entries(struct irdma_sc_dev *dev, u32 count) * @cq: sc's cq ctruct * @info: info for control cq initialization */ -enum irdma_status_code irdma_sc_ccq_init(struct irdma_sc_cq *cq, - struct irdma_ccq_init_info *info) +int irdma_sc_ccq_init(struct irdma_sc_cq *cq, struct irdma_ccq_init_info *info) { u32 pble_obj_cnt; if (info->num_elem < info->dev->hw_attrs.uk_attrs.min_hw_cq_size || info->num_elem > info->dev->hw_attrs.uk_attrs.max_hw_cq_size) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; if (info->ceq_id > (info->dev->hmc_fpm_misc.max_ceqs - 1)) - return IRDMA_ERR_INVALID_CEQ_ID; + return -EINVAL; pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; if (info->virtual_map && info->first_pm_pbl_idx >= pble_obj_cnt) - return IRDMA_ERR_INVALID_PBLE_INDEX; + return -EINVAL; cq->cq_pa = info->cq_pa; cq->cq_uk.cq_base = info->cq_base; @@ -4224,7 +4191,7 @@ enum irdma_status_code irdma_sc_ccq_init(struct irdma_sc_cq *cq, * irdma_sc_ccq_create_done - poll cqp for ccq create * @ccq: ccq sc struct */ -static inline enum irdma_status_code irdma_sc_ccq_create_done(struct irdma_sc_cq *ccq) +static inline int irdma_sc_ccq_create_done(struct irdma_sc_cq *ccq) { struct irdma_sc_cqp *cqp; @@ -4240,10 +4207,10 @@ static inline enum irdma_status_code irdma_sc_ccq_create_done(struct irdma_sc_cq * @check_overflow: overlow flag for ccq * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_ccq_create(struct irdma_sc_cq *ccq, u64 scratch, - bool check_overflow, bool post_sq) +int irdma_sc_ccq_create(struct irdma_sc_cq *ccq, u64 scratch, + bool check_overflow, bool post_sq) { - enum irdma_status_code ret_code; + int ret_code; ret_code = irdma_sc_cq_create(ccq, scratch, check_overflow, post_sq); if (ret_code) @@ -4265,19 +4232,18 @@ enum irdma_status_code irdma_sc_ccq_create(struct irdma_sc_cq *ccq, u64 scratch, * @scratch: u64 saved to be used during cqp completion * @post_sq: flag for cqp db to ring */ -enum irdma_status_code irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, - bool post_sq) +int irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, bool post_sq) { struct irdma_sc_cqp *cqp; __le64 *wqe; u64 hdr; - enum irdma_status_code ret_code = 0; + int ret_code = 0; u32 tail, val, error; cqp = ccq->dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 0, ccq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)ccq >> 1); @@ -4316,13 +4282,12 @@ enum irdma_status_code irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch * @dev : ptr to irdma_dev struct * @hmc_fn_id: hmc function id */ -enum irdma_status_code irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, - u8 hmc_fn_id) +int irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, u8 hmc_fn_id) { struct irdma_hmc_info *hmc_info; struct irdma_hmc_fpm_misc *hmc_fpm_misc; struct irdma_dma_mem query_fpm_mem; - enum irdma_status_code ret_code = 0; + int ret_code = 0; u8 wait_type; hmc_info = dev->hmc_info; @@ -4353,14 +4318,13 @@ enum irdma_status_code irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, * @dev : ptr to irdma_dev struct * @hmc_fn_id: hmc function id */ -static enum irdma_status_code irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, - u8 hmc_fn_id) +static int irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id) { struct irdma_hmc_info *hmc_info; struct irdma_hmc_obj_info *obj_info; __le64 *buf; struct irdma_dma_mem commit_fpm_mem; - enum irdma_status_code ret_code = 0; + int ret_code = 0; u8 wait_type; hmc_info = dev->hmc_info; @@ -4423,9 +4387,8 @@ static enum irdma_status_code irdma_sc_cfg_iw_fpm(struct irdma_sc_dev *dev, * @info: sd info for wqe * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code -cqp_sds_wqe_fill(struct irdma_sc_cqp *cqp, struct irdma_update_sds_info *info, - u64 scratch) +static int cqp_sds_wqe_fill(struct irdma_sc_cqp *cqp, + struct irdma_update_sds_info *info, u64 scratch) { u64 data; u64 hdr; @@ -4437,7 +4400,7 @@ cqp_sds_wqe_fill(struct irdma_sc_cqp *cqp, struct irdma_update_sds_info *info, wqe = irdma_sc_cqp_get_next_send_wqe_idx(cqp, scratch, &wqe_idx); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; wqe_entries = (info->cnt > 3) ? 3 : info->cnt; mem_entries = info->cnt - wqe_entries; @@ -4503,12 +4466,11 @@ cqp_sds_wqe_fill(struct irdma_sc_cqp *cqp, struct irdma_update_sds_info *info, * @info: sd info for sd's * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code -irdma_update_pe_sds(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *info, u64 scratch) +static int irdma_update_pe_sds(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *info, u64 scratch) { struct irdma_sc_cqp *cqp = dev->cqp; - enum irdma_status_code ret_code; + int ret_code; ret_code = cqp_sds_wqe_fill(cqp, info, scratch); if (!ret_code) @@ -4522,13 +4484,12 @@ irdma_update_pe_sds(struct irdma_sc_dev *dev, * @dev: sc device struct * @info: sd info for sd's */ -enum irdma_status_code -irdma_update_sds_noccq(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *info) +int irdma_update_sds_noccq(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *info) { u32 error, val, tail; struct irdma_sc_cqp *cqp = dev->cqp; - enum irdma_status_code ret_code; + int ret_code; ret_code = cqp_sds_wqe_fill(cqp, info, 0); if (ret_code) @@ -4549,10 +4510,9 @@ irdma_update_sds_noccq(struct irdma_sc_dev *dev, * @post_sq: flag for cqp db to ring * @poll_registers: flag to poll register for cqp completion */ -enum irdma_status_code -irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, - u8 hmc_fn_id, bool post_sq, - bool poll_registers) +int irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, + u8 hmc_fn_id, bool post_sq, + bool poll_registers) { u64 hdr; __le64 *wqe; @@ -4560,7 +4520,7 @@ irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, FIELD_PREP(IRDMA_SHMC_PAGE_ALLOCATED_HMC_FN_ID, hmc_fn_id)); @@ -4635,8 +4595,7 @@ static u32 irdma_est_sd(struct irdma_sc_dev *dev, * irdma_sc_query_rdma_features_done - poll cqp for query features done * @cqp: struct for cqp hw */ -static enum irdma_status_code -irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp) +static int irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp) { return irdma_sc_poll_for_cqp_op_done(cqp, IRDMA_CQP_OP_QUERY_RDMA_FEATURES, @@ -4649,16 +4608,15 @@ irdma_sc_query_rdma_features_done(struct irdma_sc_cqp *cqp) * @buf: buffer to hold query info * @scratch: u64 saved to be used during cqp completion */ -static enum irdma_status_code -irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp, - struct irdma_dma_mem *buf, u64 scratch) +static int irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp, + struct irdma_dma_mem *buf, u64 scratch) { __le64 *wqe; u64 temp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; temp = buf->pa; set_64bit_val(wqe, 32, temp); @@ -4682,9 +4640,9 @@ irdma_sc_query_rdma_features(struct irdma_sc_cqp *cqp, * irdma_get_rdma_features - get RDMA features * @dev: sc device struct */ -enum irdma_status_code irdma_get_rdma_features(struct irdma_sc_dev *dev) +int irdma_get_rdma_features(struct irdma_sc_dev *dev) { - enum irdma_status_code ret_code; + int ret_code; struct irdma_dma_mem feat_buf; u64 temp; u16 byte_idx, feat_type, feat_cnt, feat_idx; @@ -4694,7 +4652,7 @@ enum irdma_status_code irdma_get_rdma_features(struct irdma_sc_dev *dev) feat_buf.va = dma_alloc_coherent(dev->hw->device, feat_buf.size, &feat_buf.pa, GFP_KERNEL); if (!feat_buf.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; ret_code = irdma_sc_query_rdma_features(dev->cqp, &feat_buf, 0); if (!ret_code) @@ -4705,7 +4663,7 @@ enum irdma_status_code irdma_get_rdma_features(struct irdma_sc_dev *dev) get_64bit_val(feat_buf.va, 0, &temp); feat_cnt = (u16)FIELD_GET(IRDMA_FEATURE_CNT, temp); if (feat_cnt < 2) { - ret_code = IRDMA_ERR_INVALID_FEAT_CNT; + ret_code = -EINVAL; goto exit; } else if (feat_cnt > IRDMA_MAX_FEATURES) { ibdev_dbg(to_ibdev(dev), @@ -4719,7 +4677,7 @@ enum irdma_status_code irdma_get_rdma_features(struct irdma_sc_dev *dev) feat_buf.size, &feat_buf.pa, GFP_KERNEL); if (!feat_buf.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; ret_code = irdma_sc_query_rdma_features(dev->cqp, &feat_buf, 0); if (!ret_code) @@ -4730,7 +4688,7 @@ enum irdma_status_code irdma_get_rdma_features(struct irdma_sc_dev *dev) get_64bit_val(feat_buf.va, 0, &temp); feat_cnt = (u16)FIELD_GET(IRDMA_FEATURE_CNT, temp); if (feat_cnt < 2) { - ret_code = IRDMA_ERR_INVALID_FEAT_CNT; + ret_code = -EINVAL; goto exit; } } @@ -4809,7 +4767,7 @@ static void cfg_fpm_value_gen_2(struct irdma_sc_dev *dev, * @dev: sc device struct * @qp_count: desired qp count */ -enum irdma_status_code irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) +int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) { struct irdma_virt_mem virt_mem; u32 i, mem_size; @@ -4820,7 +4778,7 @@ enum irdma_status_code irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) u32 loop_count = 0; struct irdma_hmc_info *hmc_info; struct irdma_hmc_fpm_misc *hmc_fpm_misc; - enum irdma_status_code ret_code = 0; + int ret_code = 0; hmc_info = dev->hmc_info; hmc_fpm_misc = &dev->hmc_fpm_misc; @@ -4947,7 +4905,7 @@ enum irdma_status_code irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) ibdev_dbg(to_ibdev(dev), "HMC: cfg_fpm failed loop_cnt=%d, sd_needed=%d, max sd count %d\n", loop_count, sd_needed, hmc_info->sd_table.sd_cnt); - return IRDMA_ERR_CFG; + return -EINVAL; } if (loop_count > 1 && sd_needed < hmc_fpm_misc->max_sds) { @@ -4983,7 +4941,7 @@ enum irdma_status_code irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) if (!virt_mem.va) { ibdev_dbg(to_ibdev(dev), "HMC: failed to allocate memory for sd_entry buffer\n"); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } hmc_info->sd_table.sd_entry = virt_mem.va; @@ -4995,10 +4953,10 @@ enum irdma_status_code irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) * @dev: rdma device * @pcmdinfo: cqp command info */ -static enum irdma_status_code irdma_exec_cqp_cmd(struct irdma_sc_dev *dev, - struct cqp_cmds_info *pcmdinfo) +static int irdma_exec_cqp_cmd(struct irdma_sc_dev *dev, + struct cqp_cmds_info *pcmdinfo) { - enum irdma_status_code status; + int status; struct irdma_dma_mem val_mem; bool alloc = false; @@ -5260,7 +5218,7 @@ static enum irdma_status_code irdma_exec_cqp_cmd(struct irdma_sc_dev *dev, pcmdinfo->in.u.mc_modify.scratch); break; default: - status = IRDMA_NOT_SUPPORTED; + status = -EOPNOTSUPP; break; } @@ -5272,10 +5230,10 @@ static enum irdma_status_code irdma_exec_cqp_cmd(struct irdma_sc_dev *dev, * @dev: sc device struct * @pcmdinfo: cqp command info */ -enum irdma_status_code irdma_process_cqp_cmd(struct irdma_sc_dev *dev, - struct cqp_cmds_info *pcmdinfo) +int irdma_process_cqp_cmd(struct irdma_sc_dev *dev, + struct cqp_cmds_info *pcmdinfo) { - enum irdma_status_code status = 0; + int status = 0; unsigned long flags; spin_lock_irqsave(&dev->cqp_lock, flags); @@ -5291,9 +5249,9 @@ enum irdma_status_code irdma_process_cqp_cmd(struct irdma_sc_dev *dev, * irdma_process_bh - called from tasklet for cqp list * @dev: sc device struct */ -enum irdma_status_code irdma_process_bh(struct irdma_sc_dev *dev) +int irdma_process_bh(struct irdma_sc_dev *dev) { - enum irdma_status_code status = 0; + int status = 0; struct cqp_cmds_info *pcmdinfo; unsigned long flags; @@ -5381,12 +5339,11 @@ static inline void irdma_sc_init_hw(struct irdma_sc_dev *dev) * @dev: Device pointer * @info: Device init info */ -enum irdma_status_code irdma_sc_dev_init(enum irdma_vers ver, - struct irdma_sc_dev *dev, - struct irdma_device_init_info *info) +int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, + struct irdma_device_init_info *info) { u32 val; - enum irdma_status_code ret_code = 0; + int ret_code = 0; u8 db_size; INIT_LIST_HEAD(&dev->cqp_cmd_head); /* for CQP command backlog */ @@ -5430,7 +5387,7 @@ enum irdma_status_code irdma_sc_dev_init(enum irdma_vers ver, irdma_sc_init_hw(dev); if (irdma_wait_pe_ready(dev)) - return IRDMA_ERR_TIMEOUT; + return -ETIMEDOUT; val = readl(dev->hw_regs[IRDMA_GLPCI_LBARCTRL]); db_size = (u8)FIELD_GET(IRDMA_GLPCI_LBARCTRL_PE_DB_SIZE, val); @@ -5438,7 +5395,7 @@ enum irdma_status_code irdma_sc_dev_init(enum irdma_vers ver, ibdev_dbg(to_ibdev(dev), "DEV: RDMA PE doorbell is not enabled in CSR val 0x%x db_size=%d\n", val, db_size); - return IRDMA_ERR_PE_DOORBELL_NOT_ENA; + return -ENODEV; } dev->db_addr = dev->hw->hw_addr + (uintptr_t)dev->hw_regs[IRDMA_DB_ADDR_OFFSET]; diff --git a/drivers/infiniband/hw/irdma/defs.h b/drivers/infiniband/hw/irdma/defs.h index cc3d9a365b35..e03e03082a5f 100644 --- a/drivers/infiniband/hw/irdma/defs.h +++ b/drivers/infiniband/hw/irdma/defs.h @@ -964,7 +964,7 @@ enum irdma_cqp_op_type { (_ring).head = ((_ring).head + 1) % size; \ (_retcode) = 0; \ } else { \ - (_retcode) = IRDMA_ERR_RING_FULL; \ + (_retcode) = -ENOMEM; \ } \ } #define IRDMA_RING_MOVE_HEAD_BY_COUNT(_ring, _count, _retcode) \ @@ -975,7 +975,7 @@ enum irdma_cqp_op_type { (_ring).head = ((_ring).head + (_count)) % size; \ (_retcode) = 0; \ } else { \ - (_retcode) = IRDMA_ERR_RING_FULL; \ + (_retcode) = -ENOMEM; \ } \ } #define IRDMA_SQ_RING_MOVE_HEAD(_ring, _retcode) \ @@ -986,7 +986,7 @@ enum irdma_cqp_op_type { (_ring).head = ((_ring).head + 1) % size; \ (_retcode) = 0; \ } else { \ - (_retcode) = IRDMA_ERR_RING_FULL; \ + (_retcode) = -ENOMEM; \ } \ } #define IRDMA_SQ_RING_MOVE_HEAD_BY_COUNT(_ring, _count, _retcode) \ @@ -997,7 +997,7 @@ enum irdma_cqp_op_type { (_ring).head = ((_ring).head + (_count)) % size; \ (_retcode) = 0; \ } else { \ - (_retcode) = IRDMA_ERR_RING_FULL; \ + (_retcode) = -ENOMEM; \ } \ } #define IRDMA_RING_MOVE_HEAD_BY_COUNT_NOCHECK(_ring, _count) \ diff --git a/drivers/infiniband/hw/irdma/hmc.c b/drivers/infiniband/hw/irdma/hmc.c index ecffcb93c05a..49307ce8c4da 100644 --- a/drivers/infiniband/hw/irdma/hmc.c +++ b/drivers/infiniband/hw/irdma/hmc.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "osdep.h" -#include "status.h" #include "hmc.h" #include "defs.h" #include "type.h" @@ -121,10 +120,8 @@ static inline void irdma_invalidate_pf_hmc_pd(struct irdma_sc_dev *dev, u32 sd_i * @type: paged or direct sd * @setsd: flag to set or clear sd */ -enum irdma_status_code irdma_hmc_sd_one(struct irdma_sc_dev *dev, u8 hmc_fn_id, - u64 pa, u32 sd_idx, - enum irdma_sd_entry_type type, - bool setsd) +int irdma_hmc_sd_one(struct irdma_sc_dev *dev, u8 hmc_fn_id, u64 pa, u32 sd_idx, + enum irdma_sd_entry_type type, bool setsd) { struct irdma_update_sds_info sdinfo; @@ -145,16 +142,15 @@ enum irdma_status_code irdma_hmc_sd_one(struct irdma_sc_dev *dev, u8 hmc_fn_id, * @sd_cnt: number of sd entries * @setsd: flag to set or clear sd */ -static enum irdma_status_code irdma_hmc_sd_grp(struct irdma_sc_dev *dev, - struct irdma_hmc_info *hmc_info, - u32 sd_index, u32 sd_cnt, - bool setsd) +static int irdma_hmc_sd_grp(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, u32 sd_index, + u32 sd_cnt, bool setsd) { struct irdma_hmc_sd_entry *sd_entry; struct irdma_update_sds_info sdinfo = {}; u64 pa; u32 i; - enum irdma_status_code ret_code = 0; + int ret_code = 0; sdinfo.hmc_fn_id = hmc_info->hmc_fn_id; for (i = sd_index; i < sd_index + sd_cnt; i++) { @@ -196,16 +192,15 @@ static enum irdma_status_code irdma_hmc_sd_grp(struct irdma_sc_dev *dev, * @dev: pointer to the device structure * @info: create obj info */ -static enum irdma_status_code -irdma_hmc_finish_add_sd_reg(struct irdma_sc_dev *dev, - struct irdma_hmc_create_obj_info *info) +static int irdma_hmc_finish_add_sd_reg(struct irdma_sc_dev *dev, + struct irdma_hmc_create_obj_info *info) { if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) - return IRDMA_ERR_INVALID_HMC_OBJ_INDEX; + return -EINVAL; if ((info->start_idx + info->count) > info->hmc_info->hmc_obj[info->rsrc_type].cnt) - return IRDMA_ERR_INVALID_HMC_OBJ_COUNT; + return -EINVAL; if (!info->add_sd_cnt) return 0; @@ -222,9 +217,8 @@ irdma_hmc_finish_add_sd_reg(struct irdma_sc_dev *dev, * This will allocate memory for PDs and backing pages and populate * the sd and pd entries. */ -enum irdma_status_code -irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, - struct irdma_hmc_create_obj_info *info) +int irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, + struct irdma_hmc_create_obj_info *info) { struct irdma_hmc_sd_entry *sd_entry; u32 sd_idx, sd_lmt; @@ -232,10 +226,10 @@ irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, u32 pd_idx1 = 0, pd_lmt1 = 0; u32 i, j; bool pd_error = false; - enum irdma_status_code ret_code = 0; + int ret_code = 0; if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) - return IRDMA_ERR_INVALID_HMC_OBJ_INDEX; + return -EINVAL; if ((info->start_idx + info->count) > info->hmc_info->hmc_obj[info->rsrc_type].cnt) { @@ -243,7 +237,7 @@ irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, "HMC: error type %u, start = %u, req cnt %u, cnt = %u\n", info->rsrc_type, info->start_idx, info->count, info->hmc_info->hmc_obj[info->rsrc_type].cnt); - return IRDMA_ERR_INVALID_HMC_OBJ_COUNT; + return -EINVAL; } irdma_find_sd_index_limit(info->hmc_info, info->rsrc_type, @@ -251,7 +245,7 @@ irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, &sd_lmt); if (sd_idx >= info->hmc_info->sd_table.sd_cnt || sd_lmt > info->hmc_info->sd_table.sd_cnt) { - return IRDMA_ERR_INVALID_SD_INDEX; + return -EINVAL; } irdma_find_pd_index_limit(info->hmc_info, info->rsrc_type, @@ -312,7 +306,7 @@ exit_sd_error: irdma_prep_remove_pd_page(info->hmc_info, (j - 1)); break; default: - ret_code = IRDMA_ERR_INVALID_SD_TYPE; + ret_code = -EINVAL; break; } j--; @@ -327,12 +321,12 @@ exit_sd_error: * @info: dele obj info * @reset: true if called before reset */ -static enum irdma_status_code -irdma_finish_del_sd_reg(struct irdma_sc_dev *dev, - struct irdma_hmc_del_obj_info *info, bool reset) +static int irdma_finish_del_sd_reg(struct irdma_sc_dev *dev, + struct irdma_hmc_del_obj_info *info, + bool reset) { struct irdma_hmc_sd_entry *sd_entry; - enum irdma_status_code ret_code = 0; + int ret_code = 0; u32 i, sd_idx; struct irdma_dma_mem *mem; @@ -373,22 +367,21 @@ irdma_finish_del_sd_reg(struct irdma_sc_dev *dev, * caller should deallocate memory allocated previously for * book-keeping information about PDs and backing storage. */ -enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, - struct irdma_hmc_del_obj_info *info, - bool reset) +int irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, + struct irdma_hmc_del_obj_info *info, bool reset) { struct irdma_hmc_pd_table *pd_table; u32 sd_idx, sd_lmt; u32 pd_idx, pd_lmt, rel_pd_idx; u32 i, j; - enum irdma_status_code ret_code = 0; + int ret_code = 0; if (info->start_idx >= info->hmc_info->hmc_obj[info->rsrc_type].cnt) { ibdev_dbg(to_ibdev(dev), "HMC: error start_idx[%04d] >= [type %04d].cnt[%04d]\n", info->start_idx, info->rsrc_type, info->hmc_info->hmc_obj[info->rsrc_type].cnt); - return IRDMA_ERR_INVALID_HMC_OBJ_INDEX; + return -EINVAL; } if ((info->start_idx + info->count) > @@ -397,7 +390,7 @@ enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, "HMC: error start_idx[%04d] + count %04d >= [type %04d].cnt[%04d]\n", info->start_idx, info->count, info->rsrc_type, info->hmc_info->hmc_obj[info->rsrc_type].cnt); - return IRDMA_ERR_INVALID_HMC_OBJ_COUNT; + return -EINVAL; } irdma_find_pd_index_limit(info->hmc_info, info->rsrc_type, @@ -433,7 +426,7 @@ enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, if (sd_idx >= info->hmc_info->sd_table.sd_cnt || sd_lmt > info->hmc_info->sd_table.sd_cnt) { ibdev_dbg(to_ibdev(dev), "HMC: invalid sd_idx\n"); - return IRDMA_ERR_INVALID_SD_INDEX; + return -EINVAL; } for (i = sd_idx; i < sd_lmt; i++) { @@ -477,11 +470,9 @@ enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, * @type: what type of segment descriptor we're manipulating * @direct_mode_sz: size to alloc in direct mode */ -enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw, - struct irdma_hmc_info *hmc_info, - u32 sd_index, - enum irdma_sd_entry_type type, - u64 direct_mode_sz) +int irdma_add_sd_table_entry(struct irdma_hw *hw, + struct irdma_hmc_info *hmc_info, u32 sd_index, + enum irdma_sd_entry_type type, u64 direct_mode_sz) { struct irdma_hmc_sd_entry *sd_entry; struct irdma_dma_mem dma_mem; @@ -499,7 +490,7 @@ enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw, dma_mem.va = dma_alloc_coherent(hw->device, dma_mem.size, &dma_mem.pa, GFP_KERNEL); if (!dma_mem.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; if (type == IRDMA_SD_TYPE_PAGED) { struct irdma_virt_mem *vmem = &sd_entry->u.pd_table.pd_entry_virt_mem; @@ -510,7 +501,7 @@ enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw, dma_free_coherent(hw->device, dma_mem.size, dma_mem.va, dma_mem.pa); dma_mem.va = NULL; - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } sd_entry->u.pd_table.pd_entry = vmem->va; @@ -549,10 +540,9 @@ enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw, * aligned on 4K boundary and zeroed memory. * 2. It should be 4K in size. */ -enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev, - struct irdma_hmc_info *hmc_info, - u32 pd_index, - struct irdma_dma_mem *rsrc_pg) +int irdma_add_pd_table_entry(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, u32 pd_index, + struct irdma_dma_mem *rsrc_pg) { struct irdma_hmc_pd_table *pd_table; struct irdma_hmc_pd_entry *pd_entry; @@ -563,7 +553,7 @@ enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev, u64 page_desc; if (pd_index / IRDMA_HMC_PD_CNT_IN_SD >= hmc_info->sd_table.sd_cnt) - return IRDMA_ERR_INVALID_PAGE_DESC_INDEX; + return -EINVAL; sd_idx = (pd_index / IRDMA_HMC_PD_CNT_IN_SD); if (hmc_info->sd_table.sd_entry[sd_idx].entry_type != @@ -584,7 +574,7 @@ enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev, page->size, &page->pa, GFP_KERNEL); if (!page->va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; pd_entry->rsrc_pg = false; } @@ -621,9 +611,8 @@ enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev, * 1. Caller can deallocate the memory used by backing storage after this * function returns. */ -enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev, - struct irdma_hmc_info *hmc_info, - u32 idx) +int irdma_remove_pd_bp(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, u32 idx) { struct irdma_hmc_pd_entry *pd_entry; struct irdma_hmc_pd_table *pd_table; @@ -635,11 +624,11 @@ enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev, sd_idx = idx / IRDMA_HMC_PD_CNT_IN_SD; rel_pd_idx = idx % IRDMA_HMC_PD_CNT_IN_SD; if (sd_idx >= hmc_info->sd_table.sd_cnt) - return IRDMA_ERR_INVALID_PAGE_DESC_INDEX; + return -EINVAL; sd_entry = &hmc_info->sd_table.sd_entry[sd_idx]; if (sd_entry->entry_type != IRDMA_SD_TYPE_PAGED) - return IRDMA_ERR_INVALID_SD_TYPE; + return -EINVAL; pd_table = &hmc_info->sd_table.sd_entry[sd_idx].u.pd_table; pd_entry = &pd_table->pd_entry[rel_pd_idx]; @@ -656,7 +645,7 @@ enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev, if (!pd_entry->rsrc_pg) { mem = &pd_entry->bp.addr; if (!mem || !mem->va) - return IRDMA_ERR_PARAM; + return -EINVAL; dma_free_coherent(dev->hw->device, mem->size, mem->va, mem->pa); @@ -673,14 +662,13 @@ enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev, * @hmc_info: pointer to the HMC configuration information structure * @idx: the page index */ -enum irdma_status_code irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info, - u32 idx) +int irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info, u32 idx) { struct irdma_hmc_sd_entry *sd_entry; sd_entry = &hmc_info->sd_table.sd_entry[idx]; if (--sd_entry->u.bp.use_cnt) - return IRDMA_ERR_NOT_READY; + return -EBUSY; hmc_info->sd_table.use_cnt--; sd_entry->valid = false; @@ -693,15 +681,14 @@ enum irdma_status_code irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info, * @hmc_info: pointer to the HMC configuration information structure * @idx: segment descriptor index to find the relevant page descriptor */ -enum irdma_status_code -irdma_prep_remove_pd_page(struct irdma_hmc_info *hmc_info, u32 idx) +int irdma_prep_remove_pd_page(struct irdma_hmc_info *hmc_info, u32 idx) { struct irdma_hmc_sd_entry *sd_entry; sd_entry = &hmc_info->sd_table.sd_entry[idx]; if (sd_entry->u.pd_table.use_cnt) - return IRDMA_ERR_NOT_READY; + return -EBUSY; sd_entry->valid = false; hmc_info->sd_table.use_cnt--; diff --git a/drivers/infiniband/hw/irdma/hmc.h b/drivers/infiniband/hw/irdma/hmc.h index e2139c788b1b..f5c5dacc7021 100644 --- a/drivers/infiniband/hw/irdma/hmc.h +++ b/drivers/infiniband/hw/irdma/hmc.h @@ -141,40 +141,29 @@ struct irdma_hmc_del_obj_info { bool privileged; }; -enum irdma_status_code irdma_copy_dma_mem(struct irdma_hw *hw, void *dest_buf, - struct irdma_dma_mem *src_mem, - u64 src_offset, u64 size); -enum irdma_status_code -irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, - struct irdma_hmc_create_obj_info *info); -enum irdma_status_code irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, - struct irdma_hmc_del_obj_info *info, - bool reset); -enum irdma_status_code irdma_hmc_sd_one(struct irdma_sc_dev *dev, u8 hmc_fn_id, - u64 pa, u32 sd_idx, - enum irdma_sd_entry_type type, - bool setsd); -enum irdma_status_code -irdma_update_sds_noccq(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *info); +int irdma_copy_dma_mem(struct irdma_hw *hw, void *dest_buf, + struct irdma_dma_mem *src_mem, u64 src_offset, u64 size); +int irdma_sc_create_hmc_obj(struct irdma_sc_dev *dev, + struct irdma_hmc_create_obj_info *info); +int irdma_sc_del_hmc_obj(struct irdma_sc_dev *dev, + struct irdma_hmc_del_obj_info *info, bool reset); +int irdma_hmc_sd_one(struct irdma_sc_dev *dev, u8 hmc_fn_id, u64 pa, u32 sd_idx, + enum irdma_sd_entry_type type, + bool setsd); +int irdma_update_sds_noccq(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *info); struct irdma_vfdev *irdma_vfdev_from_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id); struct irdma_hmc_info *irdma_vf_hmcinfo_from_fpm(struct irdma_sc_dev *dev, u8 hmc_fn_id); -enum irdma_status_code irdma_add_sd_table_entry(struct irdma_hw *hw, - struct irdma_hmc_info *hmc_info, - u32 sd_index, - enum irdma_sd_entry_type type, - u64 direct_mode_sz); -enum irdma_status_code irdma_add_pd_table_entry(struct irdma_sc_dev *dev, - struct irdma_hmc_info *hmc_info, - u32 pd_index, - struct irdma_dma_mem *rsrc_pg); -enum irdma_status_code irdma_remove_pd_bp(struct irdma_sc_dev *dev, - struct irdma_hmc_info *hmc_info, - u32 idx); -enum irdma_status_code irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info, - u32 idx); -enum irdma_status_code -irdma_prep_remove_pd_page(struct irdma_hmc_info *hmc_info, u32 idx); +int irdma_add_sd_table_entry(struct irdma_hw *hw, + struct irdma_hmc_info *hmc_info, u32 sd_index, + enum irdma_sd_entry_type type, u64 direct_mode_sz); +int irdma_add_pd_table_entry(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, u32 pd_index, + struct irdma_dma_mem *rsrc_pg); +int irdma_remove_pd_bp(struct irdma_sc_dev *dev, + struct irdma_hmc_info *hmc_info, u32 idx); +int irdma_prep_remove_sd_bp(struct irdma_hmc_info *hmc_info, u32 idx); +int irdma_prep_remove_pd_page(struct irdma_hmc_info *hmc_info, u32 idx); #endif /* IRDMA_HMC_H */ diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 89234d04cc65..cd0d409844f1 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -75,12 +75,12 @@ static void irdma_puda_ce_handler(struct irdma_pci_f *rf, struct irdma_sc_cq *cq) { struct irdma_sc_dev *dev = &rf->sc_dev; - enum irdma_status_code status; u32 compl_error; + int status; do { status = irdma_puda_poll_cmpl(dev, cq, &compl_error); - if (status == IRDMA_ERR_Q_EMPTY) + if (status == -ENOENT) break; if (status) { ibdev_dbg(to_ibdev(dev), "ERR: puda status = %d\n", status); @@ -456,7 +456,7 @@ static void irdma_ceq_dpc(struct tasklet_struct *t) * Allocate iwdev msix table and copy the msix info to the table * Return 0 if successful, otherwise return error */ -static enum irdma_status_code irdma_save_msix_info(struct irdma_pci_f *rf) +static int irdma_save_msix_info(struct irdma_pci_f *rf) { struct irdma_qvlist_info *iw_qvlist; struct irdma_qv_info *iw_qvinfo; @@ -466,13 +466,13 @@ static enum irdma_status_code irdma_save_msix_info(struct irdma_pci_f *rf) size_t size; if (!rf->msix_count) - return IRDMA_ERR_NO_INTR; + return -EINVAL; size = sizeof(struct irdma_msix_vector) * rf->msix_count; size += struct_size(iw_qvlist, qv_info, rf->msix_count); rf->iw_msixtbl = kzalloc(size, GFP_KERNEL); if (!rf->iw_msixtbl) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; rf->iw_qvlist = (struct irdma_qvlist_info *) (&rf->iw_msixtbl[rf->msix_count]); @@ -564,9 +564,9 @@ static void irdma_destroy_irq(struct irdma_pci_f *rf, */ static void irdma_destroy_cqp(struct irdma_pci_f *rf, bool free_hwcqp) { - enum irdma_status_code status = 0; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_cqp *cqp = &rf->cqp; + int status = 0; if (rf->cqp_cmpl_wq) destroy_workqueue(rf->cqp_cmpl_wq); @@ -606,9 +606,9 @@ static void irdma_destroy_virt_aeq(struct irdma_pci_f *rf) */ static void irdma_destroy_aeq(struct irdma_pci_f *rf) { - enum irdma_status_code status = IRDMA_ERR_NOT_READY; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_aeq *aeq = &rf->aeq; + int status = -EBUSY; if (!rf->msix_shared) { rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, rf->iw_msixtbl->idx, false); @@ -642,8 +642,8 @@ exit: */ static void irdma_destroy_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq) { - enum irdma_status_code status; struct irdma_sc_dev *dev = &rf->sc_dev; + int status; if (rf->reset) goto exit; @@ -733,7 +733,7 @@ static void irdma_destroy_ccq(struct irdma_pci_f *rf) { struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_ccq *ccq = &rf->ccq; - enum irdma_status_code status = 0; + int status = 0; if (!rf->reset) status = irdma_sc_ccq_destroy(dev->ccq, 0, true); @@ -796,9 +796,8 @@ static void irdma_del_hmc_objects(struct irdma_sc_dev *dev, * @dev: hardware control device structure * @info: information for the hmc object to create */ -static enum irdma_status_code -irdma_create_hmc_obj_type(struct irdma_sc_dev *dev, - struct irdma_hmc_create_obj_info *info) +static int irdma_create_hmc_obj_type(struct irdma_sc_dev *dev, + struct irdma_hmc_create_obj_info *info) { return irdma_sc_create_hmc_obj(dev, info); } @@ -812,13 +811,12 @@ irdma_create_hmc_obj_type(struct irdma_sc_dev *dev, * Create the device hmc objects and allocate hmc pages * Return 0 if successful, otherwise clean up and return error */ -static enum irdma_status_code -irdma_create_hmc_objs(struct irdma_pci_f *rf, bool privileged, enum irdma_vers vers) +static int irdma_create_hmc_objs(struct irdma_pci_f *rf, bool privileged, + enum irdma_vers vers) { struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_hmc_create_obj_info info = {}; - enum irdma_status_code status = 0; - int i; + int i, status = 0; info.hmc_info = dev->hmc_info; info.privileged = privileged; @@ -868,9 +866,9 @@ irdma_create_hmc_objs(struct irdma_pci_f *rf, bool privileged, enum irdma_vers v * update the memptr to point to the new aligned memory * Return 0 if successful, otherwise return no memory error */ -static enum irdma_status_code -irdma_obj_aligned_mem(struct irdma_pci_f *rf, struct irdma_dma_mem *memptr, - u32 size, u32 mask) +static int irdma_obj_aligned_mem(struct irdma_pci_f *rf, + struct irdma_dma_mem *memptr, u32 size, + u32 mask) { unsigned long va, newva; unsigned long extra; @@ -884,7 +882,7 @@ irdma_obj_aligned_mem(struct irdma_pci_f *rf, struct irdma_dma_mem *memptr, memptr->pa = rf->obj_next.pa + extra; memptr->size = size; if (((u8 *)memptr->va + size) > ((u8 *)rf->obj_mem.va + rf->obj_mem.size)) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; rf->obj_next.va = (u8 *)memptr->va + size; rf->obj_next.pa = memptr->pa + size; @@ -899,25 +897,24 @@ irdma_obj_aligned_mem(struct irdma_pci_f *rf, struct irdma_dma_mem *memptr, * Return 0, if the cqp and all the resources associated with it * are successfully created, otherwise return error */ -static enum irdma_status_code irdma_create_cqp(struct irdma_pci_f *rf) +static int irdma_create_cqp(struct irdma_pci_f *rf) { - enum irdma_status_code status; u32 sqsize = IRDMA_CQP_SW_SQSIZE_2048; struct irdma_dma_mem mem; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_cqp_init_info cqp_init_info = {}; struct irdma_cqp *cqp = &rf->cqp; u16 maj_err, min_err; - int i; + int i, status; cqp->cqp_requests = kcalloc(sqsize, sizeof(*cqp->cqp_requests), GFP_KERNEL); if (!cqp->cqp_requests) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp->scratch_array = kcalloc(sqsize, sizeof(*cqp->scratch_array), GFP_KERNEL); if (!cqp->scratch_array) { kfree(cqp->cqp_requests); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } dev->cqp = &cqp->sc_cqp; @@ -929,7 +926,7 @@ static enum irdma_status_code irdma_create_cqp(struct irdma_pci_f *rf) if (!cqp->sq.va) { kfree(cqp->scratch_array); kfree(cqp->cqp_requests); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } status = irdma_obj_aligned_mem(rf, &mem, sizeof(struct irdma_cqp_ctx), @@ -999,12 +996,12 @@ exit: * Return 0, if the ccq and the resources associated with it * are successfully created, otherwise return error */ -static enum irdma_status_code irdma_create_ccq(struct irdma_pci_f *rf) +static int irdma_create_ccq(struct irdma_pci_f *rf) { struct irdma_sc_dev *dev = &rf->sc_dev; - enum irdma_status_code status; struct irdma_ccq_init_info info = {}; struct irdma_ccq *ccq = &rf->ccq; + int status; dev->ccq = &ccq->sc_cq; dev->ccq->dev = dev; @@ -1015,7 +1012,7 @@ static enum irdma_status_code irdma_create_ccq(struct irdma_pci_f *rf) ccq->mem_cq.va = dma_alloc_coherent(dev->hw->device, ccq->mem_cq.size, &ccq->mem_cq.pa, GFP_KERNEL); if (!ccq->mem_cq.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; status = irdma_obj_aligned_mem(rf, &ccq->shadow_area, ccq->shadow_area.size, @@ -1054,9 +1051,9 @@ exit: * Allocate a mac ip entry and add it to the hw table Return 0 * if successful, otherwise return error */ -static enum irdma_status_code irdma_alloc_set_mac(struct irdma_device *iwdev) +static int irdma_alloc_set_mac(struct irdma_device *iwdev) { - enum irdma_status_code status; + int status; status = irdma_alloc_local_mac_entry(iwdev->rf, &iwdev->mac_ip_table_idx); @@ -1082,9 +1079,8 @@ static enum irdma_status_code irdma_alloc_set_mac(struct irdma_device *iwdev) * Allocate interrupt resources and enable irq handling * Return 0 if successful, otherwise return error */ -static enum irdma_status_code -irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, - u32 ceq_id, struct irdma_msix_vector *msix_vec) +static int irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, + u32 ceq_id, struct irdma_msix_vector *msix_vec) { int status; @@ -1103,7 +1099,7 @@ irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, irq_update_affinity_hint(msix_vec->irq, &msix_vec->mask); if (status) { ibdev_dbg(&rf->iwdev->ibdev, "ERR: ceq irq config fail\n"); - return IRDMA_ERR_CFG; + return -EINVAL; } msix_vec->ceq_id = ceq_id; @@ -1119,7 +1115,7 @@ irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, * Allocate interrupt resources and enable irq handling * Return 0 if successful, otherwise return error */ -static enum irdma_status_code irdma_cfg_aeq_vector(struct irdma_pci_f *rf) +static int irdma_cfg_aeq_vector(struct irdma_pci_f *rf) { struct irdma_msix_vector *msix_vec = rf->iw_msixtbl; u32 ret = 0; @@ -1131,7 +1127,7 @@ static enum irdma_status_code irdma_cfg_aeq_vector(struct irdma_pci_f *rf) } if (ret) { ibdev_dbg(&rf->iwdev->ibdev, "ERR: aeq irq config fail\n"); - return IRDMA_ERR_CFG; + return -EINVAL; } rf->sc_dev.irq_ops->irdma_cfg_aeq(&rf->sc_dev, msix_vec->idx, true); @@ -1149,12 +1145,10 @@ static enum irdma_status_code irdma_cfg_aeq_vector(struct irdma_pci_f *rf) * Return 0, if the ceq and the resources associated with it * are successfully created, otherwise return error */ -static enum irdma_status_code irdma_create_ceq(struct irdma_pci_f *rf, - struct irdma_ceq *iwceq, - u32 ceq_id, - struct irdma_sc_vsi *vsi) +static int irdma_create_ceq(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, + u32 ceq_id, struct irdma_sc_vsi *vsi) { - enum irdma_status_code status; + int status; struct irdma_ceq_init_info info = {}; struct irdma_sc_dev *dev = &rf->sc_dev; u64 scratch; @@ -1169,7 +1163,7 @@ static enum irdma_status_code irdma_create_ceq(struct irdma_pci_f *rf, iwceq->mem.va = dma_alloc_coherent(dev->hw->device, iwceq->mem.size, &iwceq->mem.pa, GFP_KERNEL); if (!iwceq->mem.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; info.ceq_id = ceq_id; info.ceqe_base = iwceq->mem.va; @@ -1205,18 +1199,18 @@ static enum irdma_status_code irdma_create_ceq(struct irdma_pci_f *rf, * Create the ceq 0 and configure it's msix interrupt vector * Return 0, if successfully set up, otherwise return error */ -static enum irdma_status_code irdma_setup_ceq_0(struct irdma_pci_f *rf) +static int irdma_setup_ceq_0(struct irdma_pci_f *rf) { struct irdma_ceq *iwceq; struct irdma_msix_vector *msix_vec; u32 i; - enum irdma_status_code status = 0; + int status = 0; u32 num_ceqs; num_ceqs = min(rf->msix_count, rf->sc_dev.hmc_fpm_misc.max_ceqs); rf->ceqlist = kcalloc(num_ceqs, sizeof(*rf->ceqlist), GFP_KERNEL); if (!rf->ceqlist) { - status = IRDMA_ERR_NO_MEMORY; + status = -ENOMEM; goto exit; } @@ -1262,14 +1256,13 @@ exit: * Create the ceq's and configure their msix interrupt vectors * Return 0, if ceqs are successfully set up, otherwise return error */ -static enum irdma_status_code irdma_setup_ceqs(struct irdma_pci_f *rf, - struct irdma_sc_vsi *vsi) +static int irdma_setup_ceqs(struct irdma_pci_f *rf, struct irdma_sc_vsi *vsi) { u32 i; u32 ceq_id; struct irdma_ceq *iwceq; struct irdma_msix_vector *msix_vec; - enum irdma_status_code status; + int status; u32 num_ceqs; num_ceqs = min(rf->msix_count, rf->sc_dev.hmc_fpm_misc.max_ceqs); @@ -1303,16 +1296,15 @@ del_ceqs: return status; } -static enum irdma_status_code irdma_create_virt_aeq(struct irdma_pci_f *rf, - u32 size) +static int irdma_create_virt_aeq(struct irdma_pci_f *rf, u32 size) { - enum irdma_status_code status = IRDMA_ERR_NO_MEMORY; + int status = -ENOMEM; struct irdma_aeq *aeq = &rf->aeq; dma_addr_t *pg_arr; u32 pg_cnt; if (rf->rdma_ver < IRDMA_GEN_2) - return IRDMA_NOT_SUPPORTED; + return -EOPNOTSUPP; aeq->mem.size = sizeof(struct irdma_sc_aeqe) * size; aeq->mem.va = vzalloc(aeq->mem.size); @@ -1345,15 +1337,15 @@ static enum irdma_status_code irdma_create_virt_aeq(struct irdma_pci_f *rf, * Return 0, if the aeq and the resources associated with it * are successfully created, otherwise return error */ -static enum irdma_status_code irdma_create_aeq(struct irdma_pci_f *rf) +static int irdma_create_aeq(struct irdma_pci_f *rf) { - enum irdma_status_code status; struct irdma_aeq_init_info info = {}; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_aeq *aeq = &rf->aeq; struct irdma_hmc_info *hmc_info = rf->sc_dev.hmc_info; u32 aeq_size; u8 multiplier = (rf->protocol_used == IRDMA_IWARP_PROTOCOL_ONLY) ? 2 : 1; + int status; aeq_size = multiplier * hmc_info->hmc_obj[IRDMA_HMC_IW_QP].cnt + hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt; @@ -1412,10 +1404,10 @@ err: * Create the aeq and configure its msix interrupt vector * Return 0 if successful, otherwise return error */ -static enum irdma_status_code irdma_setup_aeq(struct irdma_pci_f *rf) +static int irdma_setup_aeq(struct irdma_pci_f *rf) { struct irdma_sc_dev *dev = &rf->sc_dev; - enum irdma_status_code status; + int status; status = irdma_create_aeq(rf); if (status) @@ -1439,10 +1431,10 @@ static enum irdma_status_code irdma_setup_aeq(struct irdma_pci_f *rf) * * Return 0 if successful, otherwise return error */ -static enum irdma_status_code irdma_initialize_ilq(struct irdma_device *iwdev) +static int irdma_initialize_ilq(struct irdma_device *iwdev) { struct irdma_puda_rsrc_info info = {}; - enum irdma_status_code status; + int status; info.type = IRDMA_PUDA_RSRC_TYPE_ILQ; info.cq_id = 1; @@ -1469,10 +1461,10 @@ static enum irdma_status_code irdma_initialize_ilq(struct irdma_device *iwdev) * * Return 0 if successful, otherwise return error */ -static enum irdma_status_code irdma_initialize_ieq(struct irdma_device *iwdev) +static int irdma_initialize_ieq(struct irdma_device *iwdev) { struct irdma_puda_rsrc_info info = {}; - enum irdma_status_code status; + int status; info.type = IRDMA_PUDA_RSRC_TYPE_IEQ; info.cq_id = 2; @@ -1515,9 +1507,9 @@ void irdma_reinitialize_ieq(struct irdma_sc_vsi *vsi) * the hmc objects and create the objects * Return 0 if successful, otherwise return error */ -static enum irdma_status_code irdma_hmc_setup(struct irdma_pci_f *rf) +static int irdma_hmc_setup(struct irdma_pci_f *rf) { - enum irdma_status_code status; + int status; u32 qpcnt; if (rf->rdma_ver == IRDMA_GEN_1) @@ -1570,9 +1562,9 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf) * Return 0 if successful, otherwise clean up the resources * and return error */ -static enum irdma_status_code irdma_initialize_dev(struct irdma_pci_f *rf) +static int irdma_initialize_dev(struct irdma_pci_f *rf) { - enum irdma_status_code status; + int status; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_device_init_info info = {}; struct irdma_dma_mem mem; @@ -1584,7 +1576,7 @@ static enum irdma_status_code irdma_initialize_dev(struct irdma_pci_f *rf) rf->hmc_info_mem = kzalloc(size, GFP_KERNEL); if (!rf->hmc_info_mem) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; rf->pble_rsrc = (struct irdma_hmc_pble_rsrc *)rf->hmc_info_mem; dev->hmc_info = &rf->hw.hmc; @@ -1667,9 +1659,9 @@ void irdma_rt_deinit_hw(struct irdma_device *iwdev) destroy_workqueue(iwdev->cleanup_wq); } -static enum irdma_status_code irdma_setup_init_state(struct irdma_pci_f *rf) +static int irdma_setup_init_state(struct irdma_pci_f *rf) { - enum irdma_status_code status; + int status; status = irdma_save_msix_info(rf); if (status) @@ -1680,7 +1672,7 @@ static enum irdma_status_code irdma_setup_init_state(struct irdma_pci_f *rf) rf->obj_mem.va = dma_alloc_coherent(rf->hw.device, rf->obj_mem.size, &rf->obj_mem.pa, GFP_KERNEL); if (!rf->obj_mem.va) { - status = IRDMA_ERR_NO_MEMORY; + status = -ENOMEM; goto clean_msixtbl; } @@ -1763,14 +1755,14 @@ void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf) * Create device queues ILQ, IEQ, CEQs and PBLEs. Setup irdma * device resource objects. */ -enum irdma_status_code irdma_rt_init_hw(struct irdma_device *iwdev, - struct irdma_l2params *l2params) +int irdma_rt_init_hw(struct irdma_device *iwdev, + struct irdma_l2params *l2params) { struct irdma_pci_f *rf = iwdev->rf; struct irdma_sc_dev *dev = &rf->sc_dev; - enum irdma_status_code status; struct irdma_vsi_init_info vsi_info = {}; struct irdma_vsi_stats_info stats_info = {}; + int status; vsi_info.dev = dev; vsi_info.back_vsi = iwdev; @@ -1788,7 +1780,7 @@ enum irdma_status_code irdma_rt_init_hw(struct irdma_device *iwdev, stats_info.pestat = kzalloc(sizeof(*stats_info.pestat), GFP_KERNEL); if (!stats_info.pestat) { irdma_cleanup_cm_core(&iwdev->cm_core); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } stats_info.fcn_id = dev->hmc_fn_id; status = irdma_vsi_stats_init(&iwdev->vsi, &stats_info); @@ -1850,7 +1842,7 @@ enum irdma_status_code irdma_rt_init_hw(struct irdma_device *iwdev, iwdev->cleanup_wq = alloc_workqueue("irdma-cleanup-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!iwdev->cleanup_wq) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; irdma_get_used_rsrc(iwdev); init_waitqueue_head(&iwdev->suspend_wq); @@ -1870,10 +1862,10 @@ enum irdma_status_code irdma_rt_init_hw(struct irdma_device *iwdev, * * Create admin queues, HMC obejcts and RF resource objects */ -enum irdma_status_code irdma_ctrl_init_hw(struct irdma_pci_f *rf) +int irdma_ctrl_init_hw(struct irdma_pci_f *rf) { struct irdma_sc_dev *dev = &rf->sc_dev; - enum irdma_status_code status; + int status; do { status = irdma_setup_init_state(rf); if (status) @@ -1915,7 +1907,7 @@ enum irdma_status_code irdma_ctrl_init_hw(struct irdma_pci_f *rf) rf->cqp_cmpl_wq = alloc_ordered_workqueue("cqp_cmpl_wq", WQ_HIGHPRI | WQ_UNBOUND); if (!rf->cqp_cmpl_wq) { - status = IRDMA_ERR_NO_MEMORY; + status = -ENOMEM; break; } INIT_WORK(&rf->cqp_cmpl_work, cqp_compl_worker); @@ -2202,11 +2194,11 @@ int irdma_add_local_mac_entry(struct irdma_pci_f *rf, const u8 *mac_addr, u16 id struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_info->post_sq = 1; @@ -2238,11 +2230,11 @@ int irdma_alloc_local_mac_entry(struct irdma_pci_f *rf, u16 *mac_tbl_idx) struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status = 0; + int status = 0; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_info->cqp_cmd = IRDMA_OP_ALLOC_LOCAL_MAC_ENTRY; @@ -2264,18 +2256,17 @@ int irdma_alloc_local_mac_entry(struct irdma_pci_f *rf, u16 *mac_tbl_idx) * @accel_local_port: port for apbvt * @add_port: add ordelete port */ -static enum irdma_status_code -irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev, u16 accel_local_port, - bool add_port) +static int irdma_cqp_manage_apbvt_cmd(struct irdma_device *iwdev, + u16 accel_local_port, bool add_port) { struct irdma_apbvt_info *info; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&iwdev->rf->cqp, add_port); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; @@ -2429,22 +2420,21 @@ static void irdma_send_syn_cqp_callback(struct irdma_cqp_request *cqp_request) * @cmnode: cmnode associated with connection * @wait: wait for completion */ -enum irdma_status_code -irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, - enum irdma_quad_entry_type etype, - enum irdma_quad_hash_manage_type mtype, void *cmnode, - bool wait) +int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, + enum irdma_quad_entry_type etype, + enum irdma_quad_hash_manage_type mtype, void *cmnode, + bool wait) { struct irdma_qhash_table_info *info; - enum irdma_status_code status; struct irdma_cqp *iwcqp = &iwdev->rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_cm_node *cm_node = cmnode; + int status; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, wait); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_qhash_table_entry.info; @@ -2558,12 +2548,10 @@ static void irdma_hw_flush_wqes_callback(struct irdma_cqp_request *cqp_request) * @info: info for flush * @wait: flag wait for completion */ -enum irdma_status_code irdma_hw_flush_wqes(struct irdma_pci_f *rf, - struct irdma_sc_qp *qp, - struct irdma_qp_flush_info *info, - bool wait) +int irdma_hw_flush_wqes(struct irdma_pci_f *rf, struct irdma_sc_qp *qp, + struct irdma_qp_flush_info *info, bool wait) { - enum irdma_status_code status; + int status; struct irdma_qp_flush_info *hw_info; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; @@ -2571,7 +2559,7 @@ enum irdma_status_code irdma_hw_flush_wqes(struct irdma_pci_f *rf, cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, wait); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; if (!wait) @@ -2619,7 +2607,7 @@ enum irdma_status_code irdma_hw_flush_wqes(struct irdma_pci_f *rf, info->sq = true; new_req = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!new_req) { - status = IRDMA_ERR_NO_MEMORY; + status = -ENOMEM; goto put_cqp; } cqp_info = &new_req->info; diff --git a/drivers/infiniband/hw/irdma/i40iw_hw.c b/drivers/infiniband/hw/irdma/i40iw_hw.c index 64148ad8a604..e46fc110004d 100644 --- a/drivers/infiniband/hw/irdma/i40iw_hw.c +++ b/drivers/infiniband/hw/irdma/i40iw_hw.c @@ -3,7 +3,6 @@ #include "osdep.h" #include "type.h" #include "i40iw_hw.h" -#include "status.h" #include "protos.h" static u32 i40iw_regs[IRDMA_MAX_REGS] = { diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 97625263899f..6558d5e1ade5 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -162,8 +162,8 @@ static void irdma_request_reset(struct irdma_pci_f *rf) * @vsi: vsi structure * @tc_node: Traffic class node */ -static enum irdma_status_code irdma_lan_register_qset(struct irdma_sc_vsi *vsi, - struct irdma_ws_node *tc_node) +static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *tc_node) { struct irdma_device *iwdev = vsi->back_vsi; struct ice_pf *pf = iwdev->rf->cdev; @@ -176,7 +176,7 @@ static enum irdma_status_code irdma_lan_register_qset(struct irdma_sc_vsi *vsi, ret = ice_add_rdma_qset(pf, &qset); if (ret) { ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); - return IRDMA_ERR_REG_QSET; + return -EINVAL; } tc_node->l2_sched_node_id = qset.teid; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index cf6732bf70a1..94c41f834c25 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -40,7 +40,6 @@ #include #include #include -#include "status.h" #include "osdep.h" #include "defs.h" #include "hmc.h" @@ -242,8 +241,8 @@ struct irdma_qvlist_info { struct irdma_gen_ops { void (*request_reset)(struct irdma_pci_f *rf); - enum irdma_status_code (*register_qset)(struct irdma_sc_vsi *vsi, - struct irdma_ws_node *tc_node); + int (*register_qset)(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *tc_node); void (*unregister_qset)(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node); }; @@ -457,10 +456,10 @@ static inline void irdma_free_rsrc(struct irdma_pci_f *rf, spin_unlock_irqrestore(&rf->rsrc_lock, flags); } -enum irdma_status_code irdma_ctrl_init_hw(struct irdma_pci_f *rf); +int irdma_ctrl_init_hw(struct irdma_pci_f *rf); void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf); -enum irdma_status_code irdma_rt_init_hw(struct irdma_device *iwdev, - struct irdma_l2params *l2params); +int irdma_rt_init_hw(struct irdma_device *iwdev, + struct irdma_l2params *l2params); void irdma_rt_deinit_hw(struct irdma_device *iwdev); void irdma_qp_add_ref(struct ib_qp *ibqp); void irdma_qp_rem_ref(struct ib_qp *ibqp); @@ -489,9 +488,8 @@ void irdma_cm_disconn(struct irdma_qp *qp); bool irdma_cqp_crit_err(struct irdma_sc_dev *dev, u8 cqp_cmd, u16 maj_err_code, u16 min_err_code); -enum irdma_status_code -irdma_handle_cqp_op(struct irdma_pci_f *rf, - struct irdma_cqp_request *cqp_request); +int irdma_handle_cqp_op(struct irdma_pci_f *rf, + struct irdma_cqp_request *cqp_request); int irdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); @@ -500,21 +498,17 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, void irdma_cq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_cq *cq); void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf); -enum irdma_status_code irdma_hw_modify_qp(struct irdma_device *iwdev, - struct irdma_qp *iwqp, - struct irdma_modify_qp_info *info, - bool wait); -enum irdma_status_code irdma_qp_suspend_resume(struct irdma_sc_qp *qp, - bool suspend); -enum irdma_status_code -irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, - enum irdma_quad_entry_type etype, - enum irdma_quad_hash_manage_type mtype, void *cmnode, - bool wait); +int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, + struct irdma_modify_qp_info *info, bool wait); +int irdma_qp_suspend_resume(struct irdma_sc_qp *qp, bool suspend); +int irdma_manage_qhash(struct irdma_device *iwdev, struct irdma_cm_info *cminfo, + enum irdma_quad_entry_type etype, + enum irdma_quad_hash_manage_type mtype, void *cmnode, + bool wait); void irdma_receive_ilq(struct irdma_sc_vsi *vsi, struct irdma_puda_buf *rbuf); void irdma_free_sqbuf(struct irdma_sc_vsi *vsi, void *bufp); void irdma_free_qp_rsrc(struct irdma_qp *iwqp); -enum irdma_status_code irdma_setup_cm_core(struct irdma_device *iwdev, u8 ver); +int irdma_setup_cm_core(struct irdma_device *iwdev, u8 ver); void irdma_cleanup_cm_core(struct irdma_cm_core *cm_core); void irdma_next_iw_state(struct irdma_qp *iwqp, u8 state, u8 del_hash, u8 term, u8 term_len); @@ -523,10 +517,8 @@ int irdma_send_reset(struct irdma_cm_node *cm_node); struct irdma_cm_node *irdma_find_node(struct irdma_cm_core *cm_core, u16 rem_port, u32 *rem_addr, u16 loc_port, u32 *loc_addr, u16 vlan_id); -enum irdma_status_code irdma_hw_flush_wqes(struct irdma_pci_f *rf, - struct irdma_sc_qp *qp, - struct irdma_qp_flush_info *info, - bool wait); +int irdma_hw_flush_wqes(struct irdma_pci_f *rf, struct irdma_sc_qp *qp, + struct irdma_qp_flush_info *info, bool wait); void irdma_gen_ae(struct irdma_pci_f *rf, struct irdma_sc_qp *qp, struct irdma_gen_ae_info *info, bool wait); void irdma_copy_ip_ntohl(u32 *dst, __be32 *src); diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index 6e28e437b6fb..fc1ba2a3e6fb 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -43,32 +43,28 @@ enum irdma_status_code irdma_vf_wait_vchnl_resp(struct irdma_sc_dev *dev); bool irdma_vf_clear_to_send(struct irdma_sc_dev *dev); void irdma_add_dev_ref(struct irdma_sc_dev *dev); void irdma_put_dev_ref(struct irdma_sc_dev *dev); -enum irdma_status_code irdma_ieq_check_mpacrc(struct shash_desc *desc, - void *addr, u32 len, u32 val); +int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, + u32 val); struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); void irdma_send_ieq_ack(struct irdma_sc_qp *qp); void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); void irdma_free_hash_desc(struct shash_desc *hash_desc); -enum irdma_status_code irdma_init_hash_desc(struct shash_desc **hash_desc); -enum irdma_status_code -irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, - struct irdma_puda_buf *buf); -enum irdma_status_code irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *info); -enum irdma_status_code -irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, - struct irdma_hmc_fcn_info *hmcfcninfo, - u16 *pmf_idx); -enum irdma_status_code -irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -enum irdma_status_code -irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -enum irdma_status_code irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, - struct irdma_dma_mem *mem); +int irdma_init_hash_desc(struct shash_desc **hash_desc); +int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, + struct irdma_puda_buf *buf); +int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *info); +int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, + struct irdma_hmc_fcn_info *hmcfcninfo, + u16 *pmf_idx); +int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, + struct irdma_dma_mem *val_mem, u8 hmc_fn_id); +int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, + struct irdma_dma_mem *val_mem, u8 hmc_fn_id); +int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, + struct irdma_dma_mem *mem); void *irdma_remove_cqp_head(struct irdma_sc_dev *dev); void irdma_term_modify_qp(struct irdma_sc_qp *qp, u8 next_state, u8 term, u8 term_len); @@ -80,7 +76,7 @@ void irdma_hw_stats_stop_timer(struct irdma_sc_vsi *vsi); void wr32(struct irdma_hw *hw, u32 reg, u32 val); u32 rd32(struct irdma_hw *hw, u32 reg); u64 rd64(struct irdma_hw *hw, u32 reg); -enum irdma_status_code irdma_map_vm_page_list(struct irdma_hw *hw, void *va, - dma_addr_t *pg_dma, u32 pg_cnt); +int irdma_map_vm_page_list(struct irdma_hw *hw, void *va, dma_addr_t *pg_dma, + u32 pg_cnt); void irdma_unmap_vm_page_list(struct irdma_hw *hw, dma_addr_t *pg_dma, u32 pg_cnt); #endif /* IRDMA_OSDEP_H */ diff --git a/drivers/infiniband/hw/irdma/pble.c b/drivers/infiniband/hw/irdma/pble.c index fed49da770f3..cdc0b8a6ed48 100644 --- a/drivers/infiniband/hw/irdma/pble.c +++ b/drivers/infiniband/hw/irdma/pble.c @@ -1,15 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "osdep.h" -#include "status.h" #include "hmc.h" #include "defs.h" #include "type.h" #include "protos.h" #include "pble.h" -static enum irdma_status_code -add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc); +static int add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc); /** * irdma_destroy_pble_prm - destroy prm during module unload @@ -35,13 +33,12 @@ void irdma_destroy_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) * @dev: irdma_sc_dev struct * @pble_rsrc: pble resources */ -enum irdma_status_code -irdma_hmc_init_pble(struct irdma_sc_dev *dev, - struct irdma_hmc_pble_rsrc *pble_rsrc) +int irdma_hmc_init_pble(struct irdma_sc_dev *dev, + struct irdma_hmc_pble_rsrc *pble_rsrc) { struct irdma_hmc_info *hmc_info; u32 fpm_idx = 0; - enum irdma_status_code status = 0; + int status = 0; hmc_info = dev->hmc_info; pble_rsrc->dev = dev; @@ -60,7 +57,7 @@ irdma_hmc_init_pble(struct irdma_sc_dev *dev, INIT_LIST_HEAD(&pble_rsrc->pinfo.clist); if (add_pble_prm(pble_rsrc)) { irdma_destroy_pble_prm(pble_rsrc); - status = IRDMA_ERR_NO_MEMORY; + status = -ENOMEM; } return status; @@ -84,12 +81,11 @@ static void get_sd_pd_idx(struct irdma_hmc_pble_rsrc *pble_rsrc, * @pble_rsrc: pble resource ptr * @info: page info for sd */ -static enum irdma_status_code -add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_add_page_info *info) +static int add_sd_direct(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_add_page_info *info) { struct irdma_sc_dev *dev = pble_rsrc->dev; - enum irdma_status_code ret_code = 0; + int ret_code = 0; struct sd_pd_idx *idx = &info->idx; struct irdma_chunk *chunk = info->chunk; struct irdma_hmc_info *hmc_info = info->hmc_info; @@ -137,9 +133,8 @@ static u32 fpm_to_idx(struct irdma_hmc_pble_rsrc *pble_rsrc, u64 addr) * @pble_rsrc: pble resource management * @info: page info for sd */ -static enum irdma_status_code -add_bp_pages(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_add_page_info *info) +static int add_bp_pages(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_add_page_info *info) { struct irdma_sc_dev *dev = pble_rsrc->dev; u8 *addr; @@ -148,13 +143,13 @@ add_bp_pages(struct irdma_hmc_pble_rsrc *pble_rsrc, struct irdma_hmc_sd_entry *sd_entry = info->sd_entry; struct irdma_hmc_info *hmc_info = info->hmc_info; struct irdma_chunk *chunk = info->chunk; - enum irdma_status_code status = 0; + int status = 0; u32 rel_pd_idx = info->idx.rel_pd_idx; u32 pd_idx = info->idx.pd_idx; u32 i; if (irdma_pble_get_paged_mem(chunk, info->pages)) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; status = irdma_add_sd_table_entry(dev->hw, hmc_info, info->idx.sd_idx, IRDMA_SD_TYPE_PAGED, @@ -207,8 +202,7 @@ static enum irdma_sd_entry_type irdma_get_type(struct irdma_sc_dev *dev, * add_pble_prm - add a sd entry for pble resoure * @pble_rsrc: pble resource management */ -static enum irdma_status_code -add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) +static int add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) { struct irdma_sc_dev *dev = pble_rsrc->dev; struct irdma_hmc_sd_entry *sd_entry; @@ -216,22 +210,22 @@ add_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc) struct irdma_chunk *chunk; struct irdma_add_page_info info; struct sd_pd_idx *idx = &info.idx; - enum irdma_status_code ret_code = 0; + int ret_code = 0; enum irdma_sd_entry_type sd_entry_type; u64 sd_reg_val = 0; struct irdma_virt_mem chunkmem; u32 pages; if (pble_rsrc->unallocated_pble < PBLE_PER_PAGE) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; if (pble_rsrc->next_fpm_addr & 0xfff) - return IRDMA_ERR_INVALID_PAGE_DESC_INDEX; + return -EINVAL; chunkmem.size = sizeof(*chunk); chunkmem.va = kzalloc(chunkmem.size, GFP_KERNEL); if (!chunkmem.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; chunk = chunkmem.va; chunk->chunkmem = chunkmem; @@ -337,9 +331,8 @@ static void free_lvl2(struct irdma_hmc_pble_rsrc *pble_rsrc, * @pble_rsrc: pble resource management * @palloc: level 2 pble allocation */ -static enum irdma_status_code -get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_pble_alloc *palloc) +static int get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_pble_alloc *palloc) { u32 lf4k, lflast, total, i; u32 pblcnt = PBLE_PER_PAGE; @@ -347,7 +340,7 @@ get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, struct irdma_pble_level2 *lvl2 = &palloc->level2; struct irdma_pble_info *root = &lvl2->root; struct irdma_pble_info *leaf; - enum irdma_status_code ret_code; + int ret_code; u64 fpm_addr; /* number of full 512 (4K) leafs) */ @@ -359,7 +352,7 @@ get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, lvl2->leafmem.size = (sizeof(*leaf) * total); lvl2->leafmem.va = kzalloc(lvl2->leafmem.size, GFP_KERNEL); if (!lvl2->leafmem.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; lvl2->leaf = lvl2->leafmem.va; leaf = lvl2->leaf; @@ -368,7 +361,7 @@ get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, if (ret_code) { kfree(lvl2->leafmem.va); lvl2->leaf = NULL; - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } root->idx = fpm_to_idx(pble_rsrc, fpm_addr); @@ -397,7 +390,7 @@ get_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, error: free_lvl2(pble_rsrc, palloc); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } /** @@ -405,11 +398,10 @@ error: * @pble_rsrc: pble resource management * @palloc: level 1 pble allocation */ -static enum irdma_status_code -get_lvl1_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_pble_alloc *palloc) +static int get_lvl1_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_pble_alloc *palloc) { - enum irdma_status_code ret_code; + int ret_code; u64 fpm_addr; struct irdma_pble_info *lvl1 = &palloc->level1; @@ -417,7 +409,7 @@ get_lvl1_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, palloc->total_cnt << 3, &lvl1->addr, &fpm_addr); if (ret_code) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; palloc->level = PBLE_LEVEL_1; lvl1->idx = fpm_to_idx(pble_rsrc, fpm_addr); @@ -433,11 +425,10 @@ get_lvl1_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, * @palloc: contains all inforamtion regarding pble (idx + pble addr) * @level1_only: flag for a level 1 PBLE */ -static enum irdma_status_code -get_lvl1_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_pble_alloc *palloc, bool level1_only) +static int get_lvl1_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_pble_alloc *palloc, bool level1_only) { - enum irdma_status_code status = 0; + int status = 0; status = get_lvl1_pble(pble_rsrc, palloc); if (!status || level1_only || palloc->total_cnt <= PBLE_PER_PAGE) @@ -455,11 +446,11 @@ get_lvl1_lvl2_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, * @pble_cnt: #of pbles requested * @level1_only: true if only pble level 1 to acquire */ -enum irdma_status_code irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_pble_alloc *palloc, - u32 pble_cnt, bool level1_only) +int irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_pble_alloc *palloc, u32 pble_cnt, + bool level1_only) { - enum irdma_status_code status = 0; + int status = 0; int max_sds = 0; int i; diff --git a/drivers/infiniband/hw/irdma/pble.h b/drivers/infiniband/hw/irdma/pble.h index d0d4f2b77d34..29d295463559 100644 --- a/drivers/infiniband/hw/irdma/pble.h +++ b/drivers/infiniband/hw/irdma/pble.h @@ -108,20 +108,18 @@ struct irdma_hmc_pble_rsrc { }; void irdma_destroy_pble_prm(struct irdma_hmc_pble_rsrc *pble_rsrc); -enum irdma_status_code -irdma_hmc_init_pble(struct irdma_sc_dev *dev, - struct irdma_hmc_pble_rsrc *pble_rsrc); +int irdma_hmc_init_pble(struct irdma_sc_dev *dev, + struct irdma_hmc_pble_rsrc *pble_rsrc); void irdma_free_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, struct irdma_pble_alloc *palloc); -enum irdma_status_code irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, - struct irdma_pble_alloc *palloc, - u32 pble_cnt, bool level1_only); -enum irdma_status_code irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm, - struct irdma_chunk *pchunk); -enum irdma_status_code -irdma_prm_get_pbles(struct irdma_pble_prm *pprm, - struct irdma_pble_chunkinfo *chunkinfo, u64 mem_size, - u64 **vaddr, u64 *fpm_addr); +int irdma_get_pble(struct irdma_hmc_pble_rsrc *pble_rsrc, + struct irdma_pble_alloc *palloc, u32 pble_cnt, + bool level1_only); +int irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm, + struct irdma_chunk *pchunk); +int irdma_prm_get_pbles(struct irdma_pble_prm *pprm, + struct irdma_pble_chunkinfo *chunkinfo, u64 mem_size, + u64 **vaddr, u64 *fpm_addr); void irdma_prm_return_pbles(struct irdma_pble_prm *pprm, struct irdma_pble_chunkinfo *chunkinfo); void irdma_pble_acquire_lock(struct irdma_hmc_pble_rsrc *pble_rsrc, @@ -129,7 +127,6 @@ void irdma_pble_acquire_lock(struct irdma_hmc_pble_rsrc *pble_rsrc, void irdma_pble_release_lock(struct irdma_hmc_pble_rsrc *pble_rsrc, unsigned long *flags); void irdma_pble_free_paged_mem(struct irdma_chunk *chunk); -enum irdma_status_code irdma_pble_get_paged_mem(struct irdma_chunk *chunk, - u32 pg_cnt); +int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt); void irdma_prm_rem_bitmapmem(struct irdma_hw *hw, struct irdma_chunk *chunk); #endif /* IRDMA_PBLE_H */ diff --git a/drivers/infiniband/hw/irdma/protos.h b/drivers/infiniband/hw/irdma/protos.h index a17c0ffb0cc8..9b6e919ae2a9 100644 --- a/drivers/infiniband/hw/irdma/protos.h +++ b/drivers/infiniband/hw/irdma/protos.h @@ -12,58 +12,51 @@ #define CQP_TIMEOUT_THRESHOLD 500 /* init operations */ -enum irdma_status_code irdma_sc_dev_init(enum irdma_vers ver, - struct irdma_sc_dev *dev, - struct irdma_device_init_info *info); +int irdma_sc_dev_init(enum irdma_vers ver, struct irdma_sc_dev *dev, + struct irdma_device_init_info *info); void irdma_sc_rt_init(struct irdma_sc_dev *dev); void irdma_sc_cqp_post_sq(struct irdma_sc_cqp *cqp); __le64 *irdma_sc_cqp_get_next_send_wqe(struct irdma_sc_cqp *cqp, u64 scratch); -enum irdma_status_code -irdma_sc_mr_fast_register(struct irdma_sc_qp *qp, - struct irdma_fast_reg_stag_info *info, bool post_sq); +int irdma_sc_mr_fast_register(struct irdma_sc_qp *qp, + struct irdma_fast_reg_stag_info *info, + bool post_sq); /* HMC/FPM functions */ -enum irdma_status_code irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, - u8 hmc_fn_id); +int irdma_sc_init_iw_hmc(struct irdma_sc_dev *dev, u8 hmc_fn_id); /* stats misc */ -enum irdma_status_code -irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, - struct irdma_vsi_pestat *pestat, bool wait); +int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, + struct irdma_vsi_pestat *pestat, bool wait); void irdma_cqp_gather_stats_gen1(struct irdma_sc_dev *dev, struct irdma_vsi_pestat *pestat); void irdma_hw_stats_read_all(struct irdma_vsi_pestat *stats, struct irdma_dev_hw_stats *stats_values, u64 *hw_stats_regs_32, u64 *hw_stats_regs_64, u8 hw_rev); -enum irdma_status_code -irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, - struct irdma_ws_node_info *node_info); -enum irdma_status_code irdma_cqp_ceq_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_ceq *sc_ceq, u8 op); -enum irdma_status_code irdma_cqp_aeq_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_aeq *sc_aeq, u8 op); -enum irdma_status_code -irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, - struct irdma_stats_inst_info *stats_info); +int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, + struct irdma_ws_node_info *node_info); +int irdma_cqp_ceq_cmd(struct irdma_sc_dev *dev, struct irdma_sc_ceq *sc_ceq, + u8 op); +int irdma_cqp_aeq_cmd(struct irdma_sc_dev *dev, struct irdma_sc_aeq *sc_aeq, + u8 op); +int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, + struct irdma_stats_inst_info *stats_info); u16 irdma_alloc_ws_node_id(struct irdma_sc_dev *dev); void irdma_free_ws_node_id(struct irdma_sc_dev *dev, u16 node_id); void irdma_update_stats(struct irdma_dev_hw_stats *hw_stats, struct irdma_gather_stats *gather_stats, struct irdma_gather_stats *last_gather_stats); /* vsi functions */ -enum irdma_status_code irdma_vsi_stats_init(struct irdma_sc_vsi *vsi, - struct irdma_vsi_stats_info *info); +int irdma_vsi_stats_init(struct irdma_sc_vsi *vsi, + struct irdma_vsi_stats_info *info); void irdma_vsi_stats_free(struct irdma_sc_vsi *vsi); void irdma_sc_vsi_init(struct irdma_sc_vsi *vsi, struct irdma_vsi_init_info *info); -enum irdma_status_code irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, - struct irdma_sc_cq *cq); +int irdma_sc_add_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq); void irdma_sc_remove_cq_ctx(struct irdma_sc_ceq *ceq, struct irdma_sc_cq *cq); /* misc L2 param change functions */ void irdma_change_l2params(struct irdma_sc_vsi *vsi, struct irdma_l2params *l2params); void irdma_sc_suspend_resume_qps(struct irdma_sc_vsi *vsi, u8 suspend); -enum irdma_status_code irdma_cqp_qp_suspend_resume(struct irdma_sc_qp *qp, - u8 cmd); +int irdma_cqp_qp_suspend_resume(struct irdma_sc_qp *qp, u8 cmd); void irdma_qp_add_qos(struct irdma_sc_qp *qp); void irdma_qp_rem_qos(struct irdma_sc_qp *qp); struct irdma_sc_qp *irdma_get_qp_from_list(struct list_head *head, @@ -81,31 +74,26 @@ void irdma_terminate_received(struct irdma_sc_qp *qp, /* misc */ u8 irdma_get_encoded_wqe_size(u32 wqsize, enum irdma_queue_type queue_type); void irdma_modify_qp_to_err(struct irdma_sc_qp *sc_qp); -enum irdma_status_code -irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, - u8 hmc_fn_id, bool post_sq, - bool poll_registers); -enum irdma_status_code irdma_cfg_fpm_val(struct irdma_sc_dev *dev, - u32 qp_count); -enum irdma_status_code irdma_get_rdma_features(struct irdma_sc_dev *dev); +int irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, + u8 hmc_fn_id, bool post_sq, + bool poll_registers); +int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count); +int irdma_get_rdma_features(struct irdma_sc_dev *dev); void free_sd_mem(struct irdma_sc_dev *dev); -enum irdma_status_code irdma_process_cqp_cmd(struct irdma_sc_dev *dev, - struct cqp_cmds_info *pcmdinfo); -enum irdma_status_code irdma_process_bh(struct irdma_sc_dev *dev); -enum irdma_status_code irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *info); -enum irdma_status_code -irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -enum irdma_status_code -irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -enum irdma_status_code irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, - struct irdma_dma_mem *mem); -enum irdma_status_code -irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, - struct irdma_hmc_fcn_info *hmcfcninfo, - u16 *pmf_idx); +int irdma_process_cqp_cmd(struct irdma_sc_dev *dev, + struct cqp_cmds_info *pcmdinfo); +int irdma_process_bh(struct irdma_sc_dev *dev); +int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *info); +int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, + struct irdma_dma_mem *val_mem, u8 hmc_fn_id); +int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, + struct irdma_dma_mem *val_mem, u8 hmc_fn_id); +int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, + struct irdma_dma_mem *mem); +int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, + struct irdma_hmc_fcn_info *hmcfcninfo, + u16 *pmf_idx); void irdma_add_dev_ref(struct irdma_sc_dev *dev); void irdma_put_dev_ref(struct irdma_sc_dev *dev); void *irdma_remove_cqp_head(struct irdma_sc_dev *dev); diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 58e7d875643b..397f3d070f90 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "osdep.h" -#include "status.h" #include "hmc.h" #include "defs.h" #include "type.h" @@ -114,8 +113,7 @@ static void irdma_puda_post_recvbuf(struct irdma_puda_rsrc *rsrc, u32 wqe_idx, * @rsrc: resource to use for buffer * @initial: flag if during init time */ -static enum irdma_status_code -irdma_puda_replenish_rq(struct irdma_puda_rsrc *rsrc, bool initial) +static int irdma_puda_replenish_rq(struct irdma_puda_rsrc *rsrc, bool initial) { u32 i; u32 invalid_cnt = rsrc->rxq_invalid_cnt; @@ -124,7 +122,7 @@ irdma_puda_replenish_rq(struct irdma_puda_rsrc *rsrc, bool initial) for (i = 0; i < invalid_cnt; i++) { buf = irdma_puda_get_bufpool(rsrc); if (!buf) - return IRDMA_ERR_list_empty; + return -ENOBUFS; irdma_puda_post_recvbuf(rsrc, rsrc->rx_wqe_idx, buf, initial); rsrc->rx_wqe_idx = ((rsrc->rx_wqe_idx + 1) % rsrc->rq_size); rsrc->rxq_invalid_cnt--; @@ -194,7 +192,7 @@ static __le64 *irdma_puda_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx) { __le64 *wqe = NULL; - enum irdma_status_code ret_code = 0; + int ret_code = 0; *wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring); if (!*wqe_idx) @@ -213,8 +211,8 @@ static __le64 *irdma_puda_get_next_send_wqe(struct irdma_qp_uk *qp, * @cq: cq for poll * @info: info return for successful completion */ -static enum irdma_status_code -irdma_puda_poll_info(struct irdma_sc_cq *cq, struct irdma_puda_cmpl_info *info) +static int irdma_puda_poll_info(struct irdma_sc_cq *cq, + struct irdma_puda_cmpl_info *info) { struct irdma_cq_uk *cq_uk = &cq->cq_uk; u64 qword0, qword2, qword3, qword6; @@ -233,7 +231,7 @@ irdma_puda_poll_info(struct irdma_sc_cq *cq, struct irdma_puda_cmpl_info *info) get_64bit_val(cqe, 24, &qword3); valid_bit = (bool)FIELD_GET(IRDMA_CQ_VALID, qword3); if (valid_bit != cq_uk->polarity) - return IRDMA_ERR_Q_EMPTY; + return -ENOENT; if (cq->dev->hw_attrs.uk_attrs.hw_rev >= IRDMA_GEN_2) ext_valid = (bool)FIELD_GET(IRDMA_CQ_EXTCQE, qword3); @@ -246,7 +244,7 @@ irdma_puda_poll_info(struct irdma_sc_cq *cq, struct irdma_puda_cmpl_info *info) if (!peek_head) polarity ^= 1; if (polarity != cq_uk->polarity) - return IRDMA_ERR_Q_EMPTY; + return -ENOENT; IRDMA_RING_MOVE_HEAD_NOCHECK(cq_uk->cq_ring); if (!IRDMA_RING_CURRENT_HEAD(cq_uk->cq_ring)) @@ -267,7 +265,7 @@ irdma_puda_poll_info(struct irdma_sc_cq *cq, struct irdma_puda_cmpl_info *info) major_err = (u32)(FIELD_GET(IRDMA_CQ_MAJERR, qword3)); minor_err = (u32)(FIELD_GET(IRDMA_CQ_MINERR, qword3)); info->compl_error = major_err << 16 | minor_err; - return IRDMA_ERR_CQ_COMPL_ERROR; + return -EIO; } get_64bit_val(cqe, 0, &qword0); @@ -319,14 +317,13 @@ irdma_puda_poll_info(struct irdma_sc_cq *cq, struct irdma_puda_cmpl_info *info) * @cq: cq getting interrupt * @compl_err: return any completion err */ -enum irdma_status_code irdma_puda_poll_cmpl(struct irdma_sc_dev *dev, - struct irdma_sc_cq *cq, - u32 *compl_err) +int irdma_puda_poll_cmpl(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq, + u32 *compl_err) { struct irdma_qp_uk *qp; struct irdma_cq_uk *cq_uk = &cq->cq_uk; struct irdma_puda_cmpl_info info = {}; - enum irdma_status_code ret = 0; + int ret = 0; struct irdma_puda_buf *buf; struct irdma_puda_rsrc *rsrc; u8 cq_type = cq->cq_type; @@ -337,24 +334,24 @@ enum irdma_status_code irdma_puda_poll_cmpl(struct irdma_sc_dev *dev, cq->vsi->ieq; } else { ibdev_dbg(to_ibdev(dev), "PUDA: qp_type error\n"); - return IRDMA_ERR_BAD_PTR; + return -EINVAL; } ret = irdma_puda_poll_info(cq, &info); *compl_err = info.compl_error; - if (ret == IRDMA_ERR_Q_EMPTY) + if (ret == -ENOENT) return ret; if (ret) goto done; qp = info.qp; if (!qp || !rsrc) { - ret = IRDMA_ERR_BAD_PTR; + ret = -EFAULT; goto done; } if (qp->qp_id != rsrc->qp_id) { - ret = IRDMA_ERR_BAD_PTR; + ret = -EFAULT; goto done; } @@ -422,8 +419,7 @@ done: * @qp: puda qp for send * @info: buffer information for transmit */ -enum irdma_status_code irdma_puda_send(struct irdma_sc_qp *qp, - struct irdma_puda_send_info *info) +int irdma_puda_send(struct irdma_sc_qp *qp, struct irdma_puda_send_info *info) { __le64 *wqe; u32 iplen, l4len; @@ -443,7 +439,7 @@ enum irdma_status_code irdma_puda_send(struct irdma_sc_qp *qp, wqe = irdma_puda_get_next_send_wqe(&qp->qp_uk, &wqe_idx); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid = (uintptr_t)info->scratch; /* Third line of WQE descriptor */ @@ -503,7 +499,7 @@ void irdma_puda_send_buf(struct irdma_puda_rsrc *rsrc, struct irdma_puda_buf *buf) { struct irdma_puda_send_info info; - enum irdma_status_code ret = 0; + int ret = 0; unsigned long flags; spin_lock_irqsave(&rsrc->bufpool_lock, flags); @@ -603,19 +599,18 @@ static void irdma_puda_qp_setctx(struct irdma_puda_rsrc *rsrc) * @dev: Device * @qp: Resource qp */ -static enum irdma_status_code irdma_puda_qp_wqe(struct irdma_sc_dev *dev, - struct irdma_sc_qp *qp) +static int irdma_puda_qp_wqe(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) { struct irdma_sc_cqp *cqp; __le64 *wqe; u64 hdr; struct irdma_ccq_cqe_info compl_info; - enum irdma_status_code status = 0; + int status = 0; cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 16, qp->hw_host_ctx_pa); set_64bit_val(wqe, 40, qp->shadow_area_pa); @@ -643,11 +638,11 @@ static enum irdma_status_code irdma_puda_qp_wqe(struct irdma_sc_dev *dev, * irdma_puda_qp_create - create qp for resource * @rsrc: resource to use for buffer */ -static enum irdma_status_code irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) +static int irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) { struct irdma_sc_qp *qp = &rsrc->qp; struct irdma_qp_uk *ukqp = &qp->qp_uk; - enum irdma_status_code ret = 0; + int ret = 0; u32 sq_size, rq_size; struct irdma_dma_mem *mem; @@ -659,7 +654,7 @@ static enum irdma_status_code irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) rsrc->qpmem.size, &rsrc->qpmem.pa, GFP_KERNEL); if (!rsrc->qpmem.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; mem = &rsrc->qpmem; memset(mem->va, 0, rsrc->qpmem.size); @@ -722,19 +717,18 @@ static enum irdma_status_code irdma_puda_qp_create(struct irdma_puda_rsrc *rsrc) * @dev: Device * @cq: resource for cq */ -static enum irdma_status_code irdma_puda_cq_wqe(struct irdma_sc_dev *dev, - struct irdma_sc_cq *cq) +static int irdma_puda_cq_wqe(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) { __le64 *wqe; struct irdma_sc_cqp *cqp; u64 hdr; struct irdma_ccq_cqe_info compl_info; - enum irdma_status_code status = 0; + int status = 0; cqp = dev->cqp; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, 0); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 0, cq->cq_uk.cq_size); set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); @@ -775,11 +769,11 @@ static enum irdma_status_code irdma_puda_cq_wqe(struct irdma_sc_dev *dev, * irdma_puda_cq_create - create cq for resource * @rsrc: resource for which cq to create */ -static enum irdma_status_code irdma_puda_cq_create(struct irdma_puda_rsrc *rsrc) +static int irdma_puda_cq_create(struct irdma_puda_rsrc *rsrc) { struct irdma_sc_dev *dev = rsrc->dev; struct irdma_sc_cq *cq = &rsrc->cq; - enum irdma_status_code ret = 0; + int ret = 0; u32 cqsize; struct irdma_dma_mem *mem; struct irdma_cq_init_info info = {}; @@ -792,7 +786,7 @@ static enum irdma_status_code irdma_puda_cq_create(struct irdma_puda_rsrc *rsrc) rsrc->cqmem.va = dma_alloc_coherent(dev->hw->device, rsrc->cqmem.size, &rsrc->cqmem.pa, GFP_KERNEL); if (!rsrc->cqmem.va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; mem = &rsrc->cqmem; info.dev = dev; @@ -833,7 +827,7 @@ error: */ static void irdma_puda_free_qp(struct irdma_puda_rsrc *rsrc) { - enum irdma_status_code ret; + int ret; struct irdma_ccq_cqe_info compl_info; struct irdma_sc_dev *dev = rsrc->dev; @@ -865,7 +859,7 @@ static void irdma_puda_free_qp(struct irdma_puda_rsrc *rsrc) */ static void irdma_puda_free_cq(struct irdma_puda_rsrc *rsrc) { - enum irdma_status_code ret; + int ret; struct irdma_ccq_cqe_info compl_info; struct irdma_sc_dev *dev = rsrc->dev; @@ -967,8 +961,7 @@ void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, * @rsrc: resource for buffer allocation * @count: number of buffers to create */ -static enum irdma_status_code irdma_puda_allocbufs(struct irdma_puda_rsrc *rsrc, - u32 count) +static int irdma_puda_allocbufs(struct irdma_puda_rsrc *rsrc, u32 count) { u32 i; struct irdma_puda_buf *buf; @@ -978,7 +971,7 @@ static enum irdma_status_code irdma_puda_allocbufs(struct irdma_puda_rsrc *rsrc, buf = irdma_puda_alloc_buf(rsrc->dev, rsrc->buf_size); if (!buf) { rsrc->stats_buf_alloc_fail++; - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } irdma_puda_ret_bufpool(rsrc, buf); rsrc->alloc_buf_count++; @@ -1001,11 +994,11 @@ static enum irdma_status_code irdma_puda_allocbufs(struct irdma_puda_rsrc *rsrc, * @vsi: sc VSI struct * @info: resource information */ -enum irdma_status_code irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, - struct irdma_puda_rsrc_info *info) +int irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, + struct irdma_puda_rsrc_info *info) { struct irdma_sc_dev *dev = vsi->dev; - enum irdma_status_code ret = 0; + int ret = 0; struct irdma_puda_rsrc *rsrc; u32 pudasize; u32 sqwridsize, rqwridsize; @@ -1023,12 +1016,12 @@ enum irdma_status_code irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, vmem = &vsi->ieq_mem; break; default: - return IRDMA_NOT_SUPPORTED; + return -EOPNOTSUPP; } vmem->size = pudasize + sqwridsize + rqwridsize; vmem->va = kzalloc(vmem->size, GFP_KERNEL); if (!vmem->va) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; rsrc = vmem->va; spin_lock_init(&rsrc->bufpool_lock); @@ -1046,7 +1039,7 @@ enum irdma_status_code irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, rsrc->xmit_complete = irdma_ieq_tx_compl; break; default: - return IRDMA_NOT_SUPPORTED; + return -EOPNOTSUPP; } rsrc->type = info->type; @@ -1323,12 +1316,12 @@ static void irdma_ieq_compl_pfpdu(struct irdma_puda_rsrc *ieq, * @buf: first receive buffer * @fpdu_len: total length of fpdu */ -static enum irdma_status_code -irdma_ieq_create_pbufl(struct irdma_pfpdu *pfpdu, struct list_head *rxlist, - struct list_head *pbufl, struct irdma_puda_buf *buf, - u16 fpdu_len) +static int irdma_ieq_create_pbufl(struct irdma_pfpdu *pfpdu, + struct list_head *rxlist, + struct list_head *pbufl, + struct irdma_puda_buf *buf, u16 fpdu_len) { - enum irdma_status_code status = 0; + int status = 0; struct irdma_puda_buf *nextbuf; u32 nextseqnum; u16 plen = fpdu_len - buf->datalen; @@ -1338,13 +1331,13 @@ irdma_ieq_create_pbufl(struct irdma_pfpdu *pfpdu, struct list_head *rxlist, do { nextbuf = irdma_puda_get_listbuf(rxlist); if (!nextbuf) { - status = IRDMA_ERR_list_empty; + status = -ENOBUFS; break; } list_add_tail(&nextbuf->list, pbufl); if (nextbuf->seqnum != nextseqnum) { pfpdu->bad_seq_num++; - status = IRDMA_ERR_SEQ_NUM; + status = -ERANGE; break; } if (nextbuf->datalen >= plen) { @@ -1366,11 +1359,11 @@ irdma_ieq_create_pbufl(struct irdma_pfpdu *pfpdu, struct list_head *rxlist, * @buf: receive buffer * @fpdu_len: fpdu len in the buffer */ -static enum irdma_status_code -irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq, struct irdma_pfpdu *pfpdu, - struct irdma_puda_buf *buf, u16 fpdu_len) +static int irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq, + struct irdma_pfpdu *pfpdu, + struct irdma_puda_buf *buf, u16 fpdu_len) { - enum irdma_status_code status = 0; + int status = 0; u8 *crcptr; u32 mpacrc; u32 seqnum = buf->seqnum; @@ -1390,7 +1383,7 @@ irdma_ieq_handle_partial(struct irdma_puda_rsrc *ieq, struct irdma_pfpdu *pfpdu, txbuf = irdma_puda_get_bufpool(ieq); if (!txbuf) { pfpdu->no_tx_bufs++; - status = IRDMA_ERR_NO_TXBUFS; + status = -ENOBUFS; goto error; } @@ -1434,9 +1427,9 @@ error: * @pfpdu: partial management per user qp * @buf: receive buffer */ -static enum irdma_status_code irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, - struct irdma_pfpdu *pfpdu, - struct irdma_puda_buf *buf) +static int irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, + struct irdma_pfpdu *pfpdu, + struct irdma_puda_buf *buf) { u16 fpdu_len = 0; u16 datalen = buf->datalen; @@ -1450,7 +1443,7 @@ static enum irdma_status_code irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, bool partial = false; struct irdma_puda_buf *txbuf; struct list_head *rxlist = &pfpdu->rxlist; - enum irdma_status_code ret = 0; + int ret = 0; ioffset = (u16)(buf->data - (u8 *)buf->mem.va); while (datalen) { @@ -1459,7 +1452,7 @@ static enum irdma_status_code irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, ibdev_dbg(to_ibdev(ieq->dev), "IEQ: error bad fpdu len\n"); list_add(&buf->list, rxlist); - return IRDMA_ERR_MPA_CRC; + return -EINVAL; } if (datalen < fpdu_len) { @@ -1475,7 +1468,7 @@ static enum irdma_status_code irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, list_add(&buf->list, rxlist); ibdev_dbg(to_ibdev(ieq->dev), "ERR: IRDMA_ERR_MPA_CRC\n"); - return IRDMA_ERR_MPA_CRC; + return -EINVAL; } full++; pfpdu->fpdu_processed++; @@ -1490,7 +1483,7 @@ static enum irdma_status_code irdma_ieq_process_buf(struct irdma_puda_rsrc *ieq, if (!txbuf) { pfpdu->no_tx_bufs++; list_add(&buf->list, rxlist); - return IRDMA_ERR_NO_TXBUFS; + return -ENOBUFS; } /* modify txbuf's buffer header */ irdma_ieq_setup_tx_buf(buf, txbuf); @@ -1539,7 +1532,7 @@ void irdma_ieq_process_fpdus(struct irdma_sc_qp *qp, struct irdma_pfpdu *pfpdu = &qp->pfpdu; struct list_head *rxlist = &pfpdu->rxlist; struct irdma_puda_buf *buf; - enum irdma_status_code status; + int status; do { if (list_empty(rxlist)) @@ -1557,7 +1550,7 @@ void irdma_ieq_process_fpdus(struct irdma_sc_qp *qp, } /* keep processing buffers from the head of the list */ status = irdma_ieq_process_buf(ieq, pfpdu, buf); - if (status == IRDMA_ERR_MPA_CRC) { + if (status == -EINVAL) { pfpdu->mpa_crc_err = true; while (!list_empty(rxlist)) { buf = irdma_puda_get_listbuf(rxlist); @@ -1576,8 +1569,7 @@ void irdma_ieq_process_fpdus(struct irdma_sc_qp *qp, * @qp: qp pointer * @buf: buf received on IEQ used to create AH */ -static enum irdma_status_code irdma_ieq_create_ah(struct irdma_sc_qp *qp, - struct irdma_puda_buf *buf) +static int irdma_ieq_create_ah(struct irdma_sc_qp *qp, struct irdma_puda_buf *buf) { struct irdma_ah_info ah_info = {}; diff --git a/drivers/infiniband/hw/irdma/puda.h b/drivers/infiniband/hw/irdma/puda.h index db3a51170020..5f5124db6ddf 100644 --- a/drivers/infiniband/hw/irdma/puda.h +++ b/drivers/infiniband/hw/irdma/puda.h @@ -151,42 +151,33 @@ void irdma_puda_ret_bufpool(struct irdma_puda_rsrc *rsrc, struct irdma_puda_buf *buf); void irdma_puda_send_buf(struct irdma_puda_rsrc *rsrc, struct irdma_puda_buf *buf); -enum irdma_status_code irdma_puda_send(struct irdma_sc_qp *qp, - struct irdma_puda_send_info *info); -enum irdma_status_code -irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, - struct irdma_puda_rsrc_info *info); +int irdma_puda_send(struct irdma_sc_qp *qp, struct irdma_puda_send_info *info); +int irdma_puda_create_rsrc(struct irdma_sc_vsi *vsi, + struct irdma_puda_rsrc_info *info); void irdma_puda_dele_rsrc(struct irdma_sc_vsi *vsi, enum puda_rsrc_type type, bool reset); -enum irdma_status_code irdma_puda_poll_cmpl(struct irdma_sc_dev *dev, - struct irdma_sc_cq *cq, - u32 *compl_err); +int irdma_puda_poll_cmpl(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq, + u32 *compl_err); struct irdma_sc_qp *irdma_ieq_get_qp(struct irdma_sc_dev *dev, struct irdma_puda_buf *buf); -enum irdma_status_code -irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, - struct irdma_puda_buf *buf); -enum irdma_status_code irdma_ieq_check_mpacrc(struct shash_desc *desc, - void *addr, u32 len, u32 val); -enum irdma_status_code irdma_init_hash_desc(struct shash_desc **desc); +int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, + struct irdma_puda_buf *buf); +int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, u32 val); +int irdma_init_hash_desc(struct shash_desc **desc); void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); void irdma_free_hash_desc(struct shash_desc *desc); -void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, - u32 seqnum); -enum irdma_status_code irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_qp *qp); -enum irdma_status_code irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_cq *cq); -enum irdma_status_code irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); +void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, u32 seqnum); +int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); +int irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq); +int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp); void irdma_cqp_cq_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq); void irdma_puda_ieq_get_ah_info(struct irdma_sc_qp *qp, struct irdma_ah_info *ah_info); -enum irdma_status_code irdma_puda_create_ah(struct irdma_sc_dev *dev, - struct irdma_ah_info *ah_info, - bool wait, enum puda_rsrc_type type, - void *cb_param, - struct irdma_sc_ah **ah); +int irdma_puda_create_ah(struct irdma_sc_dev *dev, + struct irdma_ah_info *ah_info, bool wait, + enum puda_rsrc_type type, void *cb_param, + struct irdma_sc_ah **ah); void irdma_puda_free_ah(struct irdma_sc_dev *dev, struct irdma_sc_ah *ah); void irdma_ieq_process_fpdus(struct irdma_sc_qp *qp, struct irdma_puda_rsrc *ieq); diff --git a/drivers/infiniband/hw/irdma/status.h b/drivers/infiniband/hw/irdma/status.h deleted file mode 100644 index 22ea3888253a..000000000000 --- a/drivers/infiniband/hw/irdma/status.h +++ /dev/null @@ -1,71 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB */ -/* Copyright (c) 2015 - 2020 Intel Corporation */ -#ifndef IRDMA_STATUS_H -#define IRDMA_STATUS_H - -/* Error Codes */ -enum irdma_status_code { - IRDMA_SUCCESS = 0, - IRDMA_ERR_NVM = -1, - IRDMA_ERR_NVM_CHECKSUM = -2, - IRDMA_ERR_CFG = -4, - IRDMA_ERR_PARAM = -5, - IRDMA_ERR_DEVICE_NOT_SUPPORTED = -6, - IRDMA_ERR_RESET_FAILED = -7, - IRDMA_ERR_SWFW_SYNC = -8, - IRDMA_ERR_NO_MEMORY = -9, - IRDMA_ERR_BAD_PTR = -10, - IRDMA_ERR_INVALID_PD_ID = -11, - IRDMA_ERR_INVALID_QP_ID = -12, - IRDMA_ERR_INVALID_CQ_ID = -13, - IRDMA_ERR_INVALID_CEQ_ID = -14, - IRDMA_ERR_INVALID_AEQ_ID = -15, - IRDMA_ERR_INVALID_SIZE = -16, - IRDMA_ERR_INVALID_ARP_INDEX = -17, - IRDMA_ERR_INVALID_FPM_FUNC_ID = -18, - IRDMA_ERR_QP_INVALID_MSG_SIZE = -19, - IRDMA_ERR_QP_TOOMANY_WRS_POSTED = -20, - IRDMA_ERR_INVALID_FRAG_COUNT = -21, - IRDMA_ERR_Q_EMPTY = -22, - IRDMA_ERR_INVALID_ALIGNMENT = -23, - IRDMA_ERR_FLUSHED_Q = -24, - IRDMA_ERR_INVALID_PUSH_PAGE_INDEX = -25, - IRDMA_ERR_INVALID_INLINE_DATA_SIZE = -26, - IRDMA_ERR_TIMEOUT = -27, - IRDMA_ERR_OPCODE_MISMATCH = -28, - IRDMA_ERR_CQP_COMPL_ERROR = -29, - IRDMA_ERR_INVALID_VF_ID = -30, - IRDMA_ERR_INVALID_HMCFN_ID = -31, - IRDMA_ERR_BACKING_PAGE_ERROR = -32, - IRDMA_ERR_NO_PBLCHUNKS_AVAILABLE = -33, - IRDMA_ERR_INVALID_PBLE_INDEX = -34, - IRDMA_ERR_INVALID_SD_INDEX = -35, - IRDMA_ERR_INVALID_PAGE_DESC_INDEX = -36, - IRDMA_ERR_INVALID_SD_TYPE = -37, - IRDMA_ERR_MEMCPY_FAILED = -38, - IRDMA_ERR_INVALID_HMC_OBJ_INDEX = -39, - IRDMA_ERR_INVALID_HMC_OBJ_COUNT = -40, - IRDMA_ERR_BUF_TOO_SHORT = -43, - IRDMA_ERR_BAD_IWARP_CQE = -44, - IRDMA_ERR_NVM_BLANK_MODE = -45, - IRDMA_ERR_NOT_IMPL = -46, - IRDMA_ERR_PE_DOORBELL_NOT_ENA = -47, - IRDMA_ERR_NOT_READY = -48, - IRDMA_NOT_SUPPORTED = -49, - IRDMA_ERR_FIRMWARE_API_VER = -50, - IRDMA_ERR_RING_FULL = -51, - IRDMA_ERR_MPA_CRC = -61, - IRDMA_ERR_NO_TXBUFS = -62, - IRDMA_ERR_SEQ_NUM = -63, - IRDMA_ERR_list_empty = -64, - IRDMA_ERR_INVALID_MAC_ADDR = -65, - IRDMA_ERR_BAD_STAG = -66, - IRDMA_ERR_CQ_COMPL_ERROR = -67, - IRDMA_ERR_Q_DESTROYED = -68, - IRDMA_ERR_INVALID_FEAT_CNT = -69, - IRDMA_ERR_REG_CQ_FULL = -70, - IRDMA_ERR_VF_MSG_ERROR = -71, - IRDMA_ERR_NO_INTR = -72, - IRDMA_ERR_REG_QSET = -73, -}; -#endif /* IRDMA_STATUS_H */ diff --git a/drivers/infiniband/hw/irdma/type.h b/drivers/infiniband/hw/irdma/type.h index 4290a2ce810a..9e7b8ecb137a 100644 --- a/drivers/infiniband/hw/irdma/type.h +++ b/drivers/infiniband/hw/irdma/type.h @@ -2,7 +2,6 @@ /* Copyright (c) 2015 - 2021 Intel Corporation */ #ifndef IRDMA_TYPE_H #define IRDMA_TYPE_H -#include "status.h" #include "osdep.h" #include "irdma.h" #include "user.h" @@ -402,8 +401,8 @@ struct irdma_sc_cqp { u64 host_ctx_pa; void *back_cqp; struct irdma_sc_dev *dev; - enum irdma_status_code (*process_cqp_sds)(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *info); + int (*process_cqp_sds)(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *info); struct irdma_dma_mem sdbuf; struct irdma_ring sq_ring; struct irdma_cqp_quanta *sq_base; @@ -605,8 +604,8 @@ struct irdma_sc_vsi { struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; struct irdma_vsi_pestat *pestat; atomic_t qp_suspend_reqs; - enum irdma_status_code (*register_qset)(struct irdma_sc_vsi *vsi, - struct irdma_ws_node *tc_node); + int (*register_qset)(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *tc_node); void (*unregister_qset)(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node); u8 qos_rel_bw; @@ -657,7 +656,7 @@ struct irdma_sc_dev { bool vchnl_up:1; bool ceq_valid:1; u8 pci_rev; - enum irdma_status_code (*ws_add)(struct irdma_sc_vsi *vsi, u8 user_pri); + int (*ws_add)(struct irdma_sc_vsi *vsi, u8 user_pri); void (*ws_remove)(struct irdma_sc_vsi *vsi, u8 user_pri); void (*ws_reset)(struct irdma_sc_vsi *vsi); }; @@ -754,8 +753,8 @@ struct irdma_vsi_init_info { u16 pf_data_vsi_num; enum irdma_vm_vf_type vm_vf_type; u16 vm_id; - enum irdma_status_code (*register_qset)(struct irdma_sc_vsi *vsi, - struct irdma_ws_node *tc_node); + int (*register_qset)(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *tc_node); void (*unregister_qset)(struct irdma_sc_vsi *vsi, struct irdma_ws_node *tc_node); }; @@ -1202,29 +1201,27 @@ struct irdma_irq_ops { }; void irdma_sc_ccq_arm(struct irdma_sc_cq *ccq); -enum irdma_status_code irdma_sc_ccq_create(struct irdma_sc_cq *ccq, u64 scratch, - bool check_overflow, bool post_sq); -enum irdma_status_code irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, - bool post_sq); -enum irdma_status_code irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, - struct irdma_ccq_cqe_info *info); -enum irdma_status_code irdma_sc_ccq_init(struct irdma_sc_cq *ccq, - struct irdma_ccq_init_info *info); +int irdma_sc_ccq_create(struct irdma_sc_cq *ccq, u64 scratch, + bool check_overflow, bool post_sq); +int irdma_sc_ccq_destroy(struct irdma_sc_cq *ccq, u64 scratch, bool post_sq); +int irdma_sc_ccq_get_cqe_info(struct irdma_sc_cq *ccq, + struct irdma_ccq_cqe_info *info); +int irdma_sc_ccq_init(struct irdma_sc_cq *ccq, + struct irdma_ccq_init_info *info); -enum irdma_status_code irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch); -enum irdma_status_code irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq); +int irdma_sc_cceq_create(struct irdma_sc_ceq *ceq, u64 scratch); +int irdma_sc_cceq_destroy_done(struct irdma_sc_ceq *ceq); -enum irdma_status_code irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, - bool post_sq); -enum irdma_status_code irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, - struct irdma_ceq_init_info *info); +int irdma_sc_ceq_destroy(struct irdma_sc_ceq *ceq, u64 scratch, bool post_sq); +int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, + struct irdma_ceq_init_info *info); void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq); void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq); -enum irdma_status_code irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, - struct irdma_aeq_init_info *info); -enum irdma_status_code irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, - struct irdma_aeqe_info *info); +int irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, + struct irdma_aeq_init_info *info); +int irdma_sc_get_next_aeqe(struct irdma_sc_aeq *aeq, + struct irdma_aeqe_info *info); void irdma_sc_repost_aeq_entries(struct irdma_sc_dev *dev, u32 count); void irdma_sc_pd_init(struct irdma_sc_dev *dev, struct irdma_sc_pd *pd, u32 pd_id, @@ -1232,31 +1229,27 @@ void irdma_sc_pd_init(struct irdma_sc_dev *dev, struct irdma_sc_pd *pd, u32 pd_i void irdma_cfg_aeq(struct irdma_sc_dev *dev, u32 idx, bool enable); void irdma_check_cqp_progress(struct irdma_cqp_timeout *cqp_timeout, struct irdma_sc_dev *dev); -enum irdma_status_code irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, - u16 *min_err); -enum irdma_status_code irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp); -enum irdma_status_code irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, - struct irdma_cqp_init_info *info); +int irdma_sc_cqp_create(struct irdma_sc_cqp *cqp, u16 *maj_err, u16 *min_err); +int irdma_sc_cqp_destroy(struct irdma_sc_cqp *cqp); +int irdma_sc_cqp_init(struct irdma_sc_cqp *cqp, + struct irdma_cqp_init_info *info); void irdma_sc_cqp_post_sq(struct irdma_sc_cqp *cqp); -enum irdma_status_code irdma_sc_poll_for_cqp_op_done(struct irdma_sc_cqp *cqp, u8 opcode, - struct irdma_ccq_cqe_info *cmpl_info); -enum irdma_status_code irdma_sc_fast_register(struct irdma_sc_qp *qp, - struct irdma_fast_reg_stag_info *info, - bool post_sq); -enum irdma_status_code irdma_sc_qp_create(struct irdma_sc_qp *qp, - struct irdma_create_qp_info *info, - u64 scratch, bool post_sq); -enum irdma_status_code irdma_sc_qp_destroy(struct irdma_sc_qp *qp, - u64 scratch, bool remove_hash_idx, - bool ignore_mw_bnd, bool post_sq); -enum irdma_status_code irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, - struct irdma_qp_flush_info *info, - u64 scratch, bool post_sq); -enum irdma_status_code irdma_sc_qp_init(struct irdma_sc_qp *qp, - struct irdma_qp_init_info *info); -enum irdma_status_code irdma_sc_qp_modify(struct irdma_sc_qp *qp, - struct irdma_modify_qp_info *info, - u64 scratch, bool post_sq); +int irdma_sc_poll_for_cqp_op_done(struct irdma_sc_cqp *cqp, u8 opcode, + struct irdma_ccq_cqe_info *cmpl_info); +int irdma_sc_fast_register(struct irdma_sc_qp *qp, + struct irdma_fast_reg_stag_info *info, bool post_sq); +int irdma_sc_qp_create(struct irdma_sc_qp *qp, + struct irdma_create_qp_info *info, u64 scratch, + bool post_sq); +int irdma_sc_qp_destroy(struct irdma_sc_qp *qp, u64 scratch, + bool remove_hash_idx, bool ignore_mw_bnd, bool post_sq); +int irdma_sc_qp_flush_wqes(struct irdma_sc_qp *qp, + struct irdma_qp_flush_info *info, u64 scratch, + bool post_sq); +int irdma_sc_qp_init(struct irdma_sc_qp *qp, struct irdma_qp_init_info *info); +int irdma_sc_qp_modify(struct irdma_sc_qp *qp, + struct irdma_modify_qp_info *info, u64 scratch, + bool post_sq); void irdma_sc_send_lsmm(struct irdma_sc_qp *qp, void *lsmm_buf, u32 size, irdma_stag stag); @@ -1265,14 +1258,12 @@ void irdma_sc_qp_setctx(struct irdma_sc_qp *qp, __le64 *qp_ctx, struct irdma_qp_host_ctx_info *info); void irdma_sc_qp_setctx_roce(struct irdma_sc_qp *qp, __le64 *qp_ctx, struct irdma_qp_host_ctx_info *info); -enum irdma_status_code irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, - bool post_sq); -enum irdma_status_code irdma_sc_cq_init(struct irdma_sc_cq *cq, - struct irdma_cq_init_info *info); +int irdma_sc_cq_destroy(struct irdma_sc_cq *cq, u64 scratch, bool post_sq); +int irdma_sc_cq_init(struct irdma_sc_cq *cq, struct irdma_cq_init_info *info); void irdma_sc_cq_resize(struct irdma_sc_cq *cq, struct irdma_modify_cq_info *info); -enum irdma_status_code irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, - u64 scratch, u8 hmc_fn_id, - bool post_sq, bool poll_registers); +int irdma_sc_static_hmc_pages_allocated(struct irdma_sc_cqp *cqp, u64 scratch, + u8 hmc_fn_id, bool post_sq, + bool poll_registers); void sc_vsi_update_stats(struct irdma_sc_vsi *vsi); struct cqp_info { diff --git a/drivers/infiniband/hw/irdma/uda.c b/drivers/infiniband/hw/irdma/uda.c index 7a9988ddbd01..5692093c327d 100644 --- a/drivers/infiniband/hw/irdma/uda.c +++ b/drivers/infiniband/hw/irdma/uda.c @@ -3,7 +3,6 @@ #include #include "osdep.h" -#include "status.h" #include "hmc.h" #include "defs.h" #include "type.h" @@ -18,16 +17,15 @@ * @op: Operation * @scratch: u64 saved to be used during cqp completion */ -enum irdma_status_code irdma_sc_access_ah(struct irdma_sc_cqp *cqp, - struct irdma_ah_info *info, - u32 op, u64 scratch) +int irdma_sc_access_ah(struct irdma_sc_cqp *cqp, struct irdma_ah_info *info, + u32 op, u64 scratch) { __le64 *wqe; u64 qw1, qw2; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) - return IRDMA_ERR_RING_FULL; + return -ENOMEM; set_64bit_val(wqe, 0, ether_addr_to_u64(info->mac_addr) << 16); qw1 = FIELD_PREP(IRDMA_UDA_CQPSQ_MAV_PDINDEXLO, info->pd_idx) | @@ -86,8 +84,7 @@ enum irdma_status_code irdma_sc_access_ah(struct irdma_sc_cqp *cqp, * irdma_create_mg_ctx() - create a mcg context * @info: multicast group context info */ -static enum irdma_status_code -irdma_create_mg_ctx(struct irdma_mcast_grp_info *info) +static int irdma_create_mg_ctx(struct irdma_mcast_grp_info *info) { struct irdma_mcast_grp_ctx_entry_info *entry_info = NULL; u8 idx = 0; /* index in the array */ @@ -117,22 +114,22 @@ irdma_create_mg_ctx(struct irdma_mcast_grp_info *info) * @op: operation to perform * @scratch: u64 saved to be used during cqp completion */ -enum irdma_status_code irdma_access_mcast_grp(struct irdma_sc_cqp *cqp, - struct irdma_mcast_grp_info *info, - u32 op, u64 scratch) +int irdma_access_mcast_grp(struct irdma_sc_cqp *cqp, + struct irdma_mcast_grp_info *info, u32 op, + u64 scratch) { __le64 *wqe; - enum irdma_status_code ret_code = 0; + int ret_code = 0; if (info->mg_id >= IRDMA_UDA_MAX_FSI_MGS) { ibdev_dbg(to_ibdev(cqp->dev), "WQE: mg_id out of range\n"); - return IRDMA_ERR_PARAM; + return -EINVAL; } wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); if (!wqe) { ibdev_dbg(to_ibdev(cqp->dev), "WQE: ring full\n"); - return IRDMA_ERR_RING_FULL; + return -ENOMEM; } ret_code = irdma_create_mg_ctx(info); @@ -198,8 +195,8 @@ static bool irdma_compare_mgs(struct irdma_mcast_grp_ctx_entry_info *entry1, * @ctx: Multcast group context * @mg: Multcast group info */ -enum irdma_status_code irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx, - struct irdma_mcast_grp_ctx_entry_info *mg) +int irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx, + struct irdma_mcast_grp_ctx_entry_info *mg) { u32 idx; bool free_entry_found = false; @@ -228,7 +225,7 @@ enum irdma_status_code irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx, return 0; } - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } /** @@ -239,8 +236,8 @@ enum irdma_status_code irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx, * Finds and removes a specific mulicast group from context, all * parameters must match to remove a multicast group. */ -enum irdma_status_code irdma_sc_del_mcast_grp(struct irdma_mcast_grp_info *ctx, - struct irdma_mcast_grp_ctx_entry_info *mg) +int irdma_sc_del_mcast_grp(struct irdma_mcast_grp_info *ctx, + struct irdma_mcast_grp_ctx_entry_info *mg) { u32 idx; @@ -269,5 +266,5 @@ enum irdma_status_code irdma_sc_del_mcast_grp(struct irdma_mcast_grp_info *ctx, } } - return IRDMA_ERR_PARAM; + return -EINVAL; } diff --git a/drivers/infiniband/hw/irdma/uda.h b/drivers/infiniband/hw/irdma/uda.h index a4ad0367dc96..fe4820ff0cca 100644 --- a/drivers/infiniband/hw/irdma/uda.h +++ b/drivers/infiniband/hw/irdma/uda.h @@ -32,56 +32,54 @@ struct irdma_sc_ah { struct irdma_ah_info ah_info; }; -enum irdma_status_code irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx, - struct irdma_mcast_grp_ctx_entry_info *mg); -enum irdma_status_code irdma_sc_del_mcast_grp(struct irdma_mcast_grp_info *ctx, - struct irdma_mcast_grp_ctx_entry_info *mg); -enum irdma_status_code irdma_sc_access_ah(struct irdma_sc_cqp *cqp, struct irdma_ah_info *info, - u32 op, u64 scratch); -enum irdma_status_code irdma_access_mcast_grp(struct irdma_sc_cqp *cqp, - struct irdma_mcast_grp_info *info, - u32 op, u64 scratch); +int irdma_sc_add_mcast_grp(struct irdma_mcast_grp_info *ctx, + struct irdma_mcast_grp_ctx_entry_info *mg); +int irdma_sc_del_mcast_grp(struct irdma_mcast_grp_info *ctx, + struct irdma_mcast_grp_ctx_entry_info *mg); +int irdma_sc_access_ah(struct irdma_sc_cqp *cqp, struct irdma_ah_info *info, + u32 op, u64 scratch); +int irdma_access_mcast_grp(struct irdma_sc_cqp *cqp, + struct irdma_mcast_grp_info *info, u32 op, + u64 scratch); static inline void irdma_sc_init_ah(struct irdma_sc_dev *dev, struct irdma_sc_ah *ah) { ah->dev = dev; } -static inline enum irdma_status_code irdma_sc_create_ah(struct irdma_sc_cqp *cqp, - struct irdma_ah_info *info, - u64 scratch) +static inline int irdma_sc_create_ah(struct irdma_sc_cqp *cqp, + struct irdma_ah_info *info, u64 scratch) { return irdma_sc_access_ah(cqp, info, IRDMA_CQP_OP_CREATE_ADDR_HANDLE, scratch); } -static inline enum irdma_status_code irdma_sc_destroy_ah(struct irdma_sc_cqp *cqp, - struct irdma_ah_info *info, - u64 scratch) +static inline int irdma_sc_destroy_ah(struct irdma_sc_cqp *cqp, + struct irdma_ah_info *info, u64 scratch) { return irdma_sc_access_ah(cqp, info, IRDMA_CQP_OP_DESTROY_ADDR_HANDLE, scratch); } -static inline enum irdma_status_code irdma_sc_create_mcast_grp(struct irdma_sc_cqp *cqp, - struct irdma_mcast_grp_info *info, - u64 scratch) +static inline int irdma_sc_create_mcast_grp(struct irdma_sc_cqp *cqp, + struct irdma_mcast_grp_info *info, + u64 scratch) { return irdma_access_mcast_grp(cqp, info, IRDMA_CQP_OP_CREATE_MCAST_GRP, scratch); } -static inline enum irdma_status_code irdma_sc_modify_mcast_grp(struct irdma_sc_cqp *cqp, - struct irdma_mcast_grp_info *info, - u64 scratch) +static inline int irdma_sc_modify_mcast_grp(struct irdma_sc_cqp *cqp, + struct irdma_mcast_grp_info *info, + u64 scratch) { return irdma_access_mcast_grp(cqp, info, IRDMA_CQP_OP_MODIFY_MCAST_GRP, scratch); } -static inline enum irdma_status_code irdma_sc_destroy_mcast_grp(struct irdma_sc_cqp *cqp, - struct irdma_mcast_grp_info *info, - u64 scratch) +static inline int irdma_sc_destroy_mcast_grp(struct irdma_sc_cqp *cqp, + struct irdma_mcast_grp_info *info, + u64 scratch) { return irdma_access_mcast_grp(cqp, info, IRDMA_CQP_OP_DESTROY_MCAST_GRP, scratch); diff --git a/drivers/infiniband/hw/irdma/uk.c b/drivers/infiniband/hw/irdma/uk.c index 57a9444e9ea7..daeab5daed5b 100644 --- a/drivers/infiniband/hw/irdma/uk.c +++ b/drivers/infiniband/hw/irdma/uk.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ #include "osdep.h" -#include "status.h" #include "defs.h" #include "user.h" #include "irdma.h" @@ -56,7 +55,7 @@ static void irdma_set_fragment_gen_1(__le64 *wqe, u32 offset, * irdma_nop_1 - insert a NOP wqe * @qp: hw qp ptr */ -static enum irdma_status_code irdma_nop_1(struct irdma_qp_uk *qp) +static int irdma_nop_1(struct irdma_qp_uk *qp) { u64 hdr; __le64 *wqe; @@ -64,7 +63,7 @@ static enum irdma_status_code irdma_nop_1(struct irdma_qp_uk *qp) bool signaled = false; if (!qp->sq_ring.head) - return IRDMA_ERR_PARAM; + return -EINVAL; wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring); wqe = qp->sq_base[wqe_idx].elem; @@ -245,7 +244,7 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, __le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx) { __le64 *wqe; - enum irdma_status_code ret_code; + int ret_code; if (IRDMA_RING_FULL_ERR(qp->rq_ring)) return NULL; @@ -268,16 +267,15 @@ __le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx) * @info: post sq information * @post_sq: flag to post sq */ -enum irdma_status_code irdma_uk_rdma_write(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq) +int irdma_uk_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + bool post_sq) { u64 hdr; __le64 *wqe; struct irdma_rdma_write *op_info; u32 i, wqe_idx; u32 total_size = 0, byte_off; - enum irdma_status_code ret_code; + int ret_code; u32 frag_cnt, addl_frag_cnt; bool read_fence = false; u16 quanta; @@ -286,7 +284,7 @@ enum irdma_status_code irdma_uk_rdma_write(struct irdma_qp_uk *qp, op_info = &info->op.rdma_write; if (op_info->num_lo_sges > qp->max_sq_frag_cnt) - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; for (i = 0; i < op_info->num_lo_sges; i++) total_size += op_info->lo_sg_list[i].length; @@ -305,7 +303,7 @@ enum irdma_status_code irdma_uk_rdma_write(struct irdma_qp_uk *qp, wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -370,12 +368,11 @@ enum irdma_status_code irdma_uk_rdma_write(struct irdma_qp_uk *qp, * @inv_stag: flag for inv_stag * @post_sq: flag to post sq */ -enum irdma_status_code irdma_uk_rdma_read(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool inv_stag, bool post_sq) +int irdma_uk_rdma_read(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + bool inv_stag, bool post_sq) { struct irdma_rdma_read *op_info; - enum irdma_status_code ret_code; + int ret_code; u32 i, byte_off, total_size = 0; bool local_fence = false; u32 addl_frag_cnt; @@ -388,7 +385,7 @@ enum irdma_status_code irdma_uk_rdma_read(struct irdma_qp_uk *qp, op_info = &info->op.rdma_read; if (qp->max_sq_frag_cnt < op_info->num_lo_sges) - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; for (i = 0; i < op_info->num_lo_sges; i++) total_size += op_info->lo_sg_list[i].length; @@ -400,7 +397,7 @@ enum irdma_status_code irdma_uk_rdma_read(struct irdma_qp_uk *qp, wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -457,15 +454,14 @@ enum irdma_status_code irdma_uk_rdma_read(struct irdma_qp_uk *qp, * @info: post sq information * @post_sq: flag to post sq */ -enum irdma_status_code irdma_uk_send(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq) +int irdma_uk_send(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + bool post_sq) { __le64 *wqe; struct irdma_post_send *op_info; u64 hdr; u32 i, wqe_idx, total_size = 0, byte_off; - enum irdma_status_code ret_code; + int ret_code; u32 frag_cnt, addl_frag_cnt; bool read_fence = false; u16 quanta; @@ -474,7 +470,7 @@ enum irdma_status_code irdma_uk_send(struct irdma_qp_uk *qp, op_info = &info->op.send; if (qp->max_sq_frag_cnt < op_info->num_sges) - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; for (i = 0; i < op_info->num_sges; i++) total_size += op_info->sg_list[i].length; @@ -490,7 +486,7 @@ enum irdma_status_code irdma_uk_send(struct irdma_qp_uk *qp, wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, total_size, info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -678,9 +674,8 @@ static u16 irdma_inline_data_size_to_quanta(u32 data_size) * @info: post sq information * @post_sq: flag to post sq */ -enum irdma_status_code -irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, - bool post_sq) +int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq) { __le64 *wqe; struct irdma_inline_rdma_write *op_info; @@ -693,13 +688,13 @@ irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *in op_info = &info->op.inline_rdma_write; if (op_info->len > qp->max_inline_data) - return IRDMA_ERR_INVALID_INLINE_DATA_SIZE; + return -EINVAL; quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len); wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, op_info->len, info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -745,9 +740,8 @@ irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *in * @info: post sq information * @post_sq: flag to post sq */ -enum irdma_status_code irdma_uk_inline_send(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq) +int irdma_uk_inline_send(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq) { __le64 *wqe; struct irdma_post_inline_send *op_info; @@ -760,13 +754,13 @@ enum irdma_status_code irdma_uk_inline_send(struct irdma_qp_uk *qp, op_info = &info->op.inline_send; if (op_info->len > qp->max_inline_data) - return IRDMA_ERR_INVALID_INLINE_DATA_SIZE; + return -EINVAL; quanta = qp->wqe_ops.iw_inline_data_size_to_quanta(op_info->len); wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, quanta, op_info->len, info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -817,9 +811,9 @@ enum irdma_status_code irdma_uk_inline_send(struct irdma_qp_uk *qp, * @info: post sq information * @post_sq: flag to post sq */ -enum irdma_status_code -irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, bool post_sq) +int irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, + bool post_sq) { __le64 *wqe; struct irdma_inv_local_stag *op_info; @@ -835,7 +829,7 @@ irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, IRDMA_QP_WQE_MIN_QUANTA, 0, info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -871,8 +865,8 @@ irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, * @qp: hw qp ptr * @info: post rq information */ -enum irdma_status_code irdma_uk_post_receive(struct irdma_qp_uk *qp, - struct irdma_post_rq_info *info) +int irdma_uk_post_receive(struct irdma_qp_uk *qp, + struct irdma_post_rq_info *info) { u32 wqe_idx, i, byte_off; u32 addl_frag_cnt; @@ -880,11 +874,11 @@ enum irdma_status_code irdma_uk_post_receive(struct irdma_qp_uk *qp, u64 hdr; if (qp->max_rq_frag_cnt < info->num_sges) - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; wqe = irdma_qp_get_next_recv_wqe(qp, &wqe_idx); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; qp->rq_wrid_array[wqe_idx] = info->wr_id; addl_frag_cnt = info->num_sges > 1 ? (info->num_sges - 1) : 0; @@ -1000,15 +994,15 @@ void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, * @cq: hw cq * @info: cq poll information returned */ -enum irdma_status_code -irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) +int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, + struct irdma_cq_poll_info *info) { u64 comp_ctx, qword0, qword2, qword3; __le64 *cqe; struct irdma_qp_uk *qp; struct irdma_ring *pring = NULL; u32 wqe_idx, q_type; - enum irdma_status_code ret_code; + int ret_code; bool move_cq_head = true; u8 polarity; bool ext_valid; @@ -1022,7 +1016,7 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) get_64bit_val(cqe, 24, &qword3); polarity = (u8)FIELD_GET(IRDMA_CQ_VALID, qword3); if (polarity != cq->polarity) - return IRDMA_ERR_Q_EMPTY; + return -ENOENT; /* Ensure CQE contents are read after valid bit is checked */ dma_rmb(); @@ -1045,7 +1039,7 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) polarity ^= 1; } if (polarity != cq->polarity) - return IRDMA_ERR_Q_EMPTY; + return -ENOENT; /* Ensure ext CQE contents are read after ext valid bit is checked */ dma_rmb(); @@ -1112,7 +1106,7 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) info->solicited_event = (bool)FIELD_GET(IRDMACQ_SOEVENT, qword3); qp = (struct irdma_qp_uk *)(unsigned long)comp_ctx; if (!qp || qp->destroy_pending) { - ret_code = IRDMA_ERR_Q_DESTROYED; + ret_code = -EFAULT; goto exit; } wqe_idx = (u32)FIELD_GET(IRDMA_CQ_WQEIDX, qword3); @@ -1126,7 +1120,7 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) if (info->comp_status == IRDMA_COMPL_STATUS_FLUSHED || info->comp_status == IRDMA_COMPL_STATUS_UNKNOWN) { if (!IRDMA_RING_MORE_WORK(qp->rq_ring)) { - ret_code = IRDMA_ERR_Q_EMPTY; + ret_code = -ENOENT; goto exit; } @@ -1186,7 +1180,7 @@ irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, struct irdma_cq_poll_info *info) wqe_idx + qp->sq_wrtrk_array[wqe_idx].quanta); } else { if (!IRDMA_RING_MORE_WORK(qp->sq_ring)) { - ret_code = IRDMA_ERR_Q_EMPTY; + ret_code = -ENOENT; goto exit; } @@ -1303,15 +1297,15 @@ void irdma_get_wqe_shift(struct irdma_uk_attrs *uk_attrs, u32 sge, * @sqdepth: depth of SQ * */ -enum irdma_status_code irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, - u32 sq_size, u8 shift, u32 *sqdepth) +int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift, + u32 *sqdepth) { *sqdepth = irdma_qp_round_up((sq_size << shift) + IRDMA_SQ_RSVD); if (*sqdepth < (IRDMA_QP_SW_MIN_WQSIZE << shift)) *sqdepth = IRDMA_QP_SW_MIN_WQSIZE << shift; else if (*sqdepth > uk_attrs->max_hw_wq_quanta) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; return 0; } @@ -1323,15 +1317,15 @@ enum irdma_status_code irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, * @shift: shift which determines size of WQE * @rqdepth: depth of RQ */ -enum irdma_status_code irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, - u32 rq_size, u8 shift, u32 *rqdepth) +int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, + u32 *rqdepth) { *rqdepth = irdma_qp_round_up((rq_size << shift) + IRDMA_RQ_RSVD); if (*rqdepth < (IRDMA_QP_SW_MIN_WQSIZE << shift)) *rqdepth = IRDMA_QP_SW_MIN_WQSIZE << shift; else if (*rqdepth > uk_attrs->max_hw_rq_quanta) - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; return 0; } @@ -1381,17 +1375,16 @@ static void irdma_setup_connection_wqes(struct irdma_qp_uk *qp, * allowed. Then size of wqe * the number of wqes should be the * amount of memory allocated for sq and rq. */ -enum irdma_status_code irdma_uk_qp_init(struct irdma_qp_uk *qp, - struct irdma_qp_uk_init_info *info) +int irdma_uk_qp_init(struct irdma_qp_uk *qp, struct irdma_qp_uk_init_info *info) { - enum irdma_status_code ret_code = 0; + int ret_code = 0; u32 sq_ring_size; u8 sqshift, rqshift; qp->uk_attrs = info->uk_attrs; if (info->max_sq_frag_cnt > qp->uk_attrs->max_hw_wq_frags || info->max_rq_frag_cnt > qp->uk_attrs->max_hw_wq_frags) - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; irdma_get_wqe_shift(qp->uk_attrs, info->max_rq_frag_cnt, 0, &rqshift); if (qp->uk_attrs->hw_rev == IRDMA_GEN_1) { @@ -1502,8 +1495,7 @@ void irdma_uk_clean_cq(void *q, struct irdma_cq_uk *cq) * @signaled: signaled for completion * @post_sq: ring doorbell */ -enum irdma_status_code irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, - bool signaled, bool post_sq) +int irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq) { __le64 *wqe; u64 hdr; @@ -1515,7 +1507,7 @@ enum irdma_status_code irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, wqe = irdma_qp_get_next_send_wqe(qp, &wqe_idx, IRDMA_QP_WQE_MIN_QUANTA, 0, &info); if (!wqe) - return IRDMA_ERR_QP_TOOMANY_WRS_POSTED; + return -ENOMEM; irdma_clr_wqes(qp, wqe_idx); @@ -1541,7 +1533,7 @@ enum irdma_status_code irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, * @frag_cnt: number of fragments * @quanta: quanta for frag_cnt */ -enum irdma_status_code irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta) +int irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta) { switch (frag_cnt) { case 0: @@ -1577,7 +1569,7 @@ enum irdma_status_code irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta) *quanta = 8; break; default: - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; } return 0; @@ -1588,7 +1580,7 @@ enum irdma_status_code irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta) * @frag_cnt: number of fragments * @wqe_size: size in bytes given frag_cnt */ -enum irdma_status_code irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size) +int irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size) { switch (frag_cnt) { case 0: @@ -1615,7 +1607,7 @@ enum irdma_status_code irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size) *wqe_size = 256; break; default: - return IRDMA_ERR_INVALID_FRAG_COUNT; + return -EINVAL; } return 0; diff --git a/drivers/infiniband/hw/irdma/user.h b/drivers/infiniband/hw/irdma/user.h index 3c811fb88404..ddd0ebbdd7d5 100644 --- a/drivers/infiniband/hw/irdma/user.h +++ b/drivers/infiniband/hw/irdma/user.h @@ -270,29 +270,24 @@ struct irdma_cq_poll_info { bool imm_valid:1; }; -enum irdma_status_code irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq); -enum irdma_status_code irdma_uk_inline_send(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq); - -enum irdma_status_code irdma_uk_post_nop(struct irdma_qp_uk *qp, u64 wr_id, - bool signaled, bool post_sq); -enum irdma_status_code irdma_uk_post_receive(struct irdma_qp_uk *qp, - struct irdma_post_rq_info *info); +int irdma_uk_inline_rdma_write(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq); +int irdma_uk_inline_send(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, bool post_sq); +int irdma_uk_post_nop(struct irdma_qp_uk *qp, u64 wr_id, bool signaled, + bool post_sq); +int irdma_uk_post_receive(struct irdma_qp_uk *qp, + struct irdma_post_rq_info *info); void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp); -enum irdma_status_code irdma_uk_rdma_read(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool inv_stag, bool post_sq); -enum irdma_status_code irdma_uk_rdma_write(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq); -enum irdma_status_code irdma_uk_send(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, bool post_sq); -enum irdma_status_code irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, - struct irdma_post_sq_info *info, - bool post_sq); +int irdma_uk_rdma_read(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + bool inv_stag, bool post_sq); +int irdma_uk_rdma_write(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + bool post_sq); +int irdma_uk_send(struct irdma_qp_uk *qp, struct irdma_post_sq_info *info, + bool post_sq); +int irdma_uk_stag_local_invalidate(struct irdma_qp_uk *qp, + struct irdma_post_sq_info *info, + bool post_sq); struct irdma_wqe_uk_ops { void (*iw_copy_inline_data)(u8 *dest, u8 *src, u32 len, u8 polarity); @@ -303,16 +298,16 @@ struct irdma_wqe_uk_ops { struct irdma_bind_window *op_info); }; -enum irdma_status_code irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, - struct irdma_cq_poll_info *info); +int irdma_uk_cq_poll_cmpl(struct irdma_cq_uk *cq, + struct irdma_cq_poll_info *info); void irdma_uk_cq_request_notification(struct irdma_cq_uk *cq, enum irdma_cmpl_notify cq_notify); void irdma_uk_cq_resize(struct irdma_cq_uk *cq, void *cq_base, int size); void irdma_uk_cq_set_resized_cnt(struct irdma_cq_uk *qp, u16 cnt); void irdma_uk_cq_init(struct irdma_cq_uk *cq, struct irdma_cq_uk_init_info *info); -enum irdma_status_code irdma_uk_qp_init(struct irdma_qp_uk *qp, - struct irdma_qp_uk_init_info *info); +int irdma_uk_qp_init(struct irdma_qp_uk *qp, + struct irdma_qp_uk_init_info *info); struct irdma_sq_uk_wr_trk_info { u64 wrid; u32 wr_len; @@ -413,16 +408,15 @@ __le64 *irdma_qp_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx, struct irdma_post_sq_info *info); __le64 *irdma_qp_get_next_recv_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx); void irdma_uk_clean_cq(void *q, struct irdma_cq_uk *cq); -enum irdma_status_code irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, - bool signaled, bool post_sq); -enum irdma_status_code irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta); -enum irdma_status_code irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size); +int irdma_nop(struct irdma_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq); +int irdma_fragcnt_to_quanta_sq(u32 frag_cnt, u16 *quanta); +int irdma_fragcnt_to_wqesize_rq(u32 frag_cnt, u16 *wqe_size); void irdma_get_wqe_shift(struct irdma_uk_attrs *uk_attrs, u32 sge, u32 inline_data, u8 *shift); -enum irdma_status_code irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, - u32 sq_size, u8 shift, u32 *wqdepth); -enum irdma_status_code irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, - u32 rq_size, u8 shift, u32 *wqdepth); +int irdma_get_sqdepth(struct irdma_uk_attrs *uk_attrs, u32 sq_size, u8 shift, + u32 *wqdepth); +int irdma_get_rqdepth(struct irdma_uk_attrs *uk_attrs, u32 rq_size, u8 shift, + u32 *wqdepth); void irdma_qp_push_wqe(struct irdma_qp_uk *qp, __le64 *wqe, u16 quanta, u32 wqe_idx, bool post_sq); void irdma_clr_wqes(struct irdma_qp_uk *qp, u32 qp_wqe_idx); diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 398736d8c78a..eae4400f1dde 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -551,12 +551,12 @@ void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf) * @rf: RDMA PCI function * @cqp_request: cqp request to wait */ -static enum irdma_status_code irdma_wait_event(struct irdma_pci_f *rf, - struct irdma_cqp_request *cqp_request) +static int irdma_wait_event(struct irdma_pci_f *rf, + struct irdma_cqp_request *cqp_request) { struct irdma_cqp_timeout cqp_timeout = {}; bool cqp_error = false; - enum irdma_status_code err_code = 0; + int err_code = 0; cqp_timeout.compl_cqp_cmds = rf->sc_dev.cqp_cmd_stats[IRDMA_OP_CMPL_CMDS]; do { @@ -575,12 +575,12 @@ static enum irdma_status_code irdma_wait_event(struct irdma_pci_f *rf, rf->reset = true; rf->gen_ops.request_reset(rf); } - return IRDMA_ERR_TIMEOUT; + return -ETIMEDOUT; } while (1); cqp_error = cqp_request->compl_info.error; if (cqp_error) { - err_code = IRDMA_ERR_CQP_COMPL_ERROR; + err_code = -EIO; if (cqp_request->compl_info.maj_err_code == 0xFFFF && cqp_request->compl_info.min_err_code == 0x8029) { if (!rf->reset) { @@ -680,16 +680,16 @@ bool irdma_cqp_crit_err(struct irdma_sc_dev *dev, u8 cqp_cmd, * @rf: RDMA PCI function * @cqp_request: cqp request to process */ -enum irdma_status_code irdma_handle_cqp_op(struct irdma_pci_f *rf, - struct irdma_cqp_request *cqp_request) +int irdma_handle_cqp_op(struct irdma_pci_f *rf, + struct irdma_cqp_request *cqp_request) { struct irdma_sc_dev *dev = &rf->sc_dev; struct cqp_cmds_info *info = &cqp_request->info; - enum irdma_status_code status; + int status; bool put_cqp_request = true; if (rf->reset) - return IRDMA_ERR_NOT_READY; + return -EBUSY; irdma_get_cqp_request(cqp_request); status = irdma_process_cqp_cmd(dev, info); @@ -791,17 +791,17 @@ void *irdma_remove_cqp_head(struct irdma_sc_dev *dev) * @sdinfo: information for sd cqp * */ -enum irdma_status_code irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, - struct irdma_update_sds_info *sdinfo) +int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, + struct irdma_update_sds_info *sdinfo) { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_pci_f *rf = dev_to_rf(dev); - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; memcpy(&cqp_info->in.u.update_pe_sds.info, sdinfo, @@ -822,19 +822,18 @@ enum irdma_status_code irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, * @qp: hardware control qp * @op: suspend or resume */ -enum irdma_status_code irdma_cqp_qp_suspend_resume(struct irdma_sc_qp *qp, - u8 op) +int irdma_cqp_qp_suspend_resume(struct irdma_sc_qp *qp, u8 op) { struct irdma_sc_dev *dev = qp->dev; struct irdma_cqp_request *cqp_request; struct irdma_sc_cqp *cqp = dev->cqp; struct cqp_cmds_info *cqp_info; struct irdma_pci_f *rf = dev_to_rf(dev); - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, false); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_info->cqp_cmd = op; @@ -940,18 +939,17 @@ void irdma_terminate_del_timer(struct irdma_sc_qp *qp) * @val_mem: buffer for fpm * @hmc_fn_id: function id for fpm */ -enum irdma_status_code -irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) +int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, + struct irdma_dma_mem *val_mem, u8 hmc_fn_id) { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_pci_f *rf = dev_to_rf(dev); - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_request->param = NULL; @@ -975,18 +973,17 @@ irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, * @val_mem: buffer with fpm values * @hmc_fn_id: function id for fpm */ -enum irdma_status_code -irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) +int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, + struct irdma_dma_mem *val_mem, u8 hmc_fn_id) { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_pci_f *rf = dev_to_rf(dev); - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_request->param = NULL; @@ -1009,18 +1006,17 @@ irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, * @dev: device pointer * @cq: pointer to created cq */ -enum irdma_status_code irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_cq *cq) +int irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) { struct irdma_pci_f *rf = dev_to_rf(dev); struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_info->cqp_cmd = IRDMA_OP_CQ_CREATE; @@ -1039,19 +1035,18 @@ enum irdma_status_code irdma_cqp_cq_create_cmd(struct irdma_sc_dev *dev, * @dev: device pointer * @qp: pointer to created qp */ -enum irdma_status_code irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_qp *qp) +int irdma_cqp_qp_create_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) { struct irdma_pci_f *rf = dev_to_rf(dev); struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_create_qp_info *qp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; qp_info = &cqp_request->info.in.u.qp_create.info; @@ -1079,7 +1074,7 @@ static void irdma_dealloc_push_page(struct irdma_pci_f *rf, { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; if (qp->push_idx == IRDMA_INVALID_PUSH_PAGE_INDEX) return; @@ -1179,12 +1174,10 @@ static void irdma_hw_modify_qp_callback(struct irdma_cqp_request *cqp_request) * @info: info for modify qp * @wait: flag to wait or not for modify qp completion */ -enum irdma_status_code irdma_hw_modify_qp(struct irdma_device *iwdev, - struct irdma_qp *iwqp, - struct irdma_modify_qp_info *info, - bool wait) +int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, + struct irdma_modify_qp_info *info, bool wait) { - enum irdma_status_code status; + int status; struct irdma_pci_f *rf = iwdev->rf; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; @@ -1192,7 +1185,7 @@ enum irdma_status_code irdma_hw_modify_qp(struct irdma_device *iwdev, cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, wait); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; if (!wait) { cqp_request->callback_fcn = irdma_hw_modify_qp_callback; @@ -1230,7 +1223,7 @@ enum irdma_status_code irdma_hw_modify_qp(struct irdma_device *iwdev, cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, wait); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; m_info = &cqp_info->in.u.qp_modify.info; @@ -1271,17 +1264,17 @@ void irdma_cqp_cq_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_cq *cq) * @dev: device pointer * @qp: pointer to qp */ -enum irdma_status_code irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) +int irdma_cqp_qp_destroy_cmd(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) { struct irdma_pci_f *rf = dev_to_rf(dev); struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; memset(cqp_info, 0, sizeof(*cqp_info)); @@ -1317,20 +1310,20 @@ void irdma_ieq_mpa_crc_ae(struct irdma_sc_dev *dev, struct irdma_sc_qp *qp) * irdma_init_hash_desc - initialize hash for crc calculation * @desc: cryption type */ -enum irdma_status_code irdma_init_hash_desc(struct shash_desc **desc) +int irdma_init_hash_desc(struct shash_desc **desc) { struct crypto_shash *tfm; struct shash_desc *tdesc; tfm = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(tfm)) - return IRDMA_ERR_MPA_CRC; + return -EINVAL; tdesc = kzalloc(sizeof(*tdesc) + crypto_shash_descsize(tfm), GFP_KERNEL); if (!tdesc) { crypto_free_shash(tfm); - return IRDMA_ERR_MPA_CRC; + return -EINVAL; } tdesc->tfm = tfm; @@ -1358,19 +1351,19 @@ void irdma_free_hash_desc(struct shash_desc *desc) * @len: length of buffer * @val: value to be compared */ -enum irdma_status_code irdma_ieq_check_mpacrc(struct shash_desc *desc, - void *addr, u32 len, u32 val) +int irdma_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 len, + u32 val) { u32 crc = 0; int ret; - enum irdma_status_code ret_code = 0; + int ret_code = 0; crypto_shash_init(desc); ret = crypto_shash_update(desc, addr, len); if (!ret) crypto_shash_final(desc, (u8 *)&crc); if (crc != val) - ret_code = IRDMA_ERR_MPA_CRC; + ret_code = -EINVAL; return ret_code; } @@ -1524,9 +1517,8 @@ void irdma_ieq_update_tcpip_info(struct irdma_puda_buf *buf, u16 len, * @info: to get information * @buf: puda buffer */ -static enum irdma_status_code -irdma_gen1_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, - struct irdma_puda_buf *buf) +static int irdma_gen1_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, + struct irdma_puda_buf *buf) { struct iphdr *iph; struct ipv6hdr *ip6h; @@ -1563,7 +1555,7 @@ irdma_gen1_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, ibdev_dbg(to_ibdev(buf->vsi->dev), "ERR: payload_len = 0x%x totallen expected0x%x\n", info->payload_len, buf->totallen); - return IRDMA_ERR_INVALID_SIZE; + return -EINVAL; } buf->tcphlen = tcph->doff << 2; @@ -1580,9 +1572,8 @@ irdma_gen1_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, * @info: to get information * @buf: puda buffer */ -enum irdma_status_code -irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, - struct irdma_puda_buf *buf) +int irdma_puda_get_tcpip_info(struct irdma_puda_cmpl_info *info, + struct irdma_puda_buf *buf) { struct tcphdr *tcph; u32 pkt_len; @@ -1861,20 +1852,19 @@ static void irdma_process_cqp_stats(struct irdma_cqp_request *cqp_request) * @pestat: pointer to stats info * @wait: flag to wait or not wait for stats */ -enum irdma_status_code -irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, - struct irdma_vsi_pestat *pestat, bool wait) +int irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, + struct irdma_vsi_pestat *pestat, bool wait) { struct irdma_pci_f *rf = dev_to_rf(dev); struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, wait); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; memset(cqp_info, 0, sizeof(*cqp_info)); @@ -1900,22 +1890,21 @@ irdma_cqp_gather_stats_cmd(struct irdma_sc_dev *dev, * @cmd: command to allocate or free * @stats_info: pointer to allocate stats info */ -enum irdma_status_code -irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, - struct irdma_stats_inst_info *stats_info) +int irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, + struct irdma_stats_inst_info *stats_info) { struct irdma_pci_f *rf = dev_to_rf(vsi->dev); struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; bool wait = false; if (cmd == IRDMA_OP_STATS_ALLOCATE) wait = true; cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, wait); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; memset(cqp_info, 0, sizeof(*cqp_info)); @@ -1938,17 +1927,17 @@ irdma_cqp_stats_inst_cmd(struct irdma_sc_vsi *vsi, u8 cmd, * @sc_ceq: pointer to ceq structure * @op: Create or Destroy */ -enum irdma_status_code irdma_cqp_ceq_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_ceq *sc_ceq, u8 op) +int irdma_cqp_ceq_cmd(struct irdma_sc_dev *dev, struct irdma_sc_ceq *sc_ceq, + u8 op) { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_pci_f *rf = dev_to_rf(dev); - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_info->post_sq = 1; @@ -1968,17 +1957,17 @@ enum irdma_status_code irdma_cqp_ceq_cmd(struct irdma_sc_dev *dev, * @sc_aeq: pointer to aeq structure * @op: Create or Destroy */ -enum irdma_status_code irdma_cqp_aeq_cmd(struct irdma_sc_dev *dev, - struct irdma_sc_aeq *sc_aeq, u8 op) +int irdma_cqp_aeq_cmd(struct irdma_sc_dev *dev, struct irdma_sc_aeq *sc_aeq, + u8 op) { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_pci_f *rf = dev_to_rf(dev); - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; cqp_info->post_sq = 1; @@ -1998,16 +1987,15 @@ enum irdma_status_code irdma_cqp_aeq_cmd(struct irdma_sc_dev *dev, * @cmd: Add, modify or delete * @node_info: pointer to ws node info */ -enum irdma_status_code -irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, - struct irdma_ws_node_info *node_info) +int irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, + struct irdma_ws_node_info *node_info) { struct irdma_pci_f *rf = dev_to_rf(dev); struct irdma_cqp *iwcqp = &rf->cqp; struct irdma_sc_cqp *cqp = &iwcqp->sc_cqp; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; bool poll; if (!rf->sc_dev.ceq_valid) @@ -2017,7 +2005,7 @@ irdma_cqp_ws_node_cmd(struct irdma_sc_dev *dev, u8 cmd, cqp_request = irdma_alloc_and_get_cqp_request(iwcqp, !poll); if (!cqp_request) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; cqp_info = &cqp_request->info; memset(cqp_info, 0, sizeof(*cqp_info)); @@ -2066,7 +2054,7 @@ int irdma_ah_cqp_op(struct irdma_pci_f *rf, struct irdma_sc_ah *sc_ah, u8 cmd, { struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; if (cmd != IRDMA_OP_AH_CREATE && cmd != IRDMA_OP_AH_DESTROY) return -EINVAL; @@ -2148,11 +2136,10 @@ static void irdma_ilq_ah_cb(struct irdma_cqp_request *cqp_request) * @ah_ret: Returned pointer to address handle if created * */ -enum irdma_status_code irdma_puda_create_ah(struct irdma_sc_dev *dev, - struct irdma_ah_info *ah_info, - bool wait, enum puda_rsrc_type type, - void *cb_param, - struct irdma_sc_ah **ah_ret) +int irdma_puda_create_ah(struct irdma_sc_dev *dev, + struct irdma_ah_info *ah_info, bool wait, + enum puda_rsrc_type type, void *cb_param, + struct irdma_sc_ah **ah_ret) { struct irdma_sc_ah *ah; struct irdma_pci_f *rf = dev_to_rf(dev); @@ -2161,7 +2148,7 @@ enum irdma_status_code irdma_puda_create_ah(struct irdma_sc_dev *dev, ah = kzalloc(sizeof(*ah), GFP_ATOMIC); *ah_ret = ah; if (!ah) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; err = irdma_alloc_rsrc(rf, rf->allocated_ahs, rf->max_ah, &ah_info->ah_idx, &rf->next_ah); @@ -2187,7 +2174,7 @@ error: err_free: kfree(ah); *ah_ret = NULL; - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } /** @@ -2229,19 +2216,19 @@ void irdma_gsi_ud_qp_ah_cb(struct irdma_cqp_request *cqp_request) * @pprm: pble resource manager * @pchunk: chunk of memory to add */ -enum irdma_status_code irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm, - struct irdma_chunk *pchunk) +int irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm, + struct irdma_chunk *pchunk) { u64 sizeofbitmap; if (pchunk->size & 0xfff) - return IRDMA_ERR_PARAM; + return -EINVAL; sizeofbitmap = (u64)pchunk->size >> pprm->pble_shift; pchunk->bitmapbuf = bitmap_zalloc(sizeofbitmap, GFP_KERNEL); if (!pchunk->bitmapbuf) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; pchunk->sizeofbitmap = sizeofbitmap; /* each pble is 8 bytes hence shift by 3 */ @@ -2259,10 +2246,9 @@ enum irdma_status_code irdma_prm_add_pble_mem(struct irdma_pble_prm *pprm, * @vaddr: returns virtual address of pble memory * @fpm_addr: returns fpm address of pble memory */ -enum irdma_status_code -irdma_prm_get_pbles(struct irdma_pble_prm *pprm, - struct irdma_pble_chunkinfo *chunkinfo, u64 mem_size, - u64 **vaddr, u64 *fpm_addr) +int irdma_prm_get_pbles(struct irdma_pble_prm *pprm, + struct irdma_pble_chunkinfo *chunkinfo, u64 mem_size, + u64 **vaddr, u64 *fpm_addr) { u64 bits_needed; u64 bit_idx = PBLE_INVALID_IDX; @@ -2290,7 +2276,7 @@ irdma_prm_get_pbles(struct irdma_pble_prm *pprm, if (!pchunk || bit_idx >= pchunk->sizeofbitmap) { spin_unlock_irqrestore(&pprm->prm_lock, flags); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } bitmap_set(pchunk->bitmapbuf, bit_idx, bits_needed); @@ -2325,8 +2311,8 @@ void irdma_prm_return_pbles(struct irdma_pble_prm *pprm, spin_unlock_irqrestore(&pprm->prm_lock, flags); } -enum irdma_status_code irdma_map_vm_page_list(struct irdma_hw *hw, void *va, - dma_addr_t *pg_dma, u32 pg_cnt) +int irdma_map_vm_page_list(struct irdma_hw *hw, void *va, dma_addr_t *pg_dma, + u32 pg_cnt) { struct page *vm_page; int i; @@ -2350,7 +2336,7 @@ enum irdma_status_code irdma_map_vm_page_list(struct irdma_hw *hw, void *va, err: irdma_unmap_vm_page_list(hw, pg_dma, i); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } void irdma_unmap_vm_page_list(struct irdma_hw *hw, dma_addr_t *pg_dma, u32 pg_cnt) @@ -2386,15 +2372,14 @@ done: * @chunk: chunk to add for paged memory * @pg_cnt: number of pages needed */ -enum irdma_status_code irdma_pble_get_paged_mem(struct irdma_chunk *chunk, - u32 pg_cnt) +int irdma_pble_get_paged_mem(struct irdma_chunk *chunk, u32 pg_cnt) { u32 size; void *va; chunk->dmainfo.dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL); if (!chunk->dmainfo.dmaaddrs) - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; size = PAGE_SIZE * pg_cnt; va = vmalloc(size); @@ -2416,7 +2401,7 @@ err: kfree(chunk->dmainfo.dmaaddrs); chunk->dmainfo.dmaaddrs = NULL; - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } /** diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 7b144e57578e..6ab56a2f6a23 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -256,7 +256,7 @@ static void irdma_alloc_push_page(struct irdma_qp *iwqp) struct cqp_cmds_info *cqp_info; struct irdma_device *iwdev = iwqp->iwdev; struct irdma_sc_qp *qp = &iwqp->sc_qp; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&iwdev->rf->cqp, true); if (!cqp_request) @@ -592,7 +592,7 @@ static int irdma_setup_kmode_qp(struct irdma_device *iwdev, u32 sqdepth, rqdepth; u8 sqshift, rqshift; u32 size; - enum irdma_status_code status; + int status; struct irdma_qp_uk_init_info *ukinfo = &info->qp_uk_init_info; struct irdma_uk_attrs *uk_attrs = &iwdev->rf->sc_dev.hw_attrs.uk_attrs; @@ -668,7 +668,7 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp) struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_create_qp_info *qp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); if (!cqp_request) @@ -806,7 +806,7 @@ static int irdma_create_qp(struct ib_qp *ibqp, struct irdma_create_qp_req req; struct irdma_create_qp_resp uresp = {}; u32 qp_num = 0; - enum irdma_status_code ret; + int ret; int err_code; int sq_size; int rq_size; @@ -1792,7 +1792,7 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, struct irdma_device *iwdev; struct irdma_pci_f *rf; struct irdma_cq_buf *cq_buf = NULL; - enum irdma_status_code status = 0; + int status = 0; unsigned long flags; int ret; @@ -1945,7 +1945,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, struct irdma_sc_cq *cq; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_cq_init_info info = {}; - enum irdma_status_code status; + int status; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_cq_uk_init_info *ukinfo = &info.cq_uk_init_info; @@ -2309,7 +2309,7 @@ static int irdma_setup_pbles(struct irdma_pci_f *rf, struct irdma_mr *iwmr, struct irdma_pble_alloc *palloc = &iwpbl->pble_alloc; struct irdma_pble_info *pinfo; u64 *pbl; - enum irdma_status_code status; + int status; enum irdma_pble_level level = PBLE_LEVEL_1; if (use_pbles) { @@ -2434,7 +2434,7 @@ static int irdma_hw_alloc_mw(struct irdma_device *iwdev, struct irdma_mr *iwmr) struct irdma_pd *iwpd = to_iwpd(iwmr->ibmr.pd); struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&iwdev->rf->cqp, true); if (!cqp_request) @@ -2533,7 +2533,7 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, { struct irdma_allocate_stag_info *info; struct irdma_pd *iwpd = to_iwpd(iwmr->ibmr.pd); - enum irdma_status_code status; + int status; int err = 0; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; @@ -2575,7 +2575,7 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, struct irdma_pble_alloc *palloc; struct irdma_pbl *iwpbl; struct irdma_mr *iwmr; - enum irdma_status_code status; + int status; u32 stag; int err_code = -ENOMEM; @@ -2672,7 +2672,7 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, struct irdma_reg_ns_stag_info *stag_info; struct irdma_pd *iwpd = to_iwpd(iwmr->ibmr.pd); struct irdma_pble_alloc *palloc = &iwpbl->pble_alloc; - enum irdma_status_code status; + int status; int err = 0; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; @@ -2897,7 +2897,7 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access struct irdma_device *iwdev = to_iwdev(pd->device); struct irdma_pbl *iwpbl; struct irdma_mr *iwmr; - enum irdma_status_code status; + int status; u32 stag; int ret; @@ -3057,7 +3057,7 @@ static int irdma_post_send(struct ib_qp *ibqp, struct irdma_qp_uk *ukqp; struct irdma_sc_dev *dev; struct irdma_post_sq_info info; - enum irdma_status_code ret; + int ret; int err = 0; unsigned long flags; bool inv_stag; @@ -3131,7 +3131,7 @@ static int irdma_post_send(struct ib_qp *ibqp, } if (ret) { - if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + if (ret == -ENOMEM) err = -ENOMEM; else err = -EINVAL; @@ -3170,7 +3170,7 @@ static int irdma_post_send(struct ib_qp *ibqp, } if (ret) { - if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + if (ret == -ENOMEM) err = -ENOMEM; else err = -EINVAL; @@ -3193,7 +3193,7 @@ static int irdma_post_send(struct ib_qp *ibqp, ret = irdma_uk_rdma_read(ukqp, &info, inv_stag, false); if (ret) { - if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + if (ret == -ENOMEM) err = -ENOMEM; else err = -EINVAL; @@ -3274,7 +3274,7 @@ static int irdma_post_recv(struct ib_qp *ibqp, struct irdma_qp *iwqp; struct irdma_qp_uk *ukqp; struct irdma_post_rq_info post_recv = {}; - enum irdma_status_code ret = 0; + int ret = 0; unsigned long flags; int err = 0; bool reflush = false; @@ -3293,7 +3293,7 @@ static int irdma_post_recv(struct ib_qp *ibqp, if (ret) { ibdev_dbg(&iwqp->iwdev->ibdev, "VERBS: post_recv err %d\n", ret); - if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + if (ret == -ENOMEM) err = -ENOMEM; else err = -EINVAL; @@ -3483,7 +3483,7 @@ static int __irdma_poll_cq(struct irdma_cq *iwcq, int num_entries, struct ib_wc struct irdma_cq_buf *last_buf = NULL; struct irdma_cq_poll_info *cur_cqe = &iwcq->cur_cqe; struct irdma_cq_buf *cq_buf; - enum irdma_status_code ret; + int ret; struct irdma_device *iwdev; struct irdma_cq_uk *ukcq; bool cq_new_cqe = false; @@ -3503,10 +3503,10 @@ static int __irdma_poll_cq(struct irdma_cq *iwcq, int num_entries, struct ib_wc cq_new_cqe = true; continue; } - if (ret == IRDMA_ERR_Q_EMPTY) + if (ret == -ENOENT) break; /* QP using the CQ is destroyed. Skip reporting this CQE */ - if (ret == IRDMA_ERR_Q_DESTROYED) { + if (ret == -EFAULT) { cq_new_cqe = true; continue; } @@ -3528,10 +3528,10 @@ static int __irdma_poll_cq(struct irdma_cq *iwcq, int num_entries, struct ib_wc continue; } - if (ret == IRDMA_ERR_Q_EMPTY) + if (ret == -ENOENT) break; /* QP using the CQ is destroyed. Skip reporting this CQE */ - if (ret == IRDMA_ERR_Q_DESTROYED) { + if (ret == -EFAULT) { cq_new_cqe = true; continue; } @@ -3859,7 +3859,7 @@ static int irdma_mcast_cqp_op(struct irdma_device *iwdev, { struct cqp_cmds_info *cqp_info; struct irdma_cqp_request *cqp_request; - enum irdma_status_code status; + int status; cqp_request = irdma_alloc_and_get_cqp_request(&iwdev->rf->cqp, true); if (!cqp_request) diff --git a/drivers/infiniband/hw/irdma/ws.c b/drivers/infiniband/hw/irdma/ws.c index b0d6ee0739f5..20bc8d0d7f1f 100644 --- a/drivers/infiniband/hw/irdma/ws.c +++ b/drivers/infiniband/hw/irdma/ws.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2017 - 2021 Intel Corporation */ #include "osdep.h" -#include "status.h" #include "hmc.h" #include "defs.h" #include "type.h" @@ -87,8 +86,8 @@ static void irdma_free_node(struct irdma_sc_vsi *vsi, * @node: pointer to node * @cmd: add, remove or modify */ -static enum irdma_status_code -irdma_ws_cqp_cmd(struct irdma_sc_vsi *vsi, struct irdma_ws_node *node, u8 cmd) +static int irdma_ws_cqp_cmd(struct irdma_sc_vsi *vsi, + struct irdma_ws_node *node, u8 cmd) { struct irdma_ws_node_info node_info = {}; @@ -106,7 +105,7 @@ irdma_ws_cqp_cmd(struct irdma_sc_vsi *vsi, struct irdma_ws_node *node, u8 cmd) node_info.enable = node->enable; if (irdma_cqp_ws_node_cmd(vsi->dev, cmd, &node_info)) { ibdev_dbg(to_ibdev(vsi->dev), "WS: CQP WS CMD failed\n"); - return IRDMA_ERR_NO_MEMORY; + return -ENOMEM; } if (node->type_leaf && cmd == IRDMA_OP_WS_ADD_NODE) { @@ -234,18 +233,18 @@ static void irdma_remove_leaf(struct irdma_sc_vsi *vsi, u8 user_pri) * @vsi: vsi pointer * @user_pri: user priority */ -enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) +int irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) { struct irdma_ws_node *ws_tree_root; struct irdma_ws_node *vsi_node; struct irdma_ws_node *tc_node; u16 traffic_class; - enum irdma_status_code ret = 0; + int ret = 0; int i; mutex_lock(&vsi->dev->ws_mutex); if (vsi->tc_change_pending) { - ret = IRDMA_ERR_NOT_READY; + ret = -EBUSY; goto exit; } @@ -258,7 +257,7 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) ws_tree_root = irdma_alloc_node(vsi, user_pri, WS_NODE_TYPE_PARENT, NULL); if (!ws_tree_root) { - ret = IRDMA_ERR_NO_MEMORY; + ret = -ENOMEM; goto exit; } @@ -283,7 +282,7 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) vsi_node = irdma_alloc_node(vsi, user_pri, WS_NODE_TYPE_PARENT, ws_tree_root); if (!vsi_node) { - ret = IRDMA_ERR_NO_MEMORY; + ret = -ENOMEM; goto vsi_add_err; } @@ -310,7 +309,7 @@ enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri) tc_node = irdma_alloc_node(vsi, user_pri, WS_NODE_TYPE_LEAF, vsi_node); if (!tc_node) { - ret = IRDMA_ERR_NO_MEMORY; + ret = -ENOMEM; goto leaf_add_err; } diff --git a/drivers/infiniband/hw/irdma/ws.h b/drivers/infiniband/hw/irdma/ws.h index f0e16f630701..d431e3327d26 100644 --- a/drivers/infiniband/hw/irdma/ws.h +++ b/drivers/infiniband/hw/irdma/ws.h @@ -34,7 +34,7 @@ struct irdma_ws_node { }; struct irdma_sc_vsi; -enum irdma_status_code irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri); +int irdma_ws_add(struct irdma_sc_vsi *vsi, u8 user_pri); void irdma_ws_remove(struct irdma_sc_vsi *vsi, u8 user_pri); void irdma_ws_reset(struct irdma_sc_vsi *vsi); From 45225a93ccc01a93f7fbffef4fcb2d9c649fe71e Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Thu, 17 Feb 2022 09:18:50 -0600 Subject: [PATCH 096/186] RDMA/irdma: Propagate error codes All functions now return linux error codes. Propagate the return from these functions as opposed to converting them to generic values. Link: https://lore.kernel.org/r/20220217151851.1518-3-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 2 +- drivers/infiniband/hw/irdma/main.c | 12 +++++------- drivers/infiniband/hw/irdma/verbs.c | 16 +++++++--------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index cd0d409844f1..93b71357ba98 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1099,7 +1099,7 @@ static int irdma_cfg_ceq_vector(struct irdma_pci_f *rf, struct irdma_ceq *iwceq, irq_update_affinity_hint(msix_vec->irq, &msix_vec->mask); if (status) { ibdev_dbg(&rf->iwdev->ibdev, "ERR: ceq irq config fail\n"); - return -EINVAL; + return status; } msix_vec->ceq_id = ceq_id; diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 6558d5e1ade5..babbe8ad5afa 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -176,7 +176,7 @@ static int irdma_lan_register_qset(struct irdma_sc_vsi *vsi, ret = ice_add_rdma_qset(pf, &qset); if (ret) { ibdev_dbg(&iwdev->ibdev, "WS: LAN alloc_res for rdma qset failed.\n"); - return -EINVAL; + return ret; } tc_node->l2_sched_node_id = qset.teid; @@ -280,10 +280,9 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ irdma_fill_device_info(iwdev, pf, vsi); rf = iwdev->rf; - if (irdma_ctrl_init_hw(rf)) { - err = -EIO; + err = irdma_ctrl_init_hw(rf); + if (err) goto err_ctrl_init; - } l2params.mtu = iwdev->netdev->mtu; ice_get_qos_params(pf, &qos_info); @@ -291,10 +290,9 @@ static int irdma_probe(struct auxiliary_device *aux_dev, const struct auxiliary_ if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; - if (irdma_rt_init_hw(iwdev, &l2params)) { - err = -EIO; + err = irdma_rt_init_hw(iwdev, &l2params); + if (err) goto err_rt_init; - } err = irdma_ib_register_device(iwdev); if (err) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 6ab56a2f6a23..3a5f41b4fd5b 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -603,7 +603,7 @@ static int irdma_setup_kmode_qp(struct irdma_device *iwdev, status = irdma_get_sqdepth(uk_attrs, ukinfo->sq_size, sqshift, &sqdepth); if (status) - return -ENOMEM; + return status; if (uk_attrs->hw_rev == IRDMA_GEN_1) rqshift = IRDMA_MAX_RQ_WQE_SHIFT_GEN1; @@ -614,7 +614,7 @@ static int irdma_setup_kmode_qp(struct irdma_device *iwdev, status = irdma_get_rqdepth(uk_attrs, ukinfo->rq_size, rqshift, &rqdepth); if (status) - return -ENOMEM; + return status; iwqp->kqp.sq_wrid_mem = kcalloc(sqdepth, sizeof(*iwqp->kqp.sq_wrid_mem), GFP_KERNEL); @@ -688,7 +688,7 @@ static int irdma_cqp_create_qp_cmd(struct irdma_qp *iwqp) status = irdma_handle_cqp_op(rf, cqp_request); irdma_put_cqp_request(&rf->cqp, cqp_request); - return status ? -ENOMEM : 0; + return status; } static void irdma_roce_fill_and_set_qpctx_info(struct irdma_qp *iwqp, @@ -2316,7 +2316,7 @@ static int irdma_setup_pbles(struct irdma_pci_f *rf, struct irdma_mr *iwmr, status = irdma_get_pble(rf->pble_rsrc, palloc, iwmr->page_cnt, false); if (status) - return -ENOMEM; + return status; iwpbl->pbl_allocated = true; level = palloc->level; @@ -2457,7 +2457,7 @@ static int irdma_hw_alloc_mw(struct irdma_device *iwdev, struct irdma_mr *iwmr) status = irdma_handle_cqp_op(iwdev->rf, cqp_request); irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); - return status ? -ENOMEM : 0; + return status; } /** @@ -3553,7 +3553,7 @@ error: ibdev_dbg(&iwdev->ibdev, "%s: Error polling CQ, irdma_err: %d\n", __func__, ret); - return -EINVAL; + return ret; } /** @@ -3873,10 +3873,8 @@ static int irdma_mcast_cqp_op(struct irdma_device *iwdev, cqp_info->in.u.mc_create.cqp = &iwdev->rf->cqp.sc_cqp; status = irdma_handle_cqp_op(iwdev->rf, cqp_request); irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); - if (status) - return -ENOMEM; - return 0; + return status; } /** From 2322d17abf0a6c90251e8c0e419620c537e2875f Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Thu, 17 Feb 2022 09:18:51 -0600 Subject: [PATCH 097/186] RDMA/irdma: Remove excess error variables As irdma_status_code is replaced with an int, there is no need for two variables to hold error codes. Remove the excess variable in functions where this occurs. Also, remove any redundant initializations which are no longer needed. Link: https://lore.kernel.org/r/20220217151851.1518-4-shiraz.saleem@intel.com Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 4 +- drivers/infiniband/hw/irdma/verbs.c | 100 ++++++++-------------------- 2 files changed, 28 insertions(+), 76 deletions(-) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 93b71357ba98..58cee86b4e35 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1298,10 +1298,10 @@ del_ceqs: static int irdma_create_virt_aeq(struct irdma_pci_f *rf, u32 size) { - int status = -ENOMEM; struct irdma_aeq *aeq = &rf->aeq; dma_addr_t *pg_arr; u32 pg_cnt; + int status; if (rf->rdma_ver < IRDMA_GEN_2) return -EOPNOTSUPP; @@ -1310,7 +1310,7 @@ static int irdma_create_virt_aeq(struct irdma_pci_f *rf, u32 size) aeq->mem.va = vzalloc(aeq->mem.size); if (!aeq->mem.va) - return status; + return -ENOMEM; pg_cnt = DIV_ROUND_UP(aeq->mem.size, PAGE_SIZE); status = irdma_get_pble(rf->pble_rsrc, &aeq->palloc, pg_cnt, true); diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 3a5f41b4fd5b..df0a011ff920 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -806,7 +806,6 @@ static int irdma_create_qp(struct ib_qp *ibqp, struct irdma_create_qp_req req; struct irdma_create_qp_resp uresp = {}; u32 qp_num = 0; - int ret; int err_code; int sq_size; int rq_size; @@ -936,9 +935,8 @@ static int irdma_create_qp(struct ib_qp *ibqp, if (dev->hw_attrs.uk_attrs.hw_rev > IRDMA_GEN_1) init_info.qp_uk_init_info.qp_caps |= IRDMA_PUSH_MODE; - ret = irdma_sc_qp_init(qp, &init_info); - if (ret) { - err_code = -EPROTO; + err_code = irdma_sc_qp_init(qp, &init_info); + if (err_code) { ibdev_dbg(&iwdev->ibdev, "VERBS: qp_init fail\n"); goto error; } @@ -1792,7 +1790,6 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, struct irdma_device *iwdev; struct irdma_pci_f *rf; struct irdma_cq_buf *cq_buf = NULL; - int status = 0; unsigned long flags; int ret; @@ -1885,12 +1882,10 @@ static int irdma_resize_cq(struct ib_cq *ibcq, int entries, cqp_info->in.u.cq_modify.cq = &iwcq->sc_cq; cqp_info->in.u.cq_modify.scratch = (uintptr_t)cqp_request; cqp_info->post_sq = 1; - status = irdma_handle_cqp_op(rf, cqp_request); + ret = irdma_handle_cqp_op(rf, cqp_request); irdma_put_cqp_request(&rf->cqp, cqp_request); - if (status) { - ret = -EPROTO; + if (ret) goto error; - } spin_lock_irqsave(&iwcq->lock, flags); if (cq_buf) { @@ -1945,7 +1940,6 @@ static int irdma_create_cq(struct ib_cq *ibcq, struct irdma_sc_cq *cq; struct irdma_sc_dev *dev = &rf->sc_dev; struct irdma_cq_init_info info = {}; - int status; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; struct irdma_cq_uk_init_info *ukinfo = &info.cq_uk_init_info; @@ -2095,12 +2089,10 @@ static int irdma_create_cq(struct ib_cq *ibcq, cqp_info->in.u.cq_create.cq = cq; cqp_info->in.u.cq_create.check_overflow = true; cqp_info->in.u.cq_create.scratch = (uintptr_t)cqp_request; - status = irdma_handle_cqp_op(rf, cqp_request); + err_code = irdma_handle_cqp_op(rf, cqp_request); irdma_put_cqp_request(&rf->cqp, cqp_request); - if (status) { - err_code = -ENOMEM; + if (err_code) goto cq_free_rsrc; - } if (udata) { struct irdma_create_cq_resp resp = {}; @@ -2534,7 +2526,6 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, struct irdma_allocate_stag_info *info; struct irdma_pd *iwpd = to_iwpd(iwmr->ibmr.pd); int status; - int err = 0; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; @@ -2556,10 +2547,8 @@ static int irdma_hw_alloc_stag(struct irdma_device *iwdev, cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request; status = irdma_handle_cqp_op(iwdev->rf, cqp_request); irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); - if (status) - err = -ENOMEM; - return err; + return status; } /** @@ -2575,9 +2564,8 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, struct irdma_pble_alloc *palloc; struct irdma_pbl *iwpbl; struct irdma_mr *iwmr; - int status; u32 stag; - int err_code = -ENOMEM; + int err_code; iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); if (!iwmr) @@ -2599,9 +2587,9 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, iwmr->type = IRDMA_MEMREG_TYPE_MEM; palloc = &iwpbl->pble_alloc; iwmr->page_cnt = max_num_sg; - status = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt, - true); - if (status) + err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt, + true); + if (err_code) goto err_get_pble; err_code = irdma_hw_alloc_stag(iwdev, iwmr); @@ -2672,10 +2660,9 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, struct irdma_reg_ns_stag_info *stag_info; struct irdma_pd *iwpd = to_iwpd(iwmr->ibmr.pd); struct irdma_pble_alloc *palloc = &iwpbl->pble_alloc; - int status; - int err = 0; struct irdma_cqp_request *cqp_request; struct cqp_cmds_info *cqp_info; + int ret; cqp_request = irdma_alloc_and_get_cqp_request(&iwdev->rf->cqp, true); if (!cqp_request) @@ -2712,12 +2699,10 @@ static int irdma_hwreg_mr(struct irdma_device *iwdev, struct irdma_mr *iwmr, cqp_info->post_sq = 1; cqp_info->in.u.mr_reg_non_shared.dev = &iwdev->rf->sc_dev; cqp_info->in.u.mr_reg_non_shared.scratch = (uintptr_t)cqp_request; - status = irdma_handle_cqp_op(iwdev->rf, cqp_request); + ret = irdma_handle_cqp_op(iwdev->rf, cqp_request); irdma_put_cqp_request(&iwdev->rf->cqp, cqp_request); - if (status) - err = -ENOMEM; - return err; + return ret; } /** @@ -2897,7 +2882,6 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access struct irdma_device *iwdev = to_iwdev(pd->device); struct irdma_pbl *iwpbl; struct irdma_mr *iwmr; - int status; u32 stag; int ret; @@ -2925,10 +2909,9 @@ struct ib_mr *irdma_reg_phys_mr(struct ib_pd *pd, u64 addr, u64 size, int access iwmr->pgaddrmem[0] = addr; iwmr->len = size; iwmr->page_size = SZ_4K; - status = irdma_hwreg_mr(iwdev, iwmr, access); - if (status) { + ret = irdma_hwreg_mr(iwdev, iwmr, access); + if (ret) { irdma_free_stag(iwdev, stag); - ret = -ENOMEM; goto err; } @@ -3057,7 +3040,6 @@ static int irdma_post_send(struct ib_qp *ibqp, struct irdma_qp_uk *ukqp; struct irdma_sc_dev *dev; struct irdma_post_sq_info info; - int ret; int err = 0; unsigned long flags; bool inv_stag; @@ -3116,7 +3098,7 @@ static int irdma_post_send(struct ib_qp *ibqp, info.op.inline_send.qkey = ud_wr(ib_wr)->remote_qkey; info.op.inline_send.dest_qp = ud_wr(ib_wr)->remote_qpn; } - ret = irdma_uk_inline_send(ukqp, &info, false); + err = irdma_uk_inline_send(ukqp, &info, false); } else { info.op.send.num_sges = ib_wr->num_sge; info.op.send.sg_list = ib_wr->sg_list; @@ -3127,14 +3109,7 @@ static int irdma_post_send(struct ib_qp *ibqp, info.op.send.qkey = ud_wr(ib_wr)->remote_qkey; info.op.send.dest_qp = ud_wr(ib_wr)->remote_qpn; } - ret = irdma_uk_send(ukqp, &info, false); - } - - if (ret) { - if (ret == -ENOMEM) - err = -ENOMEM; - else - err = -EINVAL; + err = irdma_uk_send(ukqp, &info, false); } break; case IB_WR_RDMA_WRITE_WITH_IMM: @@ -3160,20 +3135,13 @@ static int irdma_post_send(struct ib_qp *ibqp, rdma_wr(ib_wr)->remote_addr; info.op.inline_rdma_write.rem_addr.lkey = rdma_wr(ib_wr)->rkey; - ret = irdma_uk_inline_rdma_write(ukqp, &info, false); + err = irdma_uk_inline_rdma_write(ukqp, &info, false); } else { info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; info.op.rdma_write.num_lo_sges = ib_wr->num_sge; info.op.rdma_write.rem_addr.addr = rdma_wr(ib_wr)->remote_addr; info.op.rdma_write.rem_addr.lkey = rdma_wr(ib_wr)->rkey; - ret = irdma_uk_rdma_write(ukqp, &info, false); - } - - if (ret) { - if (ret == -ENOMEM) - err = -ENOMEM; - else - err = -EINVAL; + err = irdma_uk_rdma_write(ukqp, &info, false); } break; case IB_WR_RDMA_READ_WITH_INV: @@ -3190,21 +3158,12 @@ static int irdma_post_send(struct ib_qp *ibqp, info.op.rdma_read.rem_addr.lkey = rdma_wr(ib_wr)->rkey; info.op.rdma_read.lo_sg_list = (void *)ib_wr->sg_list; info.op.rdma_read.num_lo_sges = ib_wr->num_sge; - - ret = irdma_uk_rdma_read(ukqp, &info, inv_stag, false); - if (ret) { - if (ret == -ENOMEM) - err = -ENOMEM; - else - err = -EINVAL; - } + err = irdma_uk_rdma_read(ukqp, &info, inv_stag, false); break; case IB_WR_LOCAL_INV: info.op_type = IRDMA_OP_TYPE_INV_STAG; info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; - ret = irdma_uk_stag_local_invalidate(ukqp, &info, true); - if (ret) - err = -ENOMEM; + err = irdma_uk_stag_local_invalidate(ukqp, &info, true); break; case IB_WR_REG_MR: { struct irdma_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr); @@ -3226,10 +3185,8 @@ static int irdma_post_send(struct ib_qp *ibqp, stag_info.local_fence = ib_wr->send_flags & IB_SEND_FENCE; if (iwmr->npages > IRDMA_MIN_PAGES_PER_FMR) stag_info.chunk_size = 1; - ret = irdma_sc_mr_fast_register(&iwqp->sc_qp, &stag_info, + err = irdma_sc_mr_fast_register(&iwqp->sc_qp, &stag_info, true); - if (ret) - err = -ENOMEM; break; } default: @@ -3274,7 +3231,6 @@ static int irdma_post_recv(struct ib_qp *ibqp, struct irdma_qp *iwqp; struct irdma_qp_uk *ukqp; struct irdma_post_rq_info post_recv = {}; - int ret = 0; unsigned long flags; int err = 0; bool reflush = false; @@ -3289,14 +3245,10 @@ static int irdma_post_recv(struct ib_qp *ibqp, post_recv.num_sges = ib_wr->num_sge; post_recv.wr_id = ib_wr->wr_id; post_recv.sg_list = ib_wr->sg_list; - ret = irdma_uk_post_receive(ukqp, &post_recv); - if (ret) { + err = irdma_uk_post_receive(ukqp, &post_recv); + if (err) { ibdev_dbg(&iwqp->iwdev->ibdev, - "VERBS: post_recv err %d\n", ret); - if (ret == -ENOMEM) - err = -ENOMEM; - else - err = -EINVAL; + "VERBS: post_recv err %d\n", err); goto out; } From 6a8a2e473b986191f4113c485905ea8462724d58 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 23 Feb 2022 17:07:03 -0600 Subject: [PATCH 098/186] RDMA/rxe: Warn if mcast memory is not freed Print a warning if memory allocated by mcast is not cleared when the rxe driver is unloaded. Link: https://lore.kernel.org/r/20220223230706.50332-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 3520eb2db685..fce3994d8f7a 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -29,6 +29,8 @@ void rxe_dealloc(struct ib_device *ib_dev) rxe_pool_cleanup(&rxe->mr_pool); rxe_pool_cleanup(&rxe->mw_pool); + WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); + if (rxe->tfm) crypto_free_shash(rxe->tfm); } From 4a4f1073475796bcb343998bb1eddf6844b77963 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 23 Feb 2022 17:07:04 -0600 Subject: [PATCH 099/186] RDMA/rxe: Collect mca init code in a subroutine Collect initialization code for struct rxe_mca into a subroutine, __rxe_init_mca(), to cleanup rxe_attach_mcg() in rxe_mcast.c. Check limit on total number of attached qp's. Link: https://lore.kernel.org/r/20220223230706.50332-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 58 ++++++++++++++++++++------- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 4935fe5c5868..a0a7f8720f95 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -259,6 +259,46 @@ static void rxe_destroy_mcg(struct rxe_mcg *mcg) spin_unlock_irqrestore(&mcg->rxe->mcg_lock, flags); } +/** + * __rxe_init_mca - initialize a new mca holding lock + * @qp: qp object + * @mcg: mcg object + * @mca: empty space for new mca + * + * Context: caller must hold references on qp and mcg, rxe->mcg_lock + * and pass memory for new mca + * + * Returns: 0 on success else an error + */ +static int __rxe_init_mca(struct rxe_qp *qp, struct rxe_mcg *mcg, + struct rxe_mca *mca) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int n; + + n = atomic_inc_return(&rxe->mcg_attach); + if (n > rxe->attr.max_total_mcast_qp_attach) { + atomic_dec(&rxe->mcg_attach); + return -ENOMEM; + } + + n = atomic_inc_return(&mcg->qp_num); + if (n > rxe->attr.max_mcast_qp_attach) { + atomic_dec(&mcg->qp_num); + atomic_dec(&rxe->mcg_attach); + return -ENOMEM; + } + + atomic_inc(&qp->mcg_num); + + rxe_add_ref(qp); + mca->qp = qp; + + list_add_tail(&mca->qp_list, &mcg->qp_list); + + return 0; +} + static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_mcg *mcg) { @@ -291,22 +331,9 @@ static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, } } - /* check limits after checking if already attached */ - if (atomic_inc_return(&mcg->qp_num) > rxe->attr.max_mcast_qp_attach) { - atomic_dec(&mcg->qp_num); + err = __rxe_init_mca(qp, mcg, mca); + if (err) kfree(mca); - err = -ENOMEM; - goto out; - } - - /* protect pointer to qp in mca */ - rxe_add_ref(qp); - mca->qp = qp; - - atomic_inc(&qp->mcg_num); - list_add(&mca->qp_list, &mcg->qp_list); - - err = 0; out: spin_unlock_irqrestore(&rxe->mcg_lock, flags); return err; @@ -329,6 +356,7 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, if (mca->qp == qp) { list_del(&mca->qp_list); atomic_dec(&qp->mcg_num); + atomic_dec(&rxe->mcg_attach); rxe_drop_ref(qp); /* if the number of qp's attached to the diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 20fe3ee6589d..6b15251ff67a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -401,6 +401,7 @@ struct rxe_dev { spinlock_t mcg_lock; struct rb_root mcg_tree; atomic_t mcg_num; + atomic_t mcg_attach; spinlock_t pending_lock; /* guard pending_mmaps */ struct list_head pending_mmaps; From a181c4c81a7104370c6144df5daf914780f8e89e Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 23 Feb 2022 17:07:05 -0600 Subject: [PATCH 100/186] RDMA/rxe: Collect cleanup mca code in a subroutine Collect cleanup code for struct rxe_mca into a subroutine, __rxe_cleanup_mca() called in rxe_detach_mcg() in rxe_mcast.c. Link: https://lore.kernel.org/r/20220223230706.50332-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 41 +++++++++++++++++---------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index a0a7f8720f95..66c1ae703976 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -339,13 +339,31 @@ out: return err; } +/** + * __rxe_cleanup_mca - cleanup mca object holding lock + * @mca: mca object + * @mcg: mcg object + * + * Context: caller must hold a reference to mcg and rxe->mcg_lock + */ +static void __rxe_cleanup_mca(struct rxe_mca *mca, struct rxe_mcg *mcg) +{ + list_del(&mca->qp_list); + + atomic_dec(&mcg->qp_num); + atomic_dec(&mcg->rxe->mcg_attach); + atomic_dec(&mca->qp->mcg_num); + rxe_drop_ref(mca->qp); + + kfree(mca); +} + static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, union ib_gid *mgid) { struct rxe_mcg *mcg; struct rxe_mca *mca, *tmp; unsigned long flags; - int err; mcg = rxe_lookup_mcg(rxe, mgid); if (!mcg) @@ -354,37 +372,30 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, spin_lock_irqsave(&rxe->mcg_lock, flags); list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) { if (mca->qp == qp) { - list_del(&mca->qp_list); - atomic_dec(&qp->mcg_num); - atomic_dec(&rxe->mcg_attach); - rxe_drop_ref(qp); + __rxe_cleanup_mca(mca, mcg); /* if the number of qp's attached to the * mcast group falls to zero go ahead and * tear it down. This will not free the * object since we are still holding a ref - * from the get key above. + * from the get key above */ - if (atomic_dec_return(&mcg->qp_num) <= 0) + if (atomic_read(&mcg->qp_num) <= 0) __rxe_destroy_mcg(mcg); /* drop the ref from get key. This will free the * object if qp_num is zero. */ kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); - kfree(mca); - err = 0; - goto out_unlock; + + spin_unlock_irqrestore(&rxe->mcg_lock, flags); + return 0; } } /* we didn't find the qp on the list */ - kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); - err = -EINVAL; - -out_unlock: spin_unlock_irqrestore(&rxe->mcg_lock, flags); - return err; + return -EINVAL; } int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) From 6090a0c4c7c6156f267ee217f6577eecd610a652 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Wed, 23 Feb 2022 17:07:06 -0600 Subject: [PATCH 101/186] RDMA/rxe: Cleanup rxe_mcast.c Finish adding subroutine comment headers to subroutines in rxe_mcast.c. Make minor api change cleanups. Link: https://lore.kernel.org/r/20220223230706.50332-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_mcast.c | 99 +++++++++++++++++++++------ 1 file changed, 79 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 66c1ae703976..c399a29b648b 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -1,12 +1,33 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* + * Copyright (c) 2022 Hewlett Packard Enterprise, Inc. All rights reserved. * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ -#include "rxe.h" -#include "rxe_loc.h" +/* + * rxe_mcast.c implements driver support for multicast transport. + * It is based on two data structures struct rxe_mcg ('mcg') and + * struct rxe_mca ('mca'). An mcg is allocated each time a qp is + * attached to a new mgid for the first time. These are indexed by + * a red-black tree using the mgid. This data structure is searched + * for the mcg when a multicast packet is received and when another + * qp is attached to the same mgid. It is cleaned up when the last qp + * is detached from the mcg. Each time a qp is attached to an mcg an + * mca is created. It holds a pointer to the qp and is added to a list + * of qp's that are attached to the mcg. The qp_list is used to replicate + * mcast packets in the rxe receive path. + */ +#include "rxe.h" + +/** + * rxe_mcast_add - add multicast address to rxe device + * @rxe: rxe device object + * @mgid: multicast address as a gid + * + * Returns 0 on success else an error + */ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; @@ -16,6 +37,13 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) return dev_mc_add(rxe->ndev, ll_addr); } +/** + * rxe_mcast_delete - delete multicast address from rxe device + * @rxe: rxe device object + * @mgid: multicast address as a gid + * + * Returns 0 on success else an error + */ static int rxe_mcast_delete(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; @@ -216,7 +244,7 @@ err_dec: /** * rxe_cleanup_mcg - cleanup mcg for kref_put - * @kref: + * @kref: struct kref embnedded in mcg */ void rxe_cleanup_mcg(struct kref *kref) { @@ -299,9 +327,17 @@ static int __rxe_init_mca(struct rxe_qp *qp, struct rxe_mcg *mcg, return 0; } -static int rxe_attach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, - struct rxe_mcg *mcg) +/** + * rxe_attach_mcg - attach qp to mcg if not already attached + * @qp: qp object + * @mcg: mcg object + * + * Context: caller must hold reference on qp and mcg. + * Returns: 0 on success else an error + */ +static int rxe_attach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) { + struct rxe_dev *rxe = mcg->rxe; struct rxe_mca *mca, *tmp; unsigned long flags; int err; @@ -358,17 +394,19 @@ static void __rxe_cleanup_mca(struct rxe_mca *mca, struct rxe_mcg *mcg) kfree(mca); } -static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, - union ib_gid *mgid) +/** + * rxe_detach_mcg - detach qp from mcg + * @mcg: mcg object + * @qp: qp object + * + * Returns: 0 on success else an error if qp is not attached. + */ +static int rxe_detach_mcg(struct rxe_mcg *mcg, struct rxe_qp *qp) { - struct rxe_mcg *mcg; + struct rxe_dev *rxe = mcg->rxe; struct rxe_mca *mca, *tmp; unsigned long flags; - mcg = rxe_lookup_mcg(rxe, mgid); - if (!mcg) - return -EINVAL; - spin_lock_irqsave(&rxe->mcg_lock, flags); list_for_each_entry_safe(mca, tmp, &mcg->qp_list, qp_list) { if (mca->qp == qp) { @@ -378,16 +416,11 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, * mcast group falls to zero go ahead and * tear it down. This will not free the * object since we are still holding a ref - * from the get key above + * from the caller */ if (atomic_read(&mcg->qp_num) <= 0) __rxe_destroy_mcg(mcg); - /* drop the ref from get key. This will free the - * object if qp_num is zero. - */ - kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); - spin_unlock_irqrestore(&rxe->mcg_lock, flags); return 0; } @@ -398,6 +431,14 @@ static int rxe_detach_mcg(struct rxe_dev *rxe, struct rxe_qp *qp, return -EINVAL; } +/** + * rxe_attach_mcast - attach qp to multicast group (see IBA-11.3.1) + * @ibqp: (IB) qp object + * @mgid: multicast IP address + * @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6) + * + * Returns: 0 on success else an errno + */ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) { int err; @@ -410,20 +451,38 @@ int rxe_attach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) if (IS_ERR(mcg)) return PTR_ERR(mcg); - err = rxe_attach_mcg(rxe, qp, mcg); + err = rxe_attach_mcg(mcg, qp); /* if we failed to attach the first qp to mcg tear it down */ if (atomic_read(&mcg->qp_num) == 0) rxe_destroy_mcg(mcg); kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); + return err; } +/** + * rxe_detach_mcast - detach qp from multicast group (see IBA-11.3.2) + * @ibqp: address of (IB) qp object + * @mgid: multicast IP address + * @mlid: multicast LID, ignored for RoCEv2 (see IBA-A17.5.6) + * + * Returns: 0 on success else an errno + */ int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); + struct rxe_mcg *mcg; + int err; - return rxe_detach_mcg(rxe, qp, mgid); + mcg = rxe_lookup_mcg(rxe, mgid); + if (!mcg) + return -EINVAL; + + err = rxe_detach_mcg(mcg, qp); + kref_put(&mcg->ref_cnt, rxe_cleanup_mcg); + + return err; } From 80005c43d4c8f486fe594d89e640b3d24d6b3f4c Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 22 Feb 2022 21:42:50 -0500 Subject: [PATCH 102/186] RDMA/irdma: Use net_type to check network type The member variable net_type is to check the type of network. Link: https://lore.kernel.org/r/20220223024252.3873736-2-yanjun.zhu@linux.dev Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index df0a011ff920..0e160d6a73b4 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1200,7 +1200,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, av->attrs = attr->ah_attr; rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid); rdma_gid2ip((struct sockaddr *)&av->dgid_addr, &attr->ah_attr.grh.dgid); - if (av->sgid_addr.saddr.sa_family == AF_INET6) { + if (av->net_type == RDMA_NETWORK_IPV6) { __be32 *daddr = av->dgid_addr.saddr_in6.sin6_addr.in6_u.u6_addr32; __be32 *saddr = @@ -1216,7 +1216,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, &local_ip[0], false, NULL, IRDMA_ARP_RESOLVE); - } else { + } else if (av->net_type == RDMA_NETWORK_IPV4) { __be32 saddr = av->sgid_addr.saddr_in.sin_addr.s_addr; __be32 daddr = av->dgid_addr.saddr_in.sin_addr.s_addr; @@ -4129,8 +4129,6 @@ static int irdma_create_ah(struct ib_ah *ibah, rdma_gid2ip((struct sockaddr *)&dgid_addr, &ah_attr->grh.dgid); ah->av.attrs = *ah_attr; ah->av.net_type = rdma_gid_attr_network_type(sgid_attr); - ah->av.sgid_addr.saddr = sgid_addr.saddr; - ah->av.dgid_addr.saddr = dgid_addr.saddr; ah_info = &sc_ah->ah_info; ah_info->ah_idx = ah_id; ah_info->pd_idx = pd->sc_pd.pd_id; @@ -4141,7 +4139,7 @@ static int irdma_create_ah(struct ib_ah *ibah, } ether_addr_copy(dmac, ah_attr->roce.dmac); - if (rdma_gid_attr_network_type(sgid_attr) == RDMA_NETWORK_IPV4) { + if (ah->av.net_type == RDMA_NETWORK_IPV4) { ah_info->ipv4_valid = true; ah_info->dest_ip_addr[0] = ntohl(dgid_addr.saddr_in.sin_addr.s_addr); From 8627da62cc3b9de4d299f2558a9f16b4c3c13a5d Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 22 Feb 2022 21:42:51 -0500 Subject: [PATCH 103/186] RDMA/irdma: Remove the unnecessary variable saddr Firstly the variable saddr was to check the type of a network. Now the variable net_type is used to do the same work. So it is removed. Link: https://lore.kernel.org/r/20220223024252.3873736-3-yanjun.zhu@linux.dev Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 1 - drivers/infiniband/hw/irdma/verbs.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 0e160d6a73b4..84be23b44e28 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4105,7 +4105,6 @@ static int irdma_create_ah(struct ib_ah *ibah, struct irdma_ah_info *ah_info; struct irdma_create_ah_resp uresp; union { - struct sockaddr saddr; struct sockaddr_in saddr_in; struct sockaddr_in6 saddr_in6; } sgid_addr, dgid_addr; diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index d0fdef8d09ea..d2d4a7e5f954 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -29,7 +29,6 @@ struct irdma_av { u8 macaddr[16]; struct rdma_ah_attr attrs; union { - struct sockaddr saddr; struct sockaddr_in saddr_in; struct sockaddr_in6 saddr_in6; } sgid_addr, dgid_addr; From 884194ef264e140a6d22f7a5de2b76765d17734a Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 22 Feb 2022 21:42:52 -0500 Subject: [PATCH 104/186] RDMA/irdma: Move union irdma_sockaddr to header file The union irdma_sockaddr is used frequently. So move it to the header file. Link: https://lore.kernel.org/r/20220223024252.3873736-4-yanjun.zhu@linux.dev Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 17 +++-------------- drivers/infiniband/hw/irdma/verbs.h | 11 +++++++---- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 84be23b44e28..ad57c53f2aed 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3882,11 +3882,7 @@ static int irdma_attach_mcast(struct ib_qp *ibqp, union ib_gid *ibgid, u16 lid) int ret = 0; bool ipv4; u16 vlan_id; - union { - struct sockaddr saddr; - struct sockaddr_in saddr_in; - struct sockaddr_in6 saddr_in6; - } sgid_addr; + union irdma_sockaddr sgid_addr; unsigned char dmac[ETH_ALEN]; rdma_gid2ip((struct sockaddr *)&sgid_addr, ibgid); @@ -4022,11 +4018,7 @@ static int irdma_detach_mcast(struct ib_qp *ibqp, union ib_gid *ibgid, u16 lid) struct irdma_mcast_grp_ctx_entry_info mcg_info = {}; int ret; unsigned long flags; - union { - struct sockaddr saddr; - struct sockaddr_in saddr_in; - struct sockaddr_in6 saddr_in6; - } sgid_addr; + union irdma_sockaddr sgid_addr; rdma_gid2ip((struct sockaddr *)&sgid_addr, ibgid); if (!ipv6_addr_v4mapped((struct in6_addr *)ibgid)) @@ -4104,10 +4096,7 @@ static int irdma_create_ah(struct ib_ah *ibah, u32 ah_id = 0; struct irdma_ah_info *ah_info; struct irdma_create_ah_resp uresp; - union { - struct sockaddr_in saddr_in; - struct sockaddr_in6 saddr_in6; - } sgid_addr, dgid_addr; + union irdma_sockaddr sgid_addr, dgid_addr; int err; u8 dmac[ETH_ALEN]; diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index d2d4a7e5f954..541105b728e3 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -25,13 +25,16 @@ struct irdma_pd { struct irdma_sc_pd sc_pd; }; +union irdma_sockaddr { + struct sockaddr_in saddr_in; + struct sockaddr_in6 saddr_in6; +}; + struct irdma_av { u8 macaddr[16]; struct rdma_ah_attr attrs; - union { - struct sockaddr_in saddr_in; - struct sockaddr_in6 saddr_in6; - } sgid_addr, dgid_addr; + union irdma_sockaddr sgid_addr; + union irdma_sockaddr dgid_addr; u8 net_type; }; From ea7596c1e5a738a6979edf1e3539e0c85db15e90 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 24 Feb 2022 13:28:32 -0500 Subject: [PATCH 105/186] RDMA/irdma: Make irdma_create_mg_ctx return a void The function irdma_create_mg_ctx always returns 0, so make it void and delete the return value check. Link: https://lore.kernel.org/r/20220224182832.3896686-1-yanjun.zhu@linux.dev Signed-off-by: Zhu Yanjun Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/uda.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/irdma/uda.c b/drivers/infiniband/hw/irdma/uda.c index 5692093c327d..284cec2a74de 100644 --- a/drivers/infiniband/hw/irdma/uda.c +++ b/drivers/infiniband/hw/irdma/uda.c @@ -84,7 +84,7 @@ int irdma_sc_access_ah(struct irdma_sc_cqp *cqp, struct irdma_ah_info *info, * irdma_create_mg_ctx() - create a mcg context * @info: multicast group context info */ -static int irdma_create_mg_ctx(struct irdma_mcast_grp_info *info) +static void irdma_create_mg_ctx(struct irdma_mcast_grp_info *info) { struct irdma_mcast_grp_ctx_entry_info *entry_info = NULL; u8 idx = 0; /* index in the array */ @@ -103,8 +103,6 @@ static int irdma_create_mg_ctx(struct irdma_mcast_grp_info *info) ctx_idx++; } } - - return 0; } /** @@ -119,7 +117,6 @@ int irdma_access_mcast_grp(struct irdma_sc_cqp *cqp, u64 scratch) { __le64 *wqe; - int ret_code = 0; if (info->mg_id >= IRDMA_UDA_MAX_FSI_MGS) { ibdev_dbg(to_ibdev(cqp->dev), "WQE: mg_id out of range\n"); @@ -132,9 +129,7 @@ int irdma_access_mcast_grp(struct irdma_sc_cqp *cqp, return -ENOMEM; } - ret_code = irdma_create_mg_ctx(info); - if (ret_code) - return ret_code; + irdma_create_mg_ctx(info); set_64bit_val(wqe, 32, info->dma_mem_mc.pa); set_64bit_val(wqe, 16, From 6702bc14744847842a87fed21a795b6e8bab6965 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Fri, 25 Feb 2022 10:32:09 -0600 Subject: [PATCH 106/186] RDMA/irdma: Fix netdev notifications for vlan's Currently, events on vlan netdevs are being ignored. Fix this by finding the real netdev and processing the notifications for vlan netdevs. Fixes: 915cc7ac0f8e ("RDMA/irdma: Add miscellaneous utility definitions") Link: https://lore.kernel.org/r/20220225163211.127-2-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/utils.c | 48 ++++++++++++++++++----------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index eae4400f1dde..346c2c5dabdf 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -150,31 +150,35 @@ int irdma_inetaddr_event(struct notifier_block *notifier, unsigned long event, void *ptr) { struct in_ifaddr *ifa = ptr; - struct net_device *netdev = ifa->ifa_dev->dev; + struct net_device *real_dev, *netdev = ifa->ifa_dev->dev; struct irdma_device *iwdev; struct ib_device *ibdev; u32 local_ipaddr; - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_IRDMA); + real_dev = rdma_vlan_dev_real_dev(netdev); + if (!real_dev) + real_dev = netdev; + + ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA); if (!ibdev) return NOTIFY_DONE; iwdev = to_iwdev(ibdev); local_ipaddr = ntohl(ifa->ifa_address); ibdev_dbg(&iwdev->ibdev, - "DEV: netdev %p event %lu local_ip=%pI4 MAC=%pM\n", netdev, - event, &local_ipaddr, netdev->dev_addr); + "DEV: netdev %p event %lu local_ip=%pI4 MAC=%pM\n", real_dev, + event, &local_ipaddr, real_dev->dev_addr); switch (event) { case NETDEV_DOWN: - irdma_manage_arp_cache(iwdev->rf, netdev->dev_addr, + irdma_manage_arp_cache(iwdev->rf, real_dev->dev_addr, &local_ipaddr, true, IRDMA_ARP_DELETE); - irdma_if_notify(iwdev, netdev, &local_ipaddr, true, false); + irdma_if_notify(iwdev, real_dev, &local_ipaddr, true, false); irdma_gid_change_event(&iwdev->ibdev); break; case NETDEV_UP: case NETDEV_CHANGEADDR: - irdma_add_arp(iwdev->rf, &local_ipaddr, true, netdev->dev_addr); - irdma_if_notify(iwdev, netdev, &local_ipaddr, true, true); + irdma_add_arp(iwdev->rf, &local_ipaddr, true, real_dev->dev_addr); + irdma_if_notify(iwdev, real_dev, &local_ipaddr, true, true); irdma_gid_change_event(&iwdev->ibdev); break; default: @@ -196,32 +200,36 @@ int irdma_inet6addr_event(struct notifier_block *notifier, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; - struct net_device *netdev = ifa->idev->dev; + struct net_device *real_dev, *netdev = ifa->idev->dev; struct irdma_device *iwdev; struct ib_device *ibdev; u32 local_ipaddr6[4]; - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_IRDMA); + real_dev = rdma_vlan_dev_real_dev(netdev); + if (!real_dev) + real_dev = netdev; + + ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA); if (!ibdev) return NOTIFY_DONE; iwdev = to_iwdev(ibdev); irdma_copy_ip_ntohl(local_ipaddr6, ifa->addr.in6_u.u6_addr32); ibdev_dbg(&iwdev->ibdev, - "DEV: netdev %p event %lu local_ip=%pI6 MAC=%pM\n", netdev, - event, local_ipaddr6, netdev->dev_addr); + "DEV: netdev %p event %lu local_ip=%pI6 MAC=%pM\n", real_dev, + event, local_ipaddr6, real_dev->dev_addr); switch (event) { case NETDEV_DOWN: - irdma_manage_arp_cache(iwdev->rf, netdev->dev_addr, + irdma_manage_arp_cache(iwdev->rf, real_dev->dev_addr, local_ipaddr6, false, IRDMA_ARP_DELETE); - irdma_if_notify(iwdev, netdev, local_ipaddr6, false, false); + irdma_if_notify(iwdev, real_dev, local_ipaddr6, false, false); irdma_gid_change_event(&iwdev->ibdev); break; case NETDEV_UP: case NETDEV_CHANGEADDR: irdma_add_arp(iwdev->rf, local_ipaddr6, false, - netdev->dev_addr); - irdma_if_notify(iwdev, netdev, local_ipaddr6, false, true); + real_dev->dev_addr); + irdma_if_notify(iwdev, real_dev, local_ipaddr6, false, true); irdma_gid_change_event(&iwdev->ibdev); break; default: @@ -243,14 +251,18 @@ int irdma_net_event(struct notifier_block *notifier, unsigned long event, void *ptr) { struct neighbour *neigh = ptr; + struct net_device *real_dev, *netdev = (struct net_device *)neigh->dev; struct irdma_device *iwdev; struct ib_device *ibdev; __be32 *p; u32 local_ipaddr[4] = {}; bool ipv4 = true; - ibdev = ib_device_get_by_netdev((struct net_device *)neigh->dev, - RDMA_DRIVER_IRDMA); + real_dev = rdma_vlan_dev_real_dev(netdev); + if (!real_dev) + real_dev = netdev; + + ibdev = ib_device_get_by_netdev(real_dev, RDMA_DRIVER_IRDMA); if (!ibdev) return NOTIFY_DONE; From b200189626b5cefbaf8be9cadd7a28215e065bb9 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Fri, 25 Feb 2022 10:32:10 -0600 Subject: [PATCH 107/186] RDMA/irdma: Fix Passthrough mode in VM Using PCI_FUNC macro in a VM, when the device is in passthrough mode does not provide the real function instance. This means that currently, devices will not probe unless the instance in the VM matches the instance in the host. Fix this by getting the pf_id from the LAN during the probe. Fixes: 8498a30e1b94 ("RDMA/irdma: Register auxiliary driver and implement private channel OPs") Link: https://lore.kernel.org/r/20220225163211.127-3-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/hw.c | 2 +- drivers/infiniband/hw/irdma/i40iw_if.c | 1 + drivers/infiniband/hw/irdma/main.c | 1 + drivers/infiniband/hw/irdma/main.h | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 58cee86b4e35..3dc9b5801da1 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1600,7 +1600,7 @@ static int irdma_initialize_dev(struct irdma_pci_f *rf) info.fpm_commit_buf = mem.va; info.bar0 = rf->hw.hw_addr; - info.hmc_fn_id = PCI_FUNC(rf->pcidev->devfn); + info.hmc_fn_id = rf->pf_id; info.hw = &rf->hw; status = irdma_sc_dev_init(rf->rdma_ver, &rf->sc_dev, &info); if (status) diff --git a/drivers/infiniband/hw/irdma/i40iw_if.c b/drivers/infiniband/hw/irdma/i40iw_if.c index 8b5bd773da5a..4053ead32416 100644 --- a/drivers/infiniband/hw/irdma/i40iw_if.c +++ b/drivers/infiniband/hw/irdma/i40iw_if.c @@ -77,6 +77,7 @@ static void i40iw_fill_device_info(struct irdma_device *iwdev, struct i40e_info rf->rdma_ver = IRDMA_GEN_1; rf->gen_ops.request_reset = i40iw_request_reset; rf->pcidev = cdev_info->pcidev; + rf->pf_id = cdev_info->fid; rf->hw.hw_addr = cdev_info->hw_addr; rf->cdev = cdev_info; rf->msix_count = cdev_info->msix_count; diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index babbe8ad5afa..4f4c3454bc43 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -231,6 +231,7 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf rf->hw.hw_addr = pf->hw.hw_addr; rf->pcidev = pf->pdev; rf->msix_count = pf->num_rdma_msix; + rf->pf_id = pf->hw.pf_id; rf->msix_entries = &pf->msix_entries[pf->rdma_base_vector]; rf->default_vsi.vsi_idx = vsi->vsi_num; rf->protocol_used = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 94c41f834c25..44365d1dc7b2 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -256,6 +256,7 @@ struct irdma_pci_f { u8 *mem_rsrc; u8 rdma_ver; u8 rst_to; + u8 pf_id; enum irdma_protocol_used protocol_used; u32 sd_type; u32 msix_count; From 17850f2b0b4b806e47cc44df94186bfc2cdd490b Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Fri, 25 Feb 2022 10:32:11 -0600 Subject: [PATCH 108/186] RDMA/irdma: Remove incorrect masking of PD The PD id is masked with 0x7fff, while PD can be 18 bits for GEN2 HW. Remove the masking as it should not be needed and can cause incorrect PD id to be used. Fixes: b48c24c2d710 ("RDMA/irdma: Implement device supported verb APIs") Link: https://lore.kernel.org/r/20220225163211.127-4-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index ad57c53f2aed..e731768c2b2a 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -2501,7 +2501,7 @@ static int irdma_dealloc_mw(struct ib_mw *ibmw) cqp_info = &cqp_request->info; info = &cqp_info->in.u.dealloc_stag.info; memset(info, 0, sizeof(*info)); - info->pd_id = iwpd->sc_pd.pd_id & 0x00007fff; + info->pd_id = iwpd->sc_pd.pd_id; info->stag_idx = ibmw->rkey >> IRDMA_CQPSQ_STAG_IDX_S; info->mr = false; cqp_info->cqp_cmd = IRDMA_OP_DEALLOC_STAG; @@ -3004,7 +3004,7 @@ static int irdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) cqp_info = &cqp_request->info; info = &cqp_info->in.u.dealloc_stag.info; memset(info, 0, sizeof(*info)); - info->pd_id = iwpd->sc_pd.pd_id & 0x00007fff; + info->pd_id = iwpd->sc_pd.pd_id; info->stag_idx = ib_mr->rkey >> IRDMA_CQPSQ_STAG_IDX_S; info->mr = true; if (iwpbl->pbl_allocated) From a80501b89152adb29adc7ab943d75c7345f9a3fb Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 23 Feb 2022 15:49:01 +0800 Subject: [PATCH 109/186] RDMA/core: Remove unnecessary statements The rdma_zalloc_drv_obj() in __ib_alloc_pd() would zero pd, it unnecessary add NULL to the object in struct pd. The uverbs_free_pd() already return busy if pd->usecnt is true, there is no need to add a warning. Link: https://lore.kernel.org/r/20220223074901.201506-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e821dc94a43e..a9819c40a140 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -268,9 +268,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, return ERR_PTR(-ENOMEM); pd->device = device; - pd->uobject = NULL; - pd->__internal_mr = NULL; - atomic_set(&pd->usecnt, 0); pd->flags = flags; rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); @@ -341,11 +338,6 @@ int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) pd->__internal_mr = NULL; } - /* uverbs manipulates usecnt with proper locking, while the kabi - * requires the caller to guarantee we can't race here. - */ - WARN_ON(atomic_read(&pd->usecnt)); - ret = pd->device->ops.dealloc_pd(pd, udata); if (ret) return ret; From a25cedb4313d35e1f2968105678a47ca28e84d3b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 17 Jan 2022 14:32:12 -0500 Subject: [PATCH 110/186] ceph: switch netfs read ops to use rreq->inode instead of rreq->mapping->host One fewer pointer dereference, and in the future we may not be able to count on the mapping pointer being populated (e.g. in the DIO case). Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c98e5238a1b6..d3969fb11c59 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -185,7 +185,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp) static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) { - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_layout *lo = &ci->i_layout; u32 blockoff; @@ -202,7 +202,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) { - struct inode *inode = subreq->rreq->mapping->host; + struct inode *inode = subreq->rreq->inode; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 objno, objoff; @@ -248,7 +248,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) { struct netfs_read_request *rreq = subreq->rreq; - struct inode *inode = rreq->mapping->host; + struct inode *inode = rreq->inode; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; From 5b19f1eba45902bc75b92e9962374fc98a40e174 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Dec 2021 16:14:15 +0000 Subject: [PATCH 111/186] ceph: make ceph_netfs_issue_op() handle inlined data Make ceph_netfs_issue_op() handle inlined data on page 0. Signed-off-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 77 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index d3969fb11c59..a58b851eca4a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -245,6 +245,59 @@ static void finish_netfs_read(struct ceph_osd_request *req) iput(req->r_inode); } +static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq) +{ + struct netfs_read_request *rreq = subreq->rreq; + struct inode *inode = rreq->inode; + struct ceph_mds_reply_info_parsed *rinfo; + struct ceph_mds_reply_info_in *iinfo; + struct ceph_mds_request *req; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); + struct ceph_inode_info *ci = ceph_inode(inode); + struct iov_iter iter; + ssize_t err = 0; + size_t len; + + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); + + if (subreq->start >= inode->i_size) + goto out; + + /* We need to fetch the inline data. */ + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + req->r_ino1 = ci->i_vino; + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); + req->r_num_caps = 2; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto out; + + rinfo = &req->r_reply_info; + iinfo = &rinfo->targeti; + if (iinfo->inline_version == CEPH_INLINE_NONE) { + /* The data got uninlined */ + ceph_mdsc_put_request(req); + return false; + } + + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); + if (err == 0) + err = -EFAULT; + + ceph_mdsc_put_request(req); +out: + netfs_subreq_terminated(subreq, err, false); + return true; +} + static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) { struct netfs_read_request *rreq = subreq->rreq; @@ -259,6 +312,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) int err = 0; u64 len = subreq->len; + if (ci->i_inline_version != CEPH_INLINE_NONE && + ceph_netfs_issue_op_inline(subreq)) + return; + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, @@ -327,23 +384,9 @@ static int ceph_readpage(struct file *file, struct page *subpage) size_t len = folio_size(folio); u64 off = folio_file_pos(folio); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) { - folio_unlock(folio); - return -EINVAL; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - folio_unlock(folio); - return 0; - } - - dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", - vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); + dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d", + vino.ino, vino.snap, file, off, len, folio, folio_index(folio), + ci->i_inline_version != CEPH_INLINE_NONE); return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); } From 083db6fd3e73fc4f9ee74d2da1012a94216d6a60 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Dec 2021 23:48:33 +0000 Subject: [PATCH 112/186] ceph: uninline the data on a file opened for writing If a ceph file is made up of inline data, uninline that in the ceph_open() rather than in ceph_page_mkwrite(), ceph_write_iter(), ceph_fallocate() or ceph_write_begin(). This makes it easier to convert to using the netfs library for VM write hooks. Should this also take the inode lock for the duration on uninlining to prevent a race with truncation? [ jlayton: fix up folio locking, update i_inline_version after write ] Signed-off-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 154 +++++++++++++++--------------------------------- fs/ceph/file.c | 32 +++++----- fs/ceph/super.h | 2 +- 3 files changed, 64 insertions(+), 124 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a58b851eca4a..46e0881ae8b2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1317,45 +1317,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); struct folio *folio = NULL; - pgoff_t index = pos >> PAGE_SHIFT; int r; - /* - * Uninlining should have already been done and everything updated, EXCEPT - * for inline_version sent to the MDS. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) { - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (aop_flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; - folio = __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); - if (!folio) - return -ENOMEM; - - /* - * The inline_version on a new inode is set to 1. If that's the - * case, then the folio is brand new and isn't yet Uptodate. - */ - r = 0; - if (index == 0 && ci->i_inline_version != 1) { - if (!folio_test_uptodate(folio)) { - WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", - ci->i_inline_version); - r = -EINVAL; - } - goto out; - } - zero_user_segment(&folio->page, 0, folio_size(folio)); - folio_mark_uptodate(folio); - goto out; - } - r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, &ceph_netfs_read_ops, NULL); -out: if (r == 0) folio_wait_fscache(folio); if (r < 0) { @@ -1551,19 +1517,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (ci->i_inline_version != CEPH_INLINE_NONE) { - struct page *locked_page = NULL; - if (off == 0) { - lock_page(page); - locked_page = page; - } - err = ceph_uninline_data(vma->vm_file, locked_page); - if (locked_page) - unlock_page(locked_page); - if (err < 0) - goto out_free; - } - if (off + thp_size(page) <= size) len = thp_size(page); else @@ -1620,11 +1573,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_snap_context(snapc); } while (err == 0); - if (ret == VM_FAULT_LOCKED || - ci->i_inline_version != CEPH_INLINE_NONE) { + if (ret == VM_FAULT_LOCKED) { int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1688,16 +1639,29 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, } } -int ceph_uninline_data(struct file *filp, struct page *locked_page) +int ceph_uninline_data(struct file *file) { - struct inode *inode = file_inode(filp); + struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; - struct page *page = NULL; + struct ceph_cap_flush *prealloc_cf; + struct folio *folio = NULL; + struct page *pages[1]; u64 len, inline_version; int err = 0; - bool from_pagecache = false; + + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + + folio = read_mapping_folio(inode->i_mapping, 0, file); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); + goto out; + } + + folio_lock(folio); spin_lock(&ci->i_ceph_lock); inline_version = ci->i_inline_version; @@ -1708,45 +1672,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (inline_version == 1 || /* initial version, no data */ inline_version == CEPH_INLINE_NONE) - goto out; + goto out_unlock; - if (locked_page) { - page = locked_page; - WARN_ON(!PageUptodate(page)); - } else if (ceph_caps_issued(ci) & - (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { - page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - from_pagecache = true; - lock_page(page); - } else { - put_page(page); - page = NULL; - } - } - } - - if (page) { - len = i_size_read(inode); - if (len > PAGE_SIZE) - len = PAGE_SIZE; - } else { - page = __page_cache_alloc(GFP_NOFS); - if (!page) { - err = -ENOMEM; - goto out; - } - err = __ceph_do_getattr(inode, page, - CEPH_STAT_CAP_INLINE_DATA, true); - if (err < 0) { - /* no inline data */ - if (err == -ENODATA) - err = 0; - goto out; - } - len = err; - } + len = i_size_read(inode); + if (len > folio_size(folio)) + len = folio_size(folio); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 0, 1, @@ -1754,7 +1684,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) NULL, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } req->r_mtime = inode->i_mtime; @@ -1763,7 +1693,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); if (err < 0) - goto out; + goto out_unlock; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), 0, &len, 1, 3, @@ -1772,10 +1702,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ci->i_truncate_size, false); if (IS_ERR(req)) { err = PTR_ERR(req); - goto out; + goto out_unlock; } - osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + pages[0] = folio_page(folio, 0); + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); { __le64 xattr_buf = cpu_to_le64(inline_version); @@ -1785,7 +1716,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) CEPH_OSD_CMPXATTR_OP_GT, CEPH_OSD_CMPXATTR_MODE_U64); if (err) - goto out_put; + goto out_put_req; } { @@ -1796,7 +1727,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) "inline_version", xattr_buf, xattr_len, 0, 0); if (err) - goto out_put; + goto out_put_req; } req->r_mtime = inode->i_mtime; @@ -1807,19 +1738,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); -out_put: + if (!err) { + int dirty; + + /* Set to CAP_INLINE_NONE and dirty the caps */ + down_read(&fsc->mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + up_read(&fsc->mdsc->snap_rwsem); + if (dirty) + __mark_inode_dirty(inode, dirty); + } +out_put_req: ceph_osdc_put_request(req); if (err == -ECANCELED) err = 0; +out_unlock: + folio_unlock(folio); + folio_put(folio); out: - if (page && page != locked_page) { - if (from_pagecache) { - unlock_page(page); - put_page(page); - } else - __free_pages(page, 0); - } - + ceph_free_cap_flush(prealloc_cf); dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", inode, ceph_vinop(inode), inline_version, err); return err; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bbed3224ad68..22ca724aef36 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -207,6 +207,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_mount_options *opt = ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; + int ret; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, inode->i_mode, isdir ? "dir" : "regular"); @@ -240,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, INIT_LIST_HEAD(&fi->rw_contexts); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); + if ((file->f_mode & FMODE_WRITE) && + ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file); + if (ret < 0) + goto error; + } + return 0; + +error: + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); + /* wake up anyone waiting for caps on this inode */ + wake_up_all(&ci->i_cap_wq); + return ret; } /* @@ -1041,7 +1057,6 @@ static void ceph_aio_complete(struct inode *inode, } spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &aio_req->prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -1778,12 +1793,6 @@ retry_snap: if (err) goto out; - if (ci->i_inline_version != CEPH_INLINE_NONE) { - err = ceph_uninline_data(file, NULL); - if (err < 0) - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) @@ -1855,7 +1864,6 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2109,12 +2117,6 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ci->i_inline_version != CEPH_INLINE_NONE) { - ret = ceph_uninline_data(file, NULL); - if (ret < 0) - goto unlock; - } - size = i_size_read(inode); /* Are we punching a hole beyond EOF? */ @@ -2139,7 +2141,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); - ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&ci->i_ceph_lock); @@ -2532,7 +2533,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); - dst_ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); spin_unlock(&dst_ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 67f145e1ae7a..a35c27489901 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1213,7 +1213,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci, /* addr.c */ extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); -extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_uninline_data(struct file *file); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); From 9eaa7b79979f2cb1ac0508d413dc7f7664feb430 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 3 Feb 2022 09:04:24 -0500 Subject: [PATCH 113/186] ceph: eliminate req->r_wait_for_completion from ceph_mds_request ...and instead just pass the wait function on the stack. Make ceph_mdsc_wait_request non-static, and add an argument for wait for completion. Then have ceph_lock_message call ceph_mdsc_submit_request, and ceph_mdsc_wait_request and pass in the pointer to ceph_lock_wait_for_completion. While we're in there, rearrange some fields in ceph_mds_request, so we save a total of 24 bytes per. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 8 ++++---- fs/ceph/mds_client.c | 11 ++++++----- fs/ceph/mds_client.h | 9 +++++---- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d1f154aec249..3e2843e86e27 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; - if (wait) - req->r_wait_for_completion = ceph_lock_wait_for_completion; - - err = ceph_mdsc_do_request(mdsc, inode, req); + err = ceph_mdsc_submit_request(mdsc, inode, req); + if (!err) + err = ceph_mdsc_wait_request(mdsc, req, wait ? + ceph_lock_wait_for_completion : NULL); if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c30eefc0ac19..f26ed1d76be2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2946,15 +2946,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, return err; } -static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *req) +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func) { int err; /* wait */ dout("do_request waiting\n"); - if (!req->r_timeout && req->r_wait_for_completion) { - err = req->r_wait_for_completion(mdsc, req); + if (wait_func) { + err = wait_func(mdsc, req); } else { long timeleft = wait_for_completion_killable_timeout( &req->r_completion, @@ -3011,7 +3012,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* issue */ err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) - err = ceph_mdsc_wait_request(mdsc, req); + err = ceph_mdsc_wait_request(mdsc, req, NULL); dout("do_request %p done, result %d\n", req, err); return err; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 97c7f7bfa55f..ab12f3ce81a3 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -274,8 +274,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ - const struct cred *r_cred; int r_request_release_offset; + const struct cred *r_cred; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -296,12 +296,11 @@ struct ceph_mds_request { struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; - + u32 r_readdir_offset; struct page *r_locked_page; int r_dir_caps; int r_num_caps; - u32 r_readdir_offset; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ @@ -329,7 +328,6 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; - ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ long long r_dir_release_cnt; @@ -507,6 +505,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req, + ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); From fbed7045f552db33f5f9e99ec40d55e6e0128b51 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 5 Feb 2022 08:39:33 -0500 Subject: [PATCH 114/186] ceph: wait for async create reply before sending any cap messages If we haven't received a reply to an async create request, then we don't want to send any cap messages to the MDS for that inode yet. Just have ceph_check_caps and __kick_flushing_caps return without doing anything, and have ceph_write_inode wait for the reply if we were asked to wait on the inode writeback. URL: https://tracker.ceph.com/issues/54107 Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Reviewed-by: Patrick Donnelly Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b472cd066d1c..991c6c8625b2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ceph_get_mds_session(session); spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + /* Don't send messages until we get async create reply */ + spin_unlock(&ci->i_ceph_lock); + ceph_put_mds_session(session); + return; + } + if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: @@ -2409,6 +2416,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) dout("write_inode %p wait=%d\n", inode, wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { + err = ceph_wait_on_async_create(inode); + if (err) + return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, @@ -2439,6 +2449,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, u64 first_tid = 0; u64 last_snap_flush = 0; + /* Don't do anything until create reply comes in */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) + return; + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { From 4d9513cf6d20f2977c791087cf4fd9cadb9d28e3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Feb 2022 10:27:50 -0500 Subject: [PATCH 115/186] ceph: wake waiters after failed async create Currently, we only wake the waiters if we got a req->r_target_inode out of the reply. In the case where the create fails though, we may not have one. If there is a non-zero result from the create, then ensure that we wake anything waiting on the inode after we shut it down. URL: https://tracker.ceph.com/issues/54067 Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 51 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 22ca724aef36..feb75eb1cd82 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -532,52 +532,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino) } } +static void wake_async_create_waiters(struct inode *inode, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); +} + static void ceph_async_create_cb(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { + struct dentry *dentry = req->r_dentry; + struct inode *dinode = d_inode(dentry); + struct inode *tinode = req->r_target_inode; int result = req->r_err ? req->r_err : le32_to_cpu(req->r_reply_info.head->result); + WARN_ON_ONCE(dinode && tinode && dinode != tinode); + + /* MDS changed -- caller must resubmit */ if (result == -EJUKEBOX) goto out; mapping_set_error(req->r_parent->i_mapping, result); if (result) { - struct dentry *dentry = req->r_dentry; - struct inode *inode = d_inode(dentry); int pathlen = 0; u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &base, 0); + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<>" : path, result); + ceph_mdsc_free_path(path, pathlen); + ceph_dir_clear_complete(req->r_parent); if (!d_unhashed(dentry)) d_drop(dentry); - ceph_inode_shutdown(inode); - - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", - base, IS_ERR(path) ? "<>" : path, result); - ceph_mdsc_free_path(path, pathlen); + if (dinode) { + mapping_set_error(dinode->i_mapping, result); + ceph_inode_shutdown(dinode); + wake_async_create_waiters(dinode, req->r_session); + } } - if (req->r_target_inode) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - u64 ino = ceph_vino(req->r_target_inode).ino; + if (tinode) { + u64 ino = ceph_vino(tinode).ino; if (req->r_deleg_ino != ino) pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", __func__, req->r_err, req->r_deleg_ino, ino); - mapping_set_error(req->r_target_inode->i_mapping, result); - spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); - } - ceph_kick_flushing_inode_caps(req->r_session, ci); - spin_unlock(&ci->i_ceph_lock); + mapping_set_error(tinode->i_mapping, result); + wake_async_create_waiters(tinode, req->r_session); } else if (!result) { pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, req->r_deleg_ino); From 370f0acf2c707a3973daf528a725f046c9528f6b Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 7 Feb 2022 13:03:40 +0800 Subject: [PATCH 116/186] ceph: fail the request directly if handle_reply gets an ESTALE If MDS return ESTALE, that means the MDS has already iterated all the possible active MDSes including the auth MDS or the inode is under purging. No need to retry in auth MDS and will just return ESTALE directly. Retrying in this situation will cause an infinite loop. Also, retrying like this would prevent the kernel VFS layer ESTALE handling from working properly. An ESTALE error is usually an indication that the dcache is wrong, so we want to allow the VFS to redo the lookup and revalidate it properly. URL: https://tracker.ceph.com/issues/53504 Signed-off-by: Xiubo Li Acked-by: Greg Farnum Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f26ed1d76be2..f3abbf83ea5b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3098,35 +3098,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) result = le32_to_cpu(head->result); - /* - * Handle an ESTALE - * if we're not talking to the authority, send to them - * if the authority has changed while we weren't looking, - * send to new authority - * Otherwise we just have to return an ESTALE - */ - if (result == -ESTALE) { - dout("got ESTALE on request %llu\n", req->r_tid); - req->r_resend_mds = -1; - if (req->r_direct_mode != USE_AUTH_MDS) { - dout("not using auth, setting for that now\n"); - req->r_direct_mode = USE_AUTH_MDS; - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } else { - int mds = __choose_mds(mdsc, req, NULL); - if (mds >= 0 && mds != req->r_session->s_mds) { - dout("but auth changed, so resending\n"); - __do_request(mdsc, req); - mutex_unlock(&mdsc->mutex); - goto out; - } - } - dout("have to return ESTALE on request %llu\n", req->r_tid); - } - - if (head->safe) { set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); __unregister_request(mdsc, req); From 810313c5f3f5cff36c342d3e26c13f170c3af044 Mon Sep 17 00:00:00 2001 From: hongnanli Date: Fri, 11 Feb 2022 11:33:25 +0800 Subject: [PATCH 117/186] ceph: fix comments mentioning i_mutex inode->i_mutex has been replaced with inode->i_rwsem long ago. Fix comments still mentioning i_mutex. Signed-off-by: hongnanli Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 6 +++--- fs/ceph/inode.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 133dbd9338e7..0cf6afe283e9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_mutex, no need to use page lock */ + i_rwsem, no need to use page lock */ unlock_page(cache_ctl->page); cache_ctl->dentries = kmap(cache_ctl->page); } @@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, rcu_read_lock(); spin_lock(&parent->d_lock); /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ + * marked as complete while not holding the i_rwsem. */ if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) dentry = cache_ctl->dentries[cache_ctl->index]; else @@ -671,7 +671,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ /* .snap dir? */ if (ceph_snap(parent) == CEPH_NOSNAP && diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ef4a980a7bf3..22adddb3d051 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1201,7 +1201,7 @@ out_unlock: /* * splice a dentry to an inode. - * caller must hold directory i_mutex for this to be safe. + * caller must hold directory i_rwsem for this to be safe. */ static int splice_dentry(struct dentry **pdn, struct inode *in) { @@ -1598,7 +1598,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, return idx == 0 ? -ENOMEM : 0; } /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ + * i_rwsem, no need to use page lock */ unlock_page(ctl->page); ctl->dentries = kmap(ctl->page); if (idx == 0) From 27884f4bce63ebb8821992787d3e687cc24d95a3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Feb 2022 08:19:45 -0500 Subject: [PATCH 118/186] libceph: drop else branches in prepare_read_data{,_cont} Just call set_in_bvec in the non-conditional part. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/messenger_v2.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c81379f93ad5..c6e5bfc717d5 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1773,10 +1773,8 @@ static int prepare_read_data(struct ceph_connection *con) bv.bv_page = con->bounce_page; bv.bv_offset = 0; - set_in_bvec(con, &bv); - } else { - set_in_bvec(con, &bv); } + set_in_bvec(con, &bv); con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; return 0; } @@ -1807,10 +1805,8 @@ static void prepare_read_data_cont(struct ceph_connection *con) if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { bv.bv_page = con->bounce_page; bv.bv_offset = 0; - set_in_bvec(con, &bv); - } else { - set_in_bvec(con, &bv); } + set_in_bvec(con, &bv); WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); return; } From 6ddf5f165f13ab623d04aee2a473d35818255199 Mon Sep 17 00:00:00 2001 From: Milind Changire Date: Mon, 14 Feb 2022 05:01:01 +0000 Subject: [PATCH 119/186] ceph: add getvxattr op Problem: Some directory vxattrs (e.g. ceph.dir.pin.random) are governed by information that isn't necessarily shared with the client. Add support for the new GETVXATTR operation, which allows the client to query the MDS directly for vxattrs. When the client is queried for a vxattr that doesn't have a special handler, have it issue a GETVXATTR to the MDS directly. Solution: Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and ceph.file.layout* vxattrs. If the entire layout for a dir or a file is being set, then it is expected that the layout be set in standard JSON format. Individual field value retrieval is not wrapped in JSON. The JSON format also applies while setting the vxattr if the entire layout is being set in one go. As a temporary measure, setting a vxattr can also be done in the old format. The old format will be deprecated in the future. URL: https://tracker.ceph.com/issues/51062 Signed-off-by: Milind Changire Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 51 ++++++++++++++++++++++++++++++++++++ fs/ceph/mds_client.c | 24 +++++++++++++++++ fs/ceph/mds_client.h | 6 +++++ fs/ceph/strings.c | 1 + fs/ceph/super.h | 1 + fs/ceph/xattr.c | 13 +++++++-- include/linux/ceph/ceph_fs.h | 1 + 7 files changed, 95 insertions(+), 2 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 22adddb3d051..7b1e93c8a0d2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2301,6 +2301,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, return err; } +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, + size_t size) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_request *req; + int mode = USE_AUTH_MDS; + int err; + char *xattr_value; + size_t xattr_value_len; + + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); + if (IS_ERR(req)) { + err = -ENOMEM; + goto out; + } + + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + err = -ENOMEM; + goto put; + } + + ihold(inode); + req->r_inode = inode; + err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err < 0) + goto put; + + xattr_value = req->r_reply_info.xattr_info.xattr_value; + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; + + dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); + + err = (int)xattr_value_len; + if (size == 0) + goto put; + + if (xattr_value_len > size) { + err = -ERANGE; + goto put; + } + + memcpy(value, xattr_value, xattr_value_len); +put: + ceph_mdsc_put_request(req); +out: + dout("do_getvxattr result=%d\n", err); + return err; +} + /* * Check inode permissions. We verify we have a valid value for diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f3abbf83ea5b..0b7bde73bf03 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -555,6 +555,28 @@ bad: return -EIO; } +static int parse_reply_info_getvxattr(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + u64 features) +{ + u32 value_len; + + ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ + ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ + ceph_decode_skip_32(p, end, bad); /* skip payload length */ + + ceph_decode_32_safe(p, end, value_len, bad); + + if (value_len == end - *p) { + info->xattr_info.xattr_value = *p; + info->xattr_info.xattr_value_len = value_len; + *p = end; + return value_len; + } +bad: + return -EIO; +} + /* * parse extra results */ @@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end, return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); + else if (op == CEPH_MDS_OP_GETVXATTR) + return parse_reply_info_getvxattr(p, end, info, features); else return -EIO; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ab12f3ce81a3..33497846e47e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry { loff_t offset; }; +struct ceph_mds_reply_xattr { + char *xattr_value; + size_t xattr_value_len; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed { char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; + struct ceph_mds_reply_xattr xattr_info; /* extra */ union { diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 573bb9556fb5..e36e8948e728 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; case CEPH_MDS_OP_GETATTR: return "getattr"; + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr"; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a35c27489901..4569f802ddbb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1048,6 +1048,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode) /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index fcf7dfdecf96..afec84088471 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_xattr *xattr; - struct ceph_vxattr *vxattr = NULL; + struct ceph_vxattr *vxattr; int req_mask; ssize_t err; + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) + goto handle_non_vxattrs; + /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { @@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, err = -ERANGE; } return err; + } else { + err = ceph_do_getvxattr(inode, name, value, size); + /* this would happen with a new client and old server combo */ + if (err == -EOPNOTSUPP) + err = -ENODATA; + return err; } - +handle_non_vxattrs: req_mask = __get_request_mask(inode); spin_lock(&ci->i_ceph_lock); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 7ad6c3d0db7d..66db21ac5f0c 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -328,6 +328,7 @@ enum { CEPH_MDS_OP_LOOKUPPARENT = 0x00103, CEPH_MDS_OP_LOOKUPINO = 0x00104, CEPH_MDS_OP_LOOKUPNAME = 0x00105, + CEPH_MDS_OP_GETVXATTR = 0x00106, CEPH_MDS_OP_SETXATTR = 0x01105, CEPH_MDS_OP_RMXATTR = 0x01106, From ab58a5a1c0487b67f7409f39d3c8593d416d4e7f Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 15 Feb 2022 20:23:14 +0800 Subject: [PATCH 120/186] ceph: move to a dedicated slabcache for ceph_cap_snap There could be huge number of capsnaps around at any given time. On x86_64 the structure is 248 bytes, which will be rounded up to 256 bytes by kzalloc. Move this to a dedicated slabcache to save 8 bytes for each. [ jlayton: use kmem_cache_zalloc ] Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 5 +++-- fs/ceph/super.c | 7 +++++++ fs/ceph/super.h | 2 +- include/linux/ceph/libceph.h | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b41e6724c591..bc5ec72d958c 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -482,7 +482,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) struct ceph_buffer *old_blob = NULL; int used, dirty; - capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); if (!capsnap) { pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); return; @@ -603,7 +603,8 @@ update_snapc: spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); - kfree(capsnap); + if (capsnap) + kmem_cache_free(ceph_cap_snap_cachep, capsnap); ceph_put_snap_context(old_snapc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bf79f369aec6..978463fa822c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -864,6 +864,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_snap_cachep; struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -892,6 +893,9 @@ static int __init init_caches(void) ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); if (!ceph_cap_cachep) goto bad_cap; + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); + if (!ceph_cap_snap_cachep) + goto bad_cap_snap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (!ceph_cap_flush_cachep) @@ -931,6 +935,8 @@ bad_file: bad_dentry: kmem_cache_destroy(ceph_cap_flush_cachep); bad_cap_flush: + kmem_cache_destroy(ceph_cap_snap_cachep); +bad_cap_snap: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -947,6 +953,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_snap_cachep); kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 4569f802ddbb..a2caa7beca4f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -230,7 +230,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); - kfree(capsnap); + kmem_cache_free(ceph_cap_snap_cachep, capsnap); } } diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index edf62eaa6285..00af2c98da75 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -284,6 +284,7 @@ DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_cap_snap_cachep; extern struct kmem_cache *ceph_cap_flush_cachep; extern struct kmem_cache *ceph_dentry_cachep; extern struct kmem_cache *ceph_file_cachep; From 2941bf53f59c5fb4def7e5883cb452ae7b2ee304 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 17 Feb 2022 16:15:42 +0800 Subject: [PATCH 121/186] ceph: zero the dir_entries memory when allocating it This potentially will cause a bug in future if using an old ceph version that sends a smaller inode struct, which can cause some members to be skipped in handle_reply. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0b7bde73bf03..ef9145477aae 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2202,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, + __GFP_NOWARN | + __GFP_ZERO, order); if (rinfo->dir_entries) break; From 2e586641c950e7f3e7e008404bd783a466b9b590 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sat, 19 Feb 2022 14:28:33 +0800 Subject: [PATCH 122/186] ceph: do not update snapshot context when there is no new snapshot We will only track the uppest parent snapshot realm from which we need to rebuild the snapshot contexts _downward_ in hierarchy. For all the others having no new snapshot we will do nothing. This fix will avoid calling ceph_queue_cap_snap() on some inodes inappropriately. For example, with the code in mainline, suppose there are 2 directory hierarchies (with 6 directories total), like this: /dir_X1/dir_X2/dir_X3/ /dir_Y1/dir_Y2/dir_Y3/ Firstly, make a snapshot under /dir_X1/dir_X2/.snap/snap_X2, then make a root snapshot under /.snap/root_snap. Every time we make snapshots under /dir_Y1/..., the kclient will always try to rebuild the snap context for snap_X2 realm and finally will always try to queue cap snaps for dir_Y2 and dir_Y3, which makes no sense. That's because the snap_X2's seq is 2 and root_snap's seq is 3. So when creating a new snapshot under /dir_Y1/... the new seq will be 4, and the mds will send the kclient a snapshot backtrace in _downward_ order: seqs 4, 3. When ceph_update_snap_trace() is called, it will always rebuild the from the last realm, that's the root_snap. So later when rebuilding the snap context, the current logic will always cause it to rebuild the snap_X2 realm and then try to queue cap snaps for all the inodes related in that realm, even though it's not necessary. This is accompanied by a lot of these sorts of dout messages: "ceph: queue_cap_snap 00000000a42b796b nothing dirty|writing" Fix the logic to avoid this situation. Also, the 'invalidate' word is not precise here. In actuality, it will cause a rebuild of the existing snapshot contexts or just build non-existent ones. Rename it to 'rebuild_snapcs'. URL: https://tracker.ceph.com/issues/44100 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index bc5ec72d958c..5fd79ae96f34 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -708,7 +708,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, __le64 *prior_parent_snaps; /* encoded */ struct ceph_snap_realm *realm = NULL; struct ceph_snap_realm *first_realm = NULL; - int invalidate = 0; + struct ceph_snap_realm *realm_to_rebuild = NULL; + int rebuild_snapcs; int err = -ENOMEM; LIST_HEAD(dirty_realms); @@ -716,6 +717,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, dout("update_snap_trace deletion=%d\n", deletion); more: + rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); ri = p; p += sizeof(*ri); @@ -739,7 +741,7 @@ more: err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); if (err < 0) goto fail; - invalidate += err; + rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { dout("update_snap_trace updating %llx %p %lld -> %lld\n", @@ -764,22 +766,30 @@ more: if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; - invalidate = 1; + rebuild_snapcs = 1; } else if (!realm->cached_context) { dout("update_snap_trace %llx %p seq %lld new\n", realm->ino, realm, realm->seq); - invalidate = 1; + rebuild_snapcs = 1; } else { dout("update_snap_trace %llx %p seq %lld unchanged\n", realm->ino, realm, realm->seq); } - dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, - realm, invalidate, p, e); + dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, + realm, rebuild_snapcs, p, e); - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate && p >= e) - rebuild_snap_realms(realm, &dirty_realms); + /* + * this will always track the uppest parent realm from which + * we need to rebuild the snapshot contexts _downward_ in + * hierarchy. + */ + if (rebuild_snapcs) + realm_to_rebuild = realm; + + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ + if (realm_to_rebuild && p >= e) + rebuild_snap_realms(realm_to_rebuild, &dirty_realms); if (!first_realm) first_realm = realm; From 74a31df4f1f16807a93e414a322baf6eceddb60d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sat, 19 Feb 2022 14:56:17 +0800 Subject: [PATCH 123/186] ceph: eliminate the recursion when rebuilding the snap context Use a list instead of recursion to avoid possible stack overflow. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 57 +++++++++++++++++++++++++++++++++++++++++-------- fs/ceph/super.h | 2 ++ 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 5fd79ae96f34..66a1a92cf579 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -127,6 +127,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->child_item); INIT_LIST_HEAD(&realm->empty_item); INIT_LIST_HEAD(&realm->dirty_item); + INIT_LIST_HEAD(&realm->rebuild_item); INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); @@ -320,7 +321,8 @@ static int cmpu64_rev(const void *a, const void *b) * build the snap context for a given realm. */ static int build_snap_context(struct ceph_snap_realm *realm, - struct list_head* dirty_realms) + struct list_head *realm_queue, + struct list_head *dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -334,9 +336,9 @@ static int build_snap_context(struct ceph_snap_realm *realm, */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent, dirty_realms); - if (err) - goto fail; + /* add to the queue head */ + list_add(&parent->rebuild_item, realm_queue); + return 1; } num += parent->cached_context->num_snaps; } @@ -420,13 +422,50 @@ fail: static void rebuild_snap_realms(struct ceph_snap_realm *realm, struct list_head *dirty_realms) { - struct ceph_snap_realm *child; + LIST_HEAD(realm_queue); + int last = 0; + bool skip = false; - dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm, dirty_realms); + list_add_tail(&realm->rebuild_item, &realm_queue); - list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child, dirty_realms); + while (!list_empty(&realm_queue)) { + struct ceph_snap_realm *_realm, *child; + + _realm = list_first_entry(&realm_queue, + struct ceph_snap_realm, + rebuild_item); + + /* + * If the last building failed dues to memory + * issue, just empty the realm_queue and return + * to avoid infinite loop. + */ + if (last < 0) { + list_del_init(&_realm->rebuild_item); + continue; + } + + last = build_snap_context(_realm, &realm_queue, dirty_realms); + dout("rebuild_snap_realms %llx %p, %s\n", _realm->ino, _realm, + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); + + /* is any child in the list ? */ + list_for_each_entry(child, &_realm->children, child_item) { + if (!list_empty(&child->rebuild_item)) { + skip = true; + break; + } + } + + if (!skip) { + list_for_each_entry(child, &_realm->children, child_item) + list_add_tail(&child->rebuild_item, &realm_queue); + } + + /* last == 1 means need to build parent first */ + if (last <= 0) + list_del_init(&_realm->rebuild_item); + } } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a2caa7beca4f..ef9f32ec905e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -883,6 +883,8 @@ struct ceph_snap_realm { struct list_head dirty_item; /* if realm needs new context */ + struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ + /* the current set of snaps for this realm */ struct ceph_snap_context *cached_context; From 7e7d67a2044a2af919382e84f82ae4f9b141e134 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 21 Feb 2022 07:12:08 -0500 Subject: [PATCH 124/186] MAINTAINERS: add Xiubo Li as cephfs co-maintainer Xiubo has been doing stellar kernel work lately, and has graciously volunteered to help with maintainer duties. Add him on as co-maintainer in for ceph.ko and libceph.ko. Signed-off-by: Jeff Layton Acked-by: Xiubo Li Signed-off-by: Ilya Dryomov --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1ba1e4af2cbc..812166aca39f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4446,6 +4446,7 @@ F: drivers/power/supply/cw2015_battery.c CEPH COMMON CODE (LIBCEPH) M: Ilya Dryomov M: Jeff Layton +M: Xiubo Li L: ceph-devel@vger.kernel.org S: Supported W: http://ceph.com/ @@ -4456,6 +4457,7 @@ F: net/ceph/ CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) M: Jeff Layton +M: Xiubo Li M: Ilya Dryomov L: ceph-devel@vger.kernel.org S: Supported From 1753629ea0f34900467185b7d8b0db11a64f4728 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 23 Feb 2022 09:04:55 +0800 Subject: [PATCH 125/186] ceph: remove incorrect and unused CEPH_INO_DOTDOT macro Ceph have removed this macro and the 0x3 will be use for global dummy snaprealm. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/ceph_fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 66db21ac5f0c..f14f9bc290e6 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -29,7 +29,6 @@ #define CEPH_INO_ROOT 1 #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ -#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 From 5ed91587e201c77b35a5555c8c082655bb5834fe Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 23 Feb 2022 09:04:56 +0800 Subject: [PATCH 126/186] ceph: do not release the global snaprealm until unmounting The global snaprealm would be created and then destroyed immediately every time when updating it. URL: https://tracker.ceph.com/issues/54362 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 2 +- fs/ceph/snap.c | 13 +++++++++++-- fs/ceph/super.h | 2 +- include/linux/ceph/ceph_fs.h | 3 ++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ef9145477aae..fa38c013126d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4838,7 +4838,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); ceph_cleanup_snapid_map(mdsc); - ceph_cleanup_empty_realms(mdsc); + ceph_cleanup_global_and_empty_realms(mdsc); cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 66a1a92cf579..cc9097c27052 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -121,7 +121,11 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 1); /* for caller */ + /* Do not release the global dummy snaprealm until unmouting */ + if (ino == CEPH_INO_GLOBAL_SNAPREALM) + atomic_set(&realm->nref, 2); + else + atomic_set(&realm->nref, 1); realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); @@ -261,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_empty_lock); } -void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) +void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) { + struct ceph_snap_realm *global_realm; + down_write(&mdsc->snap_rwsem); + global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); + if (global_realm) + ceph_put_snap_realm(mdsc, global_realm); __cleanup_empty_realms(mdsc); up_write(&mdsc->snap_rwsem); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ef9f32ec905e..0b4b519682f1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -940,7 +940,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); -extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, u64 snap); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index f14f9bc290e6..86bf82dbd8b8 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -28,7 +28,8 @@ #define CEPH_INO_ROOT 1 -#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ +#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ #define CEPH_MAX_MON 31 From 1ab36c9dfa0116a522410117eda159e6bc97c407 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 23 Feb 2022 09:20:42 +0800 Subject: [PATCH 127/186] ceph: allocate capsnap memory outside of ceph_queue_cap_snap() This will reduce very possible but unnecessary frequently memory allocate/free in this loop. URL: https://tracker.ceph.com/issues/44100 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index cc9097c27052..b75dcc9a36b6 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -522,23 +522,15 @@ static bool has_new_snaps(struct ceph_snap_context *o, * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -static void ceph_queue_cap_snap(struct ceph_inode_info *ci) +static void ceph_queue_cap_snap(struct ceph_inode_info *ci, + struct ceph_cap_snap **pcapsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_cap_snap *capsnap; struct ceph_snap_context *old_snapc, *new_snapc; + struct ceph_cap_snap *capsnap = *pcapsnap; struct ceph_buffer *old_blob = NULL; int used, dirty; - capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); - if (!capsnap) { - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); - return; - } - capsnap->cap_flush.is_capsnap = true; - INIT_LIST_HEAD(&capsnap->cap_flush.i_list); - INIT_LIST_HEAD(&capsnap->cap_flush.g_list); - spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); @@ -595,9 +587,6 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->need_flush ? "" : "no_flush"); ihold(inode); - refcount_set(&capsnap->nref, 1); - INIT_LIST_HEAD(&capsnap->ci_item); - capsnap->follows = old_snapc->seq; capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = dirty; @@ -635,7 +624,7 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) /* note mtime, size NOW. */ __ceph_finish_cap_snap(ci, capsnap); } - capsnap = NULL; + *pcapsnap = NULL; old_snapc = NULL; update_snapc: @@ -651,8 +640,6 @@ update_snapc: spin_unlock(&ci->i_ceph_lock); ceph_buffer_put(old_blob); - if (capsnap) - kmem_cache_free(ceph_cap_snap_cachep, capsnap); ceph_put_snap_context(old_snapc); } @@ -720,6 +707,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; + struct ceph_cap_snap *capsnap = NULL; dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); @@ -731,12 +719,34 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci); + + /* + * Allocate the capsnap memory outside of ceph_queue_cap_snap() + * to reduce very possible but unnecessary frequently memory + * allocate/free in this loop. + */ + if (!capsnap) { + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); + if (!capsnap) { + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", + inode); + return; + } + } + capsnap->cap_flush.is_capsnap = true; + refcount_set(&capsnap->nref, 1); + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); + INIT_LIST_HEAD(&capsnap->ci_item); + + ceph_queue_cap_snap(ci, &capsnap); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); + if (capsnap) + kmem_cache_free(ceph_cap_snap_cachep, capsnap); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } From ad5255c1ea9c64b350efe732c90e63063b2bbbe0 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 23 Feb 2022 09:27:45 +0800 Subject: [PATCH 128/186] ceph: misc fix for code style and logs To make the logs more readable such as for log likes: ceph: will move 00000000a42b796b to split realm 100000003ed 000000007146df45 With this it will always show the inode numbers instead the inode addresses. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 122 +++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b75dcc9a36b6..322ee5add942 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -137,7 +137,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( __insert_snap_realm(&mdsc->snap_realms, realm); mdsc->num_snap_realms++; - dout("create_snap_realm %llx %p\n", realm->ino, realm); + dout("%s %llx %p\n", __func__, realm->ino, realm); return realm; } @@ -161,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, else if (ino > r->ino) n = n->rb_right; else { - dout("lookup_snap_realm %llx %p\n", r->ino, r); + dout("%s %llx %p\n", __func__, r->ino, r); return r; } } @@ -189,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, { lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); + dout("%s %p %llx\n", __func__, realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); mdsc->num_snap_realms--; @@ -302,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, if (IS_ERR(parent)) return PTR_ERR(parent); } - dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", - realm->ino, realm, realm->parent_ino, realm->parent, - parentino, parent); + dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, + realm, realm->parent_ino, realm->parent, parentino, parent); if (realm->parent) { list_del_init(&realm->child_item); ceph_put_snap_realm(mdsc, realm->parent); @@ -360,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->cached_context->seq == realm->seq && (!parent || realm->cached_context->seq >= parent->cached_context->seq)) { - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" - " (unchanged)\n", - realm->ino, realm, realm->cached_context, + dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", + __func__, realm->ino, realm, realm->cached_context, realm->cached_context->seq, (unsigned int)realm->cached_context->num_snaps); return 0; @@ -401,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm, sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); snapc->num_snaps = num; - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", - realm->ino, realm, snapc, snapc->seq, - (unsigned int) snapc->num_snaps); + dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, + realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; @@ -420,8 +417,7 @@ fail: ceph_put_snap_context(realm->cached_context); realm->cached_context = NULL; } - pr_err("build_snap_context %llx %p fail %d\n", realm->ino, - realm, err); + pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); return err; } @@ -455,7 +451,7 @@ static void rebuild_snap_realms(struct ceph_snap_realm *realm, } last = build_snap_context(_realm, &realm_queue, dirty_realms); - dout("rebuild_snap_realms %llx %p, %s\n", _realm->ino, _realm, + dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); /* is any child in the list ? */ @@ -551,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p already pending\n", inode); + dout("%s %p %llx.%llx already pending\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } if (ci->i_wrbuffer_ref_head == 0 && !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); + dout("%s %p %llx.%llx nothing dirty|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } @@ -576,15 +574,15 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, } else { if (!(used & CEPH_CAP_FILE_WR) && ci->i_wrbuffer_ref_head == 0) { - dout("queue_cap_snap %p " - "no new_snap|dirty_page|writing\n", inode); + dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", + __func__, inode, ceph_vinop(inode)); goto update_snapc; } } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", - inode, capsnap, old_snapc, ceph_cap_string(dirty), - capsnap->need_flush ? "" : "no_flush"); + dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", + __func__, inode, ceph_vinop(inode), capsnap, old_snapc, + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); ihold(inode); capsnap->follows = old_snapc->seq; @@ -616,8 +614,8 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, + dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," + " now pending\n", __func__, inode, ceph_vinop(inode), capsnap, old_snapc, old_snapc->seq); capsnap->writing = 1; } else { @@ -628,12 +626,12 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci, old_snapc = NULL; update_snapc: - if (ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ci->i_head_snapc = NULL; - } else { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } @@ -668,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "still has %d dirty pages\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size, - capsnap->dirty_pages); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "still has %d dirty pages\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size, capsnap->dirty_pages); return 0; } /* Fb cap still in use, delay it */ if (ci->i_wb_ref) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " - "used WRBUFFER, delaying\n", inode, capsnap, - capsnap->context, capsnap->context->seq, - ceph_cap_string(capsnap->dirty), capsnap->size); + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " + "used WRBUFFER, delaying\n", __func__, inode, + ceph_vinop(inode), capsnap, capsnap->context, + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); capsnap->writing = 1; return 0; } ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", - inode, capsnap, capsnap->context, + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", + __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, capsnap->context->seq, ceph_cap_string(capsnap->dirty), capsnap->size); @@ -709,7 +708,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) struct inode *lastinode = NULL; struct ceph_cap_snap *capsnap = NULL; - dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + dout("%s %p %llx inode\n", __func__, realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { @@ -747,7 +746,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (capsnap) kmem_cache_free(ceph_cap_snap_cachep, capsnap); - dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); + dout("%s %p %llx done\n", __func__, realm, realm->ino); } /* @@ -773,7 +772,7 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("update_snap_trace deletion=%d\n", deletion); + dout("%s deletion=%d\n", __func__, deletion); more: rebuild_snapcs = 0; ceph_decode_need(&p, e, sizeof(*ri), bad); @@ -802,7 +801,7 @@ more: rebuild_snapcs += err; if (le64_to_cpu(ri->seq) > realm->seq) { - dout("update_snap_trace updating %llx %p %lld -> %lld\n", + dout("%s updating %llx %p %lld -> %lld\n", __func__, realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); /* update realm parameters, snap lists */ realm->seq = le64_to_cpu(ri->seq); @@ -826,11 +825,11 @@ more: rebuild_snapcs = 1; } else if (!realm->cached_context) { - dout("update_snap_trace %llx %p seq %lld new\n", + dout("%s %llx %p seq %lld new\n", __func__, realm->ino, realm, realm->seq); rebuild_snapcs = 1; } else { - dout("update_snap_trace %llx %p seq %lld unchanged\n", + dout("%s %llx %p seq %lld unchanged\n", __func__, realm->ino, realm, realm->seq); } @@ -883,7 +882,7 @@ fail: ceph_put_snap_realm(mdsc, realm); if (first_realm) ceph_put_snap_realm(mdsc, first_realm); - pr_err("update_snap_trace error %d\n", err); + pr_err("%s error %d\n", __func__, err); return err; } @@ -900,7 +899,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) struct inode *inode; struct ceph_mds_session *session = NULL; - dout("flush_snaps\n"); + dout("%s\n", __func__); spin_lock(&mdsc->snap_flush_lock); while (!list_empty(&mdsc->snap_flush_list)) { ci = list_first_entry(&mdsc->snap_flush_list, @@ -915,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->snap_flush_lock); ceph_put_mds_session(session); - dout("flush_snaps done\n"); + dout("%s done\n", __func__); } /** @@ -997,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, trace_len = le32_to_cpu(h->trace_len); p += sizeof(*h); - dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, - ceph_snap_op_name(op), split, trace_len); + dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, + mds, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); inc_session_sequence(session); @@ -1058,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, */ if (ci->i_snap_realm->created > le64_to_cpu(ri->created)) { - dout(" leaving %p in newer realm %llx %p\n", - inode, ci->i_snap_realm->ino, + dout(" leaving %p %llx.%llx in newer realm %llx %p\n", + inode, ceph_vinop(inode), ci->i_snap_realm->ino, ci->i_snap_realm); goto skip_inode; } - dout(" will move %p to split realm %llx %p\n", - inode, realm->ino, realm); + dout(" will move %p %llx.%llx to split realm %llx %p\n", + inode, ceph_vinop(inode), realm->ino, realm); ceph_get_snap_realm(mdsc, realm); ceph_change_snap_realm(inode, realm); @@ -1107,7 +1106,7 @@ skip_inode: return; bad: - pr_err("corrupt snap message from mds%d\n", mds); + pr_err("%s corrupt snap message from mds%d\n", __func__, mds); ceph_msg_dump(msg); out: if (locked_rwsem) @@ -1140,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, } spin_unlock(&mdsc->snapid_map_lock); if (exist) { - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } @@ -1184,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, if (exist) { free_anon_bdev(sm->dev); kfree(sm); - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + dout("%s found snapid map %llx -> %x\n", __func__, + exist->snap, exist->dev); return exist; } - dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + dout("%s create snapid map %llx -> %x\n", __func__, + sm->snap, sm->dev); return sm; } From b135e324d7a2e7fa0a7ef925076136e799b79f44 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 8 Feb 2022 14:25:09 -0500 Subject: [PATCH 129/186] IB/hfi1: Allow larger MTU without AIP The AIP code signals the phys_mtu in the following query_port() fragment: props->phys_mtu = HFI1_CAP_IS_KSET(AIP) ? hfi1_max_mtu : ib_mtu_enum_to_int(props->max_mtu); Using the largest MTU possible should not depend on AIP. Fix by unconditionally using the hfi1_max_mtu value. Fixes: 6d72344cf6c4 ("IB/ipoib: Increase ipoib Datagram mode MTU's upper limit") Link: https://lore.kernel.org/r/1644348309-174874-1-git-send-email-mike.marciniszyn@cornelisnetworks.com Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index dc9211f3a009..99d0743133ca 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1397,8 +1397,7 @@ static int query_port(struct rvt_dev_info *rdi, u32 port_num, 4096 : hfi1_max_mtu), IB_MTU_4096); props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_4096); - props->phys_mtu = HFI1_CAP_IS_KSET(AIP) ? hfi1_max_mtu : - ib_mtu_enum_to_int(props->max_mtu); + props->phys_mtu = hfi1_max_mtu; return 0; } From 7c4a539ec38f4ce400a0f3fcb5ff6c940fcd67bb Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 3 Mar 2022 10:42:32 +0800 Subject: [PATCH 130/186] RDMA/core: Fix ib_qp_usecnt_dec() called when error ib_destroy_qp() would called by ib_create_qp_user() if error, the former contains ib_qp_usecnt_dec(), but ib_qp_usecnt_inc() was not called before. So move ib_qp_usecnt_inc() into create_qp(). Fixes: d2b10794fc13 ("RDMA/core: Create clean QP creations interface for uverbs") Link: https://lore.kernel.org/r/20220303024232.2847388-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 - drivers/infiniband/core/uverbs_std_types_qp.c | 1 - drivers/infiniband/core/verbs.c | 3 +-- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6b6393176b3c..4437f834c0a7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1437,7 +1437,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ret = PTR_ERR(qp); goto err_put; } - ib_qp_usecnt_inc(qp); obj->uevent.uobject.object = qp; obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c index dd1075466f61..75353e09c6fe 100644 --- a/drivers/infiniband/core/uverbs_std_types_qp.c +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -254,7 +254,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( ret = PTR_ERR(qp); goto err_put; } - ib_qp_usecnt_inc(qp); if (attr.qp_type == IB_QPT_XRC_TGT) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a9819c40a140..bc9a83f1ca2d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1245,6 +1245,7 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, if (ret) goto err_security; + ib_qp_usecnt_inc(qp); rdma_restrack_add(&qp->res); return qp; @@ -1345,8 +1346,6 @@ struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, if (IS_ERR(qp)) return qp; - ib_qp_usecnt_inc(qp); - if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); if (ret) From 5a32949d81ccc80fa6614894fb0a45e2fffb1862 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 2 Mar 2022 14:48:22 +0800 Subject: [PATCH 131/186] RDMA/hns: Remove the unused parameter "op_modifier" in mailbox The parameter "op_modifier" is only used for HIP06. It is useless for HIP08 and later versions. After removing HIP06, this parameter is no longer used, so remove it. Link: https://lore.kernel.org/r/20220302064830.61706-2-liangwenpeng@huawei.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 36 ++++++++----------- drivers/infiniband/hw/hns/hns_roce_cmd.h | 3 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 4 +-- drivers/infiniband/hw/hns/hns_roce_device.h | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 26 +++++++------- .../infiniband/hw/hns/hns_roce_hw_v2_dfx.c | 2 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 ++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 4 +-- 8 files changed, 37 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 4b693d542ace..ab89e70b6f04 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -39,25 +39,22 @@ #define CMD_MAX_NUM 32 static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, - u8 op_modifier, u16 op, u16 token, - int event) + u64 out_param, u32 in_modifier, u16 op, + u16 token, int event) { return hr_dev->hw->post_mbox(hr_dev, in_param, out_param, in_modifier, - op_modifier, op, token, event); + op, token, event); } /* this should be called with "poll_sem" */ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u8 op_modifier, u16 op, - unsigned int timeout) + u16 op, unsigned int timeout) { int ret; ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param, - in_modifier, op_modifier, op, - CMD_POLL_TOKEN, 0); + in_modifier, op, CMD_POLL_TOKEN, 0); if (ret) { dev_err_ratelimited(hr_dev->dev, "failed to post mailbox 0x%x in poll mode, ret = %d.\n", @@ -70,13 +67,13 @@ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u8 op_modifier, u16 op, unsigned int timeout) + u16 op, unsigned int timeout) { int ret; down(&hr_dev->cmd.poll_sem); ret = __hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, in_modifier, - op_modifier, op, timeout); + op, timeout); up(&hr_dev->cmd.poll_sem); return ret; @@ -102,8 +99,7 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u8 op_modifier, u16 op, - unsigned int timeout) + u16 op, unsigned int timeout) { struct hns_roce_cmdq *cmd = &hr_dev->cmd; struct hns_roce_cmd_context *context; @@ -125,8 +121,7 @@ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, reinit_completion(&context->done); ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param, - in_modifier, op_modifier, op, - context->token, 1); + in_modifier, op, context->token, 1); if (ret) { dev_err_ratelimited(dev, "failed to post mailbox 0x%x in event mode, ret = %d.\n", @@ -154,21 +149,20 @@ out: static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u8 op_modifier, u16 op, unsigned int timeout) + u16 op, unsigned int timeout) { int ret; down(&hr_dev->cmd.event_sem); ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, in_modifier, - op_modifier, op, timeout); + op, timeout); up(&hr_dev->cmd.event_sem); return ret; } int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u8 op_modifier, u16 op, - unsigned int timeout) + unsigned long in_modifier, u16 op, unsigned int timeout) { bool is_busy; @@ -178,12 +172,10 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, if (hr_dev->cmd.use_events) return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, - in_modifier, op_modifier, op, - timeout); + in_modifier, op, timeout); else return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, - in_modifier, op_modifier, op, - timeout); + in_modifier, op, timeout); } int hns_roce_cmd_init(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index 8025e7f657fa..3055996935d5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -140,8 +140,7 @@ enum { }; int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u8 op_modifier, u16 op, - unsigned int timeout); + unsigned long in_modifier, u16 op, unsigned int timeout); struct hns_roce_cmd_mailbox * hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 55057dcbb2dc..6fbfa262e6c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -140,7 +140,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle); /* Send mailbox to hw */ - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, 0, + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, HNS_ROCE_CMD_CREATE_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { @@ -174,7 +174,7 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) struct device *dev = hr_dev->dev; int ret; - ret = hns_roce_cmd_mbox(hr_dev, 0, 0, hr_cq->cqn, 1, + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, hr_cq->cqn, HNS_ROCE_CMD_DESTROY_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 1e0bae136997..6da996f46cf3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -852,7 +852,7 @@ struct hns_roce_hw { int (*hw_init)(struct hns_roce_dev *hr_dev); void (*hw_exit)(struct hns_roce_dev *hr_dev); int (*post_mbox)(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, + u64 out_param, u32 in_modifier, u16 op, u16 token, int event); int (*poll_mbox_done)(struct hns_roce_dev *hr_dev, unsigned int timeout); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b33e948fd060..c86cf75c4caa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1353,7 +1353,7 @@ static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj, if (IS_ERR(mbox)) return PTR_ERR(mbox); - ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, obj, 0, op, + ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, obj, op, HNS_ROCE_CMD_TIMEOUT_MSECS); hns_roce_free_cmd_mailbox(hr_dev, mbox); return ret; @@ -2781,7 +2781,7 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) } static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u8 op_modifier, + u64 out_param, u32 in_modifier, u16 op, u16 token, int event) { struct hns_roce_cmq_desc desc; @@ -2848,7 +2848,7 @@ static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout, } static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u8 op_modifier, + u64 out_param, u32 in_modifier, u16 op, u16 token, int event) { u8 status = 0; @@ -2866,7 +2866,7 @@ static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, /* Post new message to mbox */ ret = hns_roce_mbox_post(hr_dev, in_param, out_param, in_modifier, - op_modifier, op, token, event); + op, token, event); if (ret) dev_err_ratelimited(hr_dev->dev, "failed to post mailbox, ret = %d.\n", ret); @@ -3992,7 +3992,7 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, return PTR_ERR(mailbox); /* configure the tag and op */ - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, obj, 0, op, + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, obj, op, HNS_ROCE_CMD_TIMEOUT_MSECS); hns_roce_free_cmd_mailbox(hr_dev, mailbox); @@ -4017,7 +4017,7 @@ static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev, memcpy(mailbox->buf, context, qpc_size); memcpy(mailbox->buf + qpc_size, qpc_mask, qpc_size); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, 0, + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, HNS_ROCE_CMD_MODIFY_QPC, HNS_ROCE_CMD_TIMEOUT_MSECS); @@ -5092,7 +5092,7 @@ static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev, if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, 0, + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, HNS_ROCE_CMD_QUERY_QPC, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) @@ -5460,7 +5460,7 @@ static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq, hr_reg_write(srq_context, SRQC_LIMIT_WL, srq_attr->srq_limit); hr_reg_clear(srqc_mask, SRQC_LIMIT_WL); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0, + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, HNS_ROCE_CMD_MODIFY_SRQC, HNS_ROCE_CMD_TIMEOUT_MSECS); hns_roce_free_cmd_mailbox(hr_dev, mailbox); @@ -5488,7 +5488,7 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) return PTR_ERR(mailbox); srq_context = mailbox->buf; - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0, + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, HNS_ROCE_CMD_QUERY_SRQC, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) { @@ -5540,7 +5540,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) hr_reg_write(cq_context, CQC_CQ_PERIOD, cq_period); hr_reg_clear(cqc_mask, CQC_CQ_PERIOD); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, 1, + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, HNS_ROCE_CMD_MODIFY_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS); hns_roce_free_cmd_mailbox(hr_dev, mailbox); @@ -5872,11 +5872,11 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) if (eqn < hr_dev->caps.num_comp_vectors) ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, - 0, HNS_ROCE_CMD_DESTROY_CEQC, + HNS_ROCE_CMD_DESTROY_CEQC, HNS_ROCE_CMD_TIMEOUT_MSECS); else ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, - 0, HNS_ROCE_CMD_DESTROY_AEQC, + HNS_ROCE_CMD_DESTROY_AEQC, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); @@ -6002,7 +6002,7 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, if (ret) goto err_cmd_mbox; - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0, + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) { dev_err(hr_dev->dev, "[mailbox cmd] create eqc failed.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c index 5a97b5a0b7be..bce3a67b0b2d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c @@ -18,7 +18,7 @@ int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn, return PTR_ERR(mailbox); cq_context = mailbox->buf; - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cqn, 0, + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cqn, HNS_ROCE_CMD_QUERY_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 2ee06b906b60..e0ec839f2f6f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -51,7 +51,7 @@ static int hns_roce_hw_create_mpt(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox, unsigned long mpt_index) { - return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, 0, + return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, HNS_ROCE_CMD_CREATE_MPT, HNS_ROCE_CMD_TIMEOUT_MSECS); } @@ -61,7 +61,7 @@ int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, unsigned long mpt_index) { return hns_roce_cmd_mbox(hr_dev, 0, mailbox ? mailbox->dma : 0, - mpt_index, !mailbox, HNS_ROCE_CMD_DESTROY_MPT, + mpt_index, HNS_ROCE_CMD_DESTROY_MPT, HNS_ROCE_CMD_TIMEOUT_MSECS); } @@ -303,7 +303,7 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, return ERR_CAST(mailbox); mtpt_idx = key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1); - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, mtpt_idx, 0, + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, mtpt_idx, HNS_ROCE_CMD_QUERY_MPT, HNS_ROCE_CMD_TIMEOUT_MSECS); if (ret) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index e64ef6903fb4..525e1eba263a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -63,7 +63,7 @@ static int hns_roce_hw_create_srq(struct hns_roce_dev *dev, struct hns_roce_cmd_mailbox *mailbox, unsigned long srq_num) { - return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, 0, + return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, HNS_ROCE_CMD_CREATE_SRQ, HNS_ROCE_CMD_TIMEOUT_MSECS); } @@ -73,7 +73,7 @@ static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, unsigned long srq_num) { return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num, - mailbox ? 0 : 1, HNS_ROCE_CMD_DESTROY_SRQ, + HNS_ROCE_CMD_DESTROY_SRQ, HNS_ROCE_CMD_TIMEOUT_MSECS); } From 0018ed4bb07feb8b1b2bcb125650db63941c3160 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 2 Mar 2022 14:48:23 +0800 Subject: [PATCH 132/186] =?UTF-8?q?RDMA/hns:=20Remove=20fixed=20parameter?= =?UTF-8?q?=20=E2=80=9Ctimeout=E2=80=9D=20in=20the=20mailbox?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of the function parameter “timeout” is unique. Therefore, it is unnecessary to specify the parameter “timeout” value each time. So remove it. Link: https://lore.kernel.org/r/20220302064830.61706-3-liangwenpeng@huawei.com Signed-off-by: Chengchang Tang Signed-off-by: Haoyue Xu Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 22 ++++++------ drivers/infiniband/hw/hns/hns_roce_cmd.h | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 5 ++- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 35 +++++++------------ .../infiniband/hw/hns/hns_roce_hw_v2_dfx.c | 3 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 9 ++--- drivers/infiniband/hw/hns/hns_roce_srq.c | 6 ++-- 8 files changed, 34 insertions(+), 51 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index ab89e70b6f04..3642e9282b42 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -49,7 +49,7 @@ static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param, /* this should be called with "poll_sem" */ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op, unsigned int timeout) + u16 op) { int ret; @@ -62,18 +62,18 @@ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, return ret; } - return hr_dev->hw->poll_mbox_done(hr_dev, timeout); + return hr_dev->hw->poll_mbox_done(hr_dev); } static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op, unsigned int timeout) + u16 op) { int ret; down(&hr_dev->cmd.poll_sem); ret = __hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, in_modifier, - op, timeout); + op); up(&hr_dev->cmd.poll_sem); return ret; @@ -99,7 +99,7 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op, unsigned int timeout) + u16 op) { struct hns_roce_cmdq *cmd = &hr_dev->cmd; struct hns_roce_cmd_context *context; @@ -130,7 +130,7 @@ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, } if (!wait_for_completion_timeout(&context->done, - msecs_to_jiffies(timeout))) { + msecs_to_jiffies(HNS_ROCE_CMD_TIMEOUT_MSECS))) { dev_err_ratelimited(dev, "[cmd] token 0x%x mailbox 0x%x timeout.\n", context->token, op); ret = -EBUSY; @@ -149,20 +149,20 @@ out: static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op, unsigned int timeout) + u16 op) { int ret; down(&hr_dev->cmd.event_sem); ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, in_modifier, - op, timeout); + op); up(&hr_dev->cmd.event_sem); return ret; } int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u16 op, unsigned int timeout) + unsigned long in_modifier, u16 op) { bool is_busy; @@ -172,10 +172,10 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, if (hr_dev->cmd.use_events) return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, - in_modifier, op, timeout); + in_modifier, op); else return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, - in_modifier, op, timeout); + in_modifier, op); } int hns_roce_cmd_init(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index 3055996935d5..23937b106aa5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -140,7 +140,7 @@ enum { }; int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u16 op, unsigned int timeout); + unsigned long in_modifier, u16 op); struct hns_roce_cmd_mailbox * hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6fbfa262e6c7..22bd9e066a38 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -141,7 +141,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) /* Send mailbox to hw */ ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, - HNS_ROCE_CMD_CREATE_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_CREATE_CQC); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { ibdev_err(ibdev, @@ -175,8 +175,7 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) int ret; ret = hns_roce_cmd_mbox(hr_dev, 0, 0, hr_cq->cqn, - HNS_ROCE_CMD_DESTROY_CQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_DESTROY_CQC); if (ret) dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 6da996f46cf3..2657f4c513ee 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -854,8 +854,7 @@ struct hns_roce_hw { int (*post_mbox)(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, u32 in_modifier, u16 op, u16 token, int event); - int (*poll_mbox_done)(struct hns_roce_dev *hr_dev, - unsigned int timeout); + int (*poll_mbox_done)(struct hns_roce_dev *hr_dev); bool (*chk_mbox_avail)(struct hns_roce_dev *hr_dev, bool *is_busy); int (*set_gid)(struct hns_roce_dev *hr_dev, int gid_index, const union ib_gid *gid, const struct ib_gid_attr *attr); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c86cf75c4caa..a79ca9d3c62f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1353,8 +1353,7 @@ static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj, if (IS_ERR(mbox)) return PTR_ERR(mbox); - ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, obj, op, - HNS_ROCE_CMD_TIMEOUT_MSECS); + ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, obj, op); hns_roce_free_cmd_mailbox(hr_dev, mbox); return ret; } @@ -2874,12 +2873,13 @@ static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, return ret; } -static int v2_poll_mbox_done(struct hns_roce_dev *hr_dev, unsigned int timeout) +static int v2_poll_mbox_done(struct hns_roce_dev *hr_dev) { u8 status = 0; int ret; - ret = v2_wait_mbox_complete(hr_dev, timeout, &status); + ret = v2_wait_mbox_complete(hr_dev, HNS_ROCE_CMD_TIMEOUT_MSECS, + &status); if (!ret) { if (status != MB_ST_COMPLETE_SUCC) return -EBUSY; @@ -3992,8 +3992,7 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, return PTR_ERR(mailbox); /* configure the tag and op */ - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, obj, op, - HNS_ROCE_CMD_TIMEOUT_MSECS); + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, obj, op); hns_roce_free_cmd_mailbox(hr_dev, mailbox); return ret; @@ -4018,8 +4017,7 @@ static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev, memcpy(mailbox->buf + qpc_size, qpc_mask, qpc_size); ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, - HNS_ROCE_CMD_MODIFY_QPC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_MODIFY_QPC); hns_roce_free_cmd_mailbox(hr_dev, mailbox); @@ -5093,8 +5091,7 @@ static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev, return PTR_ERR(mailbox); ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, - HNS_ROCE_CMD_QUERY_QPC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_QUERY_QPC); if (ret) goto out; @@ -5461,8 +5458,7 @@ static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq, hr_reg_clear(srqc_mask, SRQC_LIMIT_WL); ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, - HNS_ROCE_CMD_MODIFY_SRQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_MODIFY_SRQC); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { ibdev_err(&hr_dev->ib_dev, @@ -5489,8 +5485,7 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) srq_context = mailbox->buf; ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, - HNS_ROCE_CMD_QUERY_SRQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_QUERY_SRQC); if (ret) { ibdev_err(&hr_dev->ib_dev, "failed to process cmd of querying SRQ, ret = %d.\n", @@ -5541,8 +5536,7 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) hr_reg_clear(cqc_mask, CQC_CQ_PERIOD); ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, - HNS_ROCE_CMD_MODIFY_CQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_MODIFY_CQC); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) ibdev_err(&hr_dev->ib_dev, @@ -5872,12 +5866,10 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) if (eqn < hr_dev->caps.num_comp_vectors) ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, - HNS_ROCE_CMD_DESTROY_CEQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_DESTROY_CEQC); else ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, - HNS_ROCE_CMD_DESTROY_AEQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_DESTROY_AEQC); if (ret) dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); } @@ -6002,8 +5994,7 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, if (ret) goto err_cmd_mbox; - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, - eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS); + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, eq_cmd); if (ret) { dev_err(hr_dev->dev, "[mailbox cmd] create eqc failed.\n"); goto err_cmd_mbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c index bce3a67b0b2d..107288150e3f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c @@ -19,8 +19,7 @@ int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn, cq_context = mailbox->buf; ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cqn, - HNS_ROCE_CMD_QUERY_CQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_QUERY_CQC); if (ret) { dev_err(hr_dev->dev, "QUERY cqc cmd process error\n"); goto err_mailbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index e0ec839f2f6f..bf4ea6bfff84 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -52,8 +52,7 @@ static int hns_roce_hw_create_mpt(struct hns_roce_dev *hr_dev, unsigned long mpt_index) { return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, - HNS_ROCE_CMD_CREATE_MPT, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_CREATE_MPT); } int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, @@ -61,8 +60,7 @@ int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, unsigned long mpt_index) { return hns_roce_cmd_mbox(hr_dev, 0, mailbox ? mailbox->dma : 0, - mpt_index, HNS_ROCE_CMD_DESTROY_MPT, - HNS_ROCE_CMD_TIMEOUT_MSECS); + mpt_index, HNS_ROCE_CMD_DESTROY_MPT); } static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) @@ -304,8 +302,7 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, mtpt_idx = key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1); ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, mtpt_idx, - HNS_ROCE_CMD_QUERY_MPT, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_QUERY_MPT); if (ret) goto free_cmd_mbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 525e1eba263a..5bb8ccea95a6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -64,8 +64,7 @@ static int hns_roce_hw_create_srq(struct hns_roce_dev *dev, unsigned long srq_num) { return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, - HNS_ROCE_CMD_CREATE_SRQ, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_CREATE_SRQ); } static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, @@ -73,8 +72,7 @@ static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, unsigned long srq_num) { return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num, - HNS_ROCE_CMD_DESTROY_SRQ, - HNS_ROCE_CMD_TIMEOUT_MSECS); + HNS_ROCE_CMD_DESTROY_SRQ); } static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) From 479dc93ba75da134e84c1993c9b62caa6f6ccdc6 Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Wed, 2 Mar 2022 14:48:24 +0800 Subject: [PATCH 133/186] RDMA/hns: Remove redundant parameter "mailbox" in the mailbox The parameter "out_param" of the mailbox is always null when the context is destroyed. So remove the function parameter "mailbox". Link: https://lore.kernel.org/r/20220302064830.61706-4-liangwenpeng@huawei.com Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 - drivers/infiniband/hw/hns/hns_roce_mr.c | 11 +++++------ drivers/infiniband/hw/hns/hns_roce_srq.c | 6 ++---- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2657f4c513ee..f21c7aa43324 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1145,7 +1145,6 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, - struct hns_roce_cmd_mailbox *mailbox, unsigned long mpt_index); unsigned long key_to_hw_index(u32 key); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index bf4ea6bfff84..62a57cae800c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -56,11 +56,10 @@ static int hns_roce_hw_create_mpt(struct hns_roce_dev *hr_dev, } int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, - struct hns_roce_cmd_mailbox *mailbox, unsigned long mpt_index) { - return hns_roce_cmd_mbox(hr_dev, 0, mailbox ? mailbox->dma : 0, - mpt_index, HNS_ROCE_CMD_DESTROY_MPT); + return hns_roce_cmd_mbox(hr_dev, 0, 0, mpt_index, + HNS_ROCE_CMD_DESTROY_MPT); } static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) @@ -142,7 +141,7 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, int ret; if (mr->enabled) { - ret = hns_roce_hw_destroy_mpt(hr_dev, NULL, + ret = hns_roce_hw_destroy_mpt(hr_dev, key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) @@ -306,7 +305,7 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, if (ret) goto free_cmd_mbox; - ret = hns_roce_hw_destroy_mpt(hr_dev, NULL, mtpt_idx); + ret = hns_roce_hw_destroy_mpt(hr_dev, mtpt_idx); if (ret) ibdev_warn(ib_dev, "failed to destroy MPT, ret = %d.\n", ret); @@ -477,7 +476,7 @@ static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, int ret; if (mw->enabled) { - ret = hns_roce_hw_destroy_mpt(hr_dev, NULL, + ret = hns_roce_hw_destroy_mpt(hr_dev, key_to_hw_index(mw->rkey) & (hr_dev->caps.num_mtpts - 1)); if (ret) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 5bb8ccea95a6..1fef5d630485 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -68,11 +68,9 @@ static int hns_roce_hw_create_srq(struct hns_roce_dev *dev, } static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, - struct hns_roce_cmd_mailbox *mailbox, unsigned long srq_num) { - return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num, - HNS_ROCE_CMD_DESTROY_SRQ); + return hns_roce_cmd_mbox(dev, 0, 0, srq_num, HNS_ROCE_CMD_DESTROY_SRQ); } static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) @@ -144,7 +142,7 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; int ret; - ret = hns_roce_hw_destroy_srq(hr_dev, NULL, srq->srqn); + ret = hns_roce_hw_destroy_srq(hr_dev, srq->srqn); if (ret) dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", ret, srq->srqn); From e50cda2b9f840e7a11951421e65a6a779c0aa6f1 Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Wed, 2 Mar 2022 14:48:25 +0800 Subject: [PATCH 134/186] RDMA/hns: Fix the wrong type of parameter "op" of the mailbox The "op" field of the mailbox occupies 8 bits, so the parameter "op" should be of type u8. Link: https://lore.kernel.org/r/20220302064830.61706-5-liangwenpeng@huawei.com Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 12 ++++---- drivers/infiniband/hw/hns/hns_roce_cmd.h | 2 +- drivers/infiniband/hw/hns/hns_roce_device.h | 6 ++-- drivers/infiniband/hw/hns/hns_roce_hem.c | 4 +-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 33 ++++++++++----------- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 3642e9282b42..df11acd8030e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -39,7 +39,7 @@ #define CMD_MAX_NUM 32 static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u16 op, + u64 out_param, u32 in_modifier, u8 op, u16 token, int event) { return hr_dev->hw->post_mbox(hr_dev, in_param, out_param, in_modifier, @@ -49,7 +49,7 @@ static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param, /* this should be called with "poll_sem" */ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op) + u8 op) { int ret; @@ -67,7 +67,7 @@ static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op) + u8 op) { int ret; @@ -99,7 +99,7 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op) + u8 op) { struct hns_roce_cmdq *cmd = &hr_dev->cmd; struct hns_roce_cmd_context *context; @@ -149,7 +149,7 @@ out: static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, - u16 op) + u8 op) { int ret; @@ -162,7 +162,7 @@ static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, } int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u16 op) + unsigned long in_modifier, u8 op) { bool is_busy; diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index 23937b106aa5..7928790061b8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -140,7 +140,7 @@ enum { }; int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u16 op); + unsigned long in_modifier, u8 op); struct hns_roce_cmd_mailbox * hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f21c7aa43324..8dd7919f8698 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -852,7 +852,7 @@ struct hns_roce_hw { int (*hw_init)(struct hns_roce_dev *hr_dev); void (*hw_exit)(struct hns_roce_dev *hr_dev); int (*post_mbox)(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u16 op, + u64 out_param, u32 in_modifier, u8 op, u16 token, int event); int (*poll_mbox_done)(struct hns_roce_dev *hr_dev); bool (*chk_mbox_avail)(struct hns_roce_dev *hr_dev, bool *is_busy); @@ -872,10 +872,10 @@ struct hns_roce_hw { struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, dma_addr_t dma_handle); int (*set_hem)(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_table *table, int obj, int step_idx); + struct hns_roce_hem_table *table, int obj, u32 step_idx); int (*clear_hem)(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, int obj, - int step_idx); + u32 step_idx); int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 8917365cc6b8..ce1a0d2792a3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -488,7 +488,7 @@ static int set_mhop_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_index *index) { struct ib_device *ibdev = &hr_dev->ib_dev; - int step_idx; + u32 step_idx; int ret = 0; if (index->inited & HEM_INDEX_L0) { @@ -618,7 +618,7 @@ static void clear_mhop_hem(struct hns_roce_dev *hr_dev, struct ib_device *ibdev = &hr_dev->ib_dev; u32 hop_num = mhop->hop_num; u32 chunk_ba_num; - int step_idx; + u32 step_idx; index->inited = HEM_INDEX_BUF; chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a79ca9d3c62f..63571abfc019 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1345,7 +1345,7 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, } static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj, - dma_addr_t base_addr, u16 op) + dma_addr_t base_addr, u8 op) { struct hns_roce_cmd_mailbox *mbox = hns_roce_alloc_cmd_mailbox(hr_dev); int ret; @@ -2781,7 +2781,7 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, u32 in_modifier, - u16 op, u16 token, int event) + u8 op, u16 token, int event) { struct hns_roce_cmq_desc desc; struct hns_roce_post_mbox *mb = (struct hns_roce_post_mbox *)desc.data; @@ -2848,7 +2848,7 @@ static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout, static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, u32 in_modifier, - u16 op, u16 token, int event) + u8 op, u16 token, int event) { u8 status = 0; int ret; @@ -3818,9 +3818,9 @@ out: } static int get_op_for_set_hem(struct hns_roce_dev *hr_dev, u32 type, - int step_idx, u16 *mbox_op) + u32 step_idx, u8 *mbox_op) { - u16 op; + u8 op; switch (type) { case HEM_TYPE_QPC: @@ -3872,10 +3872,10 @@ static int config_gmv_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj, } static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, - dma_addr_t base_addr, u32 hem_type, int step_idx) + dma_addr_t base_addr, u32 hem_type, u32 step_idx) { int ret; - u16 op; + u8 op; if (unlikely(hem_type == HEM_TYPE_GMV)) return config_gmv_ba_to_hw(hr_dev, obj, base_addr); @@ -3892,7 +3892,7 @@ static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, int obj, - int step_idx) + u32 step_idx) { struct hns_roce_hem_iter iter; struct hns_roce_hem_mhop mhop; @@ -3951,12 +3951,12 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev, static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, int obj, - int step_idx) + u32 step_idx) { - struct device *dev = hr_dev->dev; struct hns_roce_cmd_mailbox *mailbox; + struct device *dev = hr_dev->dev; + u8 op = 0xff; int ret; - u16 op = 0xff; if (!hns_roce_check_whether_mhop(hr_dev, table->type)) return 0; @@ -5975,8 +5975,7 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) } static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq, - unsigned int eq_cmd) + struct hns_roce_eq *eq, u8 eq_cmd) { struct hns_roce_cmd_mailbox *mailbox; int ret; @@ -6105,14 +6104,14 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; struct device *dev = hr_dev->dev; struct hns_roce_eq *eq; - unsigned int eq_cmd; - int irq_num; - int eq_num; int other_num; int comp_num; int aeq_num; - int i; + int irq_num; + int eq_num; + u8 eq_cmd; int ret; + int i; other_num = hr_dev->caps.num_other_vectors; comp_num = hr_dev->caps.num_comp_vectors; From 162e29feabba9232a07af5fb07b6b9ae4717d512 Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 2 Mar 2022 14:48:26 +0800 Subject: [PATCH 135/186] RDMA/hns: Refactor mailbox functions The current mailbox functions have too many parameters, making the code difficult to maintain. So construct a new structure mbox_msg to pass the information needed by mailbox. Link: https://lore.kernel.org/r/20220302064830.61706-6-liangwenpeng@huawei.com Signed-off-by: Chengchang Tang Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 73 ++++++------ drivers/infiniband/hw/hns/hns_roce_cmd.h | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 9 +- drivers/infiniband/hw/hns/hns_roce_device.h | 14 ++- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 111 +++++++++--------- .../infiniband/hw/hns/hns_roce_hw_v2_dfx.c | 4 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 13 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 6 +- 8 files changed, 120 insertions(+), 112 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index df11acd8030e..7e37066b272d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -38,42 +38,36 @@ #define CMD_POLL_TOKEN 0xffff #define CMD_MAX_NUM 32 -static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u8 op, - u16 token, int event) +static int hns_roce_cmd_mbox_post_hw(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { - return hr_dev->hw->post_mbox(hr_dev, in_param, out_param, in_modifier, - op, token, event); + return hr_dev->hw->post_mbox(hr_dev, mbox_msg); } /* this should be called with "poll_sem" */ -static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, unsigned long in_modifier, - u8 op) +static int __hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { int ret; - ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param, - in_modifier, op, CMD_POLL_TOKEN, 0); + ret = hns_roce_cmd_mbox_post_hw(hr_dev, mbox_msg); if (ret) { dev_err_ratelimited(hr_dev->dev, "failed to post mailbox 0x%x in poll mode, ret = %d.\n", - op, ret); + mbox_msg->cmd, ret); return ret; } return hr_dev->hw->poll_mbox_done(hr_dev); } -static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, unsigned long in_modifier, - u8 op) +static int hns_roce_cmd_mbox_poll(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { int ret; down(&hr_dev->cmd.poll_sem); - ret = __hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, in_modifier, - op); + ret = __hns_roce_cmd_mbox_poll(hr_dev, mbox_msg); up(&hr_dev->cmd.poll_sem); return ret; @@ -97,9 +91,8 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, complete(&context->done); } -static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, unsigned long in_modifier, - u8 op) +static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { struct hns_roce_cmdq *cmd = &hr_dev->cmd; struct hns_roce_cmd_context *context; @@ -120,19 +113,19 @@ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, reinit_completion(&context->done); - ret = hns_roce_cmd_mbox_post_hw(hr_dev, in_param, out_param, - in_modifier, op, context->token, 1); + mbox_msg->token = context->token; + ret = hns_roce_cmd_mbox_post_hw(hr_dev, mbox_msg); if (ret) { dev_err_ratelimited(dev, "failed to post mailbox 0x%x in event mode, ret = %d.\n", - op, ret); + mbox_msg->cmd, ret); goto out; } if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(HNS_ROCE_CMD_TIMEOUT_MSECS))) { dev_err_ratelimited(dev, "[cmd] token 0x%x mailbox 0x%x timeout.\n", - context->token, op); + context->token, mbox_msg->cmd); ret = -EBUSY; goto out; } @@ -140,42 +133,50 @@ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, ret = context->result; if (ret) dev_err_ratelimited(dev, "[cmd] token 0x%x mailbox 0x%x error %d.\n", - context->token, op, ret); + context->token, mbox_msg->cmd, ret); out: context->busy = 0; return ret; } -static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, unsigned long in_modifier, - u8 op) +static int hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { int ret; down(&hr_dev->cmd.event_sem); - ret = __hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, in_modifier, - op); + ret = __hns_roce_cmd_mbox_wait(hr_dev, mbox_msg); up(&hr_dev->cmd.event_sem); return ret; } int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u8 op) + u8 cmd, unsigned long tag) { + struct hns_roce_mbox_msg mbox_msg = {}; bool is_busy; if (hr_dev->hw->chk_mbox_avail) if (!hr_dev->hw->chk_mbox_avail(hr_dev, &is_busy)) return is_busy ? -EBUSY : 0; - if (hr_dev->cmd.use_events) - return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, - in_modifier, op); - else - return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param, - in_modifier, op); + mbox_msg.in_param = in_param; + mbox_msg.out_param = out_param; + mbox_msg.cmd = cmd; + mbox_msg.tag = tag; + + if (hr_dev->cmd.use_events) { + mbox_msg.event_en = 1; + + return hns_roce_cmd_mbox_wait(hr_dev, &mbox_msg); + } else { + mbox_msg.event_en = 0; + mbox_msg.token = CMD_POLL_TOKEN; + + return hns_roce_cmd_mbox_poll(hr_dev, &mbox_msg); + } } int hns_roce_cmd_init(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index 7928790061b8..759da8981c71 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -140,7 +140,7 @@ enum { }; int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, - unsigned long in_modifier, u8 op); + u8 cmd, unsigned long tag); struct hns_roce_cmd_mailbox * hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 22bd9e066a38..a335fa8481a5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -139,9 +139,8 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle); - /* Send mailbox to hw */ - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, - HNS_ROCE_CMD_CREATE_CQC); + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, + HNS_ROCE_CMD_CREATE_CQC, hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { ibdev_err(ibdev, @@ -174,8 +173,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) struct device *dev = hr_dev->dev; int ret; - ret = hns_roce_cmd_mbox(hr_dev, 0, 0, hr_cq->cqn, - HNS_ROCE_CMD_DESTROY_CQC); + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, HNS_ROCE_CMD_DESTROY_CQC, + hr_cq->cqn); if (ret) dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 8dd7919f8698..5e4a3536c41b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -561,6 +561,15 @@ struct hns_roce_cmd_mailbox { dma_addr_t dma; }; +struct hns_roce_mbox_msg { + u64 in_param; + u64 out_param; + u8 cmd; + u32 tag; + u16 token; + u8 event_en; +}; + struct hns_roce_dev; struct hns_roce_rinl_sge { @@ -851,9 +860,8 @@ struct hns_roce_hw { int (*hw_profile)(struct hns_roce_dev *hr_dev); int (*hw_init)(struct hns_roce_dev *hr_dev); void (*hw_exit)(struct hns_roce_dev *hr_dev); - int (*post_mbox)(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, u8 op, - u16 token, int event); + int (*post_mbox)(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg); int (*poll_mbox_done)(struct hns_roce_dev *hr_dev); bool (*chk_mbox_avail)(struct hns_roce_dev *hr_dev, bool *is_busy); int (*set_gid)(struct hns_roce_dev *hr_dev, int gid_index, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 63571abfc019..55c49d358f76 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1344,16 +1344,17 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, return ret; } -static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, unsigned long obj, - dma_addr_t base_addr, u8 op) +static int config_hem_ba_to_hw(struct hns_roce_dev *hr_dev, + dma_addr_t base_addr, u8 cmd, unsigned long tag) { - struct hns_roce_cmd_mailbox *mbox = hns_roce_alloc_cmd_mailbox(hr_dev); + struct hns_roce_cmd_mailbox *mbox; int ret; + mbox = hns_roce_alloc_cmd_mailbox(hr_dev); if (IS_ERR(mbox)) return PTR_ERR(mbox); - ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, obj, op); + ret = hns_roce_cmd_mbox(hr_dev, base_addr, mbox->dma, cmd, tag); hns_roce_free_cmd_mailbox(hr_dev, mbox); return ret; } @@ -2779,21 +2780,21 @@ static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) free_dip_list(hr_dev); } -static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, - u8 op, u16 token, int event) +static int hns_roce_mbox_post(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { struct hns_roce_cmq_desc desc; struct hns_roce_post_mbox *mb = (struct hns_roce_post_mbox *)desc.data; hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_POST_MB, false); - mb->in_param_l = cpu_to_le32(in_param); - mb->in_param_h = cpu_to_le32(in_param >> 32); - mb->out_param_l = cpu_to_le32(out_param); - mb->out_param_h = cpu_to_le32(out_param >> 32); - mb->cmd_tag = cpu_to_le32(in_modifier << 8 | op); - mb->token_event_en = cpu_to_le32(event << 16 | token); + mb->in_param_l = cpu_to_le32(mbox_msg->in_param); + mb->in_param_h = cpu_to_le32(mbox_msg->in_param >> 32); + mb->out_param_l = cpu_to_le32(mbox_msg->out_param); + mb->out_param_h = cpu_to_le32(mbox_msg->out_param >> 32); + mb->cmd_tag = cpu_to_le32(mbox_msg->tag << 8 | mbox_msg->cmd); + mb->token_event_en = cpu_to_le32(mbox_msg->event_en << 16 | + mbox_msg->token); return hns_roce_cmq_send(hr_dev, &desc, 1); } @@ -2846,9 +2847,8 @@ static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout, return ret; } -static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, - u64 out_param, u32 in_modifier, - u8 op, u16 token, int event) +static int v2_post_mbox(struct hns_roce_dev *hr_dev, + struct hns_roce_mbox_msg *mbox_msg) { u8 status = 0; int ret; @@ -2864,8 +2864,7 @@ static int v2_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, } /* Post new message to mbox */ - ret = hns_roce_mbox_post(hr_dev, in_param, out_param, in_modifier, - op, token, event); + ret = hns_roce_mbox_post(hr_dev, mbox_msg); if (ret) dev_err_ratelimited(hr_dev->dev, "failed to post mailbox, ret = %d.\n", ret); @@ -3818,38 +3817,38 @@ out: } static int get_op_for_set_hem(struct hns_roce_dev *hr_dev, u32 type, - u32 step_idx, u8 *mbox_op) + u32 step_idx, u8 *mbox_cmd) { - u8 op; + u8 cmd; switch (type) { case HEM_TYPE_QPC: - op = HNS_ROCE_CMD_WRITE_QPC_BT0; + cmd = HNS_ROCE_CMD_WRITE_QPC_BT0; break; case HEM_TYPE_MTPT: - op = HNS_ROCE_CMD_WRITE_MPT_BT0; + cmd = HNS_ROCE_CMD_WRITE_MPT_BT0; break; case HEM_TYPE_CQC: - op = HNS_ROCE_CMD_WRITE_CQC_BT0; + cmd = HNS_ROCE_CMD_WRITE_CQC_BT0; break; case HEM_TYPE_SRQC: - op = HNS_ROCE_CMD_WRITE_SRQC_BT0; + cmd = HNS_ROCE_CMD_WRITE_SRQC_BT0; break; case HEM_TYPE_SCCC: - op = HNS_ROCE_CMD_WRITE_SCCC_BT0; + cmd = HNS_ROCE_CMD_WRITE_SCCC_BT0; break; case HEM_TYPE_QPC_TIMER: - op = HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0; + cmd = HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0; break; case HEM_TYPE_CQC_TIMER: - op = HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0; + cmd = HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0; break; default: dev_warn(hr_dev->dev, "failed to check hem type %u.\n", type); return -EINVAL; } - *mbox_op = op + step_idx; + *mbox_cmd = cmd + step_idx; return 0; } @@ -3875,7 +3874,7 @@ static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, dma_addr_t base_addr, u32 hem_type, u32 step_idx) { int ret; - u8 op; + u8 cmd; if (unlikely(hem_type == HEM_TYPE_GMV)) return config_gmv_ba_to_hw(hr_dev, obj, base_addr); @@ -3883,11 +3882,11 @@ static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, if (unlikely(hem_type == HEM_TYPE_SCCC && step_idx)) return 0; - ret = get_op_for_set_hem(hr_dev, hem_type, step_idx, &op); + ret = get_op_for_set_hem(hr_dev, hem_type, step_idx, &cmd); if (ret < 0) return ret; - return config_hem_ba_to_hw(hr_dev, obj, base_addr, op); + return config_hem_ba_to_hw(hr_dev, base_addr, cmd, obj); } static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev, @@ -3950,12 +3949,12 @@ static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev, } static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_table *table, int obj, - u32 step_idx) + struct hns_roce_hem_table *table, + int tag, u32 step_idx) { struct hns_roce_cmd_mailbox *mailbox; struct device *dev = hr_dev->dev; - u8 op = 0xff; + u8 cmd = 0xff; int ret; if (!hns_roce_check_whether_mhop(hr_dev, table->type)) @@ -3963,16 +3962,16 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, switch (table->type) { case HEM_TYPE_QPC: - op = HNS_ROCE_CMD_DESTROY_QPC_BT0; + cmd = HNS_ROCE_CMD_DESTROY_QPC_BT0; break; case HEM_TYPE_MTPT: - op = HNS_ROCE_CMD_DESTROY_MPT_BT0; + cmd = HNS_ROCE_CMD_DESTROY_MPT_BT0; break; case HEM_TYPE_CQC: - op = HNS_ROCE_CMD_DESTROY_CQC_BT0; + cmd = HNS_ROCE_CMD_DESTROY_CQC_BT0; break; case HEM_TYPE_SRQC: - op = HNS_ROCE_CMD_DESTROY_SRQC_BT0; + cmd = HNS_ROCE_CMD_DESTROY_SRQC_BT0; break; case HEM_TYPE_SCCC: case HEM_TYPE_QPC_TIMER: @@ -3985,14 +3984,13 @@ static int hns_roce_v2_clear_hem(struct hns_roce_dev *hr_dev, return 0; } - op += step_idx; + cmd += step_idx; mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - /* configure the tag and op */ - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, obj, op); + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cmd, tag); hns_roce_free_cmd_mailbox(hr_dev, mailbox); return ret; @@ -4016,8 +4014,8 @@ static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev, memcpy(mailbox->buf, context, qpc_size); memcpy(mailbox->buf + qpc_size, qpc_mask, qpc_size); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, - HNS_ROCE_CMD_MODIFY_QPC); + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, + HNS_ROCE_CMD_MODIFY_QPC, hr_qp->qpn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); @@ -5090,8 +5088,8 @@ static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev, if (IS_ERR(mailbox)) return PTR_ERR(mailbox); - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, - HNS_ROCE_CMD_QUERY_QPC); + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_QPC, + hr_qp->qpn); if (ret) goto out; @@ -5457,8 +5455,8 @@ static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq, hr_reg_write(srq_context, SRQC_LIMIT_WL, srq_attr->srq_limit); hr_reg_clear(srqc_mask, SRQC_LIMIT_WL); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, - HNS_ROCE_CMD_MODIFY_SRQC); + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, + HNS_ROCE_CMD_MODIFY_SRQC, srq->srqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { ibdev_err(&hr_dev->ib_dev, @@ -5484,8 +5482,8 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) return PTR_ERR(mailbox); srq_context = mailbox->buf; - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, - HNS_ROCE_CMD_QUERY_SRQC); + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, + HNS_ROCE_CMD_QUERY_SRQC, srq->srqn); if (ret) { ibdev_err(&hr_dev->ib_dev, "failed to process cmd of querying SRQ, ret = %d.\n", @@ -5535,8 +5533,8 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) hr_reg_write(cq_context, CQC_CQ_PERIOD, cq_period); hr_reg_clear(cqc_mask, CQC_CQ_PERIOD); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, - HNS_ROCE_CMD_MODIFY_CQC); + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, + HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) ibdev_err(&hr_dev->ib_dev, @@ -5863,13 +5861,14 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) { struct device *dev = hr_dev->dev; int ret; + u8 cmd; if (eqn < hr_dev->caps.num_comp_vectors) - ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, - HNS_ROCE_CMD_DESTROY_CEQC); + cmd = HNS_ROCE_CMD_DESTROY_CEQC; else - ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, - HNS_ROCE_CMD_DESTROY_AEQC); + cmd = HNS_ROCE_CMD_DESTROY_AEQC; + + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, cmd, eqn & HNS_ROCE_V2_EQN_M); if (ret) dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); } @@ -5993,7 +5992,7 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, if (ret) goto err_cmd_mbox; - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, eq_cmd); + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq_cmd, eq->eqn); if (ret) { dev_err(hr_dev->dev, "[mailbox cmd] create eqc failed.\n"); goto err_cmd_mbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c index 107288150e3f..f7a75a7cda74 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c @@ -18,8 +18,8 @@ int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn, return PTR_ERR(mailbox); cq_context = mailbox->buf; - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cqn, - HNS_ROCE_CMD_QUERY_CQC); + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_CQC, + cqn); if (ret) { dev_err(hr_dev->dev, "QUERY cqc cmd process error\n"); goto err_mailbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 62a57cae800c..22ff78a5a1a7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -51,15 +51,15 @@ static int hns_roce_hw_create_mpt(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox, unsigned long mpt_index) { - return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, - HNS_ROCE_CMD_CREATE_MPT); + return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, + HNS_ROCE_CMD_CREATE_MPT, mpt_index); } int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, unsigned long mpt_index) { - return hns_roce_cmd_mbox(hr_dev, 0, 0, mpt_index, - HNS_ROCE_CMD_DESTROY_MPT); + return hns_roce_cmd_mbox(hr_dev, 0, 0, HNS_ROCE_CMD_DESTROY_MPT, + mpt_index); } static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) @@ -300,8 +300,9 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, return ERR_CAST(mailbox); mtpt_idx = key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1); - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, mtpt_idx, - HNS_ROCE_CMD_QUERY_MPT); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, HNS_ROCE_CMD_QUERY_MPT, + mtpt_idx); if (ret) goto free_cmd_mbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 1fef5d630485..f270563aca97 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -63,14 +63,14 @@ static int hns_roce_hw_create_srq(struct hns_roce_dev *dev, struct hns_roce_cmd_mailbox *mailbox, unsigned long srq_num) { - return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, - HNS_ROCE_CMD_CREATE_SRQ); + return hns_roce_cmd_mbox(dev, mailbox->dma, 0, HNS_ROCE_CMD_CREATE_SRQ, + srq_num); } static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, unsigned long srq_num) { - return hns_roce_cmd_mbox(dev, 0, 0, srq_num, HNS_ROCE_CMD_DESTROY_SRQ); + return hns_roce_cmd_mbox(dev, 0, 0, HNS_ROCE_CMD_DESTROY_SRQ, srq_num); } static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) From cf7f8f5c1c541ece6ec265432c8cf22a6edf067b Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 2 Mar 2022 14:48:27 +0800 Subject: [PATCH 136/186] RDMA/hns: Remove similar code that configures the hardware contexts Remove duplicate code for creating and destroying hardware contexts via mailbox. Link: https://lore.kernel.org/r/20220302064830.61706-7-liangwenpeng@huawei.com Signed-off-by: Chengchang Tang Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 12 +++++++++ drivers/infiniband/hw/hns/hns_roce_cmd.h | 5 ++++ drivers/infiniband/hw/hns/hns_roce_cq.c | 8 +++--- drivers/infiniband/hw/hns/hns_roce_device.h | 2 -- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 +-- drivers/infiniband/hw/hns/hns_roce_mr.c | 29 ++++++--------------- drivers/infiniband/hw/hns/hns_roce_srq.c | 20 +++----------- 7 files changed, 35 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 7e37066b272d..864413607571 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -262,3 +262,15 @@ void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev, dma_pool_free(hr_dev->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } + +int hns_roce_create_hw_ctx(struct hns_roce_dev *dev, + struct hns_roce_cmd_mailbox *mailbox, + u8 cmd, unsigned long idx) +{ + return hns_roce_cmd_mbox(dev, mailbox->dma, 0, cmd, idx); +} + +int hns_roce_destroy_hw_ctx(struct hns_roce_dev *dev, u8 cmd, unsigned long idx) +{ + return hns_roce_cmd_mbox(dev, 0, 0, cmd, idx); +} diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index 759da8981c71..052a3d60905a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -146,5 +146,10 @@ struct hns_roce_cmd_mailbox * hns_roce_alloc_cmd_mailbox(struct hns_roce_dev *hr_dev); void hns_roce_free_cmd_mailbox(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox); +int hns_roce_create_hw_ctx(struct hns_roce_dev *dev, + struct hns_roce_cmd_mailbox *mailbox, + u8 cmd, unsigned long idx); +int hns_roce_destroy_hw_ctx(struct hns_roce_dev *dev, u8 cmd, + unsigned long idx); #endif /* _HNS_ROCE_CMD_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index a335fa8481a5..3d10300cab85 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -139,8 +139,8 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, - HNS_ROCE_CMD_CREATE_CQC, hr_cq->cqn); + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_CQC, + hr_cq->cqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { ibdev_err(ibdev, @@ -173,8 +173,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) struct device *dev = hr_dev->dev; int ret; - ret = hns_roce_cmd_mbox(hr_dev, 0, 0, HNS_ROCE_CMD_DESTROY_CQC, - hr_cq->cqn); + ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC, + hr_cq->cqn); if (ret) dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 5e4a3536c41b..21182ec56f18 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1152,8 +1152,6 @@ struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); -int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, - unsigned long mpt_index); unsigned long key_to_hw_index(u32 key); int hns_roce_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 55c49d358f76..631f6e233492 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5868,7 +5868,7 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, u32 eqn) else cmd = HNS_ROCE_CMD_DESTROY_AEQC; - ret = hns_roce_cmd_mbox(hr_dev, 0, 0, cmd, eqn & HNS_ROCE_V2_EQN_M); + ret = hns_roce_destroy_hw_ctx(hr_dev, cmd, eqn & HNS_ROCE_V2_EQN_M); if (ret) dev_err(dev, "[mailbox cmd] destroy eqc(%u) failed.\n", eqn); } @@ -5992,7 +5992,7 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, if (ret) goto err_cmd_mbox; - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq_cmd, eq->eqn); + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, eq_cmd, eq->eqn); if (ret) { dev_err(hr_dev->dev, "[mailbox cmd] create eqc failed.\n"); goto err_cmd_mbox; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 22ff78a5a1a7..39de862666d7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -47,21 +47,6 @@ unsigned long key_to_hw_index(u32 key) return (key << 24) | (key >> 8); } -static int hns_roce_hw_create_mpt(struct hns_roce_dev *hr_dev, - struct hns_roce_cmd_mailbox *mailbox, - unsigned long mpt_index) -{ - return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, - HNS_ROCE_CMD_CREATE_MPT, mpt_index); -} - -int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, - unsigned long mpt_index) -{ - return hns_roce_cmd_mbox(hr_dev, 0, 0, HNS_ROCE_CMD_DESTROY_MPT, - mpt_index); -} - static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { struct hns_roce_ida *mtpt_ida = &hr_dev->mr_table.mtpt_ida; @@ -141,7 +126,7 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, int ret; if (mr->enabled) { - ret = hns_roce_hw_destroy_mpt(hr_dev, + ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_MPT, key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) @@ -177,7 +162,7 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, goto err_page; } - ret = hns_roce_hw_create_mpt(hr_dev, mailbox, + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_MPT, mtpt_idx & (hr_dev->caps.num_mtpts - 1)); if (ret) { dev_err(dev, "failed to create mpt, ret = %d.\n", ret); @@ -306,7 +291,8 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, if (ret) goto free_cmd_mbox; - ret = hns_roce_hw_destroy_mpt(hr_dev, mtpt_idx); + ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_MPT, + mtpt_idx); if (ret) ibdev_warn(ib_dev, "failed to destroy MPT, ret = %d.\n", ret); @@ -336,7 +322,8 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, goto free_cmd_mbox; } - ret = hns_roce_hw_create_mpt(hr_dev, mailbox, mtpt_idx); + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_MPT, + mtpt_idx); if (ret) { ibdev_err(ib_dev, "failed to create MPT, ret = %d.\n", ret); goto free_cmd_mbox; @@ -477,7 +464,7 @@ static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, int ret; if (mw->enabled) { - ret = hns_roce_hw_destroy_mpt(hr_dev, + ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_MPT, key_to_hw_index(mw->rkey) & (hr_dev->caps.num_mtpts - 1)); if (ret) @@ -517,7 +504,7 @@ static int hns_roce_mw_enable(struct hns_roce_dev *hr_dev, goto err_page; } - ret = hns_roce_hw_create_mpt(hr_dev, mailbox, + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_MPT, mtpt_idx & (hr_dev->caps.num_mtpts - 1)); if (ret) { dev_err(dev, "MW CREATE_MPT failed (%d)\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index f270563aca97..e316276e18c2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -59,20 +59,6 @@ static void hns_roce_ib_srq_event(struct hns_roce_srq *srq, } } -static int hns_roce_hw_create_srq(struct hns_roce_dev *dev, - struct hns_roce_cmd_mailbox *mailbox, - unsigned long srq_num) -{ - return hns_roce_cmd_mbox(dev, mailbox->dma, 0, HNS_ROCE_CMD_CREATE_SRQ, - srq_num); -} - -static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, - unsigned long srq_num) -{ - return hns_roce_cmd_mbox(dev, 0, 0, HNS_ROCE_CMD_DESTROY_SRQ, srq_num); -} - static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) { struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; @@ -115,7 +101,8 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) goto err_mbox; } - ret = hns_roce_hw_create_srq(hr_dev, mailbox, srq->srqn); + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_SRQ, + srq->srqn); if (ret) { ibdev_err(ibdev, "failed to config SRQC, ret = %d.\n", ret); goto err_mbox; @@ -142,7 +129,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; int ret; - ret = hns_roce_hw_destroy_srq(hr_dev, srq->srqn); + ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ, + srq->srqn); if (ret) dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n", ret, srq->srqn); From 904de76c42b7d758e75197f8d532a1ffafc3caad Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Wed, 2 Mar 2022 14:48:28 +0800 Subject: [PATCH 137/186] RDMA/hns: Clean up the return value check of hns_roce_alloc_cmd_mailbox() hns_roce_alloc_cmd_mailbox() never returns NULL, so the check should be IS_ERR(). And the error code should be converted as the function's return value. Link: https://lore.kernel.org/r/20220302064830.61706-8-liangwenpeng@huawei.com Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 ++---- drivers/infiniband/hw/hns/hns_roce_srq.c | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 631f6e233492..06eb4f00428c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5981,8 +5981,8 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, /* Allocate mailbox memory */ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR_OR_NULL(mailbox)) - return -ENOMEM; + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); ret = alloc_eq_buf(hr_dev, eq); if (ret) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 39de862666d7..b58b869339cc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -148,10 +148,8 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, /* Allocate mailbox memory */ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) { - ret = PTR_ERR(mailbox); - return ret; - } + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); if (mr->type != MR_TYPE_FRMR) ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index e316276e18c2..97032a357b00 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -89,9 +89,9 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) } mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR_OR_NULL(mailbox)) { + if (IS_ERR(mailbox)) { ibdev_err(ibdev, "failed to alloc mailbox for SRQC.\n"); - ret = -ENOMEM; + ret = PTR_ERR(mailbox); goto err_xa; } From b65afbd2a05cdc7cff1c0db742b2ed4510d4826f Mon Sep 17 00:00:00 2001 From: Chengchang Tang Date: Wed, 2 Mar 2022 14:48:29 +0800 Subject: [PATCH 138/186] RDMA/hns: Refactor the alloc_srqc() Abstract the alloc_srqc() into several parts and separate the alloc_srqn() from the alloc_srqc(). Link: https://lore.kernel.org/r/20220302064830.61706-9-liangwenpeng@huawei.com Signed-off-by: Chengchang Tang Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_srq.c | 84 +++++++++++++++--------- 1 file changed, 54 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 97032a357b00..8dae98f827eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -59,40 +59,39 @@ static void hns_roce_ib_srq_event(struct hns_roce_srq *srq, } } -static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +static int alloc_srqn(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) { - struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; struct hns_roce_ida *srq_ida = &hr_dev->srq_table.srq_ida; - struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_cmd_mailbox *mailbox; - int ret; int id; id = ida_alloc_range(&srq_ida->ida, srq_ida->min, srq_ida->max, GFP_KERNEL); if (id < 0) { - ibdev_err(ibdev, "failed to alloc srq(%d).\n", id); + ibdev_err(&hr_dev->ib_dev, "failed to alloc srq(%d).\n", id); return -ENOMEM; } - srq->srqn = (unsigned long)id; - ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn); - if (ret) { - ibdev_err(ibdev, "failed to get SRQC table, ret = %d.\n", ret); - goto err_out; - } + srq->srqn = id; - ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); - if (ret) { - ibdev_err(ibdev, "failed to store SRQC, ret = %d.\n", ret); - goto err_put; - } + return 0; +} + +static void free_srqn(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +{ + ida_free(&hr_dev->srq_table.srq_ida.ida, (int)srq->srqn); +} + +static int hns_roce_create_srqc(struct hns_roce_dev *hr_dev, + struct hns_roce_srq *srq) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_cmd_mailbox *mailbox; + int ret; mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); if (IS_ERR(mailbox)) { ibdev_err(ibdev, "failed to alloc mailbox for SRQC.\n"); - ret = PTR_ERR(mailbox); - goto err_xa; + return PTR_ERR(mailbox); } ret = hr_dev->hw->write_srqc(srq, mailbox->buf); @@ -103,23 +102,42 @@ static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_SRQ, srq->srqn); - if (ret) { + if (ret) ibdev_err(ibdev, "failed to config SRQC, ret = %d.\n", ret); - goto err_mbox; - } - - hns_roce_free_cmd_mailbox(hr_dev, mailbox); - - return 0; err_mbox: hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +{ + struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; + struct ib_device *ibdev = &hr_dev->ib_dev; + int ret; + + ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn); + if (ret) { + ibdev_err(ibdev, "failed to get SRQC table, ret = %d.\n", ret); + return ret; + } + + ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); + if (ret) { + ibdev_err(ibdev, "failed to store SRQC, ret = %d.\n", ret); + goto err_put; + } + + ret = hns_roce_create_srqc(hr_dev, srq); + if (ret) + goto err_xa; + + return 0; + err_xa: xa_erase(&srq_table->xa, srq->srqn); err_put: hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); -err_out: - ida_free(&srq_ida->ida, id); return ret; } @@ -142,7 +160,6 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) wait_for_completion(&srq->free); hns_roce_table_put(hr_dev, &srq_table->table, srq->srqn); - ida_free(&srq_table->srq_ida.ida, (int)srq->srqn); } static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, @@ -390,10 +407,14 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, if (ret) return ret; - ret = alloc_srqc(hr_dev, srq); + ret = alloc_srqn(hr_dev, srq); if (ret) goto err_srq_buf; + ret = alloc_srqc(hr_dev, srq); + if (ret) + goto err_srqn; + if (udata) { resp.srqn = srq->srqn; if (ib_copy_to_udata(udata, &resp, @@ -412,6 +433,8 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, err_srqc: free_srqc(hr_dev, srq); +err_srqn: + free_srqn(hr_dev, srq); err_srq_buf: free_srq_buf(hr_dev, srq); @@ -424,6 +447,7 @@ int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) struct hns_roce_srq *srq = to_hr_srq(ibsrq); free_srqc(hr_dev, srq); + free_srqn(hr_dev, srq); free_srq_buf(hr_dev, srq); return 0; } From 73f7e05609ece4030f2745c4c0c01e0be6889590 Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Wed, 2 Mar 2022 14:48:30 +0800 Subject: [PATCH 139/186] RDMA/hns: Refactor the alloc_cqc() Abstract the alloc_cqc() into several parts and separate the process unrelated to allocating CQC. Link: https://lore.kernel.org/r/20220302064830.61706-10-liangwenpeng@huawei.com Signed-off-by: Wenpeng Liang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cq.c | 65 ++++++++++++++----------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 3d10300cab85..8acd599ffac1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -100,12 +100,39 @@ static void free_cqn(struct hns_roce_dev *hr_dev, unsigned long cqn) mutex_unlock(&cq_table->bank_mutex); } +static int hns_roce_create_cqc(struct hns_roce_dev *hr_dev, + struct hns_roce_cq *hr_cq, + u64 *mtts, dma_addr_t dma_handle) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_cmd_mailbox *mailbox; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) { + ibdev_err(ibdev, "failed to alloc mailbox for CQC.\n"); + return PTR_ERR(mailbox); + } + + hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle); + + ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_CQC, + hr_cq->cqn); + if (ret) + ibdev_err(ibdev, + "failed to send create cmd for CQ(0x%lx), ret = %d.\n", + hr_cq->cqn, ret); + + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return ret; +} + static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) { struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_cmd_mailbox *mailbox; - u64 mtts[MTT_MIN_COUNT] = { 0 }; + u64 mtts[MTT_MIN_COUNT] = {}; dma_addr_t dma_handle; int ret; @@ -121,7 +148,7 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) if (ret) { ibdev_err(ibdev, "failed to get CQ(0x%lx) context, ret = %d.\n", hr_cq->cqn, ret); - goto err_out; + return ret; } ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); @@ -130,40 +157,17 @@ static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) goto err_put; } - /* Allocate mailbox memory */ - mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) { - ret = PTR_ERR(mailbox); + ret = hns_roce_create_cqc(hr_dev, hr_cq, mtts, dma_handle); + if (ret) goto err_xa; - } - - hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle); - - ret = hns_roce_create_hw_ctx(hr_dev, mailbox, HNS_ROCE_CMD_CREATE_CQC, - hr_cq->cqn); - hns_roce_free_cmd_mailbox(hr_dev, mailbox); - if (ret) { - ibdev_err(ibdev, - "failed to send create cmd for CQ(0x%lx), ret = %d.\n", - hr_cq->cqn, ret); - goto err_xa; - } - - hr_cq->cons_index = 0; - hr_cq->arm_sn = 1; - - refcount_set(&hr_cq->refcount, 1); - init_completion(&hr_cq->free); return 0; err_xa: xa_erase(&cq_table->array, hr_cq->cqn); - err_put: hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn); -err_out: return ret; } @@ -411,6 +415,11 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, goto err_cqc; } + hr_cq->cons_index = 0; + hr_cq->arm_sn = 1; + refcount_set(&hr_cq->refcount, 1); + init_completion(&hr_cq->free); + return 0; err_cqc: From 1a39ae415c1be1e46f5b3f97d438c7c4adc22b63 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 25 Feb 2022 16:18:30 -0800 Subject: [PATCH 140/186] xfs: add missing cmap->br_state = XFS_EXT_NORM update COW extents are already converted into written real extents after xfs_reflink_convert_cow_locked(), therefore cmap->br_state should reflect it. Otherwise, there is another necessary unwritten convertion triggered in xfs_dio_write_end_io() for direct I/O cases. Signed-off-by: Gao Xiang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index db70060e7bf6..54e68e5693fd 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -425,7 +425,10 @@ convert: if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, cmap); - return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + return error; out_trans_cancel: xfs_trans_cancel(tp); From eba0549bc7d100691c13384b774346b8aa9cf9a9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 25 Feb 2022 16:18:30 -0800 Subject: [PATCH 141/186] xfs: don't generate selinux audit messages for capability testing There are a few places where we test the current process' capability set to decide if we're going to be more or less generous with resource acquisition for a system call. If the process doesn't have the capability, we can continue the call, albeit in a degraded mode. These are /not/ the actual security decisions, so it's not proper to use capable(), which (in certain selinux setups) causes audit messages to get logged. Switch them to has_capability_noaudit. Fixes: 7317a03df703f ("xfs: refactor inode ownership change transaction/inode/quota allocation idiom") Fixes: ea9a46e1c4925 ("xfs: only return detailed fsmap info if the caller has CAP_SYS_ADMIN") Signed-off-by: Darrick J. Wong Cc: Dave Chinner Reviewed-by: Ondrej Mosnacek Acked-by: Serge Hallyn Reviewed-by: Eric Sandeen --- fs/xfs/xfs_fsmap.c | 4 ++-- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_iops.c | 2 +- kernel/capability.c | 1 + 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 48287caad28b..10e1cb71439e 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -864,8 +864,8 @@ xfs_getfsmap( !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) return -EINVAL; - use_rmap = capable(CAP_SYS_ADMIN) && - xfs_has_rmapbt(mp); + use_rmap = xfs_has_rmapbt(mp) && + has_capability_noaudit(current, CAP_SYS_ADMIN); head->fmh_entries = 0; /* Set up our device handlers. */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2515fe8299e1..83481005317a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1189,7 +1189,7 @@ xfs_ioctl_setattr_get_trans( goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_error; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index b79b3846e71b..a65217f787cf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -723,7 +723,7 @@ xfs_setattr_nonsize( } error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL, - capable(CAP_FOWNER), &tp); + has_capability_noaudit(current, CAP_FOWNER), &tp); if (error) goto out_dqrele; diff --git a/kernel/capability.c b/kernel/capability.c index 46a361dde042..765194f5d678 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } +EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, From db8cd5efeebc4904df1653926102413d088a5c7e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 4 Mar 2022 12:46:55 -0800 Subject: [PATCH 142/186] dax: Fix missing kdoc for dax_device struct dax_device has a member named ops which was undocumented. Add the kdoc. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20220304204655.3489216-1-ira.weiny@intel.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 6bd565fe2e63..ae838c636cbd 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -21,6 +21,7 @@ * @cdev: optional character interface for "device dax" * @private: dax driver private data * @flags: state and boolean properties + * @ops: operations for this device */ struct dax_device { struct inode inode; From e014f37db1a2d109afa750042ac4d69cf3e3d88e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Mar 2022 10:51:16 -0800 Subject: [PATCH 143/186] xfs: use setattr_copy to set vfs inode attributes Filipe Manana pointed out that XFS' behavior w.r.t. setuid/setgid revocation isn't consistent with btrfs[1] or ext4. Those two filesystems use the VFS function setattr_copy to convey certain attributes from struct iattr into the VFS inode structure. Andrey Zhadchenko reported[2] that XFS uses the wrong user namespace to decide if it should clear setgid and setuid on a file attribute update. This is a second symptom of the problem that Filipe noticed. XFS, on the other hand, open-codes setattr_copy in xfs_setattr_mode, xfs_setattr_nonsize, and xfs_setattr_time. Regrettably, setattr_copy is /not/ a simple copy function; it contains additional logic to clear the setgid bit when setting the mode, and XFS' version no longer matches. The VFS implements its own setuid/setgid stripping logic, which establishes consistent behavior. It's a tad unfortunate that it's scattered across notify_change, should_remove_suid, and setattr_copy but XFS should really follow the Linux VFS. Adapt XFS to use the VFS functions and get rid of the old functions. [1] https://lore.kernel.org/fstests/CAL3q7H47iNQ=Wmk83WcGB-KBJVOEtR9+qGczzCeXJ9Y2KCV25Q@mail.gmail.com/ [2] https://lore.kernel.org/linux-xfs/20220221182218.748084-1-andrey.zhadchenko@virtuozzo.com/ Fixes: 7fa294c8991c ("userns: Allow chown and setgid preservation") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner --- fs/xfs/xfs_iops.c | 56 +++-------------------------------------------- fs/xfs/xfs_pnfs.c | 3 ++- 2 files changed, 5 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a65217f787cf..db97cfede84d 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -613,37 +613,6 @@ xfs_vn_getattr( return 0; } -static void -xfs_setattr_mode( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - umode_t mode = iattr->ia_mode; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; -} - -void -xfs_setattr_time( - struct xfs_inode *ip, - struct iattr *iattr) -{ - struct inode *inode = VFS_I(ip); - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (iattr->ia_valid & ATTR_ATIME) - inode->i_atime = iattr->ia_atime; - if (iattr->ia_valid & ATTR_CTIME) - inode->i_ctime = iattr->ia_ctime; - if (iattr->ia_valid & ATTR_MTIME) - inode->i_mtime = iattr->ia_mtime; -} - static int xfs_vn_change_ok( struct user_namespace *mnt_userns, @@ -742,16 +711,6 @@ xfs_setattr_nonsize( gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((inode->i_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) - inode->i_mode &= ~(S_ISUID|S_ISGID); - /* * Change the ownerships and register quota modifications * in the transaction. @@ -763,7 +722,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - inode->i_uid = uid; } if (!gid_eq(igid, gid)) { if (XFS_IS_GQUOTA_ON(mp)) { @@ -774,15 +732,10 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - inode->i_gid = gid; } } - if (mask & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -1006,11 +959,8 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } - if (iattr->ia_valid & ATTR_MODE) - xfs_setattr_mode(ip, iattr); - if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) - xfs_setattr_time(ip, iattr); - + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(mnt_userns, inode, iattr); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 4abe17312c2b..37a24f0f7cd4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -319,7 +319,8 @@ xfs_fs_commit_blocks( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_setattr_time(ip, iattr); + ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID))); + setattr_copy(&init_user_ns, inode, iattr); if (update_isize) { i_size_write(inode, iattr->ia_size); ip->i_disk_size = iattr->ia_size; From dd3b015dd806627c6bcacc24b03f1ca23ca085ff Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 8 Mar 2022 10:51:16 -0800 Subject: [PATCH 144/186] xfs: refactor user/group quota chown in xfs_setattr_nonsize Combine if tests to reduce the indentation levels of the quota chown calls in xfs_setattr_nonsize. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner --- fs/xfs/xfs_iops.c | 60 ++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index db97cfede84d..b34e8e4344a8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -647,10 +647,10 @@ xfs_setattr_nonsize( int mask = iattr->ia_valid; xfs_trans_t *tp; int error; - kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; - kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + kuid_t uid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID; struct xfs_dquot *udqp = NULL, *gdqp = NULL; - struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL; ASSERT((mask & ATTR_SIZE) == 0); @@ -697,42 +697,22 @@ xfs_setattr_nonsize( goto out_dqrele; /* - * Change file ownership. Must be the owner or privileged. + * Register quota modifications in the transaction. Must be the owner + * or privileged. These IDs could have changed since we last looked at + * them. But, we're assured that if the ownership did change while we + * didn't have the inode locked, inode's dquot(s) would have changed + * also. */ - if (mask & (ATTR_UID|ATTR_GID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = inode->i_uid; - igid = inode->i_gid; - gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; - uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (!uid_eq(iuid, uid)) { - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & ATTR_UID); - ASSERT(udqp); - olddquot1 = xfs_qm_vop_chown(tp, ip, - &ip->i_udquot, udqp); - } - } - if (!gid_eq(igid, gid)) { - if (XFS_IS_GQUOTA_ON(mp)) { - ASSERT(xfs_has_pquotino(mp) || - !XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & ATTR_GID); - ASSERT(gdqp); - olddquot2 = xfs_qm_vop_chown(tp, ip, - &ip->i_gdquot, gdqp); - } - } + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) && + !uid_eq(inode->i_uid, iattr->ia_uid)) { + ASSERT(udqp); + old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) && + !gid_eq(inode->i_gid, iattr->ia_gid)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); + ASSERT(gdqp); + old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } setattr_copy(mnt_userns, inode, iattr); @@ -747,8 +727,8 @@ xfs_setattr_nonsize( /* * Release any dquot(s) the inode had kept before chown. */ - xfs_qm_dqrele(olddquot1); - xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(old_udqp); + xfs_qm_dqrele(old_gdqp); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); From 871b9316e7a778ff97bdc34fdb2f2977f616651d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 25 Feb 2022 16:18:41 -0800 Subject: [PATCH 145/186] xfs: reserve quota for dir expansion when linking/unlinking files XFS does not reserve quota for directory expansion when linking or unlinking children from a directory. This means that we don't reject the expansion with EDQUOT when we're at or near a hard limit, which means that unprivileged userspace can use link()/unlink() to exceed quota. The fix for this is nuanced -- link operations don't always expand the directory, and we allow a link to proceed with no space reservation if we don't need to add a block to the directory to handle the addition. Unlink operations generally do not expand the directory (you'd have to free a block and then cause a btree split) and we can defer the directory block freeing if there is no space reservation. Moreover, there is a further bug in that we do not trigger the blockgc workers to try to clear space when we're out of quota. To fix both cases, create a new xfs_trans_alloc_dir function that allocates the transaction, locks and joins the inodes, and reserves quota for the directory. If there isn't sufficient space or quota, we'll switch the caller to reservationless mode. This should prevent quota usage overruns with the least restriction in functionality. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_inode.c | 46 +++++++++---------------- fs/xfs/xfs_trans.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 3 ++ 3 files changed, 106 insertions(+), 29 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 04bf467b1090..766a621b970d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1217,7 +1217,7 @@ xfs_link( { xfs_mount_t *mp = tdp->i_mount; xfs_trans_t *tp; - int error; + int error, nospace_error = 0; int resblks; trace_xfs_link(tdp, target_name); @@ -1236,19 +1236,11 @@ xfs_link( goto std_return; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); - } + error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, + &tp, &nospace_error); if (error) goto std_return; - xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK, XFS_IEXT_DIR_MANIP_CNT(mp)); if (error) @@ -1306,6 +1298,8 @@ xfs_link( error_return: xfs_trans_cancel(tp); std_return: + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } @@ -2755,6 +2749,7 @@ xfs_remove( xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp = NULL; int is_dir = S_ISDIR(VFS_I(ip)->i_mode); + int dontcare; int error = 0; uint resblks; @@ -2772,31 +2767,24 @@ xfs_remove( goto std_return; /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. + * We try to get the real space reservation first, allowing for + * directory btree deletion(s) implying possible bmap insert(s). If we + * can't get the space reservation then we use 0 instead, and avoid the + * bmap btree insert(s) in the directory code by, if the bmap insert + * tries to happen, instead trimming the LAST block from the directory. + * + * Ignore EDQUOT and ENOSPC being returned via nospace_error because + * the directory code can handle a reservationless update and we don't + * want to prevent a user from trying to free space by deleting things. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, - &tp); - } + error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, + &tp, &dontcare); if (error) { ASSERT(error != -ENOSPC); goto std_return; } - xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* * If we're removing a directory perform some additional validation. */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 59e2f9031b9f..3d11f9bb0dbb 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1210,3 +1210,89 @@ out_cancel: xfs_trans_cancel(tp); return error; } + +/* + * Allocate an transaction, lock and join the directory and child inodes to it, + * and reserve quota for a directory update. If there isn't sufficient space, + * @dblocks will be set to zero for a reservationless directory update and + * @nospace_error will be set to a negative errno describing the space + * constraint we hit. + * + * The caller must ensure that the on-disk dquots attached to this inode have + * already been allocated and initialized. The ILOCKs will be dropped when the + * transaction is committed or cancelled. + */ +int +xfs_trans_alloc_dir( + struct xfs_inode *dp, + struct xfs_trans_res *resv, + struct xfs_inode *ip, + unsigned int *dblocks, + struct xfs_trans **tpp, + int *nospace_error) +{ + struct xfs_trans *tp; + struct xfs_mount *mp = ip->i_mount; + unsigned int resblks; + bool retried = false; + int error; + +retry: + *nospace_error = 0; + resblks = *dblocks; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + if (error == -ENOSPC) { + *nospace_error = error; + resblks = 0; + error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp); + } + if (error) + return error; + + xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqattach_locked(dp, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + error = xfs_qm_dqattach_locked(ip, false); + if (error) { + /* Caller should have allocated the dquots! */ + ASSERT(error != -ENOENT); + goto out_cancel; + } + + if (resblks == 0) + goto done; + + error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(dp, 0); + retried = true; + goto retry; + } + + *nospace_error = error; + resblks = 0; + error = 0; + } + if (error) + goto out_cancel; + +done: + *tpp = tp; + *dblocks = resblks; + return 0; + +out_cancel: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a487b264a9eb..faa282204498 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -259,6 +259,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv, + struct xfs_inode *ip, unsigned int *dblocks, + struct xfs_trans **tpp, int *nospace_error); static inline void xfs_trans_set_context( From 41667260bc84db4dfe566e3f6ab6da5293d60d8d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Mar 2022 10:10:50 -0800 Subject: [PATCH 146/186] xfs: reserve quota for target dir expansion when renaming files XFS does not reserve quota for directory expansion when renaming children into a directory. This means that we don't reject the expansion with EDQUOT when we're at or near a hard limit, which means that unprivileged userspace can use rename() to exceed quota. Rename operations don't always expand the target directory, and we allow a rename to proceed with no space reservation if we don't need to add a block to the target directory to handle the addition. Moreover, the unlink operation on the source directory generally does not expand the directory (you'd have to free a block and then cause a btree split) and it's probably of little consequence to leave the corner case that renaming a file out of a directory can increase its size. As with link and unlink, there is a further bug in that we do not trigger the blockgc workers to try to clear space when we're out of quota. Because rename is its own special tricky animal, we'll patch xfs_rename directly to reserve quota to the rename transaction. We'll leave cleaning up the rest of xfs_rename for the metadata directory tree patchset. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_inode.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 766a621b970d..35a2489942e5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3097,7 +3097,8 @@ xfs_rename( bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); int spaceres; - int error; + bool retried = false; + int error, nospace_error = 0; trace_xfs_rename(src_dp, target_dp, src_name, target_name); @@ -3121,9 +3122,12 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); +retry: + nospace_error = 0; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { + nospace_error = error; spaceres = 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, &tp); @@ -3177,6 +3181,31 @@ xfs_rename( target_dp, target_name, target_ip, spaceres); + /* + * Try to reserve quota to handle an expansion of the target directory. + * We'll allow the rename to continue in reservationless mode if we hit + * a space usage constraint. If we trigger reservationless mode, save + * the errno if there isn't any free space in the target directory. + */ + if (spaceres != 0) { + error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, + 0, false); + if (error == -EDQUOT || error == -ENOSPC) { + if (!retried) { + xfs_trans_cancel(tp); + xfs_blockgc_free_quota(target_dp, 0); + retried = true; + goto retry; + } + + nospace_error = error; + spaceres = 0; + error = 0; + } + if (error) + goto out_trans_cancel; + } + /* * Check for expected errors before we dirty the transaction * so we can return an error without a transaction abort. @@ -3423,6 +3452,8 @@ out_trans_cancel: out_release_wip: if (wip) xfs_irele(wip); + if (error == -ENOSPC && nospace_error) + error = nospace_error; return error; } From 996b2329b20a89963fa577d495cf057dd7bf129c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Mar 2022 10:16:09 -0800 Subject: [PATCH 147/186] xfs: constify the name argument to various directory functions Various directory functions do not modify their @name parameter, so mark it const to make that clear. This will enable us to mark the global xfs_name_dotdot variable as const to prevent mischief. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.c | 30 +++++++++++++++--------------- fs/xfs/libxfs/xfs_dir2.h | 6 +++--- fs/xfs/libxfs/xfs_dir2_priv.h | 5 +++-- fs/xfs/xfs_inode.c | 6 +++--- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_trace.h | 4 ++-- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 50546eadaae2..6b531a659b1e 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -54,10 +54,10 @@ xfs_mode_to_ftype( */ xfs_dahash_t xfs_ascii_ci_hashname( - struct xfs_name *name) + const struct xfs_name *name) { - xfs_dahash_t hash; - int i; + xfs_dahash_t hash; + int i; for (i = 0, hash = 0; i < name->len; i++) hash = tolower(name->name[i]) ^ rol32(hash, 7); @@ -243,7 +243,7 @@ int xfs_dir_createname( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, + const struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_extlen_t total) /* bmap's total block count */ { @@ -337,16 +337,16 @@ xfs_dir_cilookup_result( int xfs_dir_lookup( - xfs_trans_t *tp, - xfs_inode_t *dp, - struct xfs_name *name, - xfs_ino_t *inum, /* out: inode number */ - struct xfs_name *ci_name) /* out: actual name if CI match */ + struct xfs_trans *tp, + struct xfs_inode *dp, + const struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - int lock_mode; + struct xfs_da_args *args; + int rval; + int v; /* type-checking value */ + int lock_mode; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); @@ -475,7 +475,7 @@ int xfs_dir_replace( struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, /* name of entry to replace */ + const struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_extlen_t total) /* bmap's total block count */ { @@ -728,7 +728,7 @@ xfs_dir2_namecheck( xfs_dahash_t xfs_dir2_hashname( struct xfs_mount *mp, - struct xfs_name *name) + const struct xfs_name *name) { if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index d03e6098ded9..55e0557000db 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t *inum, + const struct xfs_name *name, xfs_ino_t *inum, struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name, xfs_ino_t ino, xfs_extlen_t tot); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, xfs_ino_t inum, + const struct xfs_name *name, xfs_ino_t inum, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_name *name); diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 711709a2aa53..7404a9ff1a92 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr { }; /* xfs_dir2.c */ -xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name); +xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name); enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args, const unsigned char *name, int len); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -201,7 +201,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, + const struct xfs_name *name); enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, const unsigned char *name, int len); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 35a2489942e5..67ece991d3f5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -658,9 +658,9 @@ xfs_ip2xflags( */ int xfs_lookup( - xfs_inode_t *dp, - struct xfs_name *name, - xfs_inode_t **ipp, + struct xfs_inode *dp, + const struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name) { xfs_ino_t inum; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b7e8f14d9fca..740ab13d1aa2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -402,7 +402,7 @@ enum layout_break_reason { int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); -int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, +int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *name, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4a8076ef8cb4..239c8b8a5a85 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -933,7 +933,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); DECLARE_EVENT_CLASS(xfs_namespace_class, - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), TP_ARGS(dp, name), TP_STRUCT__entry( __field(dev_t, dev) @@ -956,7 +956,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class, #define DEFINE_NAMESPACE_EVENT(name) \ DEFINE_EVENT(xfs_namespace_class, name, \ - TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \ TP_ARGS(dp, name)) DEFINE_NAMESPACE_EVENT(xfs_remove); DEFINE_NAMESPACE_EVENT(xfs_link); From 744e6c8ada5d612353a42ce8cd8323dd2364a70d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 9 Mar 2022 10:16:12 -0800 Subject: [PATCH 148/186] xfs: constify xfs_name_dotdot The symbol xfs_name_dotdot is a global variable that the xfs codebase uses here and there to look up directory dotdot entries. Currently it's a non-const variable, which means that it's a mutable global variable. So far nobody's abused this to cause problems, but let's use the compiler to enforce that. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2.c | 6 +++++- fs/xfs/libxfs/xfs_dir2.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 6b531a659b1e..5f1e4799e8fa 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -19,7 +19,11 @@ #include "xfs_error.h" #include "xfs_trace.h" -struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +const struct xfs_name xfs_name_dotdot = { + .name = (const unsigned char *)"..", + .len = 2, + .type = XFS_DIR3_FT_DIR, +}; /* * Convert inode mode to directory entry filetype diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 55e0557000db..b6df3c34b26a 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -21,7 +21,7 @@ struct xfs_dir2_data_unused; struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; -extern struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dotdot; /* * Convert inode mode to directory entry filetype From 6f6dbb819dfc1a35bcb8b709b5c83a3ea8beff75 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 7 Mar 2022 15:59:28 +0300 Subject: [PATCH 149/186] RDMA/irdma: Prevent some integer underflows My static checker complains that: drivers/infiniband/hw/irdma/ctrl.c:3605 irdma_sc_ceq_init() warn: can subtract underflow 'info->dev->hmc_fpm_misc.max_ceqs'? It appears that "info->dev->hmc_fpm_misc.max_ceqs" comes from the firmware in irdma_sc_parse_fpm_query_buf() so, yes, there is a chance that it could be zero. Even if we trust the firmware, it's easy enough to change the condition just as a hardenning measure. Fixes: 3f49d6842569 ("RDMA/irdma: Implement HW Admin Queue OPs") Link: https://lore.kernel.org/r/20220307125928.GE16710@kili Signed-off-by: Dan Carpenter Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/ctrl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 01cf75e9fd48..58c0e181ca2b 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -454,7 +454,7 @@ int irdma_sc_qp_create(struct irdma_sc_qp *qp, struct irdma_create_qp_info *info cqp = qp->dev->cqp; if (qp->qp_uk.qp_id < cqp->dev->hw_attrs.min_hw_qp_id || - qp->qp_uk.qp_id > (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt - 1)) + qp->qp_uk.qp_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_QP].max_cnt) return -EINVAL; wqe = irdma_sc_cqp_get_next_send_wqe(cqp, scratch); @@ -2504,10 +2504,10 @@ static int irdma_sc_cq_create(struct irdma_sc_cq *cq, u64 scratch, int ret_code = 0; cqp = cq->dev->cqp; - if (cq->cq_uk.cq_id > (cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt - 1)) + if (cq->cq_uk.cq_id >= cqp->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].max_cnt) return -EINVAL; - if (cq->ceq_id > (cq->dev->hmc_fpm_misc.max_ceqs - 1)) + if (cq->ceq_id >= cq->dev->hmc_fpm_misc.max_ceqs) return -EINVAL; ceq = cq->dev->ceq[cq->ceq_id]; @@ -3602,7 +3602,7 @@ int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, info->elem_cnt > info->dev->hw_attrs.max_hw_ceq_size) return -EINVAL; - if (info->ceq_id > (info->dev->hmc_fpm_misc.max_ceqs - 1)) + if (info->ceq_id >= info->dev->hmc_fpm_misc.max_ceqs) return -EINVAL; pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; @@ -4148,7 +4148,7 @@ int irdma_sc_ccq_init(struct irdma_sc_cq *cq, struct irdma_ccq_init_info *info) info->num_elem > info->dev->hw_attrs.uk_attrs.max_hw_cq_size) return -EINVAL; - if (info->ceq_id > (info->dev->hmc_fpm_misc.max_ceqs - 1)) + if (info->ceq_id >= info->dev->hmc_fpm_misc.max_ceqs) return -EINVAL; pble_obj_cnt = info->dev->hmc_info->hmc_obj[IRDMA_HMC_IW_PBLE].cnt; From 7e8e611d6a0ff228577b1167335ffefb0f44d5d8 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 7 Mar 2022 22:50:46 +0800 Subject: [PATCH 150/186] RDMA/rxe: Change variable and function argument to proper type The type of wqe length is u32 so in order to avoid overflow and shadow casting change variable and relevant function argument to proper type. Link: https://lore.kernel.org/r/20220307145047.3235675-1-cgxu519@mykernel.net Signed-off-by: Chengguang Xu Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 5eb89052dd66..b28036a7a3b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -359,7 +359,7 @@ static inline int get_mtu(struct rxe_qp *qp) static struct sk_buff *init_req_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - int opcode, int payload, + int opcode, u32 payload, struct rxe_pkt_info *pkt) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); @@ -449,7 +449,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, static int finish_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, struct rxe_pkt_info *pkt, struct sk_buff *skb, - int paylen) + u32 paylen) { int err; @@ -497,7 +497,7 @@ static void update_wqe_state(struct rxe_qp *qp, static void update_wqe_psn(struct rxe_qp *qp, struct rxe_send_wqe *wqe, struct rxe_pkt_info *pkt, - int payload) + u32 payload) { /* number of packets left to send including current one */ int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu; @@ -540,7 +540,7 @@ static void rollback_state(struct rxe_send_wqe *wqe, } static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_pkt_info *pkt, int payload) + struct rxe_pkt_info *pkt, u32 payload) { qp->req.opcode = pkt->opcode; @@ -612,7 +612,7 @@ int rxe_requester(void *arg) struct sk_buff *skb; struct rxe_send_wqe *wqe; enum rxe_hdr_mask mask; - int payload; + u32 payload; int mtu; int opcode; int ret; From aaaf62e066231f68f937990bc99f728576a2eab5 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 7 Mar 2022 22:50:47 +0800 Subject: [PATCH 151/186] RDMA/rxe: Remove useless argument for update_state() The argument 'payload' is not used in update_state(), so just remove it. Link: https://lore.kernel.org/r/20220307145047.3235675-2-cgxu519@mykernel.net Signed-off-by: Chengguang Xu Reviewed-by: Leon Romanovsky Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index b28036a7a3b8..a3c78b4ac9f1 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -540,7 +540,7 @@ static void rollback_state(struct rxe_send_wqe *wqe, } static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_pkt_info *pkt, u32 payload) + struct rxe_pkt_info *pkt) { qp->req.opcode = pkt->opcode; @@ -747,7 +747,7 @@ next_wqe: goto err; } - update_state(qp, wqe, &pkt, payload); + update_state(qp, wqe, &pkt); goto next_wqe; From 7922d3de4d270a9aedb71212fc0d5ae697ced516 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 9 Mar 2022 20:42:13 +0200 Subject: [PATCH 152/186] Revert "RDMA/core: Fix ib_qp_usecnt_dec() called when error" This reverts commit 7c4a539ec38f4ce400a0f3fcb5ff6c940fcd67bb. which causes to the following error in mlx4. Destroy of kernel CQ shouldn't fail WARNING: CPU: 4 PID: 18064 at include/rdma/ib_verbs.h:3936 mlx4_ib_dealloc_xrcd+0x12e/0x1b0 [mlx4_ib] Modules linked in: bonding ib_ipoib ip_gre ipip tunnel4 geneve rdma_ucm nf_tables ib_umad mlx4_en mlx4_ib ib_uverbs mlx4_core ip6_gre gre ip6_tunnel tunnel6 iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter overlay fuse [last unloaded: mlx4_core] CPU: 4 PID: 18064 Comm: ibv_xsrq_pingpo Not tainted 5.17.0-rc7_master_62c6ecb #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx4_ib_dealloc_xrcd+0x12e/0x1b0 [mlx4_ib] Code: 1e 93 08 00 40 80 fd 01 0f 87 fa f1 04 00 83 e5 01 0f 85 2b ff ff ff 48 c7 c7 20 4f b6 a0 c6 05 fd 92 08 00 01 e8 47 c9 82 e2 <0f> 0b e9 11 ff ff ff 0f b6 2d eb 92 08 00 40 80 fd 01 0f 87 b1 f1 RSP: 0018:ffff8881a4957750 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff8881ac4b6800 RCX: 0000000000000000 RDX: 0000000000000027 RSI: 0000000000000004 RDI: ffffed103492aedc RBP: 0000000000000000 R08: 0000000000000001 R09: ffff8884d2e378eb R10: ffffed109a5c6f1d R11: 0000000000000001 R12: ffff888132620000 R13: ffff8881a4957a90 R14: ffff8881aa2d4000 R15: ffff8881a4957ad0 FS: 00007f0401747740(0000) GS:ffff8884d2e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055f8ae036118 CR3: 000000012fe94005 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ib_dealloc_xrcd_user+0xce/0x120 [ib_core] ib_uverbs_dealloc_xrcd+0xad/0x210 [ib_uverbs] uverbs_free_xrcd+0xe8/0x190 [ib_uverbs] destroy_hw_idr_uobject+0x7a/0x130 [ib_uverbs] uverbs_destroy_uobject+0x164/0x730 [ib_uverbs] uobj_destroy+0x72/0xf0 [ib_uverbs] ib_uverbs_cmd_verbs+0x19fb/0x3110 [ib_uverbs] ib_uverbs_ioctl+0x169/0x260 [ib_uverbs] __x64_sys_ioctl+0x856/0x1550 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 7c4a539ec38f ("RDMA/core: Fix ib_qp_usecnt_dec() called when error") Link: https://lore.kernel.org/r/74c11029adaf449b3b9228a77cc82f39e9e892c8.1646851220.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 + drivers/infiniband/core/uverbs_std_types_qp.c | 1 + drivers/infiniband/core/verbs.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4437f834c0a7..6b6393176b3c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1437,6 +1437,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ret = PTR_ERR(qp); goto err_put; } + ib_qp_usecnt_inc(qp); obj->uevent.uobject.object = qp; obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c index 75353e09c6fe..dd1075466f61 100644 --- a/drivers/infiniband/core/uverbs_std_types_qp.c +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -254,6 +254,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( ret = PTR_ERR(qp); goto err_put; } + ib_qp_usecnt_inc(qp); if (attr.qp_type == IB_QPT_XRC_TGT) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index bc9a83f1ca2d..a9819c40a140 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1245,7 +1245,6 @@ static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, if (ret) goto err_security; - ib_qp_usecnt_inc(qp); rdma_restrack_add(&qp->res); return qp; @@ -1346,6 +1345,8 @@ struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, if (IS_ERR(qp)) return qp; + ib_qp_usecnt_inc(qp); + if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); if (ret) From 087f9c3f2309ed183f7e4b85ae57121d8663224d Mon Sep 17 00:00:00 2001 From: Yongzhi Liu Date: Fri, 11 Mar 2022 09:06:01 -0800 Subject: [PATCH 153/186] RDMA/mlx5: Fix memory leak in error flow for subscribe event routine In case the second xa_insert() fails, the obj_event is not released. Fix the error unwind flow to free that memory to avoid a memory leak. Fixes: 759738537142 ("IB/mlx5: Enable subscription for device events over DEVX") Link: https://lore.kernel.org/r/1647018361-18266-1-git-send-email-lyz_cs@pku.edu.cn Signed-off-by: Yongzhi Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 08b7f6bc56c3..15c0884d1f49 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1886,8 +1886,10 @@ subscribe_event_xa_alloc(struct mlx5_devx_event_table *devx_event_table, key_level2, obj_event, GFP_KERNEL); - if (err) + if (err) { + kfree(obj_event); return err; + } INIT_LIST_HEAD(&obj_event->obj_sub_list); } From 2c25e45267d0c7517578b7203a55fba2b6f6564a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 14 Mar 2022 12:53:46 +0100 Subject: [PATCH 154/186] RDMA/qib: Fix typos in comments Various spelling mistakes in comments. Detected with the help of Coccinelle. Link: https://lore.kernel.org/r/20220314115354.144023-23-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_iba7220.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 80a8dd6c7814..37b628a162e0 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -634,7 +634,7 @@ static const struct qib_hwerror_msgs qib_7220_hwerror_msgs[] = { QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT, "PCIe completion timeout"), /* - * In practice, it's unlikely wthat we'll see PCIe PLL, or bus + * In practice, it's unlikely that we'll see PCIe PLL, or bus * parity or memory parity error failures, because most likely we * won't be able to talk to the core of the chip. Nonetheless, we * might see them, if they are in parts of the PCIe core that aren't @@ -2988,7 +2988,7 @@ done: * the utility. Names need to be 12 chars or less (w/o newline), for proper * display by utility. * Non-error counters are first. - * Start of "error" conters is indicated by a leading "E " on the first + * Start of "error" counters is indicated by a leading "E " on the first * "error" counter, and doesn't count in label length. * The EgrOvfl list needs to be last so we truncate them at the configured * context count for the device. From 51cad2872435f79c63b81503d8d1c9f143172489 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Mon, 28 Feb 2022 12:36:50 -0600 Subject: [PATCH 155/186] RDMA/irdma: Add support for address handle re-use Address handles (AH) are a limited HW resource and some user applications may create large numbers of identical AH's. Avoid running out of AH's by reusing existing identical ones. Link: https://lore.kernel.org/r/20220228183650.290-1-shiraz.saleem@intel.com Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/irdma/main.c | 2 +- drivers/infiniband/hw/irdma/main.h | 2 + drivers/infiniband/hw/irdma/verbs.c | 215 ++++++++++++++++++++-------- drivers/infiniband/hw/irdma/verbs.h | 3 + 4 files changed, 163 insertions(+), 59 deletions(-) diff --git a/drivers/infiniband/hw/irdma/main.c b/drivers/infiniband/hw/irdma/main.c index 4f4c3454bc43..514453777e07 100644 --- a/drivers/infiniband/hw/irdma/main.c +++ b/drivers/infiniband/hw/irdma/main.c @@ -242,7 +242,7 @@ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_pf *pf rf->gen_ops.request_reset = irdma_request_reset; rf->limits_sel = 7; rf->iwdev = iwdev; - + mutex_init(&iwdev->ah_tbl_lock); iwdev->netdev = vsi->netdev; iwdev->vsi_num = vsi->vsi_num; iwdev->init_state = INITIAL_STATE; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 44365d1dc7b2..5123f5feaa2f 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -332,6 +332,8 @@ struct irdma_device { struct workqueue_struct *cleanup_wq; struct irdma_sc_vsi vsi; struct irdma_cm_core cm_core; + DECLARE_HASHTABLE(ah_hash_tbl, 8); + struct mutex ah_tbl_lock; /* protect AH hash table access */ u32 roce_cwnd; u32 roce_ackcreds; u32 vendor_id; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index e731768c2b2a..46f475394af5 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -4074,17 +4074,47 @@ static int irdma_detach_mcast(struct ib_qp *ibqp, union ib_gid *ibgid, u16 lid) return 0; } -/** - * irdma_create_ah - create address handle - * @ibah: address handle - * @attr: address handle attributes - * @udata: User data - * - * returns 0 on success, error otherwise - */ -static int irdma_create_ah(struct ib_ah *ibah, - struct rdma_ah_init_attr *attr, - struct ib_udata *udata) +static int irdma_create_hw_ah(struct irdma_device *iwdev, struct irdma_ah *ah, bool sleep) +{ + struct irdma_pci_f *rf = iwdev->rf; + int err; + + err = irdma_alloc_rsrc(rf, rf->allocated_ahs, rf->max_ah, &ah->sc_ah.ah_info.ah_idx, + &rf->next_ah); + if (err) + return err; + + err = irdma_ah_cqp_op(rf, &ah->sc_ah, IRDMA_OP_AH_CREATE, sleep, + irdma_gsi_ud_qp_ah_cb, &ah->sc_ah); + + if (err) { + ibdev_dbg(&iwdev->ibdev, "VERBS: CQP-OP Create AH fail"); + goto err_ah_create; + } + + if (!sleep) { + int cnt = CQP_COMPL_WAIT_TIME_MS * CQP_TIMEOUT_THRESHOLD; + + do { + irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq); + mdelay(1); + } while (!ah->sc_ah.ah_info.ah_valid && --cnt); + + if (!cnt) { + ibdev_dbg(&iwdev->ibdev, "VERBS: CQP create AH timed out"); + err = -ETIMEDOUT; + goto err_ah_create; + } + } + return 0; + +err_ah_create: + irdma_free_rsrc(iwdev->rf, iwdev->rf->allocated_ahs, ah->sc_ah.ah_info.ah_idx); + + return err; +} + +static int irdma_setup_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *attr) { struct irdma_pd *pd = to_iwpd(ibah->pd); struct irdma_ah *ah = container_of(ibah, struct irdma_ah, ibah); @@ -4093,21 +4123,13 @@ static int irdma_create_ah(struct ib_ah *ibah, struct irdma_device *iwdev = to_iwdev(ibah->pd->device); struct irdma_pci_f *rf = iwdev->rf; struct irdma_sc_ah *sc_ah; - u32 ah_id = 0; struct irdma_ah_info *ah_info; - struct irdma_create_ah_resp uresp; union irdma_sockaddr sgid_addr, dgid_addr; int err; u8 dmac[ETH_ALEN]; - err = irdma_alloc_rsrc(rf, rf->allocated_ahs, rf->max_ah, &ah_id, - &rf->next_ah); - if (err) - return err; - ah->pd = pd; sc_ah = &ah->sc_ah; - sc_ah->ah_info.ah_idx = ah_id; sc_ah->ah_info.vsi = &iwdev->vsi; irdma_sc_init_ah(&rf->sc_dev, sc_ah); ah->sgid_index = ah_attr->grh.sgid_index; @@ -4118,7 +4140,6 @@ static int irdma_create_ah(struct ib_ah *ibah, ah->av.attrs = *ah_attr; ah->av.net_type = rdma_gid_attr_network_type(sgid_attr); ah_info = &sc_ah->ah_info; - ah_info->ah_idx = ah_id; ah_info->pd_idx = pd->sc_pd.pd_id; if (ah_attr->ah_flags & IB_AH_GRH) { ah_info->flow_label = ah_attr->grh.flow_label; @@ -4155,15 +4176,13 @@ static int irdma_create_ah(struct ib_ah *ibah, err = rdma_read_gid_l2_fields(sgid_attr, &ah_info->vlan_tag, ah_info->mac_addr); if (err) - goto error; + return err; ah_info->dst_arpindex = irdma_add_arp(iwdev->rf, ah_info->dest_ip_addr, ah_info->ipv4_valid, dmac); - if (ah_info->dst_arpindex == -1) { - err = -EINVAL; - goto error; - } + if (ah_info->dst_arpindex == -1) + return -EINVAL; if (ah_info->vlan_tag >= VLAN_N_VID && iwdev->dcb_vlan_mode) ah_info->vlan_tag = 0; @@ -4174,43 +4193,38 @@ static int irdma_create_ah(struct ib_ah *ibah, rt_tos2priority(ah_info->tc_tos) << VLAN_PRIO_SHIFT; } - err = irdma_ah_cqp_op(iwdev->rf, sc_ah, IRDMA_OP_AH_CREATE, - attr->flags & RDMA_CREATE_AH_SLEEPABLE, - irdma_gsi_ud_qp_ah_cb, sc_ah); + return 0; +} - if (err) { - ibdev_dbg(&iwdev->ibdev, - "VERBS: CQP-OP Create AH fail"); - goto error; - } +/** + * irdma_ah_exists - Check for existing identical AH + * @iwdev: irdma device + * @new_ah: AH to check for + * + * returns true if AH is found, false if not found. + */ +static bool irdma_ah_exists(struct irdma_device *iwdev, + struct irdma_ah *new_ah) +{ + struct irdma_ah *ah; + u32 key = new_ah->sc_ah.ah_info.dest_ip_addr[0] ^ + new_ah->sc_ah.ah_info.dest_ip_addr[1] ^ + new_ah->sc_ah.ah_info.dest_ip_addr[2] ^ + new_ah->sc_ah.ah_info.dest_ip_addr[3]; - if (!(attr->flags & RDMA_CREATE_AH_SLEEPABLE)) { - int cnt = CQP_COMPL_WAIT_TIME_MS * CQP_TIMEOUT_THRESHOLD; - - do { - irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq); - mdelay(1); - } while (!sc_ah->ah_info.ah_valid && --cnt); - - if (!cnt) { - ibdev_dbg(&iwdev->ibdev, - "VERBS: CQP create AH timed out"); - err = -ETIMEDOUT; - goto error; + hash_for_each_possible(iwdev->ah_hash_tbl, ah, list, key) { + /* Set ah_valid and ah_id the same so memcmp can work */ + new_ah->sc_ah.ah_info.ah_idx = ah->sc_ah.ah_info.ah_idx; + new_ah->sc_ah.ah_info.ah_valid = ah->sc_ah.ah_info.ah_valid; + if (!memcmp(&ah->sc_ah.ah_info, &new_ah->sc_ah.ah_info, + sizeof(ah->sc_ah.ah_info))) { + refcount_inc(&ah->refcnt); + new_ah->parent_ah = ah; + return true; } } - if (udata) { - uresp.ah_id = ah->sc_ah.ah_info.ah_idx; - err = ib_copy_to_udata(udata, &uresp, - min(sizeof(uresp), udata->outlen)); - } - return 0; - -error: - irdma_free_rsrc(iwdev->rf, iwdev->rf->allocated_ahs, ah_id); - - return err; + return false; } /** @@ -4223,6 +4237,17 @@ static int irdma_destroy_ah(struct ib_ah *ibah, u32 ah_flags) struct irdma_device *iwdev = to_iwdev(ibah->device); struct irdma_ah *ah = to_iwah(ibah); + if ((ah_flags & RDMA_DESTROY_AH_SLEEPABLE) && ah->parent_ah) { + mutex_lock(&iwdev->ah_tbl_lock); + if (!refcount_dec_and_test(&ah->parent_ah->refcnt)) { + mutex_unlock(&iwdev->ah_tbl_lock); + return 0; + } + hash_del(&ah->parent_ah->list); + kfree(ah->parent_ah); + mutex_unlock(&iwdev->ah_tbl_lock); + } + irdma_ah_cqp_op(iwdev->rf, &ah->sc_ah, IRDMA_OP_AH_DESTROY, false, NULL, ah); @@ -4232,6 +4257,80 @@ static int irdma_destroy_ah(struct ib_ah *ibah, u32 ah_flags) return 0; } +/** + * irdma_create_user_ah - create user address handle + * @ibah: address handle + * @attr: address handle attributes + * @udata: User data + * + * returns 0 on success, error otherwise + */ +static int irdma_create_user_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *attr, + struct ib_udata *udata) +{ + struct irdma_ah *ah = container_of(ibah, struct irdma_ah, ibah); + struct irdma_device *iwdev = to_iwdev(ibah->pd->device); + struct irdma_create_ah_resp uresp; + struct irdma_ah *parent_ah; + int err; + + err = irdma_setup_ah(ibah, attr); + if (err) + return err; + mutex_lock(&iwdev->ah_tbl_lock); + if (!irdma_ah_exists(iwdev, ah)) { + err = irdma_create_hw_ah(iwdev, ah, true); + if (err) { + mutex_unlock(&iwdev->ah_tbl_lock); + return err; + } + /* Add new AH to list */ + parent_ah = kmemdup(ah, sizeof(*ah), GFP_KERNEL); + if (parent_ah) { + u32 key = parent_ah->sc_ah.ah_info.dest_ip_addr[0] ^ + parent_ah->sc_ah.ah_info.dest_ip_addr[1] ^ + parent_ah->sc_ah.ah_info.dest_ip_addr[2] ^ + parent_ah->sc_ah.ah_info.dest_ip_addr[3]; + + ah->parent_ah = parent_ah; + hash_add(iwdev->ah_hash_tbl, &parent_ah->list, key); + refcount_set(&parent_ah->refcnt, 1); + } + } + mutex_unlock(&iwdev->ah_tbl_lock); + + uresp.ah_id = ah->sc_ah.ah_info.ah_idx; + err = ib_copy_to_udata(udata, &uresp, min(sizeof(uresp), udata->outlen)); + if (err) + irdma_destroy_ah(ibah, attr->flags); + + return err; +} + +/** + * irdma_create_ah - create address handle + * @ibah: address handle + * @attr: address handle attributes + * @udata: NULL + * + * returns 0 on success, error otherwise + */ +static int irdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *attr, + struct ib_udata *udata) +{ + struct irdma_ah *ah = container_of(ibah, struct irdma_ah, ibah); + struct irdma_device *iwdev = to_iwdev(ibah->pd->device); + int err; + + err = irdma_setup_ah(ibah, attr); + if (err) + return err; + err = irdma_create_hw_ah(iwdev, ah, attr->flags & RDMA_CREATE_AH_SLEEPABLE); + + return err; +} + /** * irdma_query_ah - Query address handle * @ibah: pointer to address handle @@ -4265,7 +4364,7 @@ static enum rdma_link_layer irdma_get_link_layer(struct ib_device *ibdev, static const struct ib_device_ops irdma_roce_dev_ops = { .attach_mcast = irdma_attach_mcast, .create_ah = irdma_create_ah, - .create_user_ah = irdma_create_ah, + .create_user_ah = irdma_create_user_ah, .destroy_ah = irdma_destroy_ah, .detach_mcast = irdma_detach_mcast, .get_link_layer = irdma_get_link_layer, diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index 541105b728e3..08ba24d0b843 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -45,6 +45,9 @@ struct irdma_ah { struct irdma_av av; u8 sgid_index; union ib_gid dgid; + struct hlist_node list; + refcount_t refcnt; + struct irdma_ah *parent_ah; /* AH from cached list */ }; struct irdma_hmc_pble { From 70f92521584f1d1e8268311ee84413307b0fdea8 Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Thu, 10 Mar 2022 12:28:35 +0800 Subject: [PATCH 156/186] RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT Before destroying MPT, the reserved loopback QPs send loopback IOs (one write operation per SL). Completing these loopback IOs represents that there isn't any outstanding request in MPT, then it's safe to destroy MPT. Link: https://lore.kernel.org/r/20220310042835.38634-1-liangwenpeng@huawei.com Signed-off-by: Yixing Liu Signed-off-by: Wenpeng Liang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 311 +++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 20 ++ drivers/infiniband/hw/hns/hns_roce_mr.c | 6 +- 4 files changed, 335 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 21182ec56f18..3083d6db1d68 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -633,6 +633,7 @@ struct hns_roce_qp { u32 next_sge; enum ib_mtu path_mtu; u32 max_inline_data; + u8 free_mr_en; /* 0: flush needed, 1: unneeded */ unsigned long flush_flag; @@ -889,6 +890,7 @@ struct hns_roce_hw { enum ib_qp_state new_state); int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); + void (*dereg_mr)(struct hns_roce_dev *hr_dev); int (*init_eq)(struct hns_roce_dev *hr_dev); void (*cleanup_eq)(struct hns_roce_dev *hr_dev); int (*write_srqc)(struct hns_roce_srq *srq, void *mb_buf); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 06eb4f00428c..2b0cef17ad45 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2664,6 +2664,194 @@ static void free_dip_list(struct hns_roce_dev *hr_dev) spin_unlock_irqrestore(&hr_dev->dip_list_lock, flags); } +static void free_mr_exit(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + if (free_mr->rsv_qp[i]) { + ret = ib_destroy_qp(free_mr->rsv_qp[i]); + if (ret) + ibdev_err(&hr_dev->ib_dev, + "failed to destroy qp in free mr.\n"); + + free_mr->rsv_qp[i] = NULL; + } + } + + if (free_mr->rsv_cq) { + ib_destroy_cq(free_mr->rsv_cq); + free_mr->rsv_cq = NULL; + } + + if (free_mr->rsv_pd) { + ib_dealloc_pd(free_mr->rsv_pd); + free_mr->rsv_pd = NULL; + } +} + +static int free_mr_alloc_res(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_cq_init_attr cq_init_attr = {}; + struct ib_qp_init_attr qp_init_attr = {}; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + int ret; + int i; + + pd = ib_alloc_pd(ibdev, 0); + if (IS_ERR(pd)) { + ibdev_err(ibdev, "failed to create pd for free mr.\n"); + return PTR_ERR(pd); + } + free_mr->rsv_pd = pd; + + cq_init_attr.cqe = HNS_ROCE_FREE_MR_USED_CQE_NUM; + cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_init_attr); + if (IS_ERR(cq)) { + ibdev_err(ibdev, "failed to create cq for free mr.\n"); + ret = PTR_ERR(cq); + goto create_failed; + } + free_mr->rsv_cq = cq; + + qp_init_attr.qp_type = IB_QPT_RC; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.send_cq = free_mr->rsv_cq; + qp_init_attr.recv_cq = free_mr->rsv_cq; + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + qp_init_attr.cap.max_send_wr = HNS_ROCE_FREE_MR_USED_SQWQE_NUM; + qp_init_attr.cap.max_send_sge = HNS_ROCE_FREE_MR_USED_SQSGE_NUM; + qp_init_attr.cap.max_recv_wr = HNS_ROCE_FREE_MR_USED_RQWQE_NUM; + qp_init_attr.cap.max_recv_sge = HNS_ROCE_FREE_MR_USED_RQSGE_NUM; + + qp = ib_create_qp(free_mr->rsv_pd, &qp_init_attr); + if (IS_ERR(qp)) { + ibdev_err(ibdev, "failed to create qp for free mr.\n"); + ret = PTR_ERR(qp); + goto create_failed; + } + + free_mr->rsv_qp[i] = qp; + } + + return 0; + +create_failed: + free_mr_exit(hr_dev); + + return ret; +} + +static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev, + struct ib_qp_attr *attr, int sl_num) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_qp *hr_qp; + int loopback; + int mask; + int ret; + + hr_qp = to_hr_qp(free_mr->rsv_qp[sl_num]); + hr_qp->free_mr_en = 1; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS; + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE; + ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + if (ret) { + ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n", + ret); + return ret; + } + + loopback = hr_dev->loop_idc; + /* Set qpc lbi = 1 incidate loopback IO */ + hr_dev->loop_idc = 1; + + mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | + IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; + attr->qp_state = IB_QPS_RTR; + attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; + attr->path_mtu = IB_MTU_256; + attr->dest_qp_num = hr_qp->qpn; + attr->rq_psn = HNS_ROCE_FREE_MR_USED_PSN; + + rdma_ah_set_sl(&attr->ah_attr, (u8)sl_num); + + ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + hr_dev->loop_idc = loopback; + if (ret) { + ibdev_err(ibdev, "failed to modify qp to rtr, ret = %d.\n", + ret); + return ret; + } + + mask = IB_QP_STATE | IB_QP_SQ_PSN | IB_QP_RETRY_CNT | IB_QP_TIMEOUT | + IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC; + attr->qp_state = IB_QPS_RTS; + attr->sq_psn = HNS_ROCE_FREE_MR_USED_PSN; + attr->retry_cnt = HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT; + attr->timeout = HNS_ROCE_FREE_MR_USED_QP_TIMEOUT; + ret = ib_modify_qp(&hr_qp->ibqp, attr, mask); + if (ret) + ibdev_err(ibdev, "failed to modify qp to rts, ret = %d.\n", + ret); + + return ret; +} + +static int free_mr_modify_qp(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_qp_attr attr = {}; + int ret; + int i; + + rdma_ah_set_grh(&attr.ah_attr, NULL, 0, 0, 1, 0); + rdma_ah_set_static_rate(&attr.ah_attr, 3); + rdma_ah_set_port_num(&attr.ah_attr, 1); + + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + ret = free_mr_modify_rsv_qp(hr_dev, &attr, i); + if (ret) + return ret; + } + + return 0; +} + +static int free_mr_init(struct hns_roce_dev *hr_dev) +{ + int ret; + + ret = free_mr_alloc_res(hr_dev); + if (ret) + return ret; + + ret = free_mr_modify_qp(hr_dev); + if (ret) + goto err_modify_qp; + + return 0; + +err_modify_qp: + free_mr_exit(hr_dev); + + return ret; +} + static int get_hem_table(struct hns_roce_dev *hr_dev) { unsigned int qpc_count; @@ -3244,6 +3432,98 @@ static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) return 0; } +static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); + struct ib_device *ibdev = &hr_dev->ib_dev; + const struct ib_send_wr *bad_wr; + struct ib_rdma_wr rdma_wr = {}; + struct ib_send_wr *send_wr; + int ret; + + send_wr = &rdma_wr.wr; + send_wr->opcode = IB_WR_RDMA_WRITE; + + ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr); + if (ret) { + ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n", + ret); + return ret; + } + + return 0; +} + +static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc); + +static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_v2_free_mr *free_mr = &priv->free_mr; + struct ib_wc wc[ARRAY_SIZE(free_mr->rsv_qp)]; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_qp *hr_qp; + unsigned long end; + int cqe_cnt = 0; + int npolled; + int ret; + int i; + + /* + * If the device initialization is not complete or in the uninstall + * process, then there is no need to execute free mr. + */ + if (priv->handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT || + priv->handle->rinfo.instance_state == HNS_ROCE_STATE_INIT || + hr_dev->state == HNS_ROCE_DEVICE_STATE_UNINIT) + return; + + mutex_lock(&free_mr->mutex); + + for (i = 0; i < ARRAY_SIZE(free_mr->rsv_qp); i++) { + hr_qp = to_hr_qp(free_mr->rsv_qp[i]); + + ret = free_mr_post_send_lp_wqe(hr_qp); + if (ret) { + ibdev_err(ibdev, + "failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n", + hr_qp->qpn, ret); + break; + } + + cqe_cnt++; + } + + end = msecs_to_jiffies(HNS_ROCE_V2_FREE_MR_TIMEOUT) + jiffies; + while (cqe_cnt) { + npolled = hns_roce_v2_poll_cq(free_mr->rsv_cq, cqe_cnt, wc); + if (npolled < 0) { + ibdev_err(ibdev, + "failed to poll cqe for free mr, remain %d cqe.\n", + cqe_cnt); + goto out; + } + + if (time_after(jiffies, end)) { + ibdev_err(ibdev, + "failed to poll cqe for free mr and timeout, remain %d cqe.\n", + cqe_cnt); + goto out; + } + cqe_cnt -= npolled; + } + +out: + mutex_unlock(&free_mr->mutex); +} + +static void hns_roce_v2_dereg_mr(struct hns_roce_dev *hr_dev) +{ + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_send_cmd_to_hw(hr_dev); +} + static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n) { return hns_roce_buf_offset(hr_cq->mtr.kmem, n * hr_cq->cqe_size); @@ -4663,6 +4943,18 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, u8 hr_port; int ret; + /* + * If free_mr_en of qp is set, it means that this qp comes from + * free mr. This qp will perform the loopback operation. + * In the loopback scenario, only sl needs to be set. + */ + if (hr_qp->free_mr_en) { + hr_reg_write(context, QPC_SL, rdma_ah_get_sl(&attr->ah_attr)); + hr_reg_clear(qpc_mask, QPC_SL); + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + return 0; + } + ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num : hr_qp->port + 1; hr_port = ib_port - 1; is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) && @@ -6247,6 +6539,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .set_hem = hns_roce_v2_set_hem, .clear_hem = hns_roce_v2_clear_hem, .modify_qp = hns_roce_v2_modify_qp, + .dereg_mr = hns_roce_v2_dereg_mr, .qp_flow_control_init = hns_roce_v2_qp_flow_control_init, .init_eq = hns_roce_v2_init_eq_table, .cleanup_eq = hns_roce_v2_cleanup_eq_table, @@ -6328,14 +6621,25 @@ static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) ret = hns_roce_init(hr_dev); if (ret) { dev_err(hr_dev->dev, "RoCE Engine init failed!\n"); - goto error_failed_get_cfg; + goto error_failed_cfg; + } + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + ret = free_mr_init(hr_dev); + if (ret) { + dev_err(hr_dev->dev, "failed to init free mr!\n"); + goto error_failed_roce_init; + } } handle->priv = hr_dev; return 0; -error_failed_get_cfg: +error_failed_roce_init: + hns_roce_exit(hr_dev); + +error_failed_cfg: kfree(hr_dev->priv); error_failed_kzalloc: @@ -6357,6 +6661,9 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 12be85f0986e..0d87b627601e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -139,6 +139,18 @@ enum { #define CMD_CSQ_DESC_NUM 1024 #define CMD_CRQ_DESC_NUM 1024 +/* Free mr used parameters */ +#define HNS_ROCE_FREE_MR_USED_CQE_NUM 128 +#define HNS_ROCE_FREE_MR_USED_QP_NUM 0x8 +#define HNS_ROCE_FREE_MR_USED_PSN 0x0808 +#define HNS_ROCE_FREE_MR_USED_QP_RETRY_CNT 0x7 +#define HNS_ROCE_FREE_MR_USED_QP_TIMEOUT 0x12 +#define HNS_ROCE_FREE_MR_USED_SQWQE_NUM 128 +#define HNS_ROCE_FREE_MR_USED_SQSGE_NUM 0x2 +#define HNS_ROCE_FREE_MR_USED_RQWQE_NUM 128 +#define HNS_ROCE_FREE_MR_USED_RQSGE_NUM 0x2 +#define HNS_ROCE_V2_FREE_MR_TIMEOUT 4500 + enum { NO_ARMED = 0x0, REG_NXT_CEQE = 0x2, @@ -1418,10 +1430,18 @@ struct hns_roce_link_table { #define HNS_ROCE_EXT_LLM_ENTRY(addr, id) (((id) << (64 - 12)) | ((addr) >> 12)) #define HNS_ROCE_EXT_LLM_MIN_PAGES(que_num) ((que_num) * 4 + 2) +struct hns_roce_v2_free_mr { + struct ib_qp *rsv_qp[HNS_ROCE_FREE_MR_USED_QP_NUM]; + struct ib_cq *rsv_cq; + struct ib_pd *rsv_pd; + struct mutex mutex; +}; + struct hns_roce_v2_priv { struct hnae3_handle *handle; struct hns_roce_v2_cmq cmq; struct hns_roce_link_table ext_llm; + struct hns_roce_v2_free_mr free_mr; }; struct hns_roce_dip { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index b58b869339cc..b389738d157f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -119,8 +119,7 @@ static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr); } -static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, - struct hns_roce_mr *mr) +static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { struct ib_device *ibdev = &hr_dev->ib_dev; int ret; @@ -343,6 +342,9 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) struct hns_roce_mr *mr = to_hr_mr(ibmr); int ret = 0; + if (hr_dev->hw->dereg_mr) + hr_dev->hw->dereg_mr(hr_dev); + hns_roce_mr_free(hr_dev, mr); kfree(mr); From 63221acb0c63141cc7650f8eefb148337061e6db Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:07:57 -0600 Subject: [PATCH 157/186] RDMA/rxe: Fix ref error in rxe_av.c The commit referenced below can take a reference to the AH which is never dropped. This only happens in the UD request path. This patch optionally passes that AH back to the caller so that it can hold the reference while the AV is being accessed and then drop it. Code to do this is added to rxe_req.c. The AV is also passed to rxe_prepare in rxe_net.c as an optimization. Fixes: e2fe06c90806 ("RDMA/rxe: Lookup kernel AH from ah index in UD WQEs") Link: https://lore.kernel.org/r/20220304000808.225811-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 19 +++++++++- drivers/infiniband/sw/rxe/rxe_loc.h | 5 ++- drivers/infiniband/sw/rxe/rxe_net.c | 17 +++++---- drivers/infiniband/sw/rxe/rxe_req.c | 55 +++++++++++++++++----------- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 5 files changed, 63 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 38c7b6fb39d7..360a567159fe 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -99,11 +99,14 @@ void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr) av->network_type = type; } -struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp) { struct rxe_ah *ah; u32 ah_num; + if (ahp) + *ahp = NULL; + if (!pkt || !pkt->qp) return NULL; @@ -117,10 +120,22 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) if (ah_num) { /* only new user provider or kernel client */ ah = rxe_pool_get_index(&pkt->rxe->ah_pool, ah_num); - if (!ah || ah->ah_num != ah_num || rxe_ah_pd(ah) != pkt->qp->pd) { + if (!ah) { pr_warn("Unable to find AH matching ah_num\n"); return NULL; } + + if (rxe_ah_pd(ah) != pkt->qp->pd) { + pr_warn("PDs don't match for AH and QP\n"); + rxe_drop_ref(ah); + return NULL; + } + + if (ahp) + *ahp = ah; + else + rxe_drop_ref(ah); + return &ah->av; } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 409efeecd581..2ffbe3390668 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -19,7 +19,7 @@ void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr); void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); -struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt); +struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp); /* rxe_cq.c */ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, @@ -94,7 +94,8 @@ void rxe_mw_cleanup(struct rxe_pool_elem *arg); /* rxe_net.c */ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt); -int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb); +int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb); int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct sk_buff *skb); const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index a8cfa7160478..b06f22ffc5a8 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -271,13 +271,13 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } -static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb) +static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; bool xnet = false; __be16 df = htons(IP_DF); - struct rxe_av *av = rxe_get_av(pkt); struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; @@ -297,11 +297,11 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb) return 0; } -static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb) +static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; - struct rxe_av *av = rxe_get_av(pkt); struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; @@ -322,16 +322,17 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb) return 0; } -int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb) +int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, + struct sk_buff *skb) { int err = 0; if (skb->protocol == htons(ETH_P_IP)) - err = prepare4(pkt, skb); + err = prepare4(av, pkt, skb); else if (skb->protocol == htons(ETH_P_IPV6)) - err = prepare6(pkt, skb); + err = prepare6(av, pkt, skb); - if (ether_addr_equal(skb->dev->dev_addr, rxe_get_av(pkt)->dmac)) + if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) pkt->mask |= RXE_LOOPBACK_MASK; return err; diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index a3c78b4ac9f1..3520d55a124b 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -358,6 +358,7 @@ static inline int get_mtu(struct rxe_qp *qp) } static struct sk_buff *init_req_packet(struct rxe_qp *qp, + struct rxe_av *av, struct rxe_send_wqe *wqe, int opcode, u32 payload, struct rxe_pkt_info *pkt) @@ -365,7 +366,6 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct sk_buff *skb; struct rxe_send_wr *ibwr = &wqe->wr; - struct rxe_av *av; int pad = (-payload) & 0x3; int paylen; int solicited; @@ -374,21 +374,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, /* length from start of bth to end of icrc */ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; - - /* pkt->hdr, port_num and mask are initialized in ifc layer */ - pkt->rxe = rxe; - pkt->opcode = opcode; - pkt->qp = qp; - pkt->psn = qp->req.psn; - pkt->mask = rxe_opcode[opcode].mask; - pkt->paylen = paylen; - pkt->wqe = wqe; + pkt->paylen = paylen; /* init skb */ - av = rxe_get_av(pkt); - if (!av) - return NULL; - skb = rxe_init_packet(rxe, av, paylen, pkt); if (unlikely(!skb)) return NULL; @@ -447,13 +435,13 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, return skb; } -static int finish_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_pkt_info *pkt, struct sk_buff *skb, - u32 paylen) +static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, + struct rxe_send_wqe *wqe, struct rxe_pkt_info *pkt, + struct sk_buff *skb, u32 paylen) { int err; - err = rxe_prepare(pkt, skb); + err = rxe_prepare(av, pkt, skb); if (err) return err; @@ -608,6 +596,7 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) int rxe_requester(void *arg) { struct rxe_qp *qp = (struct rxe_qp *)arg; + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_pkt_info pkt; struct sk_buff *skb; struct rxe_send_wqe *wqe; @@ -619,6 +608,8 @@ int rxe_requester(void *arg) struct rxe_send_wqe rollback_wqe; u32 rollback_psn; struct rxe_queue *q = qp->sq.queue; + struct rxe_ah *ah; + struct rxe_av *av; rxe_add_ref(qp); @@ -705,14 +696,28 @@ next_wqe: payload = mtu; } - skb = init_req_packet(qp, wqe, opcode, payload, &pkt); + pkt.rxe = rxe; + pkt.opcode = opcode; + pkt.qp = qp; + pkt.psn = qp->req.psn; + pkt.mask = rxe_opcode[opcode].mask; + pkt.wqe = wqe; + + av = rxe_get_av(&pkt, &ah); + if (unlikely(!av)) { + pr_err("qp#%d Failed no address vector\n", qp_num(qp)); + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err_drop_ah; + } + + skb = init_req_packet(qp, av, wqe, opcode, payload, &pkt); if (unlikely(!skb)) { pr_err("qp#%d Failed allocating skb\n", qp_num(qp)); wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err; + goto err_drop_ah; } - ret = finish_packet(qp, wqe, &pkt, skb, payload); + ret = finish_packet(qp, av, wqe, &pkt, skb, payload); if (unlikely(ret)) { pr_debug("qp#%d Error during finish packet\n", qp_num(qp)); if (ret == -EFAULT) @@ -720,9 +725,12 @@ next_wqe: else wqe->status = IB_WC_LOC_QP_OP_ERR; kfree_skb(skb); - goto err; + goto err_drop_ah; } + if (ah) + rxe_drop_ref(ah); + /* * To prevent a race on wqe access between requester and completer, * wqe members state and psn need to be set before calling @@ -751,6 +759,9 @@ next_wqe: goto next_wqe; +err_drop_ah: + if (ah) + rxe_drop_ref(ah); err: wqe->state = wqe_state_error; __rxe_do_task(&qp->comp.task); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c369d78fc8e8..b5ebe853748a 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -633,7 +633,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, if (ack->mask & RXE_ATMACK_MASK) atmack_set_orig(ack, qp->resp.atomic_orig); - err = rxe_prepare(ack, skb); + err = rxe_prepare(&qp->pri_av, ack, skb); if (err) { kfree_skb(skb); return NULL; From 8a1a0be894da0d06bfbb496cc2dc3057fa83e103 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:07:58 -0600 Subject: [PATCH 158/186] RDMA/rxe: Replace mr by rkey in responder resources Currently rxe saves a copy of MR in responder resources for RDMA reads. Since the responder resources are never freed just over written if more are needed this MR may not have a reference freed until the QP is destroyed. This patch uses the rkey instead of the MR and on subsequent packets of a multipacket read reply message it looks up the MR from the rkey for each packet. This makes it possible for a user to deregister an MR or unbind a MW on the fly and get correct behaviour. Link: https://lore.kernel.org/r/20220304000808.225811-3-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 10 +-- drivers/infiniband/sw/rxe/rxe_resp.c | 123 ++++++++++++++++++-------- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 - 3 files changed, 87 insertions(+), 47 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 5f270cbf18c6..26d461a8d71c 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -135,12 +135,8 @@ static void free_rd_atomic_resources(struct rxe_qp *qp) void free_rd_atomic_resource(struct rxe_qp *qp, struct resp_res *res) { - if (res->type == RXE_ATOMIC_MASK) { + if (res->type == RXE_ATOMIC_MASK) kfree_skb(res->atomic.skb); - } else if (res->type == RXE_READ_MASK) { - if (res->read.mr) - rxe_drop_ref(res->read.mr); - } res->type = 0; } @@ -825,10 +821,8 @@ static void rxe_qp_do_cleanup(struct work_struct *work) if (qp->pd) rxe_drop_ref(qp->pd); - if (qp->resp.mr) { + if (qp->resp.mr) rxe_drop_ref(qp->resp.mr); - qp->resp.mr = NULL; - } if (qp_type(qp) == IB_QPT_RC) sk_dst_reset(qp->sk->sk); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index b5ebe853748a..b1ec003f0bb8 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -642,6 +642,78 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, return skb; } +static struct resp_res *rxe_prepare_read_res(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) +{ + struct resp_res *res; + u32 pkts; + + res = &qp->resp.resources[qp->resp.res_head]; + rxe_advance_resp_resource(qp); + free_rd_atomic_resource(qp, res); + + res->type = RXE_READ_MASK; + res->replay = 0; + res->read.va = qp->resp.va + qp->resp.offset; + res->read.va_org = qp->resp.va + qp->resp.offset; + res->read.resid = qp->resp.resid; + res->read.length = qp->resp.resid; + res->read.rkey = qp->resp.rkey; + + pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); + res->first_psn = pkt->psn; + res->cur_psn = pkt->psn; + res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; + + res->state = rdatm_res_state_new; + + return res; +} + +/** + * rxe_recheck_mr - revalidate MR from rkey and get a reference + * @qp: the qp + * @rkey: the rkey + * + * This code allows the MR to be invalidated or deregistered or + * the MW if one was used to be invalidated or deallocated. + * It is assumed that the access permissions if originally good + * are OK and the mappings to be unchanged. + * + * Return: mr on success else NULL + */ +static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + struct rxe_mr *mr; + struct rxe_mw *mw; + + if (rkey_is_mw(rkey)) { + mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8); + if (!mw || mw->rkey != rkey) + return NULL; + + if (mw->state != RXE_MW_STATE_VALID) { + rxe_drop_ref(mw); + return NULL; + } + + mr = mw->mr; + rxe_drop_ref(mw); + } else { + mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); + if (!mr || mr->rkey != rkey) + return NULL; + } + + if (mr->state != RXE_MR_STATE_VALID) { + rxe_drop_ref(mr); + return NULL; + } + + return mr; +} + /* RDMA read response. If res is not NULL, then we have a current RDMA request * being processed or replayed. */ @@ -656,53 +728,26 @@ static enum resp_states read_reply(struct rxe_qp *qp, int opcode; int err; struct resp_res *res = qp->resp.res; + struct rxe_mr *mr; if (!res) { - /* This is the first time we process that request. Get a - * resource - */ - res = &qp->resp.resources[qp->resp.res_head]; - - free_rd_atomic_resource(qp, res); - rxe_advance_resp_resource(qp); - - res->type = RXE_READ_MASK; - res->replay = 0; - - res->read.va = qp->resp.va + - qp->resp.offset; - res->read.va_org = qp->resp.va + - qp->resp.offset; - - res->first_psn = req_pkt->psn; - - if (reth_len(req_pkt)) { - res->last_psn = (req_pkt->psn + - (reth_len(req_pkt) + mtu - 1) / - mtu - 1) & BTH_PSN_MASK; - } else { - res->last_psn = res->first_psn; - } - res->cur_psn = req_pkt->psn; - - res->read.resid = qp->resp.resid; - res->read.length = qp->resp.resid; - res->read.rkey = qp->resp.rkey; - - /* note res inherits the reference to mr from qp */ - res->read.mr = qp->resp.mr; - qp->resp.mr = NULL; - - qp->resp.res = res; - res->state = rdatm_res_state_new; + res = rxe_prepare_read_res(qp, req_pkt); + qp->resp.res = res; } if (res->state == rdatm_res_state_new) { + mr = qp->resp.mr; + qp->resp.mr = NULL; + if (res->read.resid <= mtu) opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; else opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; } else { + mr = rxe_recheck_mr(qp, res->read.rkey); + if (!mr) + return RESPST_ERR_RKEY_VIOLATION; + if (res->read.resid > mtu) opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE; else @@ -718,10 +763,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, if (!skb) return RESPST_ERR_RNR; - err = rxe_mr_copy(res->read.mr, res->read.va, payload_addr(&ack_pkt), + err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); if (err) pr_err("Failed copying memory\n"); + if (mr) + rxe_drop_ref(mr); if (bth_pad(&ack_pkt)) { u8 *pad = payload_addr(&ack_pkt) + payload; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 6b15251ff67a..e7eff1ca75e9 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -157,7 +157,6 @@ struct resp_res { struct sk_buff *skb; } atomic; struct { - struct rxe_mr *mr; u64 va_org; u32 rkey; u32 length; From c9f4c695835c9c2085065a3adc1b57d2005b508b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:07:59 -0600 Subject: [PATCH 159/186] RDMA/rxe: Reverse the sense of RXE_POOL_NO_ALLOC There is only one remaining object type that allocates its own memory, that is mr. So the sense of RXE_POOL_NO_ALLOC is changed to RXE_POOL_ALLOC. Add checks to rxe_alloc() and rxe_add_to_pool() to make sure the correct call is used for the setting of this flag. Link: https://lore.kernel.org/r/20220304000808.225811-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++++++++++--------- drivers/infiniband/sw/rxe/rxe_pool.h | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 16056b918ace..239c24544ff2 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -21,19 +21,17 @@ static const struct rxe_type_info { .name = "rxe-uc", .size = sizeof(struct rxe_ucontext), .elem_offset = offsetof(struct rxe_ucontext, elem), - .flags = RXE_POOL_NO_ALLOC, }, [RXE_TYPE_PD] = { .name = "rxe-pd", .size = sizeof(struct rxe_pd), .elem_offset = offsetof(struct rxe_pd, elem), - .flags = RXE_POOL_NO_ALLOC, }, [RXE_TYPE_AH] = { .name = "rxe-ah", .size = sizeof(struct rxe_ah), .elem_offset = offsetof(struct rxe_ah, elem), - .flags = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC, + .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_AH_INDEX, .max_index = RXE_MAX_AH_INDEX, }, @@ -41,7 +39,7 @@ static const struct rxe_type_info { .name = "rxe-srq", .size = sizeof(struct rxe_srq), .elem_offset = offsetof(struct rxe_srq, elem), - .flags = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC, + .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_SRQ_INDEX, .max_index = RXE_MAX_SRQ_INDEX, }, @@ -50,7 +48,7 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_qp), .elem_offset = offsetof(struct rxe_qp, elem), .cleanup = rxe_qp_cleanup, - .flags = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC, + .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_QP_INDEX, .max_index = RXE_MAX_QP_INDEX, }, @@ -58,7 +56,6 @@ static const struct rxe_type_info { .name = "rxe-cq", .size = sizeof(struct rxe_cq), .elem_offset = offsetof(struct rxe_cq, elem), - .flags = RXE_POOL_NO_ALLOC, .cleanup = rxe_cq_cleanup, }, [RXE_TYPE_MR] = { @@ -66,7 +63,7 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_mr), .elem_offset = offsetof(struct rxe_mr, elem), .cleanup = rxe_mr_cleanup, - .flags = RXE_POOL_INDEX, + .flags = RXE_POOL_INDEX | RXE_POOL_ALLOC, .min_index = RXE_MIN_MR_INDEX, .max_index = RXE_MAX_MR_INDEX, }, @@ -75,7 +72,7 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_mw), .elem_offset = offsetof(struct rxe_mw, elem), .cleanup = rxe_mw_cleanup, - .flags = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC, + .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, }, @@ -264,6 +261,9 @@ void *rxe_alloc(struct rxe_pool *pool) struct rxe_pool_elem *elem; void *obj; + if (WARN_ON(!(pool->flags & RXE_POOL_ALLOC))) + return NULL; + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) goto out_cnt; @@ -286,6 +286,9 @@ out_cnt: int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) { + if (WARN_ON(pool->flags & RXE_POOL_ALLOC)) + return -EINVAL; + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) goto out_cnt; @@ -310,7 +313,7 @@ void rxe_elem_release(struct kref *kref) if (pool->cleanup) pool->cleanup(elem); - if (!(pool->flags & RXE_POOL_NO_ALLOC)) { + if (pool->flags & RXE_POOL_ALLOC) { obj = elem->obj; kfree(obj); } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 8fc95c6b7b9b..44b944c8c360 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -9,7 +9,7 @@ enum rxe_pool_flags { RXE_POOL_INDEX = BIT(1), - RXE_POOL_NO_ALLOC = BIT(4), + RXE_POOL_ALLOC = BIT(2), }; enum rxe_elem_type { From 3c3e4d582bdc461081abea9de54eb4112a9a6283 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:08:00 -0600 Subject: [PATCH 160/186] RDMA/rxe: Delete _locked() APIs for pool objects Since caller managed locks for indexed objects are no longer used these APIs are deleted. Link: https://lore.kernel.org/r/20220304000808.225811-5-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_pool.c | 67 ++++------------------------ drivers/infiniband/sw/rxe/rxe_pool.h | 24 ++-------- 2 files changed, 12 insertions(+), 79 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 239c24544ff2..2e3543dde000 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -189,17 +189,6 @@ static int rxe_insert_index(struct rxe_pool *pool, struct rxe_pool_elem *new) return 0; } -int __rxe_add_index_locked(struct rxe_pool_elem *elem) -{ - struct rxe_pool *pool = elem->pool; - int err; - - elem->index = alloc_index(pool); - err = rxe_insert_index(pool, elem); - - return err; -} - int __rxe_add_index(struct rxe_pool_elem *elem) { struct rxe_pool *pool = elem->pool; @@ -207,55 +196,24 @@ int __rxe_add_index(struct rxe_pool_elem *elem) int err; write_lock_irqsave(&pool->pool_lock, flags); - err = __rxe_add_index_locked(elem); + elem->index = alloc_index(pool); + err = rxe_insert_index(pool, elem); write_unlock_irqrestore(&pool->pool_lock, flags); return err; } -void __rxe_drop_index_locked(struct rxe_pool_elem *elem) -{ - struct rxe_pool *pool = elem->pool; - - clear_bit(elem->index - pool->index.min_index, pool->index.table); - rb_erase(&elem->index_node, &pool->index.tree); -} - void __rxe_drop_index(struct rxe_pool_elem *elem) { struct rxe_pool *pool = elem->pool; unsigned long flags; write_lock_irqsave(&pool->pool_lock, flags); - __rxe_drop_index_locked(elem); + clear_bit(elem->index - pool->index.min_index, pool->index.table); + rb_erase(&elem->index_node, &pool->index.tree); write_unlock_irqrestore(&pool->pool_lock, flags); } -void *rxe_alloc_locked(struct rxe_pool *pool) -{ - struct rxe_pool_elem *elem; - void *obj; - - if (atomic_inc_return(&pool->num_elem) > pool->max_elem) - goto out_cnt; - - obj = kzalloc(pool->elem_size, GFP_ATOMIC); - if (!obj) - goto out_cnt; - - elem = (struct rxe_pool_elem *)((u8 *)obj + pool->elem_offset); - - elem->pool = pool; - elem->obj = obj; - kref_init(&elem->ref_cnt); - - return obj; - -out_cnt: - atomic_dec(&pool->num_elem); - return NULL; -} - void *rxe_alloc(struct rxe_pool *pool) { struct rxe_pool_elem *elem; @@ -321,12 +279,14 @@ void rxe_elem_release(struct kref *kref) atomic_dec(&pool->num_elem); } -void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index) +void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) { - struct rb_node *node; struct rxe_pool_elem *elem; + struct rb_node *node; + unsigned long flags; void *obj; + read_lock_irqsave(&pool->pool_lock, flags); node = pool->index.tree.rb_node; while (node) { @@ -346,17 +306,6 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index) } else { obj = NULL; } - - return obj; -} - -void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) -{ - unsigned long flags; - void *obj; - - read_lock_irqsave(&pool->pool_lock, flags); - obj = rxe_pool_get_index_locked(pool, index); read_unlock_irqrestore(&pool->pool_lock, flags); return obj; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 44b944c8c360..7fec5d96d695 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -68,9 +68,7 @@ int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, /* free resources from object pool */ void rxe_pool_cleanup(struct rxe_pool *pool); -/* allocate an object from pool holding and not holding the pool lock */ -void *rxe_alloc_locked(struct rxe_pool *pool); - +/* allocate an object from pool */ void *rxe_alloc(struct rxe_pool *pool); /* connect already allocated object to pool */ @@ -79,32 +77,18 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem); #define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem) /* assign an index to an indexed object and insert object into - * pool's rb tree holding and not holding the pool_lock + * pool's rb tree */ -int __rxe_add_index_locked(struct rxe_pool_elem *elem); - -#define rxe_add_index_locked(obj) __rxe_add_index_locked(&(obj)->elem) - int __rxe_add_index(struct rxe_pool_elem *elem); #define rxe_add_index(obj) __rxe_add_index(&(obj)->elem) -/* drop an index and remove object from rb tree - * holding and not holding the pool_lock - */ -void __rxe_drop_index_locked(struct rxe_pool_elem *elem); - -#define rxe_drop_index_locked(obj) __rxe_drop_index_locked(&(obj)->elem) - +/* drop an index and remove object from rb tree */ void __rxe_drop_index(struct rxe_pool_elem *elem); #define rxe_drop_index(obj) __rxe_drop_index(&(obj)->elem) -/* lookup an indexed object from index holding and not holding the pool_lock. - * takes a reference on object - */ -void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index); - +/* lookup an indexed object from index. takes a reference on object */ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); /* cleanup an object when all references are dropped */ From b4a47f6836b9c8fa60ccd8ff64f3c5f5b7d35afa Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:08:01 -0600 Subject: [PATCH 161/186] RDMA/rxe: Replace obj by elem in declaration Fix a harmless typo replacing obj by elem in the cleanup fields. This has no effect but is confusing. Link: https://lore.kernel.org/r/20220304000808.225811-6-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_pool.c | 2 +- drivers/infiniband/sw/rxe/rxe_pool.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 2e3543dde000..3b50fd3d9d70 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -12,7 +12,7 @@ static const struct rxe_type_info { const char *name; size_t size; size_t elem_offset; - void (*cleanup)(struct rxe_pool_elem *obj); + void (*cleanup)(struct rxe_pool_elem *elem); enum rxe_pool_flags flags; u32 min_index; u32 max_index; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 7fec5d96d695..a8582ad85b1e 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -39,7 +39,7 @@ struct rxe_pool { struct rxe_dev *rxe; const char *name; rwlock_t pool_lock; /* protects pool add/del/search */ - void (*cleanup)(struct rxe_pool_elem *obj); + void (*cleanup)(struct rxe_pool_elem *elem); enum rxe_pool_flags flags; enum rxe_elem_type type; From 3ccffe8abf2febab4642033d4675a20bbade151b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:08:02 -0600 Subject: [PATCH 162/186] RDMA/rxe: Move max_elem into rxe_type_info Move the maximum number of elements from a parameter in rxe_pool_init to a member of the rxe_type_info array. Link: https://lore.kernel.org/r/20220304000808.225811-7-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 24 ++++++++---------------- drivers/infiniband/sw/rxe/rxe_pool.c | 14 +++++++++++--- drivers/infiniband/sw/rxe/rxe_pool.h | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index fce3994d8f7a..dc1f9dd70966 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -118,43 +118,35 @@ static int rxe_init_pools(struct rxe_dev *rxe) { int err; - err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC, - rxe->max_ucontext); + err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); if (err) goto err1; - err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD, - rxe->attr.max_pd); + err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); if (err) goto err2; - err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH, - rxe->attr.max_ah); + err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); if (err) goto err3; - err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ, - rxe->attr.max_srq); + err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); if (err) goto err4; - err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP, - rxe->attr.max_qp); + err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); if (err) goto err5; - err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ, - rxe->attr.max_cq); + err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); if (err) goto err6; - err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR, - rxe->attr.max_mr); + err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); if (err) goto err7; - err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW, - rxe->attr.max_mw); + err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); if (err) goto err8; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 3b50fd3d9d70..bc3ae64adba8 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -16,16 +16,19 @@ static const struct rxe_type_info { enum rxe_pool_flags flags; u32 min_index; u32 max_index; + u32 max_elem; } rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_UC] = { .name = "rxe-uc", .size = sizeof(struct rxe_ucontext), .elem_offset = offsetof(struct rxe_ucontext, elem), + .max_elem = UINT_MAX, }, [RXE_TYPE_PD] = { .name = "rxe-pd", .size = sizeof(struct rxe_pd), .elem_offset = offsetof(struct rxe_pd, elem), + .max_elem = UINT_MAX, }, [RXE_TYPE_AH] = { .name = "rxe-ah", @@ -34,6 +37,7 @@ static const struct rxe_type_info { .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_AH_INDEX, .max_index = RXE_MAX_AH_INDEX, + .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1, }, [RXE_TYPE_SRQ] = { .name = "rxe-srq", @@ -42,6 +46,7 @@ static const struct rxe_type_info { .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_SRQ_INDEX, .max_index = RXE_MAX_SRQ_INDEX, + .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, }, [RXE_TYPE_QP] = { .name = "rxe-qp", @@ -51,12 +56,14 @@ static const struct rxe_type_info { .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_QP_INDEX, .max_index = RXE_MAX_QP_INDEX, + .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1, }, [RXE_TYPE_CQ] = { .name = "rxe-cq", .size = sizeof(struct rxe_cq), .elem_offset = offsetof(struct rxe_cq, elem), .cleanup = rxe_cq_cleanup, + .max_elem = UINT_MAX, }, [RXE_TYPE_MR] = { .name = "rxe-mr", @@ -66,6 +73,7 @@ static const struct rxe_type_info { .flags = RXE_POOL_INDEX | RXE_POOL_ALLOC, .min_index = RXE_MIN_MR_INDEX, .max_index = RXE_MAX_MR_INDEX, + .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, }, [RXE_TYPE_MW] = { .name = "rxe-mw", @@ -75,6 +83,7 @@ static const struct rxe_type_info { .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, + .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1, }, }; @@ -104,8 +113,7 @@ out: int rxe_pool_init( struct rxe_dev *rxe, struct rxe_pool *pool, - enum rxe_elem_type type, - unsigned int max_elem) + enum rxe_elem_type type) { const struct rxe_type_info *info = &rxe_type_info[type]; int err = 0; @@ -115,7 +123,7 @@ int rxe_pool_init( pool->rxe = rxe; pool->name = info->name; pool->type = type; - pool->max_elem = max_elem; + pool->max_elem = info->max_elem; pool->elem_size = ALIGN(info->size, RXE_POOL_ALIGN); pool->elem_offset = info->elem_offset; pool->flags = info->flags; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index a8582ad85b1e..5f34d232d7f4 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -63,7 +63,7 @@ struct rxe_pool { * pool elements will be allocated out of a slab cache */ int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, - enum rxe_elem_type type, u32 max_elem); + enum rxe_elem_type type); /* free resources from object pool */ void rxe_pool_cleanup(struct rxe_pool *pool); From df34dc9e03bfb9181f6f5405f4dd319340b5b46c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:08:03 -0600 Subject: [PATCH 163/186] RDMA/rxe: Shorten pool names in rxe_pool.c Replace pool names like "rxe-xx" with "xx". Just reduces clutter. Link: https://lore.kernel.org/r/20220304000808.225811-8-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_pool.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index bc3ae64adba8..c50baeb10bd2 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -19,19 +19,19 @@ static const struct rxe_type_info { u32 max_elem; } rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_UC] = { - .name = "rxe-uc", + .name = "uc", .size = sizeof(struct rxe_ucontext), .elem_offset = offsetof(struct rxe_ucontext, elem), .max_elem = UINT_MAX, }, [RXE_TYPE_PD] = { - .name = "rxe-pd", + .name = "pd", .size = sizeof(struct rxe_pd), .elem_offset = offsetof(struct rxe_pd, elem), .max_elem = UINT_MAX, }, [RXE_TYPE_AH] = { - .name = "rxe-ah", + .name = "ah", .size = sizeof(struct rxe_ah), .elem_offset = offsetof(struct rxe_ah, elem), .flags = RXE_POOL_INDEX, @@ -40,7 +40,7 @@ static const struct rxe_type_info { .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1, }, [RXE_TYPE_SRQ] = { - .name = "rxe-srq", + .name = "srq", .size = sizeof(struct rxe_srq), .elem_offset = offsetof(struct rxe_srq, elem), .flags = RXE_POOL_INDEX, @@ -49,7 +49,7 @@ static const struct rxe_type_info { .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, }, [RXE_TYPE_QP] = { - .name = "rxe-qp", + .name = "qp", .size = sizeof(struct rxe_qp), .elem_offset = offsetof(struct rxe_qp, elem), .cleanup = rxe_qp_cleanup, @@ -59,14 +59,14 @@ static const struct rxe_type_info { .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1, }, [RXE_TYPE_CQ] = { - .name = "rxe-cq", + .name = "cq", .size = sizeof(struct rxe_cq), .elem_offset = offsetof(struct rxe_cq, elem), .cleanup = rxe_cq_cleanup, .max_elem = UINT_MAX, }, [RXE_TYPE_MR] = { - .name = "rxe-mr", + .name = "mr", .size = sizeof(struct rxe_mr), .elem_offset = offsetof(struct rxe_mr, elem), .cleanup = rxe_mr_cleanup, @@ -76,7 +76,7 @@ static const struct rxe_type_info { .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, }, [RXE_TYPE_MW] = { - .name = "rxe-mw", + .name = "mw", .size = sizeof(struct rxe_mw), .elem_offset = offsetof(struct rxe_mw, elem), .cleanup = rxe_mw_cleanup, From 3225717f6dfa29a6f03629b7a7f8492e1521d06d Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:08:04 -0600 Subject: [PATCH 164/186] RDMA/rxe: Replace red-black trees by xarrays Currently the rxe driver uses red-black trees to add indices to the rxe object pools. Linux xarrays provide a better way to implement the same functionality for indices. This patch replaces red-black trees by xarrays for pool objects. Since xarrays already have a spinlock use that in place of the pool rwlock. Make sure that all changes in the xarray(index) and kref(ref counnt) occur atomically. Link: https://lore.kernel.org/r/20220304000808.225811-9-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 80 ++------- drivers/infiniband/sw/rxe/rxe_mr.c | 1 - drivers/infiniband/sw/rxe/rxe_mw.c | 8 - drivers/infiniband/sw/rxe/rxe_pool.c | 227 ++++++++------------------ drivers/infiniband/sw/rxe/rxe_pool.h | 43 ++--- drivers/infiniband/sw/rxe/rxe_verbs.c | 12 -- 6 files changed, 88 insertions(+), 283 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index dc1f9dd70966..2dae7538a2ea 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -114,75 +114,26 @@ static void rxe_init_ports(struct rxe_dev *rxe) } /* init pools of managed objects */ -static int rxe_init_pools(struct rxe_dev *rxe) +static void rxe_init_pools(struct rxe_dev *rxe) { - int err; - - err = rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); - if (err) - goto err1; - - err = rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); - if (err) - goto err2; - - err = rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); - if (err) - goto err3; - - err = rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); - if (err) - goto err4; - - err = rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); - if (err) - goto err5; - - err = rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); - if (err) - goto err6; - - err = rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); - if (err) - goto err7; - - err = rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); - if (err) - goto err8; - - return 0; - -err8: - rxe_pool_cleanup(&rxe->mr_pool); -err7: - rxe_pool_cleanup(&rxe->cq_pool); -err6: - rxe_pool_cleanup(&rxe->qp_pool); -err5: - rxe_pool_cleanup(&rxe->srq_pool); -err4: - rxe_pool_cleanup(&rxe->ah_pool); -err3: - rxe_pool_cleanup(&rxe->pd_pool); -err2: - rxe_pool_cleanup(&rxe->uc_pool); -err1: - return err; + rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); + rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); + rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); + rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); + rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); + rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); + rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); + rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); } /* initialize rxe device state */ -static int rxe_init(struct rxe_dev *rxe) +static void rxe_init(struct rxe_dev *rxe) { - int err; - /* init default device parameters */ rxe_init_device_param(rxe); rxe_init_ports(rxe); - - err = rxe_init_pools(rxe); - if (err) - return err; + rxe_init_pools(rxe); /* init pending mmap list */ spin_lock_init(&rxe->mmap_offset_lock); @@ -194,8 +145,6 @@ static int rxe_init(struct rxe_dev *rxe) rxe->mcg_tree = RB_ROOT; mutex_init(&rxe->usdev_lock); - - return 0; } void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) @@ -217,12 +166,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) */ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) { - int err; - - err = rxe_init(rxe); - if (err) - return err; - + rxe_init(rxe); rxe_set_mtu(rxe, mtu); return rxe_register_device(rxe, ibdev_name); diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 453ef3c9d535..35628b8a00b4 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -691,7 +691,6 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) mr->state = RXE_MR_STATE_INVALID; rxe_drop_ref(mr_pd(mr)); - rxe_drop_index(mr); rxe_drop_ref(mr); return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 32dd8c0b8b9e..7df36c40eec2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -20,7 +20,6 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) return ret; } - rxe_add_index(mw); mw->rkey = ibmw->rkey = (mw->elem.index << 8) | rxe_get_next_key(-1); mw->state = (mw->ibmw.type == IB_MW_TYPE_2) ? RXE_MW_STATE_FREE : RXE_MW_STATE_VALID; @@ -329,10 +328,3 @@ struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey) return mw; } - -void rxe_mw_cleanup(struct rxe_pool_elem *elem) -{ - struct rxe_mw *mw = container_of(elem, typeof(*mw), elem); - - rxe_drop_index(mw); -} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index c50baeb10bd2..87066d04ed18 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -22,19 +22,22 @@ static const struct rxe_type_info { .name = "uc", .size = sizeof(struct rxe_ucontext), .elem_offset = offsetof(struct rxe_ucontext, elem), + .min_index = 1, + .max_index = UINT_MAX, .max_elem = UINT_MAX, }, [RXE_TYPE_PD] = { .name = "pd", .size = sizeof(struct rxe_pd), .elem_offset = offsetof(struct rxe_pd, elem), + .min_index = 1, + .max_index = UINT_MAX, .max_elem = UINT_MAX, }, [RXE_TYPE_AH] = { .name = "ah", .size = sizeof(struct rxe_ah), .elem_offset = offsetof(struct rxe_ah, elem), - .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_AH_INDEX, .max_index = RXE_MAX_AH_INDEX, .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1, @@ -43,7 +46,6 @@ static const struct rxe_type_info { .name = "srq", .size = sizeof(struct rxe_srq), .elem_offset = offsetof(struct rxe_srq, elem), - .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_SRQ_INDEX, .max_index = RXE_MAX_SRQ_INDEX, .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, @@ -53,7 +55,6 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_qp), .elem_offset = offsetof(struct rxe_qp, elem), .cleanup = rxe_qp_cleanup, - .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_QP_INDEX, .max_index = RXE_MAX_QP_INDEX, .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1, @@ -63,6 +64,8 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_cq), .elem_offset = offsetof(struct rxe_cq, elem), .cleanup = rxe_cq_cleanup, + .min_index = 1, + .max_index = UINT_MAX, .max_elem = UINT_MAX, }, [RXE_TYPE_MR] = { @@ -70,7 +73,7 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_mr), .elem_offset = offsetof(struct rxe_mr, elem), .cleanup = rxe_mr_cleanup, - .flags = RXE_POOL_INDEX | RXE_POOL_ALLOC, + .flags = RXE_POOL_ALLOC, .min_index = RXE_MIN_MR_INDEX, .max_index = RXE_MAX_MR_INDEX, .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, @@ -79,44 +82,16 @@ static const struct rxe_type_info { .name = "mw", .size = sizeof(struct rxe_mw), .elem_offset = offsetof(struct rxe_mw, elem), - .cleanup = rxe_mw_cleanup, - .flags = RXE_POOL_INDEX, .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1, }, }; -static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min) -{ - int err = 0; - - if ((max - min + 1) < pool->max_elem) { - pr_warn("not enough indices for max_elem\n"); - err = -EINVAL; - goto out; - } - - pool->index.max_index = max; - pool->index.min_index = min; - - pool->index.table = bitmap_zalloc(max - min + 1, GFP_KERNEL); - if (!pool->index.table) { - err = -ENOMEM; - goto out; - } - -out: - return err; -} - -int rxe_pool_init( - struct rxe_dev *rxe, - struct rxe_pool *pool, - enum rxe_elem_type type) +void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, + enum rxe_elem_type type) { const struct rxe_type_info *info = &rxe_type_info[type]; - int err = 0; memset(pool, 0, sizeof(*pool)); @@ -131,111 +106,31 @@ int rxe_pool_init( atomic_set(&pool->num_elem, 0); - rwlock_init(&pool->pool_lock); - - if (pool->flags & RXE_POOL_INDEX) { - pool->index.tree = RB_ROOT; - err = rxe_pool_init_index(pool, info->max_index, - info->min_index); - if (err) - goto out; - } - -out: - return err; + xa_init_flags(&pool->xa, XA_FLAGS_ALLOC); + pool->limit.min = info->min_index; + pool->limit.max = info->max_index; } void rxe_pool_cleanup(struct rxe_pool *pool) { - if (atomic_read(&pool->num_elem) > 0) - pr_warn("%s pool destroyed with unfree'd elem\n", - pool->name); - - if (pool->flags & RXE_POOL_INDEX) - bitmap_free(pool->index.table); -} - -static u32 alloc_index(struct rxe_pool *pool) -{ - u32 index; - u32 range = pool->index.max_index - pool->index.min_index + 1; - - index = find_next_zero_bit(pool->index.table, range, pool->index.last); - if (index >= range) - index = find_first_zero_bit(pool->index.table, range); - - WARN_ON_ONCE(index >= range); - set_bit(index, pool->index.table); - pool->index.last = index; - return index + pool->index.min_index; -} - -static int rxe_insert_index(struct rxe_pool *pool, struct rxe_pool_elem *new) -{ - struct rb_node **link = &pool->index.tree.rb_node; - struct rb_node *parent = NULL; - struct rxe_pool_elem *elem; - - while (*link) { - parent = *link; - elem = rb_entry(parent, struct rxe_pool_elem, index_node); - - if (elem->index == new->index) { - pr_warn("element already exists!\n"); - return -EINVAL; - } - - if (elem->index > new->index) - link = &(*link)->rb_left; - else - link = &(*link)->rb_right; - } - - rb_link_node(&new->index_node, parent, link); - rb_insert_color(&new->index_node, &pool->index.tree); - - return 0; -} - -int __rxe_add_index(struct rxe_pool_elem *elem) -{ - struct rxe_pool *pool = elem->pool; - unsigned long flags; - int err; - - write_lock_irqsave(&pool->pool_lock, flags); - elem->index = alloc_index(pool); - err = rxe_insert_index(pool, elem); - write_unlock_irqrestore(&pool->pool_lock, flags); - - return err; -} - -void __rxe_drop_index(struct rxe_pool_elem *elem) -{ - struct rxe_pool *pool = elem->pool; - unsigned long flags; - - write_lock_irqsave(&pool->pool_lock, flags); - clear_bit(elem->index - pool->index.min_index, pool->index.table); - rb_erase(&elem->index_node, &pool->index.tree); - write_unlock_irqrestore(&pool->pool_lock, flags); + WARN_ON(!xa_empty(&pool->xa)); } void *rxe_alloc(struct rxe_pool *pool) { struct rxe_pool_elem *elem; void *obj; + int err; if (WARN_ON(!(pool->flags & RXE_POOL_ALLOC))) return NULL; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) - goto out_cnt; + goto err_cnt; obj = kzalloc(pool->elem_size, GFP_KERNEL); if (!obj) - goto out_cnt; + goto err_cnt; elem = (struct rxe_pool_elem *)((u8 *)obj + pool->elem_offset); @@ -243,78 +138,86 @@ void *rxe_alloc(struct rxe_pool *pool) elem->obj = obj; kref_init(&elem->ref_cnt); + err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, + &pool->next, GFP_KERNEL); + if (err) + goto err_free; + return obj; -out_cnt: +err_free: + kfree(obj); +err_cnt: atomic_dec(&pool->num_elem); return NULL; } int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) { + int err; + if (WARN_ON(pool->flags & RXE_POOL_ALLOC)) return -EINVAL; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) - goto out_cnt; + goto err_cnt; elem->pool = pool; elem->obj = (u8 *)elem - pool->elem_offset; kref_init(&elem->ref_cnt); + err = xa_alloc_cyclic(&pool->xa, &elem->index, elem, pool->limit, + &pool->next, GFP_KERNEL); + if (err) + goto err_cnt; + return 0; -out_cnt: +err_cnt: atomic_dec(&pool->num_elem); return -EINVAL; } -void rxe_elem_release(struct kref *kref) -{ - struct rxe_pool_elem *elem = - container_of(kref, struct rxe_pool_elem, ref_cnt); - struct rxe_pool *pool = elem->pool; - void *obj; - - if (pool->cleanup) - pool->cleanup(elem); - - if (pool->flags & RXE_POOL_ALLOC) { - obj = elem->obj; - kfree(obj); - } - - atomic_dec(&pool->num_elem); -} - void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) { struct rxe_pool_elem *elem; - struct rb_node *node; + struct xarray *xa = &pool->xa; unsigned long flags; void *obj; - read_lock_irqsave(&pool->pool_lock, flags); - node = pool->index.tree.rb_node; - - while (node) { - elem = rb_entry(node, struct rxe_pool_elem, index_node); - - if (elem->index > index) - node = node->rb_left; - else if (elem->index < index) - node = node->rb_right; - else - break; - } - - if (node) { - kref_get(&elem->ref_cnt); + xa_lock_irqsave(xa, flags); + elem = xa_load(xa, index); + if (elem && kref_get_unless_zero(&elem->ref_cnt)) obj = elem->obj; - } else { + else obj = NULL; - } - read_unlock_irqrestore(&pool->pool_lock, flags); + xa_unlock_irqrestore(xa, flags); return obj; } + +static void rxe_elem_release(struct kref *kref) +{ + struct rxe_pool_elem *elem = container_of(kref, typeof(*elem), ref_cnt); + struct rxe_pool *pool = elem->pool; + + xa_erase(&pool->xa, elem->index); + + if (pool->cleanup) + pool->cleanup(elem); + + if (pool->flags & RXE_POOL_ALLOC) + kfree(elem->obj); + + atomic_dec(&pool->num_elem); +} + +int __rxe_get(struct rxe_pool_elem *elem) +{ + return kref_get_unless_zero(&elem->ref_cnt); +} + +int __rxe_put(struct rxe_pool_elem *elem) +{ + return kref_put(&elem->ref_cnt, rxe_elem_release); +} diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 5f34d232d7f4..d1e05d384b2c 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -8,8 +8,7 @@ #define RXE_POOL_H enum rxe_pool_flags { - RXE_POOL_INDEX = BIT(1), - RXE_POOL_ALLOC = BIT(2), + RXE_POOL_ALLOC = BIT(1), }; enum rxe_elem_type { @@ -29,16 +28,12 @@ struct rxe_pool_elem { void *obj; struct kref ref_cnt; struct list_head list; - - /* only used if indexed */ - struct rb_node index_node; u32 index; }; struct rxe_pool { struct rxe_dev *rxe; const char *name; - rwlock_t pool_lock; /* protects pool add/del/search */ void (*cleanup)(struct rxe_pool_elem *elem); enum rxe_pool_flags flags; enum rxe_elem_type type; @@ -48,21 +43,16 @@ struct rxe_pool { size_t elem_size; size_t elem_offset; - /* only used if indexed */ - struct { - struct rb_root tree; - unsigned long *table; - u32 last; - u32 max_index; - u32 min_index; - } index; + struct xarray xa; + struct xa_limit limit; + u32 next; }; /* initialize a pool of objects with given limit on * number of elements. gets parameters from rxe_type_info * pool elements will be allocated out of a slab cache */ -int rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, +void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, enum rxe_elem_type type); /* free resources from object pool */ @@ -76,29 +66,18 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem); #define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->elem) -/* assign an index to an indexed object and insert object into - * pool's rb tree - */ -int __rxe_add_index(struct rxe_pool_elem *elem); - -#define rxe_add_index(obj) __rxe_add_index(&(obj)->elem) - -/* drop an index and remove object from rb tree */ -void __rxe_drop_index(struct rxe_pool_elem *elem); - -#define rxe_drop_index(obj) __rxe_drop_index(&(obj)->elem) - /* lookup an indexed object from index. takes a reference on object */ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); -/* cleanup an object when all references are dropped */ -void rxe_elem_release(struct kref *kref); - /* take a reference on an object */ -#define rxe_add_ref(obj) kref_get(&(obj)->elem.ref_cnt) +int __rxe_get(struct rxe_pool_elem *elem); + +#define rxe_add_ref(obj) __rxe_get(&(obj)->elem) /* drop a reference on an object */ -#define rxe_drop_ref(obj) kref_put(&(obj)->elem.ref_cnt, rxe_elem_release) +int __rxe_put(struct rxe_pool_elem *elem); + +#define rxe_drop_ref(obj) __rxe_put(&(obj)->elem) #define rxe_read_ref(obj) kref_read(&(obj)->elem.ref_cnt) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 80df9a8f71a1..f0c5715ac500 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -181,7 +181,6 @@ static int rxe_create_ah(struct ib_ah *ibah, return err; /* create index > 0 */ - rxe_add_index(ah); ah->ah_num = ah->elem.index; if (uresp) { @@ -189,7 +188,6 @@ static int rxe_create_ah(struct ib_ah *ibah, err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { - rxe_drop_index(ah); rxe_drop_ref(ah); return -EFAULT; } @@ -230,7 +228,6 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); - rxe_drop_index(ah); rxe_drop_ref(ah); return 0; } @@ -438,7 +435,6 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, if (err) return err; - rxe_add_index(qp); err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); if (err) goto qp_init; @@ -446,7 +442,6 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, return 0; qp_init: - rxe_drop_index(qp); rxe_drop_ref(qp); return err; } @@ -501,7 +496,6 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return ret; rxe_qp_destroy(qp); - rxe_drop_index(qp); rxe_drop_ref(qp); return 0; } @@ -908,7 +902,6 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) if (!mr) return ERR_PTR(-ENOMEM); - rxe_add_index(mr); rxe_add_ref(pd); rxe_mr_init_dma(pd, access, mr); @@ -932,7 +925,6 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, goto err2; } - rxe_add_index(mr); rxe_add_ref(pd); @@ -944,7 +936,6 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, err3: rxe_drop_ref(pd); - rxe_drop_index(mr); rxe_drop_ref(mr); err2: return ERR_PTR(err); @@ -967,8 +958,6 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, goto err1; } - rxe_add_index(mr); - rxe_add_ref(pd); err = rxe_mr_init_fast(pd, max_num_sg, mr); @@ -979,7 +968,6 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, err2: rxe_drop_ref(pd); - rxe_drop_index(mr); rxe_drop_ref(mr); err1: return ERR_PTR(err); From 3197706abd053275d2a561cfb7dc8f6cfaf7d02c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 3 Mar 2022 18:08:05 -0600 Subject: [PATCH 165/186] RDMA/rxe: Use standard names for ref counting Rename rxe_add_ref() to rxe_get() and rxe_drop_ref() to rxe_put(). Significantly improves readability for new readers. Link: https://lore.kernel.org/r/20220304000808.225811-10-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 4 +-- drivers/infiniband/sw/rxe/rxe_comp.c | 8 +++--- drivers/infiniband/sw/rxe/rxe_mcast.c | 4 +-- drivers/infiniband/sw/rxe/rxe_mr.c | 14 +++++----- drivers/infiniband/sw/rxe/rxe_mw.c | 30 ++++++++++----------- drivers/infiniband/sw/rxe/rxe_net.c | 6 ++--- drivers/infiniband/sw/rxe/rxe_pool.h | 8 +++--- drivers/infiniband/sw/rxe/rxe_qp.c | 28 ++++++++++---------- drivers/infiniband/sw/rxe/rxe_recv.c | 8 +++--- drivers/infiniband/sw/rxe/rxe_req.c | 10 +++---- drivers/infiniband/sw/rxe/rxe_resp.c | 32 +++++++++++----------- drivers/infiniband/sw/rxe/rxe_verbs.c | 38 +++++++++++++-------------- 12 files changed, 94 insertions(+), 96 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 360a567159fe..3b05314ca739 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -127,14 +127,14 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt, struct rxe_ah **ahp) if (rxe_ah_pd(ah) != pkt->qp->pd) { pr_warn("PDs don't match for AH and QP\n"); - rxe_drop_ref(ah); + rxe_put(ah); return NULL; } if (ahp) *ahp = ah; else - rxe_drop_ref(ah); + rxe_put(ah); return &ah->av; } diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index f363fe3fa414..138b3e7d3a5f 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -526,7 +526,7 @@ static void rxe_drain_resp_pkts(struct rxe_qp *qp, bool notify) struct rxe_queue *q = qp->sq.queue; while ((skb = skb_dequeue(&qp->resp_pkts))) { - rxe_drop_ref(qp); + rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } @@ -548,7 +548,7 @@ static void free_pkt(struct rxe_pkt_info *pkt) struct ib_device *dev = qp->ibqp.device; kfree_skb(skb); - rxe_drop_ref(qp); + rxe_put(qp); ib_device_put(dev); } @@ -562,7 +562,7 @@ int rxe_completer(void *arg) enum comp_state state; int ret = 0; - rxe_add_ref(qp); + rxe_get(qp); if (!qp->valid || qp->req.state == QP_STATE_ERROR || qp->req.state == QP_STATE_RESET) { @@ -761,7 +761,7 @@ int rxe_completer(void *arg) done: if (pkt) free_pkt(pkt); - rxe_drop_ref(qp); + rxe_put(qp); return ret; } diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index c399a29b648b..ae8f11cb704a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -319,7 +319,7 @@ static int __rxe_init_mca(struct rxe_qp *qp, struct rxe_mcg *mcg, atomic_inc(&qp->mcg_num); - rxe_add_ref(qp); + rxe_get(qp); mca->qp = qp; list_add_tail(&mca->qp_list, &mcg->qp_list); @@ -389,7 +389,7 @@ static void __rxe_cleanup_mca(struct rxe_mca *mca, struct rxe_mcg *mcg) atomic_dec(&mcg->qp_num); atomic_dec(&mcg->rxe->mcg_attach); atomic_dec(&mca->qp->mcg_num); - rxe_drop_ref(mca->qp); + rxe_put(mca->qp); kfree(mca); } diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 35628b8a00b4..60a31b718774 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -459,7 +459,7 @@ int copy_data( if (offset >= sge->length) { if (mr) { - rxe_drop_ref(mr); + rxe_put(mr); mr = NULL; } sge++; @@ -504,13 +504,13 @@ int copy_data( dma->resid = resid; if (mr) - rxe_drop_ref(mr); + rxe_put(mr); return 0; err2: if (mr) - rxe_drop_ref(mr); + rxe_put(mr); err1: return err; } @@ -569,7 +569,7 @@ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, (type == RXE_LOOKUP_REMOTE && mr->rkey != key) || mr_pd(mr) != pd || (access && !(access & mr->access)) || mr->state != RXE_MR_STATE_VALID)) { - rxe_drop_ref(mr); + rxe_put(mr); mr = NULL; } @@ -613,7 +613,7 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey) ret = 0; err_drop_ref: - rxe_drop_ref(mr); + rxe_put(mr); err: return ret; } @@ -690,8 +690,8 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } mr->state = RXE_MR_STATE_INVALID; - rxe_drop_ref(mr_pd(mr)); - rxe_drop_ref(mr); + rxe_put(mr_pd(mr)); + rxe_put(mr); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index 7df36c40eec2..c86b2efd58f2 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -12,11 +12,11 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) struct rxe_dev *rxe = to_rdev(ibmw->device); int ret; - rxe_add_ref(pd); + rxe_get(pd); ret = rxe_add_to_pool(&rxe->mw_pool, mw); if (ret) { - rxe_drop_ref(pd); + rxe_put(pd); return ret; } @@ -35,14 +35,14 @@ static void rxe_do_dealloc_mw(struct rxe_mw *mw) mw->mr = NULL; atomic_dec(&mr->num_mw); - rxe_drop_ref(mr); + rxe_put(mr); } if (mw->qp) { struct rxe_qp *qp = mw->qp; mw->qp = NULL; - rxe_drop_ref(qp); + rxe_put(qp); } mw->access = 0; @@ -60,8 +60,8 @@ int rxe_dealloc_mw(struct ib_mw *ibmw) rxe_do_dealloc_mw(mw); spin_unlock_bh(&mw->lock); - rxe_drop_ref(mw); - rxe_drop_ref(pd); + rxe_put(mw); + rxe_put(pd); return 0; } @@ -170,7 +170,7 @@ static void rxe_do_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, mw->length = wqe->wr.wr.mw.length; if (mw->mr) { - rxe_drop_ref(mw->mr); + rxe_put(mw->mr); atomic_dec(&mw->mr->num_mw); mw->mr = NULL; } @@ -178,11 +178,11 @@ static void rxe_do_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe, if (mw->length) { mw->mr = mr; atomic_inc(&mr->num_mw); - rxe_add_ref(mr); + rxe_get(mr); } if (mw->ibmw.type == IB_MW_TYPE_2) { - rxe_add_ref(qp); + rxe_get(qp); mw->qp = qp; } } @@ -233,9 +233,9 @@ err_unlock: spin_unlock_bh(&mw->lock); err_drop_mr: if (mr) - rxe_drop_ref(mr); + rxe_put(mr); err_drop_mw: - rxe_drop_ref(mw); + rxe_put(mw); err: return ret; } @@ -260,13 +260,13 @@ static void rxe_do_invalidate_mw(struct rxe_mw *mw) /* valid type 2 MW will always have a QP pointer */ qp = mw->qp; mw->qp = NULL; - rxe_drop_ref(qp); + rxe_put(qp); /* valid type 2 MW will always have an MR pointer */ mr = mw->mr; mw->mr = NULL; atomic_dec(&mr->num_mw); - rxe_drop_ref(mr); + rxe_put(mr); mw->access = 0; mw->addr = 0; @@ -301,7 +301,7 @@ int rxe_invalidate_mw(struct rxe_qp *qp, u32 rkey) err_unlock: spin_unlock_bh(&mw->lock); err_drop_ref: - rxe_drop_ref(mw); + rxe_put(mw); err: return ret; } @@ -322,7 +322,7 @@ struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey) (mw->length == 0) || (access && !(access & mw->access)) || mw->state != RXE_MW_STATE_VALID)) { - rxe_drop_ref(mw); + rxe_put(mw); return NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index b06f22ffc5a8..c53f4529f098 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -348,7 +348,7 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) rxe_run_task(&qp->req.task, 1); - rxe_drop_ref(qp); + rxe_put(qp); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) @@ -358,7 +358,7 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) skb->destructor = rxe_skb_tx_dtor; skb->sk = pkt->qp->sk->sk; - rxe_add_ref(pkt->qp); + rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) { @@ -368,7 +368,7 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) } else { pr_err("Unknown layer 3 protocol: %d\n", skb->protocol); atomic_dec(&pkt->qp->skb_out); - rxe_drop_ref(pkt->qp); + rxe_put(pkt->qp); kfree_skb(skb); return -EINVAL; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index d1e05d384b2c..24bcc786c1b3 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -69,16 +69,14 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem); /* lookup an indexed object from index. takes a reference on object */ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index); -/* take a reference on an object */ int __rxe_get(struct rxe_pool_elem *elem); -#define rxe_add_ref(obj) __rxe_get(&(obj)->elem) +#define rxe_get(obj) __rxe_get(&(obj)->elem) -/* drop a reference on an object */ int __rxe_put(struct rxe_pool_elem *elem); -#define rxe_drop_ref(obj) __rxe_put(&(obj)->elem) +#define rxe_put(obj) __rxe_put(&(obj)->elem) -#define rxe_read_ref(obj) kref_read(&(obj)->elem.ref_cnt) +#define rxe_read(obj) kref_read(&(obj)->elem.ref_cnt) #endif /* RXE_POOL_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 26d461a8d71c..62acf890af6c 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -323,11 +323,11 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; - rxe_add_ref(pd); - rxe_add_ref(rcq); - rxe_add_ref(scq); + rxe_get(pd); + rxe_get(rcq); + rxe_get(scq); if (srq) - rxe_add_ref(srq); + rxe_get(srq); qp->pd = pd; qp->rcq = rcq; @@ -359,10 +359,10 @@ err1: qp->srq = NULL; if (srq) - rxe_drop_ref(srq); - rxe_drop_ref(scq); - rxe_drop_ref(rcq); - rxe_drop_ref(pd); + rxe_put(srq); + rxe_put(scq); + rxe_put(rcq); + rxe_put(pd); return err; } @@ -521,7 +521,7 @@ static void rxe_qp_reset(struct rxe_qp *qp) qp->resp.sent_psn_nak = 0; if (qp->resp.mr) { - rxe_drop_ref(qp->resp.mr); + rxe_put(qp->resp.mr); qp->resp.mr = NULL; } @@ -809,20 +809,20 @@ static void rxe_qp_do_cleanup(struct work_struct *work) rxe_queue_cleanup(qp->sq.queue); if (qp->srq) - rxe_drop_ref(qp->srq); + rxe_put(qp->srq); if (qp->rq.queue) rxe_queue_cleanup(qp->rq.queue); if (qp->scq) - rxe_drop_ref(qp->scq); + rxe_put(qp->scq); if (qp->rcq) - rxe_drop_ref(qp->rcq); + rxe_put(qp->rcq); if (qp->pd) - rxe_drop_ref(qp->pd); + rxe_put(qp->pd); if (qp->resp.mr) - rxe_drop_ref(qp->resp.mr); + rxe_put(qp->resp.mr); if (qp_type(qp) == IB_QPT_RC) sk_dst_reset(qp->sk->sk); diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index 53924453abef..d09a8b68c962 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -217,7 +217,7 @@ static int hdr_check(struct rxe_pkt_info *pkt) return 0; err2: - rxe_drop_ref(qp); + rxe_put(qp); err1: return -EINVAL; } @@ -288,11 +288,11 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) cpkt = SKB_TO_PKT(cskb); cpkt->qp = qp; - rxe_add_ref(qp); + rxe_get(qp); rxe_rcv_pkt(cpkt, cskb); } else { pkt->qp = qp; - rxe_add_ref(qp); + rxe_get(qp); rxe_rcv_pkt(pkt, skb); skb = NULL; /* mark consumed */ } @@ -397,7 +397,7 @@ void rxe_rcv(struct sk_buff *skb) drop: if (pkt->qp) - rxe_drop_ref(pkt->qp); + rxe_put(pkt->qp); kfree_skb(skb); ib_device_put(&rxe->ib_dev); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 3520d55a124b..ae5fbc79dd5c 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -611,7 +611,7 @@ int rxe_requester(void *arg) struct rxe_ah *ah; struct rxe_av *av; - rxe_add_ref(qp); + rxe_get(qp); next_wqe: if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) @@ -690,7 +690,7 @@ next_wqe: wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; __rxe_do_task(&qp->comp.task); - rxe_drop_ref(qp); + rxe_put(qp); return 0; } payload = mtu; @@ -729,7 +729,7 @@ next_wqe: } if (ah) - rxe_drop_ref(ah); + rxe_put(ah); /* * To prevent a race on wqe access between requester and completer, @@ -761,12 +761,12 @@ next_wqe: err_drop_ah: if (ah) - rxe_drop_ref(ah); + rxe_put(ah); err: wqe->state = wqe_state_error; __rxe_do_task(&qp->comp.task); exit: - rxe_drop_ref(qp); + rxe_put(qp); return -EAGAIN; } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index b1ec003f0bb8..16fc7ea1298d 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -99,7 +99,7 @@ static inline enum resp_states get_req(struct rxe_qp *qp, if (qp->resp.state == QP_STATE_ERROR) { while ((skb = skb_dequeue(&qp->req_pkts))) { - rxe_drop_ref(qp); + rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } @@ -464,8 +464,8 @@ static enum resp_states check_rkey(struct rxe_qp *qp, if (mw->access & IB_ZERO_BASED) qp->resp.offset = mw->addr; - rxe_drop_ref(mw); - rxe_add_ref(mr); + rxe_put(mw); + rxe_get(mr); } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { @@ -508,9 +508,9 @@ static enum resp_states check_rkey(struct rxe_qp *qp, err: if (mr) - rxe_drop_ref(mr); + rxe_put(mr); if (mw) - rxe_drop_ref(mw); + rxe_put(mw); return state; } @@ -694,12 +694,12 @@ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) return NULL; if (mw->state != RXE_MW_STATE_VALID) { - rxe_drop_ref(mw); + rxe_put(mw); return NULL; } mr = mw->mr; - rxe_drop_ref(mw); + rxe_put(mw); } else { mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); if (!mr || mr->rkey != rkey) @@ -707,7 +707,7 @@ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) } if (mr->state != RXE_MR_STATE_VALID) { - rxe_drop_ref(mr); + rxe_put(mr); return NULL; } @@ -768,7 +768,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, if (err) pr_err("Failed copying memory\n"); if (mr) - rxe_drop_ref(mr); + rxe_put(mr); if (bth_pad(&ack_pkt)) { u8 *pad = payload_addr(&ack_pkt) + payload; @@ -1037,7 +1037,7 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, rc = rxe_xmit_packet(qp, &ack_pkt, skb); if (rc) { pr_err_ratelimited("Failed sending ack\n"); - rxe_drop_ref(qp); + rxe_put(qp); } out: return rc; @@ -1066,13 +1066,13 @@ static enum resp_states cleanup(struct rxe_qp *qp, if (pkt) { skb = skb_dequeue(&qp->req_pkts); - rxe_drop_ref(qp); + rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } if (qp->resp.mr) { - rxe_drop_ref(qp->resp.mr); + rxe_put(qp->resp.mr); qp->resp.mr = NULL; } @@ -1216,7 +1216,7 @@ static enum resp_states do_class_d1e_error(struct rxe_qp *qp) } if (qp->resp.mr) { - rxe_drop_ref(qp->resp.mr); + rxe_put(qp->resp.mr); qp->resp.mr = NULL; } @@ -1230,7 +1230,7 @@ static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) struct rxe_queue *q = qp->rq.queue; while ((skb = skb_dequeue(&qp->req_pkts))) { - rxe_drop_ref(qp); + rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } @@ -1250,7 +1250,7 @@ int rxe_responder(void *arg) struct rxe_pkt_info *pkt = NULL; int ret = 0; - rxe_add_ref(qp); + rxe_get(qp); qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; @@ -1437,6 +1437,6 @@ int rxe_responder(void *arg) exit: ret = -EAGAIN; done: - rxe_drop_ref(qp); + rxe_put(qp); return ret; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f0c5715ac500..67184b0281a0 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -115,7 +115,7 @@ static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); - rxe_drop_ref(uc); + rxe_put(uc); } static int rxe_port_immutable(struct ib_device *dev, u32 port_num, @@ -149,7 +149,7 @@ static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); - rxe_drop_ref(pd); + rxe_put(pd); return 0; } @@ -188,7 +188,7 @@ static int rxe_create_ah(struct ib_ah *ibah, err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { - rxe_drop_ref(ah); + rxe_put(ah); return -EFAULT; } } else if (ah->is_user) { @@ -228,7 +228,7 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); - rxe_drop_ref(ah); + rxe_put(ah); return 0; } @@ -303,7 +303,7 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, if (err) goto err1; - rxe_add_ref(pd); + rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); @@ -313,8 +313,8 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, return 0; err2: - rxe_drop_ref(pd); - rxe_drop_ref(srq); + rxe_put(pd); + rxe_put(srq); err1: return err; } @@ -371,8 +371,8 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) if (srq->rq.queue) rxe_queue_cleanup(srq->rq.queue); - rxe_drop_ref(srq->pd); - rxe_drop_ref(srq); + rxe_put(srq->pd); + rxe_put(srq); return 0; } @@ -442,7 +442,7 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, return 0; qp_init: - rxe_drop_ref(qp); + rxe_put(qp); return err; } @@ -496,7 +496,7 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return ret; rxe_qp_destroy(qp); - rxe_drop_ref(qp); + rxe_put(qp); return 0; } @@ -809,7 +809,7 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) rxe_cq_disable(cq); - rxe_drop_ref(cq); + rxe_put(cq); return 0; } @@ -902,7 +902,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) if (!mr) return ERR_PTR(-ENOMEM); - rxe_add_ref(pd); + rxe_get(pd); rxe_mr_init_dma(pd, access, mr); return &mr->ibmr; @@ -926,7 +926,7 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, } - rxe_add_ref(pd); + rxe_get(pd); err = rxe_mr_init_user(pd, start, length, iova, access, mr); if (err) @@ -935,8 +935,8 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, return &mr->ibmr; err3: - rxe_drop_ref(pd); - rxe_drop_ref(mr); + rxe_put(pd); + rxe_put(mr); err2: return ERR_PTR(err); } @@ -958,7 +958,7 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, goto err1; } - rxe_add_ref(pd); + rxe_get(pd); err = rxe_mr_init_fast(pd, max_num_sg, mr); if (err) @@ -967,8 +967,8 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, return &mr->ibmr; err2: - rxe_drop_ref(pd); - rxe_drop_ref(mr); + rxe_put(pd); + rxe_put(mr); err1: return ERR_PTR(err); } From 7f68d7493ff07a0ad63f63c4a1a4e0781cec0dd2 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 8 Mar 2022 16:55:43 +0200 Subject: [PATCH 166/186] IB/iser: Remove iser_reg_data_sg helper function Open coding it makes the code more readable and simple. Link: https://lore.kernel.org/r/20220308145546.8372-2-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Signed-off-by: Max Gurtovoy Acked-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_memory.c | 31 +++++++---------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index f1fd05d8609d..1317e18cfaa8 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -326,40 +326,26 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, return 0; } -static int iser_reg_data_sg(struct iscsi_iser_task *task, - struct iser_data_buf *mem, - struct iser_fr_desc *desc, bool use_dma_key, - struct iser_mem_reg *reg) -{ - struct iser_device *device = task->iser_conn->ib_conn.device; - - if (use_dma_key) - return iser_reg_dma(device, mem, reg); - - return iser_fast_reg_mr(task, mem, &desc->rsc, reg); -} - int iser_reg_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir dir, bool all_imm) { struct ib_conn *ib_conn = &task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; struct iser_data_buf *mem = &task->data[dir]; struct iser_mem_reg *reg = &task->rdma_reg[dir]; - struct iser_fr_desc *desc = NULL; + struct iser_fr_desc *desc; bool use_dma_key; int err; use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) && scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL; + if (use_dma_key) + return iser_reg_dma(device, mem, reg); - if (!use_dma_key) { - desc = iser_reg_desc_get_fr(ib_conn); - reg->mem_h = desc; - } - + desc = iser_reg_desc_get_fr(ib_conn); if (scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL) { - err = iser_reg_data_sg(task, mem, desc, use_dma_key, reg); + err = iser_fast_reg_mr(task, mem, &desc->rsc, reg); if (unlikely(err)) goto err_reg; } else { @@ -371,11 +357,12 @@ int iser_reg_mem_fastreg(struct iscsi_iser_task *task, desc->sig_protected = true; } + reg->mem_h = desc; + return 0; err_reg: - if (desc) - iser_reg_desc_put_fr(ib_conn, desc); + iser_reg_desc_put_fr(ib_conn, desc); return err; } From ee4efeaea8837bb7018d188a9eb8837c7ff79561 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 8 Mar 2022 16:55:44 +0200 Subject: [PATCH 167/186] IB/iser: Use iser_fr_desc as registration context After removing the FMR support in iSER, there is only one type of registration context. Replace the void pointer with the explicit structure for registration (struct iser_fr_desc). Link: https://lore.kernel.org/r/20220308145546.8372-3-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Signed-off-by: Max Gurtovoy Acked-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.h | 8 ++++---- drivers/infiniband/ulp/iser/iser_initiator.c | 4 ++-- drivers/infiniband/ulp/iser/iser_memory.c | 8 ++++---- drivers/infiniband/ulp/iser/iser_verbs.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 20af46c4e954..23b922233006 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -203,12 +203,12 @@ struct iser_reg_resources; * * @sge: memory region sg element * @rkey: memory region remote key - * @mem_h: pointer to registration context (FMR/Fastreg) + * @desc: pointer to fast registration context */ struct iser_mem_reg { - struct ib_sge sge; - u32 rkey; - void *mem_h; + struct ib_sge sge; + u32 rkey; + struct iser_fr_desc *desc; }; enum iser_desc_type { diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 2490150d3085..012decf6905a 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -619,13 +619,13 @@ static int iser_check_remote_inv(struct iser_conn *iser_conn, struct ib_wc *wc, struct iser_fr_desc *desc; if (iser_task->dir[ISER_DIR_IN]) { - desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h; + desc = iser_task->rdma_reg[ISER_DIR_IN].desc; if (unlikely(iser_inv_desc(desc, rkey))) return -EINVAL; } if (iser_task->dir[ISER_DIR_OUT]) { - desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h; + desc = iser_task->rdma_reg[ISER_DIR_OUT].desc; if (unlikely(iser_inv_desc(desc, rkey))) return -EINVAL; } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 1317e18cfaa8..7ec63f4a4663 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -129,7 +129,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, struct iser_fr_desc *desc; struct ib_mr_status mr_status; - desc = reg->mem_h; + desc = reg->desc; if (!desc) return; @@ -146,8 +146,8 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, ib_check_mr_status(desc->rsc.sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); } - iser_reg_desc_put_fr(&iser_task->iser_conn->ib_conn, reg->mem_h); - reg->mem_h = NULL; + iser_reg_desc_put_fr(&iser_task->iser_conn->ib_conn, reg->desc); + reg->desc = NULL; } static void iser_set_dif_domain(struct scsi_cmnd *sc, @@ -357,7 +357,7 @@ int iser_reg_mem_fastreg(struct iscsi_iser_task *task, desc->sig_protected = true; } - reg->mem_h = desc; + reg->desc = desc; return 0; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 8893bc27d8f5..5dbad68c7390 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -904,7 +904,7 @@ u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector) { struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; - struct iser_fr_desc *desc = reg->mem_h; + struct iser_fr_desc *desc = reg->desc; unsigned long sector_size = iser_task->sc->device->sector_size; struct ib_mr_status mr_status; int ret; From 80303ee244907e720544da83e10fd0552875a6f0 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 8 Mar 2022 16:55:45 +0200 Subject: [PATCH 168/186] IB/iser: Generalize map/unmap dma tasks Avoid code duplication and add the mapping/unmapping of the protection buffers to the iser_dma_map_task_data/iser_dma_unmap_task_data functions. Link: https://lore.kernel.org/r/20220308145546.8372-4-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Signed-off-by: Max Gurtovoy Acked-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.h | 5 +-- drivers/infiniband/ulp/iser/iser_initiator.c | 40 +------------------- drivers/infiniband/ulp/iser/iser_memory.c | 31 +++++++++++++-- 3 files changed, 31 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 23b922233006..7e4faf9c5e9e 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -531,13 +531,12 @@ int iser_post_recvm(struct iser_conn *iser_conn, int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc); int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, enum iser_data_dir iser_dir, enum dma_data_direction dma_dir); void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, - enum dma_data_direction dir); + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir); int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 012decf6905a..dbc2c268bc0e 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -52,26 +52,13 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) struct iser_mem_reg *mem_reg; int err; struct iser_ctrl *hdr = &iser_task->desc.iser_header; - struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; err = iser_dma_map_task_data(iser_task, - buf_in, ISER_DIR_IN, DMA_FROM_DEVICE); if (err) return err; - if (scsi_prot_sg_count(iser_task->sc)) { - struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN]; - - err = iser_dma_map_task_data(iser_task, - pbuf_in, - ISER_DIR_IN, - DMA_FROM_DEVICE); - if (err) - return err; - } - err = iser_reg_mem_fastreg(iser_task, ISER_DIR_IN, false); if (err) { iser_err("Failed to set up Data-IN RDMA\n"); @@ -106,23 +93,11 @@ static int iser_prepare_write_cmd(struct iscsi_task *task, unsigned int imm_sz, struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1]; err = iser_dma_map_task_data(iser_task, - buf_out, ISER_DIR_OUT, DMA_TO_DEVICE); if (err) return err; - if (scsi_prot_sg_count(iser_task->sc)) { - struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT]; - - err = iser_dma_map_task_data(iser_task, - pbuf_out, - ISER_DIR_OUT, - DMA_TO_DEVICE); - if (err) - return err; - } - err = iser_reg_mem_fastreg(iser_task, ISER_DIR_OUT, buf_out->data_len == imm_sz); if (err != 0) { @@ -740,27 +715,16 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) { - int prot_count = scsi_prot_sg_count(iser_task->sc); if (iser_task->dir[ISER_DIR_IN]) { iser_unreg_mem_fastreg(iser_task, ISER_DIR_IN); - iser_dma_unmap_task_data(iser_task, - &iser_task->data[ISER_DIR_IN], + iser_dma_unmap_task_data(iser_task, ISER_DIR_IN, DMA_FROM_DEVICE); - if (prot_count) - iser_dma_unmap_task_data(iser_task, - &iser_task->prot[ISER_DIR_IN], - DMA_FROM_DEVICE); } if (iser_task->dir[ISER_DIR_OUT]) { iser_unreg_mem_fastreg(iser_task, ISER_DIR_OUT); - iser_dma_unmap_task_data(iser_task, - &iser_task->data[ISER_DIR_OUT], + iser_dma_unmap_task_data(iser_task, ISER_DIR_OUT, DMA_TO_DEVICE); - if (prot_count) - iser_dma_unmap_task_data(iser_task, - &iser_task->prot[ISER_DIR_OUT], - DMA_TO_DEVICE); } } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 7ec63f4a4663..29ae2c6a250a 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -70,10 +70,10 @@ static void iser_reg_desc_put_fr(struct ib_conn *ib_conn, } int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, enum iser_data_dir iser_dir, enum dma_data_direction dma_dir) { + struct iser_data_buf *data = &iser_task->data[iser_dir]; struct ib_device *dev; iser_task->dir[iser_dir] = 1; @@ -84,17 +84,40 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, iser_err("dma_map_sg failed!!!\n"); return -EINVAL; } + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pdata = &iser_task->prot[iser_dir]; + + pdata->dma_nents = ib_dma_map_sg(dev, pdata->sg, pdata->size, dma_dir); + if (unlikely(pdata->dma_nents == 0)) { + iser_err("protection dma_map_sg failed!!!\n"); + goto out_unmap; + } + } + return 0; + +out_unmap: + ib_dma_unmap_sg(dev, data->sg, data->size, dma_dir); + return -EINVAL; } + void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, - enum dma_data_direction dir) + enum iser_data_dir iser_dir, + enum dma_data_direction dma_dir) { + struct iser_data_buf *data = &iser_task->data[iser_dir]; struct ib_device *dev; dev = iser_task->iser_conn->ib_conn.device->ib_device; - ib_dma_unmap_sg(dev, data->sg, data->size, dir); + ib_dma_unmap_sg(dev, data->sg, data->size, dma_dir); + + if (scsi_prot_sg_count(iser_task->sc)) { + struct iser_data_buf *pdata = &iser_task->prot[iser_dir]; + + ib_dma_unmap_sg(dev, pdata->sg, pdata->size, dma_dir); + } } static int iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, From 2e11a5e459c1c4f1d27430d2707ec7ef77f371ca Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 8 Mar 2022 16:55:46 +0200 Subject: [PATCH 169/186] IB/iser: Fix error flow in case of registration failure During READ/WRITE preparation, in case of failure in memory registration using iser_reg_mem_fastreg we must unmap previously mapped iser task. Link: https://lore.kernel.org/r/20220308145546.8372-5-mgurtovoy@nvidia.com Reviewed-by: Sergey Gorenko Signed-off-by: Max Gurtovoy Acked-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_initiator.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index dbc2c268bc0e..bd5f3b5e1727 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -62,7 +62,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) err = iser_reg_mem_fastreg(iser_task, ISER_DIR_IN, false); if (err) { iser_err("Failed to set up Data-IN RDMA\n"); - return err; + goto out_err; } mem_reg = &iser_task->rdma_reg[ISER_DIR_IN]; @@ -75,6 +75,10 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) (unsigned long long)mem_reg->sge.addr); return 0; + +out_err: + iser_dma_unmap_task_data(iser_task, ISER_DIR_IN, DMA_FROM_DEVICE); + return err; } /* Register user buffer memory and initialize passive rdma @@ -100,9 +104,9 @@ static int iser_prepare_write_cmd(struct iscsi_task *task, unsigned int imm_sz, err = iser_reg_mem_fastreg(iser_task, ISER_DIR_OUT, buf_out->data_len == imm_sz); - if (err != 0) { + if (err) { iser_err("Failed to register write cmd RDMA mem\n"); - return err; + goto out_err; } mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; @@ -129,6 +133,10 @@ static int iser_prepare_write_cmd(struct iscsi_task *task, unsigned int imm_sz, } return 0; + +out_err: + iser_dma_unmap_task_data(iser_task, ISER_DIR_OUT, DMA_TO_DEVICE); + return err; } /* creates a new tx descriptor and adds header regd buffer */ From 87e0eacb176f9500c2063d140c0a1d7fa51ab8a5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Mar 2022 11:39:48 +0300 Subject: [PATCH 170/186] RDMA/nldev: Prevent underflow in nldev_stat_set_counter_dynamic_doit() This code checks "index" for an upper bound but it does not check for negatives. Change the type to unsigned to prevent underflows. Fixes: 3c3c1f141639 ("RDMA/nldev: Allow optional-counter status configuration through RDMA netlink") Link: https://lore.kernel.org/r/20220316083948.GC30941@kili Signed-off-by: Dan Carpenter Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index f5aacaf7fb8e..ca24ce34da76 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1951,9 +1951,10 @@ static int nldev_stat_set_counter_dynamic_doit(struct nlattr *tb[], u32 port) { struct rdma_hw_stats *stats; - int rem, i, index, ret = 0; struct nlattr *entry_attr; unsigned long *target; + int rem, i, ret = 0; + u32 index; stats = ib_get_hw_stats_port(device, port); if (!stats) From a9a4bc8c76d747aa40b30e2dfc176c781f353a08 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:10 -0700 Subject: [PATCH 171/186] xfs: log worker needs to start before intent/unlink recovery After 963 iterations of generic/530, it deadlocked during recovery on a pinned inode cluster buffer like so: XFS (pmem1): Starting recovery (logdev: internal) INFO: task kworker/8:0:306037 blocked for more than 122 seconds. Not tainted 5.17.0-rc6-dgc+ #975 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. task:kworker/8:0 state:D stack:13024 pid:306037 ppid: 2 flags:0x00004000 Workqueue: xfs-inodegc/pmem1 xfs_inodegc_worker Call Trace: __schedule+0x30d/0x9e0 schedule+0x55/0xd0 schedule_timeout+0x114/0x160 __down+0x99/0xf0 down+0x5e/0x70 xfs_buf_lock+0x36/0xf0 xfs_buf_find+0x418/0x850 xfs_buf_get_map+0x47/0x380 xfs_buf_read_map+0x54/0x240 xfs_trans_read_buf_map+0x1bd/0x490 xfs_imap_to_bp+0x4f/0x70 xfs_iunlink_map_ino+0x66/0xd0 xfs_iunlink_map_prev.constprop.0+0x148/0x2f0 xfs_iunlink_remove_inode+0xf2/0x1d0 xfs_inactive_ifree+0x1a3/0x900 xfs_inode_unlink+0xcc/0x210 xfs_inodegc_worker+0x1ac/0x2f0 process_one_work+0x1ac/0x390 worker_thread+0x56/0x3c0 kthread+0xf6/0x120 ret_from_fork+0x1f/0x30 task:mount state:D stack:13248 pid:324509 ppid:324233 flags:0x00004000 Call Trace: __schedule+0x30d/0x9e0 schedule+0x55/0xd0 schedule_timeout+0x114/0x160 __down+0x99/0xf0 down+0x5e/0x70 xfs_buf_lock+0x36/0xf0 xfs_buf_find+0x418/0x850 xfs_buf_get_map+0x47/0x380 xfs_buf_read_map+0x54/0x240 xfs_trans_read_buf_map+0x1bd/0x490 xfs_imap_to_bp+0x4f/0x70 xfs_iget+0x300/0xb40 xlog_recover_process_one_iunlink+0x4c/0x170 xlog_recover_process_iunlinks.isra.0+0xee/0x130 xlog_recover_finish+0x57/0x110 xfs_log_mount_finish+0xfc/0x1e0 xfs_mountfs+0x540/0x910 xfs_fs_fill_super+0x495/0x850 get_tree_bdev+0x171/0x270 xfs_fs_get_tree+0x15/0x20 vfs_get_tree+0x24/0xc0 path_mount+0x304/0xba0 __x64_sys_mount+0x108/0x140 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae task:xfsaild/pmem1 state:D stack:14544 pid:324525 ppid: 2 flags:0x00004000 Call Trace: __schedule+0x30d/0x9e0 schedule+0x55/0xd0 io_schedule+0x4b/0x80 xfs_buf_wait_unpin+0x9e/0xf0 __xfs_buf_submit+0x14a/0x230 xfs_buf_delwri_submit_buffers+0x107/0x280 xfs_buf_delwri_submit_nowait+0x10/0x20 xfsaild+0x27e/0x9d0 kthread+0xf6/0x120 ret_from_fork+0x1f/0x30 We have the mount process waiting on an inode cluster buffer read, inodegc doing unlink waiting on the same inode cluster buffer, and the AIL push thread blocked in writeback waiting for the inode cluster buffer to become unpinned. What has happened here is that the AIL push thread has raced with the inodegc process modifying, committing and pinning the inode cluster buffer here in xfs_buf_delwri_submit_buffers() here: blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; } Here >>>>>> if (!xfs_buf_trylock(bp)) continue; Basically, the AIL has found the buffer wasn't pinned and got the lock without blocking, but then the buffer was pinned. This implies the processing here was pre-empted between the pin check and the lock, because the pin count can only be increased while holding the buffer locked. Hence when it has gone to submit the IO, it has blocked waiting for the buffer to be unpinned. With all executing threads now waiting on the buffer to be unpinned, we normally get out of situations like this via the background log worker issuing a log force which will unpinned stuck buffers like this. But at this point in recovery, we haven't started the log worker. In fact, the first thing we do after processing intents and unlinked inodes is *start the log worker*. IOWs, we start it too late to have it break deadlocks like this. Avoid this and any other similar deadlock vectors in intent and unlinked inode recovery by starting the log worker before we recover intents and unlinked inodes. This part of recovery runs as though the filesystem is fully active, so we really should have the same infrastructure running as we normally do at runtime. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Chandan Babu R Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 89fec9a18c34..ffd928cf9a9a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -812,10 +812,9 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; + xfs_log_work_queue(mp); if (xlog_recovery_needed(log)) error = xlog_recover_finish(log); - if (!error) - xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); From dbd0f5299302f8506637592e2373891a748c6990 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:10 -0700 Subject: [PATCH 172/186] xfs: check buffer pin state after locking in delwri_submit AIL flushing can get stuck here: [316649.005769] INFO: task xfsaild/pmem1:324525 blocked for more than 123 seconds. [316649.007807] Not tainted 5.17.0-rc6-dgc+ #975 [316649.009186] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [316649.011720] task:xfsaild/pmem1 state:D stack:14544 pid:324525 ppid: 2 flags:0x00004000 [316649.014112] Call Trace: [316649.014841] [316649.015492] __schedule+0x30d/0x9e0 [316649.017745] schedule+0x55/0xd0 [316649.018681] io_schedule+0x4b/0x80 [316649.019683] xfs_buf_wait_unpin+0x9e/0xf0 [316649.021850] __xfs_buf_submit+0x14a/0x230 [316649.023033] xfs_buf_delwri_submit_buffers+0x107/0x280 [316649.024511] xfs_buf_delwri_submit_nowait+0x10/0x20 [316649.025931] xfsaild+0x27e/0x9d0 [316649.028283] kthread+0xf6/0x120 [316649.030602] ret_from_fork+0x1f/0x30 in the situation where flushing gets preempted between the unpin check and the buffer trylock under nowait conditions: blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; } Here >>>>>> if (!xfs_buf_trylock(bp)) continue; This means submission is stuck until something else triggers a log force to unpin the buffer. To get onto the delwri list to begin with, the buffer pin state has already been checked, and hence it's relatively rare we get a race between flushing and encountering a pinned buffer in delwri submission to begin with. Further, to increase the pin count the buffer has to be locked, so the only way we can hit this race without failing the trylock is to be preempted between the pincount check seeing zero and the trylock being run. Hence to avoid this problem, just invert the order of trylock vs pin check. We shouldn't hit that many pinned buffers here, so optimising away the trylock for pinned buffers should not matter for performance at all. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b45e0d50a405..8867f143598e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2094,12 +2094,13 @@ xfs_buf_delwri_submit_buffers( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { if (!wait_list) { + if (!xfs_buf_trylock(bp)) + continue; if (xfs_buf_ispinned(bp)) { + xfs_buf_unlock(bp); pinned++; continue; } - if (!xfs_buf_trylock(bp)) - continue; } else { xfs_buf_lock(bp); } From 941fbdfd6dd0f1d7961c28123b5460912f678cb5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:11 -0700 Subject: [PATCH 173/186] xfs: xfs_ail_push_all_sync() stalls when racing with updates xfs_ail_push_all_sync() has a loop like this: while max_ail_lsn { prepare_to_wait(ail_empty) target = max_ail_lsn wake_up(ail_task); schedule() } Which is designed to sleep until the AIL is emptied. When xfs_ail_update_finish() moves the tail of the log, it does: if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); So it will only wake up the sync push waiter when the AIL goes empty. If, by the time the push waiter has woken, the AIL has more in it, it will reset the target, wake the push task and go back to sleep. The problem here is that if the AIL is having items added to it when xfs_ail_push_all_sync() is called, then they may get inserted into the AIL at a LSN higher than the target LSN. At this point, xfsaild_push() will see that the target is X, the item LSNs are (X+N) and skip over them, hence never pushing the out. The result of this the AIL will not get emptied by the AIL push thread, hence xfs_ail_finish_update() will never see the AIL being empty even if it moves the tail. Hence xfs_ail_push_all_sync() never gets woken and hence cannot update the push target to capture the items beyond the current target on the LSN. This is a TOCTOU type of issue so the way to avoid it is to not use the push target at all for sync pushes. We know that a sync push is being requested by the fact the ail_empty wait queue is active, hence the xfsaild can just set the target to max_ail_lsn on every push that we see the wait queue active. Hence we no longer will leave items on the AIL that are beyond the LSN sampled at the start of a sync push. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans_ail.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 2a8c8dc54c95..1b52952097c1 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -448,10 +448,22 @@ xfsaild_push( spin_lock(&ailp->ail_lock); - /* barrier matches the ail_target update in xfs_ail_push() */ - smp_rmb(); - target = ailp->ail_target; - ailp->ail_target_prev = target; + /* + * If we have a sync push waiter, we always have to push till the AIL is + * empty. Update the target to point to the end of the AIL so that + * capture updates that occur after the sync push waiter has gone to + * sleep. + */ + if (waitqueue_active(&ailp->ail_empty)) { + lip = xfs_ail_max(ailp); + if (lip) + target = lip->li_lsn; + } else { + /* barrier matches the ail_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->ail_target; + ailp->ail_target_prev = target; + } /* we're done if the AIL is empty or our push has reached the end */ lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); @@ -724,7 +736,6 @@ xfs_ail_push_all_sync( spin_lock(&ailp->ail_lock); while ((lip = xfs_ail_max(ailp)) != NULL) { prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); - ailp->ail_target = lip->li_lsn; wake_up_process(ailp->ail_task); spin_unlock(&ailp->ail_lock); schedule(); From 70447e0ad9781f84e60e0990888bd8c84987f44e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:11 -0700 Subject: [PATCH 174/186] xfs: async CIL flushes need pending pushes to be made stable When the AIL tries to flush the CIL, it relies on the CIL push ending up on stable storage without having to wait for and manipulate iclog state directly. However, if there is already a pending CIL push when the AIL tries to flush the CIL, it won't set the cil->xc_push_commit_stable flag and so the CIL push will not actively flush the commit record iclog. generic/530 when run on a single CPU test VM can trigger this fairly reliably. This test exercises unlinked inode recovery, and can result in inodes being pinned in memory by ongoing modifications to the inode cluster buffer to record unlinked list modifications. As a result, the first inode unlinked in a buffer can pin the tail of the log whilst the inode cluster buffer is pinned by the current checkpoint that has been pushed but isn't on stable storage because because the cil->xc_push_commit_stable was not set. This results in the log/AIL effectively deadlocking until something triggers the commit record iclog to be pushed to stable storage (i.e. the periodic log worker calling xfs_log_force()). The fix is two-fold - first we should always set the cil->xc_push_commit_stable when xlog_cil_flush() is called, regardless of whether there is already a pending push or not. Second, if the CIL is empty, we should trigger an iclog flush to ensure that the iclogs of the last checkpoint have actually been submitted to disk as that checkpoint may not have been run under stable completion constraints. Reported-and-tested-by: Matthew Wilcox Fixes: 0020a190cf3e ("xfs: AIL needs asynchronous CIL forcing") Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 83a039762b81..25a86e35b4fe 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1243,18 +1243,27 @@ xlog_cil_push_now( if (!async) flush_workqueue(cil->xc_push_wq); + spin_lock(&cil->xc_push_lock); + + /* + * If this is an async flush request, we always need to set the + * xc_push_commit_stable flag even if something else has already queued + * a push. The flush caller is asking for the CIL to be on stable + * storage when the next push completes, so regardless of who has queued + * the push, the flush requires stable semantics from it. + */ + cil->xc_push_commit_stable = async; + /* * If the CIL is empty or we've already pushed the sequence then - * there's no work we need to do. + * there's no more work that we need to do. */ - spin_lock(&cil->xc_push_lock); if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; - cil->xc_push_commit_stable = async; queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } @@ -1352,6 +1361,13 @@ xlog_cil_flush( trace_xfs_log_force(log->l_mp, seq, _RET_IP_); xlog_cil_push_now(log, seq, true); + + /* + * If the CIL is empty, make sure that any previous checkpoint that may + * still be in an active iclog is pushed to stable storage. + */ + if (list_empty(&log->l_cilp->xc_cil)) + xfs_log_force(log->l_mp, 0); } /* From d86142dd7c4e10e50bdb3679b405d748214b2c28 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:12 -0700 Subject: [PATCH 175/186] xfs: log items should have a xlog pointer, not a mount Log items belong to the log, not the xfs_mount. Convert the mount pointer in the log item to a xlog pointer in preparation for upcoming log centric changes to the log items. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_buf_item.c | 5 +++-- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_cil.c | 2 +- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- fs/xfs/xfs_trace.h | 4 ++-- fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans.h | 3 ++- 10 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e1f4d7d5a011..761dde155099 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -463,7 +463,7 @@ xfs_bui_item_recover( struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_filblks_t count; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a7a8e4528881..522d450a94b1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -21,6 +21,7 @@ #include "xfs_dquot.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_buf_item_cache; @@ -428,7 +429,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_has_v3inodes(lip->li_mountp) || + if (xfs_has_v3inodes(lip->li_log->l_mp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -616,7 +617,7 @@ xfs_buf_item_put( * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xfs_is_shutdown(lip->li_mountp); + xlog_is_shutdown(lip->li_log); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 47ef9c9c5c17..0e50f2c9348e 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -604,7 +604,7 @@ xfs_efi_item_recover( struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ffd928cf9a9a..5010ce712a3e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1101,7 +1101,7 @@ xfs_log_item_init( int type, const struct xfs_item_ops *ops) { - item->li_mountp = mp; + item->li_log = mp->m_log; item->li_ailp = mp->m_ail; item->li_type = type; item->li_ops = ops; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 25a86e35b4fe..796e4464f809 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1484,7 +1484,7 @@ bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { - struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp; + struct xfs_cil *cil = lip->li_log->l_cilp; if (list_empty(&lip->li_cil)) return false; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d3da67772d57..0d868c93144d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -457,7 +457,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; unsigned int refc_type; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c3966b4c58ef..a22b2d19ef91 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -510,7 +510,7 @@ xfs_rui_item_recover( struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = lip->li_mountp; + struct xfs_mount *mp = lip->li_log->l_mp; enum xfs_rmap_intent_type type; xfs_exntst_t state; int i; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 239c8b8a5a85..b141ef78c755 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1308,7 +1308,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(xfs_lsn_t, lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; @@ -1361,7 +1361,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(xfs_lsn_t, new_lsn) ), TP_fast_assign( - __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->dev = lip->li_log->l_mp->m_super->s_dev; __entry->lip = lip; __entry->type = lip->li_type; __entry->flags = lip->li_flags; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3d11f9bb0dbb..50f22c34716a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -648,7 +648,7 @@ xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_log == tp->t_mountp->m_log); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); ASSERT(list_empty(&lip->li_trans)); ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index faa282204498..de177842b951 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -8,6 +8,7 @@ /* kernel only transaction subsystem defines */ +struct xlog; struct xfs_buf; struct xfs_buftarg; struct xfs_efd_log_item; @@ -31,7 +32,7 @@ struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xlog *li_log; struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ unsigned long li_flags; /* misc flags */ From 8eda87211097195d96d7d12be37dd39d6a7c8b80 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:12 -0700 Subject: [PATCH 176/186] xfs: AIL should be log centric The AIL operates purely on log items, so it is a log centric subsystem. Divorce it from the xfs_mount and instead have it pass around xlog pointers. Signed-off-by: Dave Chinner Reviewed-by: Chandan Babu R Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans_ail.c | 26 +++++++++++++------------- fs/xfs/xfs_trans_priv.h | 3 ++- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 50f22c34716a..917a69f0a6ff 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -775,7 +775,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(xfs_is_shutdown(ailp->ail_mount)); + ASSERT(xlog_is_shutdown(ailp->ail_log)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1b52952097c1..c2ccb98c7bcd 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -398,7 +398,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* @@ -418,7 +418,7 @@ static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; struct xfs_ail_cursor cur; struct xfs_log_item *lip; xfs_lsn_t lsn; @@ -443,7 +443,7 @@ xfsaild_push( ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); - xlog_cil_flush(mp->m_log); + xlog_cil_flush(ailp->ail_log); } spin_lock(&ailp->ail_lock); @@ -632,7 +632,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - xfs_is_shutdown(ailp->ail_mount)); + xlog_is_shutdown(ailp->ail_log)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -695,7 +695,7 @@ xfs_ail_push( struct xfs_log_item *lip; lip = xfs_ail_min(ailp); - if (!lip || xfs_is_shutdown(ailp->ail_mount) || + if (!lip || xlog_is_shutdown(ailp->ail_log) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; @@ -751,7 +751,7 @@ xfs_ail_update_finish( struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->ail_mount; + struct xlog *log = ailp->ail_log; /* if the tail lsn hasn't changed, don't do updates or wakeups. */ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { @@ -759,13 +759,13 @@ xfs_ail_update_finish( return; } - if (!xfs_is_shutdown(mp)) - xlog_assign_tail_lsn_locked(mp); + if (!xlog_is_shutdown(log)) + xlog_assign_tail_lsn_locked(log->l_mp); if (list_empty(&ailp->ail_head)) wake_up_all(&ailp->ail_empty); spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(mp); + xfs_log_space_wake(log->l_mp); } /* @@ -873,13 +873,13 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_mount; + struct xfs_mount *mp = ailp->ail_log->l_mp; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xfs_is_shutdown(mp)) { + if (shutdown_type && !xlog_is_shutdown(ailp->ail_log)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); @@ -904,7 +904,7 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM; - ailp->ail_mount = mp; + ailp->ail_log = mp->m_log; INIT_LIST_HEAD(&ailp->ail_head); INIT_LIST_HEAD(&ailp->ail_cursors); spin_lock_init(&ailp->ail_lock); @@ -912,7 +912,7 @@ xfs_trans_ail_init( init_waitqueue_head(&ailp->ail_empty); ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->ail_mount->m_super->s_id); + mp->m_super->s_id); if (IS_ERR(ailp->ail_task)) goto out_free_ailp; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 3004aeac9110..f0d79a9050ba 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -6,6 +6,7 @@ #ifndef __XFS_TRANS_PRIV_H__ #define __XFS_TRANS_PRIV_H__ +struct xlog; struct xfs_log_item; struct xfs_mount; struct xfs_trans; @@ -50,7 +51,7 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *ail_mount; + struct xlog *ail_log; struct task_struct *ail_task; struct list_head ail_head; xfs_lsn_t ail_target; From 01728b44ef1b714756607be0210fbcf60c78efce Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 17 Mar 2022 09:09:13 -0700 Subject: [PATCH 177/186] xfs: xfs_is_shutdown vs xlog_is_shutdown cage fight I've been chasing a recent resurgence in generic/388 recovery failure and/or corruption events. The events have largely been uninitialised inode chunks being tripped over in log recovery such as: XFS (pmem1): User initiated shutdown received. pmem1: writeback error on inode 12621949, offset 1019904, sector 12968096 XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/xfs/xfs_fsops.c:500). Shutting down filesystem. XFS (pmem1): Please unmount the filesystem and rectify the problem(s) XFS (pmem1): Unmounting Filesystem XFS (pmem1): Mounting V5 Filesystem XFS (pmem1): Starting recovery (logdev: internal) XFS (pmem1): bad inode magic/vsn daddr 8723584 #0 (magic=1818) XFS (pmem1): Metadata corruption detected at xfs_inode_buf_verify+0x180/0x190, xfs_inode block 0x851c80 xfs_inode_buf_verify XFS (pmem1): Unmount and run xfs_repair XFS (pmem1): First 128 bytes of corrupted metadata buffer: 00000000: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000010: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000020: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000030: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000040: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000050: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000060: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ 00000070: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................ XFS (pmem1): metadata I/O error in "xlog_recover_items_pass2+0x52/0xc0" at daddr 0x851c80 len 32 error 117 XFS (pmem1): log mount/recovery failed: error -117 XFS (pmem1): log mount failed There have been isolated random other issues, too - xfs_repair fails because it finds some corruption in symlink blocks, rmap inconsistencies, etc - but they are nowhere near as common as the uninitialised inode chunk failure. The problem has clearly happened at runtime before recovery has run; I can see the ICREATE log item in the log shortly before the actively recovered range of the log. This means the ICREATE was definitely created and written to the log, but for some reason the tail of the log has been moved past the ordered buffer log item that tracks INODE_ALLOC buffers and, supposedly, prevents the tail of the log moving past the ICREATE log item before the inode chunk buffer is written to disk. Tracing the fsstress processes that are running when the filesystem shut down immediately pin-pointed the problem: user shutdown marks xfs_mount as shutdown godown-213341 [008] 6398.022871: console: [ 6397.915392] XFS (pmem1): User initiated shutdown received. ..... aild tries to push ordered inode cluster buffer xfsaild/pmem1-213314 [001] 6398.022974: xfs_buf_trylock: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 16 pincount 0 lock 0 flags DONE|INODES|PAGES caller xfs_inode_item_push+0x8e xfsaild/pmem1-213314 [001] 6398.022976: xfs_ilock_nowait: dev 259:1 ino 0x851c80 flags ILOCK_SHARED caller xfs_iflush_cluster+0xae xfs_iflush_cluster() checks xfs_is_shutdown(), returns true, calls xfs_iflush_abort() to kill writeback of the inode. Inode is removed from AIL, drops cluster buffer reference. xfsaild/pmem1-213314 [001] 6398.022977: xfs_ail_delete: dev 259:1 lip 0xffff88880247ed80 old lsn 7/20344 new lsn 7/21000 type XFS_LI_INODE flags IN_AIL xfsaild/pmem1-213314 [001] 6398.022978: xfs_buf_rele: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 17 pincount 0 lock 0 flags DONE|INODES|PAGES caller xfs_iflush_abort+0xd7 ..... All inodes on cluster buffer are aborted, then the cluster buffer itself is aborted and removed from the AIL *without writeback*: xfsaild/pmem1-213314 [001] 6398.023011: xfs_buf_error_relse: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_ioend_fail+0x33 xfsaild/pmem1-213314 [001] 6398.023012: xfs_ail_delete: dev 259:1 lip 0xffff8888053efde8 old lsn 7/20344 new lsn 7/20344 type XFS_LI_BUF flags IN_AIL The inode buffer was at 7/20344 when it was removed from the AIL. xfsaild/pmem1-213314 [001] 6398.023012: xfs_buf_item_relse: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_item_done+0x31 xfsaild/pmem1-213314 [001] 6398.023012: xfs_buf_rele: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_item_relse+0x39 ..... Userspace is still running, doing stuff. an fsstress process runs syncfs() or sync() and we end up in sync_fs_one_sb() which issues a log force. This pushes on the CIL: fsstress-213322 [001] 6398.024430: xfs_fs_sync_fs: dev 259:1 m_features 0x20000000019ff6e9 opstate (clean|shutdown|inodegc|blockgc) s_flags 0x70810000 caller sync_fs_one_sb+0x26 fsstress-213322 [001] 6398.024430: xfs_log_force: dev 259:1 lsn 0x0 caller xfs_fs_sync_fs+0x82 fsstress-213322 [001] 6398.024430: xfs_log_force: dev 259:1 lsn 0x5f caller xfs_log_force+0x7c <...>-194402 [001] 6398.024467: kmem_alloc: size 176 flags 0x14 caller xlog_cil_push_work+0x9f And the CIL fills up iclogs with pending changes. This picks up the current tail from the AIL: <...>-194402 [001] 6398.024497: xlog_iclog_get_space: dev 259:1 state XLOG_STATE_ACTIVE refcnt 1 offset 0 lsn 0x0 flags caller xlog_write+0x149 <...>-194402 [001] 6398.024498: xlog_iclog_switch: dev 259:1 state XLOG_STATE_ACTIVE refcnt 1 offset 0 lsn 0x700005408 flags caller xlog_state_get_iclog_space+0x37e <...>-194402 [001] 6398.024521: xlog_iclog_release: dev 259:1 state XLOG_STATE_WANT_SYNC refcnt 1 offset 32256 lsn 0x700005408 flags caller xlog_write+0x5f9 <...>-194402 [001] 6398.024522: xfs_log_assign_tail_lsn: dev 259:1 new tail lsn 7/21000, old lsn 7/20344, last sync 7/21448 And it moves the tail of the log to 7/21000 from 7/20344. This *moves the tail of the log beyond the ICREATE transaction* that was at 7/20344 and pinned by the inode cluster buffer that was cancelled above. .... godown-213341 [008] 6398.027005: xfs_force_shutdown: dev 259:1 tag logerror flags log_io|force_umount file fs/xfs/xfs_fsops.c line_num 500 godown-213341 [008] 6398.027022: console: [ 6397.915406] pmem1: writeback error on inode 12621949, offset 1019904, sector 12968096 godown-213341 [008] 6398.030551: console: [ 6397.919546] XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/ And finally the log itself is now shutdown, stopping all further writes to the log. But this is too late to prevent the corruption that moving the tail of the log forwards after we start cancelling writeback causes. The fundamental problem here is that we are using the wrong shutdown checks for log items. We've long conflated mount shutdown with log shutdown state, and I started separating that recently with the atomic shutdown state changes in commit b36d4651e165 ("xfs: make forced shutdown processing atomic"). The changes in that commit series are directly responsible for being able to diagnose this issue because it clearly separated mount shutdown from log shutdown. Essentially, once we start cancelling writeback of log items and removing them from the AIL because the filesystem is shut down, we *cannot* update the journal because we may have cancelled the items that pin the tail of the log. That moves the tail of the log forwards without having written the metadata back, hence we have corrupt in memory state and writing to the journal propagates that to the on-disk state. What commit b36d4651e165 makes clear is that log item state needs to change relative to log shutdown, not mount shutdown. IOWs, anything that aborts metadata writeback needs to check log shutdown state because log items directly affect log consistency. Having them check mount shutdown state introduces the above race condition where we cancel metadata writeback before the log shuts down. To fix this, this patch works through all log items and converts shutdown checks to use xlog_is_shutdown() rather than xfs_is_shutdown(), so that we don't start aborting metadata writeback before we shut off journal writes. AFAICT, this race condition is a zero day IO error handling bug in XFS that dates back to the introduction of XLOG_IO_ERROR, XLOG_STATE_IOERROR and XFS_FORCED_SHUTDOWN back in January 1997. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 40 ++++++++++++++++++++++++++++++++-------- fs/xfs/xfs_icache.c | 10 +++++++++- fs/xfs/xfs_inode.c | 15 +++++++++++++-- fs/xfs/xfs_inode_item.c | 12 ++++++++++++ fs/xfs/xfs_qm.c | 8 ++++---- 5 files changed, 70 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8867f143598e..3617d9d2bc73 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -14,6 +14,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_log_recover.h" +#include "xfs_log_priv.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_errortag.h" @@ -813,7 +814,15 @@ xfs_buf_read_map( * buffer. */ if (error) { - if (!xfs_is_shutdown(target->bt_mount)) + /* + * Check against log shutdown for error reporting because + * metadata writeback may require a read first and we need to + * report errors in metadata writeback until the log is shut + * down. High level transaction read functions already check + * against mount shutdown, anyway, so we only need to be + * concerned about low level IO interactions here. + */ + if (!xlog_is_shutdown(target->bt_mount->m_log)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; @@ -1177,10 +1186,10 @@ xfs_buf_ioend_handle_error( struct xfs_error_cfg *cfg; /* - * If we've already decided to shutdown the filesystem because of I/O - * errors, there's no point in giving this a retry. + * If we've already shutdown the journal because of I/O errors, there's + * no point in giving this a retry. */ - if (xfs_is_shutdown(mp)) + if (xlog_is_shutdown(mp->m_log)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); @@ -1593,8 +1602,23 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - /* on shutdown we stale and complete the buffer immediately */ - if (xfs_is_shutdown(bp->b_mount)) { + /* + * On log shutdown we stale and complete the buffer immediately. We can + * be called to read the superblock before the log has been set up, so + * be careful checking the log state. + * + * Checking the mount shutdown state here can result in the log tail + * moving inappropriately on disk as the log may not yet be shut down. + * i.e. failing this buffer on mount shutdown can remove it from the AIL + * and move the tail of the log forwards without having written this + * buffer to disk. This corrupts the log tail state in memory, and + * because the log may not be shut down yet, it can then be propagated + * to disk before the log is shutdown. Hence we check log shutdown + * state here rather than mount state to avoid corrupting the log tail + * on shutdown. + */ + if (bp->b_mount->m_log && + xlog_is_shutdown(bp->b_mount->m_log)) { xfs_buf_ioend_fail(bp); return -EIO; } @@ -1808,10 +1832,10 @@ xfs_buftarg_drain( * If one or more failed buffers were freed, that means dirty metadata * was thrown away. This should only ever happen after I/O completion * handling has elevated I/O error(s) to permanent failures and shuts - * down the fs. + * down the journal. */ if (write_fail) { - ASSERT(xfs_is_shutdown(btp->bt_mount)); + ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9644f938990c..4148cdf7ce4a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -23,6 +23,7 @@ #include "xfs_reflink.h" #include "xfs_ialloc.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" #include @@ -873,7 +874,14 @@ xfs_reclaim_inode( if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; - if (xfs_is_shutdown(ip->i_mount)) { + /* + * Check for log shutdown because aborting the inode can move the log + * tail and corrupt in memory state. This is fine if the log is shut + * down, but if the log is still active and only the mount is shut down + * then the in-memory log tail movement caused by the abort can be + * incorrectly propagated to disk. + */ + if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); goto reclaim; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 67ece991d3f5..26227d26f274 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -35,6 +35,7 @@ #include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_log_priv.h" struct kmem_cache *xfs_inode_cache; @@ -3678,7 +3679,7 @@ xfs_iflush_cluster( * AIL, leaving a dirty/unpinned inode attached to the buffer * that otherwise looks like it should be flushed. */ - if (xfs_is_shutdown(mp)) { + if (xlog_is_shutdown(mp->m_log)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -3704,9 +3705,19 @@ xfs_iflush_cluster( } if (error) { + /* + * Shutdown first so we kill the log before we release this + * buffer. If it is an INODE_ALLOC buffer and pins the tail + * of the log, failing it before the _log_ is shut down can + * result in the log tail being moved forward in the journal + * on disk because log writes can still be taking place. Hence + * unpinning the tail will allow the ICREATE intent to be + * removed from the log an recovery will fail with uninitialised + * inode cluster buffers. + */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 90d8e591baf8..11158fa81a09 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,6 +17,7 @@ #include "xfs_trans_priv.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include @@ -720,6 +721,17 @@ xfs_iflush_ail_updates( if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn) continue; + /* + * dgc: Not sure how this happens, but it happens very + * occassionaly via generic/388. xfs_iflush_abort() also + * silently handles this same "under writeback but not in AIL at + * shutdown" condition via xfs_trans_ail_delete(). + */ + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { + ASSERT(xlog_is_shutdown(lip->li_log)); + continue; + } + lsn = xfs_ail_delete_one(ailp, lip); if (!tail_lsn && lsn) tail_lsn = lsn; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 32ac8d9c8940..f165d1a3de1d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -25,6 +25,7 @@ #include "xfs_error.h" #include "xfs_ag.h" #include "xfs_ialloc.h" +#include "xfs_log_priv.h" /* * The global quota manager. There is only one of these for the entire @@ -121,8 +122,7 @@ xfs_qm_dqpurge( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; int error = -EAGAIN; xfs_dqlock(dqp); @@ -157,7 +157,7 @@ xfs_qm_dqpurge( } ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(xfs_is_shutdown(mp) || + ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); @@ -172,7 +172,7 @@ xfs_qm_dqpurge( */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); - XFS_STATS_DEC(mp, xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; From 322794d3355c33adcc4feace0045d85a8e4ed813 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 2 Mar 2022 14:51:53 +0800 Subject: [PATCH 178/186] ceph: fix inode reference leakage in ceph_get_snapdir() The ceph_get_inode() will search for or insert a new inode into the hash for the given vino, and return a reference to it. If new is non-NULL, its reference is consumed. We should release the reference when in error handing cases. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7b1e93c8a0d2..460a4fbd7a65 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent) if (!S_ISDIR(parent->i_mode)) { pr_warn_once("bad snapdir parent type (mode=0%o)\n", parent->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { pr_warn_once("bad snapdir inode type (mode=0%o)\n", inode->i_mode); - return ERR_PTR(-ENOTDIR); + goto err; } inode->i_mode = parent->i_mode; @@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent) } return inode; +err: + if ((inode->i_state & I_NEW)) + discard_new_inode(inode); + else + iput(inode); + return ERR_PTR(-ENOTDIR); } const struct inode_operations ceph_file_iops = { From 1ad3bb28d336978e451980ca6c72be494b8fe2f1 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 2 Mar 2022 15:29:51 +0800 Subject: [PATCH 179/186] ceph: assign the ci only when the inode isn't NULL The ceph_find_inode() may will fail and return NULL. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 991c6c8625b2..f1ad6884d4da 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4166,7 +4166,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); - ci = ceph_inode(inode); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); @@ -4192,6 +4191,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, } goto flush_cap_releases; } + ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { From 8d728c769fd8e5aaaea36288741a4e3493111816 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 8 Mar 2022 07:42:16 -0500 Subject: [PATCH 180/186] ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64() Latencies are of type ktime_t, coverting from jiffies is incorrect. Also, switch to "struct ceph_timespec" for r/w/m latencies. Signed-off-by: Venky Shankar Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 19 +++++++++---------- fs/ceph/metric.h | 11 ++++------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 0fcba68f9a99..454d2c93208e 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -8,6 +8,12 @@ #include "metric.h" #include "mds_client.h" +static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) +{ + struct timespec64 t = ktime_to_timespec64(val); + ceph_encode_timespec64(ts, &t); +} + static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_mds_session *s) { @@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, u64 nr_caps = atomic64_read(&m->total_caps); u32 header_len = sizeof(struct ceph_metric_header); struct ceph_msg *msg; - struct timespec64 ts; s64 sum; s32 items = 0; s32 len; @@ -63,9 +68,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; - jiffies_to_timespec64(sum, &ts); - read->sec = cpu_to_le32(ts.tv_sec); - read->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&read->lat, sum); items++; /* encode the write latency metric */ @@ -75,9 +78,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; - jiffies_to_timespec64(sum, &ts); - write->sec = cpu_to_le32(ts.tv_sec); - write->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&write->lat, sum); items++; /* encode the metadata latency metric */ @@ -87,9 +88,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; - jiffies_to_timespec64(sum, &ts); - meta->sec = cpu_to_le32(ts.tv_sec); - meta->nsec = cpu_to_le32(ts.tv_nsec); + ktime_to_ceph_timespec(&meta->lat, sum); items++; /* encode the dentry lease metric */ diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index bb45608181e7..5b2bb2897056 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -2,7 +2,7 @@ #ifndef _FS_CEPH_MDS_METRIC_H #define _FS_CEPH_MDS_METRIC_H -#include +#include #include #include @@ -60,22 +60,19 @@ struct ceph_metric_cap { /* metric read latency header */ struct ceph_metric_read_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { struct ceph_metric_header header; - __le32 sec; - __le32 nsec; + struct ceph_timespec lat; } __packed; /* metric dentry lease header */ From 367290e6355ce13c17d17f3ec1075beb7ca3224a Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 8 Mar 2022 07:42:17 -0500 Subject: [PATCH 181/186] ceph: track average r/w/m latency Make the math a bit simpler to understand (should not affect execution speeds). Signed-off-by: Venky Shankar Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 27 ++++++++++++++------------- fs/ceph/metric.h | 1 + 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 454d2c93208e..14b6af48b611 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -249,6 +249,7 @@ int ceph_metric_init(struct ceph_client_metric *m) metric->size_max = 0; metric->total = 0; metric->latency_sum = 0; + metric->latency_avg = 0; metric->latency_sq_sum = 0; metric->latency_min = KTIME_MAX; metric->latency_max = 0; @@ -306,20 +307,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m) max = new; \ } -static inline void __update_stdev(ktime_t total, ktime_t lsum, - ktime_t *sq_sump, ktime_t lat) +static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, + ktime_t *sq_sump, ktime_t lat) { - ktime_t avg, sq; + ktime_t avg; - if (unlikely(total == 1)) - return; - - /* the sq is (lat - old_avg) * (lat - new_avg) */ - avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); - sq = lat - avg; - avg = DIV64_U64_ROUND_CLOSEST(lsum, total); - sq = sq * (lat - avg); - *sq_sump += sq; + if (unlikely(total == 1)) { + *lavg = lat; + } else { + /* the sq is (lat - old_avg) * (lat - new_avg) */ + avg = *lavg + div64_s64(lat - *lavg, total); + *sq_sump += (lat - *lavg)*(lat - avg); + *lavg = avg; + } } void ceph_update_metrics(struct ceph_metric *m, @@ -338,6 +338,7 @@ void ceph_update_metrics(struct ceph_metric *m, METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); m->latency_sum += lat; METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); - __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat); + __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, + lat); spin_unlock(&m->lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 5b2bb2897056..c47ba0074e49 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -137,6 +137,7 @@ struct ceph_metric { u64 size_min; u64 size_max; ktime_t latency_sum; + ktime_t latency_avg; ktime_t latency_sq_sum; ktime_t latency_min; ktime_t latency_max; From 54d7b821a37fdb805ffc8545e536fb228c1113b2 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 8 Mar 2022 07:42:18 -0500 Subject: [PATCH 182/186] ceph: include average/stdev r/w/m latency in mds metrics stdev is computed in `cephfs-top` tool - clients forward square of sums and IO count required to calculate stdev. Signed-off-by: Venky Shankar Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 15 ++++++++++++--- fs/ceph/metric.h | 49 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 14b6af48b611..c47347d2e84e 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -64,31 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, /* encode the read latency metric */ read = (struct ceph_metric_read_latency *)(cap + 1); read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); - read->header.ver = 1; + read->header.ver = 2; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); sum = m->metric[METRIC_READ].latency_sum; ktime_to_ceph_timespec(&read->lat, sum); + ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); + read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); + read->count = cpu_to_le64(m->metric[METRIC_READ].total); items++; /* encode the write latency metric */ write = (struct ceph_metric_write_latency *)(read + 1); write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); - write->header.ver = 1; + write->header.ver = 2; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); sum = m->metric[METRIC_WRITE].latency_sum; ktime_to_ceph_timespec(&write->lat, sum); + ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); + write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); + write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); items++; /* encode the metadata latency metric */ meta = (struct ceph_metric_metadata_latency *)(write + 1); meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); - meta->header.ver = 1; + meta->header.ver = 2; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); sum = m->metric[METRIC_METADATA].latency_sum; ktime_to_ceph_timespec(&meta->lat, sum); + ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); + meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); + meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); items++; /* encode the dentry lease metric */ diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index c47ba0074e49..0d0c44bd3332 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -19,27 +19,39 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_OPENED_INODES, CLIENT_METRIC_TYPE_READ_IO_SIZES, CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, }; /* * This will always have the highest metric bit value * as the last element of the array. */ -#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ - CLIENT_METRIC_TYPE_CAP_INFO, \ - CLIENT_METRIC_TYPE_READ_LATENCY, \ - CLIENT_METRIC_TYPE_WRITE_LATENCY, \ - CLIENT_METRIC_TYPE_METADATA_LATENCY, \ - CLIENT_METRIC_TYPE_DENTRY_LEASE, \ - CLIENT_METRIC_TYPE_OPENED_FILES, \ - CLIENT_METRIC_TYPE_PINNED_ICAPS, \ - CLIENT_METRIC_TYPE_OPENED_INODES, \ - CLIENT_METRIC_TYPE_READ_IO_SIZES, \ - CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ - \ - CLIENT_METRIC_TYPE_MAX, \ +#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ + CLIENT_METRIC_TYPE_CAP_INFO, \ + CLIENT_METRIC_TYPE_READ_LATENCY, \ + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ + \ + CLIENT_METRIC_TYPE_MAX, \ } struct ceph_metric_header { @@ -61,18 +73,27 @@ struct ceph_metric_cap { struct ceph_metric_read_latency { struct ceph_metric_header header; struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric write latency header */ struct ceph_metric_write_latency { struct ceph_metric_header header; struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric metadata latency header */ struct ceph_metric_metadata_latency { struct ceph_metric_header header; struct ceph_timespec lat; + struct ceph_timespec avg; + __le64 sq_sum; + __le64 count; } __packed; /* metric dentry lease header */ From 271251f841a53ce20c4e49047278e06c205a8ebd Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Tue, 8 Mar 2022 07:42:19 -0500 Subject: [PATCH 183/186] ceph: use tracked average r/w/m latencies to display metrics in debugfs Signed-off-by: Venky Shankar Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3cf7c9c1085b..bec3c4549c07 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *cm = &fsc->mdsc->metric; struct ceph_metric *m; - s64 total, sum, avg, min, max, sq; + s64 total, avg, min, max, sq; int i; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); @@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p) m = &cm->metric[i]; spin_lock(&m->lock); total = m->total; - sum = m->latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + avg = m->latency_avg; min = m->latency_min; max = m->latency_max; sq = m->latency_sq_sum; From c38af9825eff8ff76b9d57dd62ebc7ff6050464c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 7 Mar 2022 17:21:21 +0300 Subject: [PATCH 184/186] ceph: uninitialized variable in debug output If read_mapping_folio() fails then "inline_version" is printed without being initialized. [ jlayton: use CEPH_INLINE_NONE instead of "-1" ] Fixes: 083db6fd3e73 ("ceph: uninline the data on a file opened for writing") Signed-off-by: Dan Carpenter Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 46e0881ae8b2..752c421c9922 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1647,9 +1647,10 @@ int ceph_uninline_data(struct file *file) struct ceph_osd_request *req; struct ceph_cap_flush *prealloc_cf; struct folio *folio = NULL; + u64 inline_version = CEPH_INLINE_NONE; struct page *pages[1]; - u64 len, inline_version; int err = 0; + u64 len; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) From f639d9867eea647005dc824e0e24f39ffc50d4e4 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sat, 5 Mar 2022 19:52:59 +0800 Subject: [PATCH 185/186] ceph: fix memory leak in ceph_readdir when note_last_dentry returns error Reset the last_readdir at the same time, and add a comment explaining why we don't free last_readdir when dir_emit returns false. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0cf6afe283e9..eae417d71136 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -478,8 +478,11 @@ more: 2 : (fpos_off(rde->offset) + 1); err = note_last_dentry(dfi, rde->name, rde->name_len, next_offset); - if (err) + if (err) { + ceph_mdsc_put_request(dfi->last_readdir); + dfi->last_readdir = NULL; return err; + } } else if (req->r_reply_info.dir_end) { dfi->next_offset = 2; /* keep last name */ @@ -520,6 +523,12 @@ more: if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { + /* + * NOTE: Here no need to put the 'dfi->last_readdir', + * because when dir_emit stops us it's most likely + * doesn't have enough memory, etc. So for next readdir + * it will continue. + */ dout("filldir stopping us...\n"); return 0; } From 05e815539f3f161585c13a9ab023341bade2c52f Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 7 Mar 2022 17:41:48 +0800 Subject: [PATCH 186/186] cxl/core/port: Fix NULL but dereferenced coccicheck error Fix the following coccicheck warning: ./drivers/cxl/core/port.c:913:21-24: ERROR: port is NULL but dereferenced. The put_device() is only relevent in the is_cxl_root() case. Fixes: 2703c16c75ae ("cxl/core/port: Add switch port enumeration") Signed-off-by: Wan Jiabing Link: https://lore.kernel.org/r/20220307094158.404882-1-wanjiabing@vivo.com Signed-off-by: Dan Williams --- drivers/cxl/core/port.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c1681248f322..2ab1ba4499b3 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -911,7 +911,10 @@ static void cxl_detach_ep(void *data) break; port = find_cxl_port(dport_dev); - if (!port || is_cxl_root(port)) { + if (!port) + continue; + + if (is_cxl_root(port)) { put_device(&port->dev); continue; }