From 96f87ee1811306d0c8cf94b8c37b0e4f725b01d1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:23 +0200 Subject: RDMA: Clean structures from CONFIG_INFINIBAND_ON_DEMAND_PAGING CONFIG_INFINIBAND_ON_DEMAND_PAGING is used in general structures to micro-optimize the memory footprint. Remove it, so it will allow us to simplify various ODP device flows. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a3ceed3a040a..3ddd199ba602 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1504,12 +1504,10 @@ struct ib_ucontext { bool cleanup_retryable; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); struct mutex per_mm_list_lock; struct list_head per_mm_list; -#endif struct ib_rdmacg_object cg_obj; /* -- cgit v1.2.3 From 13859d5df418ea535926e2b57c29d5161c522b9d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:26 +0200 Subject: RDMA/mlx5: Embed into the code flow the ODP config option Convert various places to more readable code, which embeds CONFIG_INFINIBAND_ON_DEMAND_PAGING into the code flow. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 3 --- drivers/infiniband/hw/mlx5/main.c | 41 ++++++++++++++++++------------------ drivers/infiniband/hw/mlx5/mem.c | 5 +---- drivers/infiniband/hw/mlx5/mr.c | 28 +++++++++++------------- include/rdma/ib_umem_odp.h | 26 +++++++++++------------ 5 files changed, 46 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 549d9eedf62e..d4f1a2ef5015 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -234,14 +234,11 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING mutex_init(&ucontext->per_mm_list_lock); INIT_LIST_HEAD(&ucontext->per_mm_list); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; -#endif - resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 06ee1f0cb22d..11e9783cefcc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1763,9 +1763,9 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (err) goto out_sys_pages; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; -#endif + if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) + context->ibucontext.invalidate_range = + &mlx5_ib_invalidate_range; if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { err = mlx5_ib_devx_create(dev, true); @@ -1897,12 +1897,10 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING /* All umem's must be destroyed before destroying the ucontext. */ mutex_lock(&ibcontext->per_mm_list_lock); WARN_ON(!list_empty(&ibcontext->per_mm_list)); mutex_unlock(&ibcontext->per_mm_list_lock); -#endif bfregi = &context->bfregi; mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); @@ -5722,11 +5720,11 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - cleanup_srcu_struct(&dev->mr_srcu); - drain_workqueue(dev->advise_mr_wq); - destroy_workqueue(dev->advise_mr_wq); -#endif + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + cleanup_srcu_struct(&dev->mr_srcu); + drain_workqueue(dev->advise_mr_wq); + destroy_workqueue(dev->advise_mr_wq); + } kfree(dev->port); } @@ -5779,19 +5777,20 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->memic.memic_lock); dev->memic.dev = mdev; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0); - if (!dev->advise_mr_wq) { - err = -ENOMEM; - goto err_mp; - } + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + dev->advise_mr_wq = + alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0); + if (!dev->advise_mr_wq) { + err = -ENOMEM; + goto err_mp; + } - err = init_srcu_struct(&dev->mr_srcu); - if (err) { - destroy_workqueue(dev->advise_mr_wq); - goto err_mp; + err = init_srcu_struct(&dev->mr_srcu); + if (err) { + destroy_workqueue(dev->advise_mr_wq); + goto err_mp; + } } -#endif return 0; err_mp: diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 549234988bb4..9f90be296ee0 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -111,7 +111,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, *count = i; } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static u64 umem_dma_to_mtt(dma_addr_t umem_dma) { u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; @@ -123,7 +122,6 @@ static u64 umem_dma_to_mtt(dma_addr_t umem_dma) return mtt_entry; } -#endif /* * Populate the given array with bus addresses from the umem. @@ -151,7 +149,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int len; struct scatterlist *sg; int entry; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem->is_odp) { WARN_ON(shift != 0); WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); @@ -164,7 +162,6 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, } return; } -#endif i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index c389750f771e..494a90f4348c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -71,10 +71,9 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* Wait until all page fault handlers using the mr complete. */ - synchronize_srcu(&dev->mr_srcu); -#endif + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + /* Wait until all page fault handlers using the mr complete. */ + synchronize_srcu(&dev->mr_srcu); return err; } @@ -254,9 +253,8 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - synchronize_srcu(&dev->mr_srcu); -#endif + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + synchronize_srcu(&dev->mr_srcu); list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { list_del(&mr->list); @@ -1329,8 +1327,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (!start && length == U64_MAX) { + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start && + length == U64_MAX) { if (!(access_flags & IB_ACCESS_ON_DEMAND) || !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return ERR_PTR(-EINVAL); @@ -1340,7 +1338,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(mr); return &mr->ibmr; } -#endif err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, &page_shift, &ncont, &order); @@ -1401,9 +1398,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - mr->live = 1; -#endif + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + mr->live = 1; + return &mr->ibmr; error: ib_umem_release(umem); @@ -1518,9 +1515,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, } mr->allocated_from_cache = 0; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - mr->live = 1; -#endif + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + mr->live = 1; } else { /* * Send a UMR WQE diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0b1446fe2fab..d3725cf13ecd 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -83,6 +83,19 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) return container_of(umem, struct ib_umem_odp, umem); } +/* + * The lower 2 bits of the DMA address signal the R/W permissions for + * the entry. To upgrade the permissions, provide the appropriate + * bitmask to the map_dma_pages function. + * + * Be aware that upgrading a mapped address might result in change of + * the DMA address for the page. + */ +#define ODP_READ_ALLOWED_BIT (1<<0ULL) +#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) + +#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_ucontext_per_mm { @@ -107,19 +120,6 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); -/* - * The lower 2 bits of the DMA address signal the R/W permissions for - * the entry. To upgrade the permissions, provide the appropriate - * bitmask to the map_dma_pages function. - * - * Be aware that upgrading a mapped address might result in change of - * the DMA address for the page. - */ -#define ODP_READ_ALLOWED_BIT (1<<0ULL) -#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) - -#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) - int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bcnt, u64 access_mask, unsigned long current_seq); -- cgit v1.2.3 From 0ada768517dafa1504ef5986ba04f118b7436960 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:27 +0200 Subject: RDMA/mlx5: Delete declaration of already removed function The implementation of mlx5_core_page_fault_resume() was removed in commit d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA"). This patch removes declaration too. Fixes: d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 54299251d40d..b6f5839f129a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -939,10 +939,6 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, size_t sz); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token, - u32 wq_num, u8 type, int error); -#endif int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From b0ea0fa5435f9df7213a9af098558f7dd584d8e8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Jan 2019 11:15:16 +0200 Subject: IB/{core,hw}: Have ib_umem_get extract the ib_ucontext from ib_udata ib_umem_get() can only be called in a method callback, which always has a udata parameter. This allows ib_umem_get() to derive the ucontext pointer directly from the udata without requiring the drivers to find it in some way or another. Signed-off-by: Jason Gunthorpe Signed-off-by: Shamir Rabinovitch --- drivers/infiniband/core/umem.c | 9 ++++-- drivers/infiniband/core/uverbs_main.c | 24 ++++++++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 13 ++++----- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/mem.c | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 9 +++--- drivers/infiniband/hw/hns/hns_roce_db.c | 6 ++-- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 7 ++--- drivers/infiniband/hw/hns/hns_roce_qp.c | 13 ++++----- drivers/infiniband/hw/hns/hns_roce_srq.c | 7 ++--- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +- drivers/infiniband/hw/mlx4/cq.c | 17 ++++++------ drivers/infiniband/hw/mlx4/doorbell.c | 6 ++-- drivers/infiniband/hw/mlx4/mlx4_ib.h | 3 +- drivers/infiniband/hw/mlx4/mr.c | 11 ++++---- drivers/infiniband/hw/mlx4/qp.c | 14 ++++++---- drivers/infiniband/hw/mlx4/srq.c | 5 ++-- drivers/infiniband/hw/mlx5/cq.c | 11 ++++---- drivers/infiniband/hw/mlx5/devx.c | 2 +- drivers/infiniband/hw/mlx5/doorbell.c | 6 ++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++- drivers/infiniband/hw/mlx5/mr.c | 22 +++++++-------- drivers/infiniband/hw/mlx5/odp.c | 4 +-- drivers/infiniband/hw/mlx5/qp.c | 40 ++++++++++++--------------- drivers/infiniband/hw/mlx5/srq.c | 5 ++-- drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- drivers/infiniband/hw/qedr/verbs.c | 33 +++++++++++----------- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 3 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 6 ++-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c | 4 +-- drivers/infiniband/sw/rdmavt/mr.c | 3 +- drivers/infiniband/sw/rxe/rxe_mr.c | 2 +- include/rdma/ib_umem.h | 8 ++++-- include/rdma/ib_verbs.h | 1 + 38 files changed, 168 insertions(+), 147 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c6144df47ea4..1efe0a74e06b 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -72,15 +72,16 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d * If access flags indicate ODP memory, avoid pinning. Instead, stores * the mm for future page fault handling in conjunction with MMU notifiers. * - * @context: userspace context to pin memory for + * @udata: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned * @dmasync: flush in-flight DMA when the memory region is written */ -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, int access, int dmasync) { + struct ib_ucontext *context; struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; @@ -95,6 +96,10 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct scatterlist *sg, *sg_list_start; unsigned int gup_flags = FOLL_WRITE; + context = rdma_get_ucontext(udata); + if (IS_ERR(context)) + return ERR_CAST(context); + if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index fb0007aa0c27..996f167d1436 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -101,6 +101,30 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile) } EXPORT_SYMBOL(ib_uverbs_get_ucontext_file); +/* rdma_get_ucontext - Return the ucontext from a udata + * @udata: The udata to get the context from + * + * This can only be called from within a uapi method that was passed ib_udata + * as a parameter. It returns the ucontext associated with the udata, or ERR_PTR + * if the udata is NULL or the ucontext has been disassociated. + */ +struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata) +{ + if (!udata) + return ERR_PTR(-EIO); + + /* + * FIXME: Really all cases that get here with a udata will have + * already called ib_uverbs_get_ucontext_file, or located a uobject + * that points to a ucontext. We could store that result in the udata + * so this function can't fail. + */ + return ib_uverbs_get_ucontext_file( + container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->ufile); +} +EXPORT_SYMBOL(rdma_get_ucontext); + int uverbs_dealloc_mw(struct ib_mw *mw) { struct ib_pd *pd = mw->pd; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 1e2515e2eb62..9bc637e49faa 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -895,8 +895,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) bytes += (qplib_qp->sq.max_wqe * sizeof(struct sq_psn_search)); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(context, ureq.qpsva, bytes, - IB_ACCESS_LOCAL_WRITE, 1); + umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -908,7 +907,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(context, ureq.qprva, bytes, + umem = ib_umem_get(udata, ureq.qprva, bytes, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) goto rqfail; @@ -1370,8 +1369,7 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(context, ureq.srqva, bytes, - IB_ACCESS_LOCAL_WRITE, 1); + umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -2622,7 +2620,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, goto fail; } - cq->umem = ib_umem_get(context, req.cq_va, + cq->umem = ib_umem_get(udata, req.cq_va, entries * sizeof(struct cq_base), IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(cq->umem)) { @@ -3589,8 +3587,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, /* The fixed portion of the rkey is the same as the lkey */ mr->ib_mr.rkey = mr->qplib_mr.rkey; - umem = ib_umem_get(ib_pd->uobject->context, start, length, - mr_access_flags, 0); + umem = ib_umem_get(udata, start, length, mr_access_flags, 0); if (IS_ERR(umem)) { dev_err(rdev_to_dev(rdev), "Failed to get umem"); rc = -EFAULT; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index b34b1a1bd94b..92ee6761a3bd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -540,7 +540,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mhp->rhp = rhp; - mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + mhp->umem = ib_umem_get(udata, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); kfree(mhp); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 7b76e6f81aeb..96760a36b9fc 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -537,7 +537,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mhp->rhp = rhp; - mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + mhp->umem = ib_umem_get(udata, start, length, acc, 0); if (IS_ERR(mhp->umem)) goto err_free_skb; diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 3a485f50fede..1dfe5627006c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -215,7 +215,7 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) EXPORT_SYMBOL_GPL(hns_roce_free_cq); static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev, - struct ib_ucontext *context, + struct ib_udata *udata, struct hns_roce_cq_buf *buf, struct ib_umem **umem, u64 buf_addr, int cqe) { @@ -223,7 +223,7 @@ static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev, u32 page_shift; u32 npages; - *umem = ib_umem_get(context, buf_addr, cqe * hr_dev->caps.cq_entry_sz, + *umem = ib_umem_get(udata, buf_addr, cqe * hr_dev->caps.cq_entry_sz, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -347,7 +347,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, } /* Get user space address, write it into mtt table */ - ret = hns_roce_ib_get_cq_umem(hr_dev, context, &hr_cq->hr_buf, + ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf, &hr_cq->umem, ucmd.buf_addr, cq_entries); if (ret) { @@ -358,7 +358,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && (udata->outlen >= sizeof(resp))) { ret = hns_roce_db_map_user(to_hr_ucontext(context), - ucmd.db_addr, &hr_cq->db); + udata, ucmd.db_addr, + &hr_cq->db); if (ret) { dev_err(dev, "cq record doorbell map failed!\n"); goto err_mtt; diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index e2f93c1ce86a..0c6c1fe87705 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -8,7 +8,8 @@ #include #include "hns_roce_device.h" -int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, +int hns_roce_db_map_user(struct hns_roce_ucontext *context, + struct ib_udata *udata, unsigned long virt, struct hns_roce_db *db) { struct hns_roce_user_db_page *page; @@ -28,8 +29,7 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, refcount_set(&page->refcount, 1); page->user_virt = (virt & PAGE_MASK); - page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, - PAGE_SIZE, 0, 0); + page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0); if (IS_ERR(page->umem)) { ret = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 509e467843f6..9a4be70936e0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1133,7 +1133,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq); void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); -int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, +int hns_roce_db_map_user(struct hns_roce_ucontext *context, + struct ib_udata *udata, unsigned long virt, struct hns_roce_db *db); void hns_roce_db_unmap_user(struct hns_roce_ucontext *context, struct hns_roce_db *db); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index ee5991bd4171..da4fffedb879 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1110,8 +1110,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags, 0); + mr->umem = ib_umem_get(udata, start, length, access_flags, 0); if (IS_ERR(mr->umem)) { ret = PTR_ERR(mr->umem); goto err_free; @@ -1220,8 +1219,8 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, } ib_umem_release(mr->umem); - mr->umem = ib_umem_get(ibmr->uobject->context, start, length, - mr_access_flags, 0); + mr->umem = + ib_umem_get(udata, start, length, mr_access_flags, 0); if (IS_ERR(mr->umem)) { ret = PTR_ERR(mr->umem); mr->umem = NULL; diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index b9ba140b50d4..accf9ce1507d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -613,9 +613,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_rq_sge_list; } - hr_qp->umem = ib_umem_get(ib_pd->uobject->context, - ucmd.buf_addr, hr_qp->buff_size, 0, - 0); + hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr, + hr_qp->buff_size, 0, 0); if (IS_ERR(hr_qp->umem)) { dev_err(dev, "ib_umem_get error for create qp\n"); ret = PTR_ERR(hr_qp->umem); @@ -654,8 +653,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_sq(init_attr)) { ret = hns_roce_db_map_user( - to_hr_ucontext(ib_pd->uobject->context), - ucmd.sdb_addr, &hr_qp->sdb); + to_hr_ucontext(ib_pd->uobject->context), udata, + ucmd.sdb_addr, &hr_qp->sdb); if (ret) { dev_err(dev, "sq record doorbell map failed!\n"); goto err_mtt; @@ -670,8 +669,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_rq(init_attr)) { ret = hns_roce_db_map_user( - to_hr_ucontext(ib_pd->uobject->context), - ucmd.db_addr, &hr_qp->rdb); + to_hr_ucontext(ib_pd->uobject->context), udata, + ucmd.db_addr, &hr_qp->rdb); if (ret) { dev_err(dev, "rq record doorbell map failed!\n"); goto err_sq_dbmap; diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 960b1946c365..8975f858b36f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -252,8 +252,8 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd, goto err_srq; } - srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - srq_buf_size, 0, 0); + srq->umem = + ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0); if (IS_ERR(srq->umem)) { ret = PTR_ERR(srq->umem); goto err_srq; @@ -280,8 +280,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd, goto err_srq_mtt; /* config index queue BA */ - srq->idx_que.umem = ib_umem_get(pd->uobject->context, - ucmd.que_addr, + srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr, srq->idx_que.buf_size, 0, 0); if (IS_ERR(srq->idx_que.umem)) { dev_err(hr_dev->dev, diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 0b675b0742c2..80b66df95362 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1852,7 +1852,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, if (length > I40IW_MAX_MR_SIZE) return ERR_PTR(-EINVAL); - region = ib_umem_get(pd->uobject->context, start, length, acc, 0); + region = ib_umem_get(udata, start, length, acc, 0); if (IS_ERR(region)) return (struct ib_mr *)region; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 43512347b4f0..db936c12b5bd 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -134,16 +134,16 @@ static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf); } -static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, - struct mlx4_ib_cq_buf *buf, struct ib_umem **umem, - u64 buf_addr, int cqe) +static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata, + struct mlx4_ib_cq_buf *buf, + struct ib_umem **umem, u64 buf_addr, int cqe) { int err; int cqe_size = dev->dev->caps.cqe_size; int shift; int n; - *umem = ib_umem_get(context, buf_addr, cqe * cqe_size, + *umem = ib_umem_get(udata, buf_addr, cqe * cqe_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -213,14 +213,13 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, } buf_addr = (void *)(unsigned long)ucmd.buf_addr; - - err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem, + err = mlx4_ib_get_cq_umem(dev, udata, &cq->buf, &cq->umem, ucmd.buf_addr, entries); if (err) goto err_cq; - err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr, - &cq->db); + err = mlx4_ib_db_map_user(to_mucontext(context), udata, + ucmd.db_addr, &cq->db); if (err) goto err_mtt; @@ -336,7 +335,7 @@ static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq if (!cq->resize_buf) return -ENOMEM; - err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf, + err = mlx4_ib_get_cq_umem(dev, udata, &cq->resize_buf->buf, &cq->resize_umem, ucmd.buf_addr, entries); if (err) { kfree(cq->resize_buf); diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c index c51740986367..3aab71b29ce8 100644 --- a/drivers/infiniband/hw/mlx4/doorbell.c +++ b/drivers/infiniband/hw/mlx4/doorbell.c @@ -41,7 +41,8 @@ struct mlx4_ib_user_db_page { int refcnt; }; -int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, + struct ib_udata *udata, unsigned long virt, struct mlx4_db *db) { struct mlx4_ib_user_db_page *page; @@ -61,8 +62,7 @@ int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, - PAGE_SIZE, 0, 0); + page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e491f3eda6e7..60dc1347c5ab 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -722,7 +722,8 @@ static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev) int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); -int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, +int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, + struct ib_udata *udata, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index c7c85c22e4e3..56639ecd53ad 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -367,7 +367,8 @@ end: return block_shift; } -static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start, +static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, + struct ib_udata *udata, u64 start, u64 length, u64 virt_addr, int access_flags) { @@ -398,7 +399,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start, up_read(¤t->mm->mmap_sem); } - return ib_umem_get(context, start, length, access_flags, 0); + return ib_umem_get(udata, start, length, access_flags, 0); } struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, @@ -415,7 +416,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length, + mr->umem = mlx4_get_umem_mr(pd->uobject->context, udata, start, length, virt_addr, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); @@ -506,8 +507,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); mmr->umem = - mlx4_get_umem_mr(mr->uobject->context, start, length, - virt_addr, mr_access_flags); + mlx4_get_umem_mr(mr->uobject->context, udata, start, + length, virt_addr, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 971e9a9ebdaf..e38bab50cecf 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1015,9 +1015,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, (qp->sq.wqe_cnt << qp->sq.wqe_shift); } - qp->umem = ib_umem_get(pd->uobject->context, - (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr : - ucmd.wq.buf_addr, qp->buf_size, 0, 0); + qp->umem = + ib_umem_get(udata, + (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr : + ucmd.wq.buf_addr, + qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; @@ -1035,9 +1037,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err_mtt; if (qp_has_rq(init_attr)) { - err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), + err = mlx4_ib_db_map_user( + to_mucontext(pd->uobject->context), udata, (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr : - ucmd.wq.db_addr, &qp->db); + ucmd.wq.db_addr, + &qp->db); if (err) goto err_mtt; } diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 4456f1b8921d..498588eac051 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -113,8 +113,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, goto err_srq; } - srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - buf_size, 0, 0); + srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0); if (IS_ERR(srq->umem)) { err = PTR_ERR(srq->umem); goto err_srq; @@ -130,7 +129,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, goto err_mtt; err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), - ucmd.db_addr, &srq->db); + udata, ucmd.db_addr, &srq->db); if (err) goto err_mtt; } else { diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 90f1b0bae5b5..c283c32f30fe 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -707,15 +707,15 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, *cqe_size = ucmd.cqe_size; - cq->buf.umem = ib_umem_get(context, ucmd.buf_addr, - entries * ucmd.cqe_size, - IB_ACCESS_LOCAL_WRITE, 1); + cq->buf.umem = + ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size, + IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(cq->buf.umem)) { err = PTR_ERR(cq->buf.umem); return err; } - err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, + err = mlx5_ib_db_map_user(to_mucontext(context), udata, ucmd.db_addr, &cq->db); if (err) goto err_umem; @@ -1111,7 +1111,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, struct ib_umem *umem; int err; int npages; - struct ib_ucontext *context = cq->buf.umem->context; err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); if (err) @@ -1124,7 +1123,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) return -EINVAL; - umem = ib_umem_get(context, ucmd.buf_addr, + umem = ib_umem_get(udata, ucmd.buf_addr, (size_t)ucmd.cqe_size * entries, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(umem)) { diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 5a588f3cfb1b..8bb711da7ee1 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1195,7 +1195,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, if (err) return err; - obj->umem = ib_umem_get(ucontext, addr, size, access, 0); + obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access, 0); if (IS_ERR(obj->umem)) return PTR_ERR(obj->umem); diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index a0e4e6ddb71a..8f4e5f22b84c 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -43,7 +43,8 @@ struct mlx5_ib_user_db_page { int refcnt; }; -int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, + struct ib_udata *udata, unsigned long virt, struct mlx5_db *db) { struct mlx5_ib_user_db_page *page; @@ -63,8 +64,7 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, page->user_virt = (virt & PAGE_MASK); page->refcnt = 0; - page->umem = ib_umem_get(&context->ibucontext, virt & PAGE_MASK, - PAGE_SIZE, 0, 0); + page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0); if (IS_ERR(page->umem)) { err = PTR_ERR(page->umem); kfree(page); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 81efa5def8ad..b0a37ca2a714 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1037,7 +1037,8 @@ to_mflow_act(struct ib_flow_action *ibact) return container_of(ibact, struct mlx5_ib_flow_action, ib_action); } -int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, + struct ib_udata *udata, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); @@ -1103,6 +1104,7 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw); int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + struct ib_udata *udata, int access_flags); void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 494a90f4348c..6d763d6a189f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -843,18 +843,17 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev) return MLX5_MAX_UMR_SHIFT; } -static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length, - int access_flags, struct ib_umem **umem, - int *npages, int *page_shift, int *ncont, - int *order) +static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, + u64 start, u64 length, int access_flags, + struct ib_umem **umem, int *npages, int *page_shift, + int *ncont, int *order) { - struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *u; int err; *umem = NULL; - u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0); + u = ib_umem_get(udata, start, length, access_flags, 0); err = PTR_ERR_OR_ZERO(u); if (err) { mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); @@ -1333,14 +1332,14 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return ERR_PTR(-EINVAL); - mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); + mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags); if (IS_ERR(mr)) return ERR_CAST(mr); return &mr->ibmr; } - err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, - &page_shift, &ncont, &order); + err = mr_umem_get(dev, udata, start, length, access_flags, &umem, + &npages, &page_shift, &ncont, &order); if (err < 0) return ERR_PTR(err); @@ -1488,8 +1487,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, flags |= IB_MR_REREG_TRANS; ib_umem_release(mr->umem); mr->umem = NULL; - err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, - &npages, &page_shift, &ncont, &order); + err = mr_umem_get(dev, udata, addr, len, access_flags, + &mr->umem, &npages, &page_shift, &ncont, + &order); if (err) goto err; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 37d6653f9636..8d46b1dc5658 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -492,13 +492,13 @@ next_mr: } struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + struct ib_udata *udata, int access_flags) { - struct ib_ucontext *ctx = pd->ibpd.uobject->context; struct mlx5_ib_mr *imr; struct ib_umem *umem; - umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); + umem = ib_umem_get(udata, 0, 0, IB_ACCESS_ON_DEMAND, 0); if (IS_ERR(umem)) return ERR_CAST(umem); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index dd2ae640bc84..529e76f67cb6 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -645,16 +645,14 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev, return bfregi->sys_pages[index_of_sys_page] + offset; } -static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, - struct ib_pd *pd, +static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, unsigned long addr, size_t size, - struct ib_umem **umem, - int *npages, int *page_shift, int *ncont, - u32 *offset) + struct ib_umem **umem, int *npages, int *page_shift, + int *ncont, u32 *offset) { int err; - *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0); + *umem = ib_umem_get(udata, addr, size, 0, 0); if (IS_ERR(*umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); return PTR_ERR(*umem); @@ -695,10 +693,9 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, } static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct mlx5_ib_rwq *rwq, + struct ib_udata *udata, struct mlx5_ib_rwq *rwq, struct mlx5_ib_create_wq *ucmd) { - struct mlx5_ib_ucontext *context; int page_shift = 0; int npages; u32 offset = 0; @@ -708,9 +705,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (!ucmd->buf_addr) return -EINVAL; - context = to_mucontext(pd->uobject->context); - rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, - rwq->buf_size, 0, 0); + rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0); if (IS_ERR(rwq->umem)) { mlx5_ib_dbg(dev, "umem_get failed\n"); err = PTR_ERR(rwq->umem); @@ -735,7 +730,8 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, (unsigned long long)ucmd->buf_addr, rwq->buf_size, npages, page_shift, ncont, offset); - err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db); + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata, + ucmd->db_addr, &rwq->db); if (err) { mlx5_ib_dbg(dev, "map failed\n"); goto err_umem; @@ -819,10 +815,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (ucmd.buf_addr && ubuffer->buf_size) { ubuffer->buf_addr = ucmd.buf_addr; - err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, - ubuffer->buf_size, - &ubuffer->umem, &npages, &page_shift, - &ncont, &offset); + err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, + ubuffer->buf_size, &ubuffer->umem, + &npages, &page_shift, &ncont, &offset); if (err) goto err_bfreg; } else { @@ -856,7 +851,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, resp->bfreg_index = MLX5_IB_INVALID_BFREG; qp->bfregn = bfregn; - err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); + err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db); if (err) { mlx5_ib_dbg(dev, "map failed\n"); goto err_free; @@ -1119,6 +1114,7 @@ static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev, } static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct ib_udata *udata, struct mlx5_ib_sq *sq, void *qpin, struct ib_pd *pd) { @@ -1135,9 +1131,9 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, int ncont = 0; u32 offset = 0; - err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, - &sq->ubuffer.umem, &npages, &page_shift, - &ncont, &offset); + err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size, + &sq->ubuffer.umem, &npages, &page_shift, &ncont, + &offset); if (err) return err; @@ -1374,7 +1370,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) return err; - err = create_raw_packet_qp_sq(dev, sq, in, pd); + err = create_raw_packet_qp_sq(dev, udata, sq, in, pd); if (err) goto err_destroy_tis; @@ -5793,7 +5789,7 @@ static int prepare_user_rq(struct ib_pd *pd, return err; } - err = create_user_rq(dev, pd, rwq, &ucmd); + err = create_user_rq(dev, pd, udata, rwq, &ucmd); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); return err; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 4e8d18009f58..22bd774e0b4e 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -79,8 +79,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); - srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, - 0, 0); + srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0); if (IS_ERR(srq->umem)) { mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); err = PTR_ERR(srq->umem); @@ -104,7 +103,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0); - err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), + err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata, ucmd.db_addr, &srq->db); if (err) { mlx5_ib_dbg(dev, "map doorbell failed\n"); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 82cb6b71ac7c..53fff6aed896 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -931,7 +931,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, + mr->umem = ib_umem_get(udata, start, length, acc, ucmd.mr_attrs & MTHCA_MR_DMASYNC); if (IS_ERR(mr->umem)) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 4e7f08ee1907..feb4d259aab9 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2134,7 +2134,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u8 stag_key; int first_page = 1; - region = ib_umem_get(pd->uobject->context, start, length, acc, 0); + region = ib_umem_get(udata, start, length, acc, 0); if (IS_ERR(region)) { return (struct ib_mr *)region; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c46bed0c5513..a8b40cb7679e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -916,7 +916,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(status); - mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0); + mr->umem = ib_umem_get(udata, start, len, acc, 0); if (IS_ERR(mr->umem)) { status = -EFAULT; goto umem_err; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index b342a70e2814..ba8cb6559122 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -736,11 +736,10 @@ static inline int qedr_align_cq_entries(int entries) return aligned_size / QEDR_CQE_SIZE; } -static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx, +static inline int qedr_init_user_queue(struct ib_udata *udata, struct qedr_dev *dev, - struct qedr_userq *q, - u64 buf_addr, size_t buf_len, - int access, int dmasync, + struct qedr_userq *q, u64 buf_addr, + size_t buf_len, int access, int dmasync, int alloc_and_init) { u32 fw_pages; @@ -748,7 +747,7 @@ static inline int qedr_init_user_queue(struct ib_ucontext *ib_ctx, q->buf_addr = buf_addr; q->buf_len = buf_len; - q->umem = ib_umem_get(ib_ctx, q->buf_addr, q->buf_len, access, dmasync); + q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access, dmasync); if (IS_ERR(q->umem)) { DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n", PTR_ERR(q->umem)); @@ -905,9 +904,9 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, cq->cq_type = QEDR_CQ_TYPE_USER; - rc = qedr_init_user_queue(ib_ctx, dev, &cq->q, ureq.addr, - ureq.len, IB_ACCESS_LOCAL_WRITE, - 1, 1); + rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr, + ureq.len, IB_ACCESS_LOCAL_WRITE, 1, + 1); if (rc) goto err0; @@ -1344,7 +1343,7 @@ static void qedr_free_srq_kernel_params(struct qedr_srq *srq) hw_srq->phy_prod_pair_addr); } -static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx, +static int qedr_init_srq_user_params(struct ib_udata *udata, struct qedr_srq *srq, struct qedr_create_srq_ureq *ureq, int access, int dmasync) @@ -1352,14 +1351,14 @@ static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx, struct scatterlist *sg; int rc; - rc = qedr_init_user_queue(ib_ctx, srq->dev, &srq->usrq, ureq->srq_addr, + rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr, ureq->srq_len, access, dmasync, 1); if (rc) return rc; - srq->prod_umem = ib_umem_get(ib_ctx, ureq->prod_pair_addr, - sizeof(struct rdma_srq_producers), - access, dmasync); + srq->prod_umem = + ib_umem_get(udata, ureq->prod_pair_addr, + sizeof(struct rdma_srq_producers), access, dmasync); if (IS_ERR(srq->prod_umem)) { qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); ib_umem_release(srq->usrq.umem); @@ -1468,7 +1467,7 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, goto err0; } - rc = qedr_init_srq_user_params(ib_ctx, srq, &ureq, 0, 0); + rc = qedr_init_srq_user_params(udata, srq, &ureq, 0, 0); if (rc) goto err0; @@ -1714,14 +1713,14 @@ static int qedr_create_user_qp(struct qedr_dev *dev, } /* SQ - read access only (0), dma sync not required (0) */ - rc = qedr_init_user_queue(ib_ctx, dev, &qp->usq, ureq.sq_addr, + rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr, ureq.sq_len, 0, 0, alloc_and_init); if (rc) return rc; if (!qp->srq) { /* RQ - read access only (0), dma sync not required (0) */ - rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr, + rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr, ureq.rq_len, 0, 0, alloc_and_init); if (rc) return rc; @@ -2719,7 +2718,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, mr->type = QEDR_MR_USER; - mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0); + mr->umem = ib_umem_get(udata, start, len, acc, 0); if (IS_ERR(mr->umem)) { rc = -EFAULT; goto err0; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 0f004c737620..104c7db4704f 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -141,7 +141,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, goto err_cq; } - cq->umem = ib_umem_get(context, ucmd.buf_addr, ucmd.buf_size, + cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, IB_ACCESS_LOCAL_WRITE, 1); if (IS_ERR(cq->umem)) { ret = PTR_ERR(cq->umem); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index fa96fa4fb829..a85884e90e84 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -126,8 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(-EINVAL); } - umem = ib_umem_get(pd->uobject->context, start, - length, access_flags, 0); + umem = ib_umem_get(udata, start, length, access_flags, 0); if (IS_ERR(umem)) { dev_warn(&dev->pdev->dev, "could not get umem for mem region\n"); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 3acf74cbe266..5fc444cef011 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -262,8 +262,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, if (!is_srq) { /* set qp->sq.wqe_cnt, shift, buf_size.. */ - qp->rumem = ib_umem_get(pd->uobject->context, - ucmd.rbuf_addr, + qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr, ucmd.rbuf_size, 0, 0); if (IS_ERR(qp->rumem)) { ret = PTR_ERR(qp->rumem); @@ -275,8 +274,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, qp->srq = to_vsrq(init_attr->srq); } - qp->sumem = ib_umem_get(pd->uobject->context, - ucmd.sbuf_addr, + qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr, ucmd.sbuf_size, 0, 0); if (IS_ERR(qp->sumem)) { if (!is_srq) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c index 06ba7c7a2235..951d9d68107a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c @@ -153,9 +153,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd, goto err_srq; } - srq->umem = ib_umem_get(pd->uobject->context, - ucmd.buf_addr, - ucmd.buf_size, 0, 0); + srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0, 0); if (IS_ERR(srq->umem)) { ret = PTR_ERR(srq->umem); goto err_srq; diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 49c9541050d4..8b1c1e8dd7ef 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -388,8 +388,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (length == 0) return ERR_PTR(-EINVAL); - umem = ib_umem_get(pd->uobject->context, start, length, - mr_access_flags, 0); + umem = ib_umem_get(udata, start, length, mr_access_flags, 0); if (IS_ERR(umem)) return (void *)umem; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 9d3916b93f23..2438093776a0 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -171,7 +171,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start, void *vaddr; int err; - umem = ib_umem_get(pd->ibpd.uobject->context, start, length, access, 0); + umem = ib_umem_get(udata, start, length, access, 0); if (IS_ERR(umem)) { pr_warn("err %d from rxe_umem_get\n", (int)PTR_ERR(umem)); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 5d3755ec5afa..73af05db04c7 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -36,6 +36,7 @@ #include #include #include +#include struct ib_ucontext; struct ib_umem_odp; @@ -80,7 +81,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_USER_MEM -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); @@ -91,9 +92,10 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, #include -static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, +static inline struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, - int access, int dmasync) { + int access, int dmasync) +{ return ERR_PTR(-EINVAL); } static inline void ib_umem_release(struct ib_umem *umem) { } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3ddd199ba602..aa1f126d3383 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4200,6 +4200,7 @@ void rdma_roce_rescan_device(struct ib_device *ibdev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); +struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata); int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); -- cgit v1.2.3 From ea4baf7f116a18382df331db2123d98bc1c3cd83 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 18 Dec 2018 14:28:30 +0200 Subject: RDMA: Rename port_callback to init_port Most provider routines are callback routines which ib core invokes. _callback suffix doesn't convey information about when such callback is invoked. Therefore, rename port_callback to init_port. Additionally, store the init_port function pointer in ib_device_ops, so that it can be accessed in subsequent patches when binding rdma device to net namespace. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 4 +--- drivers/infiniband/core/device.c | 7 +++---- drivers/infiniband/core/sysfs.c | 16 ++++++---------- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/provider.c | 2 +- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 +- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rdmavt/vt.c | 5 ++--- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- include/rdma/ib_verbs.h | 10 +++++++--- include/rdma/rdma_vt.h | 3 --- 22 files changed, 35 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 3cd830d52967..aca75c74e451 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -54,9 +54,7 @@ struct pkey_index_qp_list { struct list_head qp_list; }; -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8872453e26c0..66867e92ddea 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -574,9 +574,7 @@ port_cleanup: * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)) +int ib_register_device(struct ib_device *device, const char *name) { int ret; struct ib_client *client; @@ -613,7 +611,7 @@ int ib_register_device(struct ib_device *device, const char *name, goto dev_cleanup; } - ret = ib_device_register_sysfs(device, port_callback); + ret = ib_device_register_sysfs(device); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); @@ -1283,6 +1281,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); + SET_DEVICE_OP(dev_ops, init_port); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 80f68eb0ba5c..7a5679933df6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1015,9 +1015,7 @@ err_free_stats: return; } -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int add_port(struct ib_device *device, int port_num) { struct ib_port *p; struct ib_port_attr attr; @@ -1113,8 +1111,8 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_pkey; - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); + if (device->ops.init_port) { + ret = device->ops.init_port(device, port_num, &p->kobj); if (ret) goto err_remove_pkey; } @@ -1308,9 +1306,7 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(device->ports_kobj); } -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_device_register_sysfs(struct ib_device *device) { struct device *class_dev = &device->dev; int ret; @@ -1330,12 +1326,12 @@ int ib_device_register_sysfs(struct ib_device *device, } if (rdma_cap_ib_switch(device)) { - ret = add_port(device, 0, port_callback); + ret = add_port(device, 0); if (ret) goto err_put; } else { for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); + ret = add_port(device, i); if (ret) goto err_put; } diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e7a997f2a537..797a3e943366 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -662,7 +662,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group); ibdev->driver_id = RDMA_DRIVER_BNXT_RE; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); - return ib_register_device(ibdev, "bnxt_re%d", NULL); + return ib_register_device(ibdev, "bnxt_re%d"); } static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 92ee6761a3bd..ffdde3cca268 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1409,7 +1409,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group); ib_set_device_ops(&dev->ibdev, &iwch_dev_ops); - ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); + ret = ib_register_device(&dev->ibdev, "cxgb3_%d"); if (ret) kfree(dev->ibdev.iwcm); return ret; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 586b0c37481f..0a99894b0160 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -634,7 +634,7 @@ void c4iw_register_device(struct work_struct *work) rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops); - ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); + ret = ib_register_device(&dev->ibdev, "cxgb4_%d"); if (ret) goto err_kfree_iwcm; return; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index ec582d86025f..8957adf58af7 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1622,6 +1622,7 @@ static const struct ib_device_ops hfi1_dev_ops = { .alloc_rdma_netdev = hfi1_vnic_alloc_rn, .get_dev_fw_str = hfi1_get_dev_fw_str, .get_hw_stats = get_hw_stats, + .init_port = hfi1_create_port_files, .modify_device = modify_device, /* keep process mad in the driver */ .process_mad = hfi1_process_mad, @@ -1679,7 +1680,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) /* * Fill in rvt info object. */ - dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 44a07fce0617..46ede58ef3b8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -564,7 +564,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->driver_id = RDMA_DRIVER_HNS; ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); - ret = ib_register_device(ib_dev, "hns_%d", NULL); + ret = ib_register_device(ib_dev, "hns_%d"); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 80b66df95362..af66ab9d150b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2868,7 +2868,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) iwibdev = iwdev->iwibdev; rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group); iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; - ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL); + ret = ib_register_device(&iwibdev->ibdev, "i40iw%d"); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1f15ec3e2b83..c3f950d82ed0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2856,7 +2856,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group); ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; - if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL)) + if (ib_register_device(&ibdev->ib_dev, "mlx4_%d")) goto err_diag_counters; if (mlx4_ib_mad_init(ibdev)) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 717e8535b41a..948617a60d44 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6233,7 +6233,7 @@ int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) name = "mlx5_%d"; else name = "mlx5_bond_%d"; - return ib_register_device(&dev->ib_dev, name, NULL); + return ib_register_device(&dev->ib_dev, name); } void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 0a939ea5497b..3473c6c51b92 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1338,7 +1338,7 @@ int mthca_register_device(struct mthca_dev *dev) rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group); dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; - ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL); + ret = ib_register_device(&dev->ib_dev, "mthca%d"); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index feb4d259aab9..34601f0cbd74 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3801,7 +3801,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group); nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; - ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL); + ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d"); if (ret) { return ret; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 1f393842453a..f45b996f617f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -243,7 +243,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) } rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; - return ib_register_device(&dev->ibdev, "ocrdma%d", NULL); + return ib_register_device(&dev->ibdev, "ocrdma%d"); } static int ocrdma_alloc_resources(struct ocrdma_dev *dev) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 75940e2a8791..8e5c76d06855 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -290,7 +290,7 @@ static int qedr_register_device(struct qedr_dev *dev) ib_set_device_ops(&dev->ibdev, &qedr_dev_ops); dev->ibdev.driver_id = RDMA_DRIVER_QEDR; - return ib_register_device(&dev->ibdev, "qedr%d", NULL); + return ib_register_device(&dev->ibdev, "qedr%d"); } /* This function allocates fast-path status block memory */ diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 276304f611ab..ff8dab8e2344 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1494,6 +1494,7 @@ static void qib_fill_device_attr(struct qib_devdata *dd) } static const struct ib_device_ops qib_dev_ops = { + .init_port = qib_create_port_files, .modify_device = qib_modify_device, .process_mad = qib_process_mad, }; @@ -1567,7 +1568,6 @@ int qib_register_ib_device(struct qib_devdata *dd) /* * Fill in rvt info object. */ - dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index b2323a52a0dd..3201dd1899c7 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -422,7 +422,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group); - if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL)) + if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d")) goto err_fwd_dealloc; usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index eaa109dbc96a..1bc415483d9b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -278,7 +278,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) spin_lock_init(&dev->srq_tbl_lock); rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group); - ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL); + ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d"); if (ret) goto err_srq_free; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 8724a1817158..7de7389d0235 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -446,7 +446,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * These functions are not part of verbs specifically but are * required for rdmavt to function. */ - if ((!rdi->driver_f.port_callback) || + if ((!rdi->ibdev.ops.init_port) || (!rdi->driver_f.get_pci_dev)) return -EINVAL; break; @@ -644,8 +644,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ - ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), - rdi->driver_f.port_callback); + ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev)); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); goto bail_wss; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index b20e6e0415f5..43171148e9c5 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1258,7 +1258,7 @@ int rxe_register_device(struct rxe_dev *rxe) rdma_set_device_sysfs_group(dev, &rxe_attr_group); dev->driver_id = RDMA_DRIVER_RXE; - err = ib_register_device(dev, "rxe%d", NULL); + err = ib_register_device(dev, "rxe%d"); if (err) { pr_warn("%s failed with error %d\n", __func__, err); goto err1; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index aa1f126d3383..1d1902fd9f87 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2504,6 +2504,12 @@ struct ib_device_ops { */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u8 port, int index); + /* + * This function is called once for each port when a ib device is + * registered. + */ + int (*init_port)(struct ib_device *device, u8 port_num, + struct kobject *port_sysfs); }; struct ib_device { @@ -2620,9 +2626,7 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index dd0ed8048bb4..acb3bc96dfa7 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -250,9 +250,6 @@ struct rvt_driver_provided { */ void (*do_send)(struct rvt_qp *qp); - /* Passed to ib core registration. Callback to create syfs files */ - int (*port_callback)(struct ib_device *, u8, struct kobject *); - /* * Returns a pointer to the undelying hardware's PCI device. This is * used to display information as to what hardware is being referenced -- cgit v1.2.3 From 54747231150f0dddf68f2ee29ec2970fcc433909 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 18 Dec 2018 14:15:56 +0200 Subject: RDMA: Introduce and use rdma_device_to_ibdev() Introduce and use rdma_device_to_ibdev() API for those drivers which are registering one sysfs group and also use in ib_core. In subsequent patch, device->provider_ibdev one-to-one mapping is no longer holds true during accessing sysfs entries. Therefore, introduce an API rdma_device_to_ibdev() that provides such information. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 -- drivers/infiniband/core/sysfs.c | 12 ++++++------ drivers/infiniband/hw/bnxt_re/main.c | 6 ++++-- drivers/infiniband/hw/cxgb3/iwch_provider.c | 14 ++++++++------ drivers/infiniband/hw/cxgb4/provider.c | 14 ++++++++------ drivers/infiniband/hw/hfi1/sysfs.c | 16 ++++++++-------- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 5 ++--- drivers/infiniband/hw/mlx4/main.c | 7 ++++--- drivers/infiniband/hw/mlx5/main.c | 13 ++++++++----- drivers/infiniband/hw/mthca/mthca_provider.c | 9 ++++++--- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 6 ++++-- drivers/infiniband/hw/qedr/main.c | 3 ++- drivers/infiniband/hw/qib/qib_sysfs.c | 18 +++++++++--------- drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 26 +++++++++++--------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 4 ++-- include/rdma/ib_verbs.h | 23 +++++++++++++++++++++++ 17 files changed, 106 insertions(+), 74 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 66867e92ddea..f8180cf1a004 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -296,8 +296,6 @@ struct ib_device *ib_alloc_device(size_t size) device->dev.class = &ib_class; device_initialize(&device->dev); - dev_set_drvdata(&device->dev, device); - INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); rwlock_init(&device->client_data_lock); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7a5679933df6..c75692802da8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1187,7 +1187,7 @@ err_put: static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); @@ -1204,7 +1204,7 @@ static DEVICE_ATTR_RO(node_type); static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), @@ -1217,7 +1217,7 @@ static DEVICE_ATTR_RO(sys_image_guid); static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(((__be16 *) &dev->node_guid)[0]), @@ -1230,7 +1230,7 @@ static DEVICE_ATTR_RO(node_guid); static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); return sprintf(buf, "%.64s\n", dev->node_desc); } @@ -1239,7 +1239,7 @@ static ssize_t node_desc_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); struct ib_device_modify desc = {}; int ret; @@ -1258,7 +1258,7 @@ static DEVICE_ATTR_RW(node_desc); static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_device *dev = container_of(device, struct ib_device, dev); + struct ib_device *dev = rdma_device_to_ibdev(device); ib_get_device_fw_str(dev, buf); strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 797a3e943366..16eecfa5882c 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -538,7 +538,8 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + struct bnxt_re_dev *rdev = + rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); } @@ -547,7 +548,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + struct bnxt_re_dev *rdev = + rdma_device_to_drv_device(device, struct bnxt_re_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index ffdde3cca268..07c20cd07f33 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1130,8 +1130,9 @@ static int iwch_query_port(struct ib_device *ibdev, static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); + pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); } @@ -1140,8 +1141,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); struct ethtool_drvinfo info; struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev; @@ -1154,8 +1155,9 @@ static DEVICE_ATTR_RO(hca_type); static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, - ibdev.dev); + struct iwch_dev *iwch_dev = + rdma_device_to_drv_device(dev, struct iwch_dev, ibdev); + pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, iwch_dev->rdev.rnic_info.pdev->device); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0a99894b0160..f977f8e7e162 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -376,8 +376,9 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port, static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); + pr_debug("dev 0x%p\n", dev); return sprintf(buf, "%d\n", CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); @@ -387,8 +388,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); struct ethtool_drvinfo info; struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0]; @@ -401,8 +402,9 @@ static DEVICE_ATTR_RO(hca_type); static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, - ibdev.dev); + struct c4iw_dev *c4iw_dev = + rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev); + pr_debug("dev 0x%p\n", dev); return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, c4iw_dev->rdev.lldi.pdev->device); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 2be513d4c9da..90f62c4bddba 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -498,7 +498,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -508,7 +508,7 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -524,7 +524,7 @@ static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -536,7 +536,7 @@ static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* @@ -555,7 +555,7 @@ static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -567,7 +567,7 @@ static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); @@ -579,7 +579,7 @@ static ssize_t chip_reset_store(struct device *device, size_t count) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); int ret; @@ -609,7 +609,7 @@ static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = - container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev); struct hfi1_devdata *dd = dd_from_dev(dev); struct hfi1_temp temp; int ret; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index af66ab9d150b..12b31a8440be 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2139,9 +2139,8 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr) static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct i40iw_ib_device *iwibdev = container_of(dev, - struct i40iw_ib_device, - ibdev.dev); + struct i40iw_ib_device *iwibdev = + rdma_device_to_drv_device(dev, struct i40iw_ib_device, ibdev); u32 hw_rev = iwibdev->iwdev->sc_dev.hw_rev; return sprintf(buf, "%x\n", hw_rev); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c3f950d82ed0..dc2ffd293a11 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2043,7 +2043,7 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev); return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } static DEVICE_ATTR_RO(hca_type); @@ -2052,7 +2052,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -2061,7 +2061,8 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = - container_of(device, struct mlx4_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx4_ib_dev, ib_dev); + return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 948617a60d44..4b1b56d54301 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4104,7 +4104,7 @@ static ssize_t fw_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); } @@ -4114,7 +4114,7 @@ static ssize_t reg_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } @@ -4124,7 +4124,8 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } static DEVICE_ATTR_RO(hca_type); @@ -4133,7 +4134,8 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + return sprintf(buf, "%x\n", dev->mdev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -4142,7 +4144,8 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev); + return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 3473c6c51b92..63003b4d2485 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1081,7 +1081,8 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mthca_dev, ib_dev); + return sprintf(buf, "%x\n", dev->rev_id); } static DEVICE_ATTR_RO(hw_rev); @@ -1090,7 +1091,8 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mthca_dev, ib_dev); + switch (dev->pdev->device) { case PCI_DEVICE_ID_MELLANOX_TAVOR: return sprintf(buf, "MT23108\n"); @@ -1111,7 +1113,8 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct mthca_dev *dev = - container_of(device, struct mthca_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct mthca_dev, ib_dev); + return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); } static DEVICE_ATTR_RO(board_id); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 34601f0cbd74..034156f7e9ed 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2560,7 +2560,7 @@ static ssize_t hw_rev_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nes_ib_device *nesibdev = - container_of(dev, struct nes_ib_device, ibdev.dev); + rdma_device_to_drv_device(dev, struct nes_ib_device, ibdev); struct nes_vnic *nesvnic = nesibdev->nesvnic; nes_debug(NES_DBG_INIT, "\n"); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index f45b996f617f..b0491b9ecfe4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -118,7 +118,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocrdma_dev *dev = dev_get_drvdata(device); + struct ocrdma_dev *dev = + rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); } @@ -127,7 +128,8 @@ static DEVICE_ATTR_RO(hw_rev); static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { - struct ocrdma_dev *dev = dev_get_drvdata(device); + struct ocrdma_dev *dev = + rdma_device_to_drv_device(device, struct ocrdma_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); } diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 8e5c76d06855..f85e72b65a10 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -137,7 +137,8 @@ static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num, static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { - struct qedr_dev *dev = dev_get_drvdata(device); + struct qedr_dev *dev = + rdma_device_to_drv_device(device, struct qedr_dev, ibdev); return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); } diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index 1cf4ca3f23e3..905206a0c2d5 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -555,7 +555,7 @@ static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } @@ -565,7 +565,7 @@ static ssize_t hca_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); int ret; @@ -590,7 +590,7 @@ static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -602,7 +602,7 @@ static ssize_t localbus_info_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* The string printed here is already newline-terminated. */ @@ -614,7 +614,7 @@ static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of user ports (contexts) available. */ @@ -630,7 +630,7 @@ static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); /* Return the number of free user ports (contexts) available. */ @@ -642,7 +642,7 @@ static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); buf[sizeof(dd->serial)] = '\0'; @@ -657,7 +657,7 @@ static ssize_t chip_reset_store(struct device *device, size_t count) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); int ret; @@ -679,7 +679,7 @@ static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = - container_of(device, struct qib_ibdev, rdi.ibdev.dev); + rdma_device_to_drv_device(device, struct qib_ibdev, rdi.ibdev); struct qib_devdata *dd = dd_from_dev(dev); int ret; int idx; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index a7e4b2ccfaf8..c85d48ae7442 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -50,7 +50,7 @@ static ssize_t board_id_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev = - container_of(device, struct usnic_ib_dev, ib_dev.dev); + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); unsigned short subsystem_device_id; mutex_lock(&us_ibdev->usdev_lock); @@ -67,14 +67,13 @@ static DEVICE_ATTR_RO(board_id); static ssize_t config_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); char *ptr; unsigned left; unsigned n; enum usnic_vnic_res_type res_type; - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); - /* Buffer space limit is 1 page */ ptr = buf; left = PAGE_SIZE; @@ -130,9 +129,8 @@ static DEVICE_ATTR_RO(config); static ssize_t iface_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; - - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); return scnprintf(buf, PAGE_SIZE, "%s\n", netdev_name(us_ibdev->netdev)); @@ -142,9 +140,8 @@ static DEVICE_ATTR_RO(iface); static ssize_t max_vf_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; - - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); return scnprintf(buf, PAGE_SIZE, "%u\n", kref_read(&us_ibdev->vf_cnt)); @@ -154,10 +151,10 @@ static DEVICE_ATTR_RO(max_vf); static ssize_t qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); int qp_per_vf; - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ], us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]); @@ -169,9 +166,8 @@ static DEVICE_ATTR_RO(qp_per_vf); static ssize_t cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { - struct usnic_ib_dev *us_ibdev; - - us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); + struct usnic_ib_dev *us_ibdev = + rdma_device_to_drv_device(device, struct usnic_ib_dev, ib_dev); return scnprintf(buf, PAGE_SIZE, "%d\n", us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 43171148e9c5..3d01247a28db 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1129,8 +1129,8 @@ static int rxe_detach_mcast(struct ib_qp *ibqp, union ib_gid *mgid, u16 mlid) static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { - struct rxe_dev *rxe = container_of(device, struct rxe_dev, - ib_dev.dev); + struct rxe_dev *rxe = + rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); return snprintf(buf, 16, "%s\n", rxe_parent_name(rxe, 1)); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1d1902fd9f87..94b6e1dd4dab 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4241,4 +4241,27 @@ rdma_set_device_sysfs_group(struct ib_device *dev, dev->groups[1] = group; } +/** + * rdma_device_to_ibdev - Get ib_device pointer from device pointer + * + * @device: device pointer for which ib_device pointer to retrieve + * + * rdma_device_to_ibdev() retrieves ib_device pointer from device. + * + */ +static inline struct ib_device *rdma_device_to_ibdev(struct device *device) +{ + return container_of(device, struct ib_device, dev); +} + +/** + * rdma_device_to_drv_device - Helper macro to reach back to driver's + * ib_device holder structure from device pointer. + * + * NOTE: New drivers should not make use of this API; This API is only for + * existing drivers who have exposed sysfs entries using + * rdma_set_device_sysfs_group(). + */ +#define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ + container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 7527a7b157d1191b23562ed70154ae93bd65f845 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 17 Jan 2019 20:14:15 +0200 Subject: IB/core: Simplify rdma cgroup registration RDMA cgroup registration routine always returns success, so simplify function to be void and run clang formatter over whole CONFIG_CGROUP_RDMA art of core_priv.h. This reduces unwinding error path for regular registration and future net namespace change functionality for rdma device. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Acked-by: Tejun Heo Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cgroup.c | 5 ++--- drivers/infiniband/core/core_priv.h | 17 +++++++++++------ drivers/infiniband/core/device.c | 8 +------- include/linux/cgroup_rdma.h | 2 +- kernel/cgroup/rdma.c | 5 +---- 5 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c index 126ac5f99db7..388fd04e5f63 100644 --- a/drivers/infiniband/core/cgroup.c +++ b/drivers/infiniband/core/cgroup.c @@ -21,12 +21,11 @@ * Register with the rdma cgroup. Should be called before * exposing rdma device to user space applications to avoid * resource accounting leak. - * Returns 0 on success or otherwise failure code. */ -int ib_device_register_rdmacg(struct ib_device *device) +void ib_device_register_rdmacg(struct ib_device *device) { device->cg_device.name = device->name; - return rdmacg_register_device(&device->cg_device); + rdmacg_register_device(&device->cg_device); } /** diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index aca75c74e451..42a49982f66e 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -115,7 +115,7 @@ void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); #ifdef CONFIG_CGROUP_RDMA -int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_register_rdmacg(struct ib_device *device); void ib_device_unregister_rdmacg(struct ib_device *device); int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, @@ -126,21 +126,26 @@ void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); #else -static inline int ib_device_register_rdmacg(struct ib_device *device) -{ return 0; } +static inline void ib_device_register_rdmacg(struct ib_device *device) +{ +} static inline void ib_device_unregister_rdmacg(struct ib_device *device) -{ } +{ +} static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ return 0; } +{ + return 0; +} static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) -{ } +{ +} #endif static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 4a9aa6d10c5e..200431c540f2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -599,12 +599,7 @@ int ib_register_device(struct ib_device *device, const char *name) device->index = __dev_new_index(); - ret = ib_device_register_rdmacg(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't register device with rdma cgroup\n"); - goto dev_cleanup; - } + ib_device_register_rdmacg(device); ret = ib_device_register_sysfs(device); if (ret) { @@ -627,7 +622,6 @@ int ib_register_device(struct ib_device *device, const char *name) cg_cleanup: ib_device_unregister_rdmacg(device); -dev_cleanup: cleanup_device(device); out: mutex_unlock(&device_mutex); diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index e94290b29e99..ef1bae2983f3 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -39,7 +39,7 @@ struct rdmacg_device { * APIs for RDMA/IB stack to publish when a device wants to * participate in resource accounting */ -int rdmacg_register_device(struct rdmacg_device *device); +void rdmacg_register_device(struct rdmacg_device *device); void rdmacg_unregister_device(struct rdmacg_device *device); /* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index d3bbb757ee49..1d75ae7f1cb7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -313,10 +313,8 @@ EXPORT_SYMBOL(rdmacg_try_charge); * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. */ -int rdmacg_register_device(struct rdmacg_device *device) +void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); @@ -324,7 +322,6 @@ int rdmacg_register_device(struct rdmacg_device *device) mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); - return 0; } EXPORT_SYMBOL(rdmacg_register_device); -- cgit v1.2.3 From 534fd7aac56a7994d16032f32123def9923e339f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 13 Jan 2019 16:01:17 +0200 Subject: IB/mlx5: Manage indirection mkey upon DEVX flow for ODP Manage indirection mkey upon DEVX flow to support ODP. To support a page fault event on the indirection mkey it needs to be part of the device mkey radix tree. Both the creation and the deletion flows for a DEVX object which is indirection mkey were adapted to handle that. Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 89 +++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx5/main.c | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +++ include/linux/mlx5/driver.h | 1 + 4 files changed, 96 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index b7ff2138ac2a..bbf9a26d8fa6 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -17,12 +17,18 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +enum devx_obj_flags { + DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; u64 obj_id; u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; + u32 flags; + struct mlx5_ib_devx_mr devx_mr; }; struct devx_umem { @@ -1011,6 +1017,36 @@ static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, } } +static int devx_handle_mkey_indirect(struct devx_obj *obj, + struct mlx5_ib_dev *dev, + void *in, void *out) +{ + struct mlx5_mkey_table *table = &dev->mdev->priv.mkey_table; + struct mlx5_ib_devx_mr *devx_mr = &obj->devx_mr; + unsigned long flags; + struct mlx5_core_mkey *mkey; + void *mkc; + u8 key; + int err; + + mkey = &devx_mr->mmkey; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + key = MLX5_GET(mkc, mkc, mkey_7_0); + mkey->key = mlx5_idx_to_mkey( + MLX5_GET(create_mkey_out, out, mkey_index)) | key; + mkey->type = MLX5_MKEY_INDIRECT_DEVX; + mkey->iova = MLX5_GET64(mkc, mkc, start_addr); + mkey->size = MLX5_GET64(mkc, mkc, len); + mkey->pd = MLX5_GET(mkc, mkc, pd); + devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); + + write_lock_irqsave(&table->lock, flags); + err = radix_tree_insert(&table->tree, mlx5_base_mkey(mkey->key), + mkey); + write_unlock_irqrestore(&table->lock, flags); + return err; +} + static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, struct devx_obj *obj, void *in, int in_len) @@ -1030,13 +1066,45 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, access_mode |= MLX5_GET(mkc, mkc, access_mode_4_2) << 2; if (access_mode == MLX5_MKC_ACCESS_MODE_KLMS || - access_mode == MLX5_MKC_ACCESS_MODE_KSM) + access_mode == MLX5_MKC_ACCESS_MODE_KSM) { + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + obj->flags |= DEVX_OBJ_FLAGS_INDIRECT_MKEY; return 0; + } MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); return 0; } +static void devx_free_indirect_mkey(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct devx_obj, devx_mr.rcu)); +} + +/* This function to delete from the radix tree needs to be called before + * destroying the underlying mkey. Otherwise a race might occur in case that + * other thread will get the same mkey before this one will be deleted, + * in that case it will fail via inserting to the tree its own data. + * + * Note: + * An error in the destroy is not expected unless there is some other indirect + * mkey which points to this one. In a kernel cleanup flow it will be just + * destroyed in the iterative destruction call. In a user flow, in case + * the application didn't close in the expected order it's its own problem, + * the mkey won't be part of the tree, in both cases the kernel is safe. + */ +static void devx_cleanup_mkey(struct devx_obj *obj) +{ + struct mlx5_mkey_table *table = &obj->mdev->priv.mkey_table; + struct mlx5_core_mkey *del_mkey; + unsigned long flags; + + write_lock_irqsave(&table->lock, flags); + del_mkey = radix_tree_delete(&table->tree, + mlx5_base_mkey(obj->devx_mr.mmkey.key)); + write_unlock_irqrestore(&table->lock, flags); +} + static int devx_obj_cleanup(struct ib_uobject *uobject, enum rdma_remove_reason why) { @@ -1044,10 +1112,21 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, struct devx_obj *obj = uobject->object; int ret; + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) + devx_cleanup_mkey(obj); + ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + struct mlx5_ib_dev *dev = to_mdev(uobject->context->device); + + call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu, + devx_free_indirect_mkey); + return ret; + } + kfree(obj); return ret; } @@ -1108,6 +1187,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out); + if (err) + goto obj_destroy; + } + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) goto obj_destroy; @@ -1116,6 +1201,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( return 0; obj_destroy: + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) + devx_cleanup_mkey(obj); mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); obj_free: kfree(obj); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 61064b7171fc..ae00f994673b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5724,6 +5724,7 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + srcu_barrier(&dev->mr_srcu); cleanup_srcu_struct(&dev->mr_srcu); drain_workqueue(dev->advise_mr_wq); destroy_workqueue(dev->advise_mr_wq); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b0a37ca2a714..819207190343 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -602,6 +602,12 @@ struct mlx5_ib_mw { int ndescs; }; +struct mlx5_ib_devx_mr { + struct mlx5_core_mkey mmkey; + int ndescs; + struct rcu_head rcu; +}; + struct mlx5_ib_umr_context { struct ib_cqe cqe; enum ib_wc_status status; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b6f5839f129a..619d6fee96a1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -364,6 +364,7 @@ struct mlx5_core_sig_ctx { enum { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, + MLX5_MKEY_INDIRECT_DEVX, }; struct mlx5_core_mkey { -- cgit v1.2.3 From da6a496a34f2fdcab14362cdc5068aac385e7b47 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 09:16:08 +0200 Subject: IB/mlx5: Ranges in implicit ODP MR inherit its write access A sub-range in ODP implicit MR should take its write permission from the MR and not be set always to allow. Fixes: d07d1d70ce1a ("IB/umem: Update on demand page (ODP) support") Signed-off-by: Moni Shoua Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 5 +++-- drivers/infiniband/hw/mlx5/odp.c | 2 +- include/rdma/ib_umem_odp.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index dc1d7cb15cfa..eb8a5eb65bfa 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -332,9 +332,10 @@ static void put_per_mm(struct ib_umem_odp *umem_odp) mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, unsigned long addr, size_t size) { + struct ib_ucontext_per_mm *per_mm = root->per_mm; struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; @@ -349,7 +350,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; - umem->writable = 1; + umem->writable = root->umem.writable; umem->is_odp = 1; odp_data->per_mm = per_mm; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3abdfc3584c0..3e0d5885c026 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -439,7 +439,7 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(odp_mr->per_mm, addr, + odp = ib_alloc_odp_umem(odp_mr, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index d3725cf13ecd..d0024f53626e 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -116,7 +116,7 @@ struct ib_ucontext_per_mm { }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); -- cgit v1.2.3 From 61b2fe3c62e5269408e264b2348f96467246d537 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 09:16:09 +0200 Subject: IB/mlx5: Remove dead code When CONFIG_INFINIBAND_ON_DEMAND_PAGING is not set there is no caller to ib_alloc_odp_umem() so let's remove it. Signed-off-by: Moni Shoua Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_umem_odp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index d0024f53626e..dadc96dea39c 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -169,12 +169,6 @@ static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) return -EINVAL; } -static inline struct ib_umem_odp * -ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) -{ - return ERR_PTR(-EINVAL); -} - static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -- cgit v1.2.3 From 6bf8f22aea0ddd93af822aed8afeeee4acdf7694 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:56 +0200 Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD and its initial implementation. This object is from type class FD and will be used to read DEVX async commands completion. The core layer should allow the driver to set object from type FD in a safe mode, this option was added with a matching comment in place. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 1 + drivers/infiniband/core/uverbs_uapi.c | 15 ++++-- drivers/infiniband/hw/mlx5/devx.c | 83 ++++++++++++++++++++++++++++++++ include/rdma/uverbs_types.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 ++++ 5 files changed, 104 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6c4747e61d2b..a260d2f8e0b7 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -801,6 +801,7 @@ void uverbs_close_fd(struct file *f) /* Pairs with filp->private_data in alloc_begin_fd_uobject */ uverbs_uobject_put(uobj); } +EXPORT_SYMBOL(uverbs_close_fd); /* * Drop the ucontext off the ufile and completely disconnect it from the diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 9ae08e4b78a3..7a987acf0c0b 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -188,13 +188,18 @@ static int uapi_merge_obj_tree(struct uverbs_api *uapi, obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* - * Today drivers are only permitted to use idr_class - * types. They cannot use FD types because we currently have - * no way to revoke the fops pointer after device - * disassociation. + * Today drivers are only permitted to use idr_class and + * fd_class types. We can revoke the IDR types during + * disassociation, and the FD types require the driver to use + * struct file_operations.owner to prevent the driver module + * code from unloading while the file is open. This provides + * enough safety that uverbs_close_fd() will continue to work. + * Drivers using FD are responsible to handle disassociation of + * the device on their own. */ if (WARN_ON(is_driver && - obj->type_attrs->type_class != &uverbs_idr_class)) + obj->type_attrs->type_class != &uverbs_idr_class && + obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 5a588f3cfb1b..9933bcf83a6b 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1168,6 +1168,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( cmd_out, cmd_out_len); } +struct devx_async_event_queue { + spinlock_t lock; + wait_queue_head_t poll_wait; + struct list_head event_list; +}; + +struct devx_async_cmd_event_file { + struct ib_uobject uobj; + struct devx_async_event_queue ev_queue; +}; + +static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) +{ + spin_lock_init(&ev_queue->lock); + INIT_LIST_HEAD(&ev_queue->event_list); + init_waitqueue_head(&ev_queue->poll_wait); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct devx_async_cmd_event_file *ev_file; + + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE); + + ev_file = container_of(uobj, struct devx_async_cmd_event_file, + uobj); + devx_init_event_queue(&ev_file->ev_queue); + return 0; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1313,6 +1345,38 @@ static int devx_umem_cleanup(struct ib_uobject *uobject, return 0; } +static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + return -EINVAL; +} + +static int devx_async_cmd_event_close(struct inode *inode, struct file *filp) +{ + uverbs_close_fd(filp); + return 0; +} + +static __poll_t devx_async_cmd_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + return 0; +} + +const struct file_operations devx_async_cmd_event_fops = { + .owner = THIS_MODULE, + .read = devx_async_cmd_event_read, + .poll = devx_async_cmd_event_poll, + .release = devx_async_cmd_event_close, + .llseek = no_llseek, +}; + +static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +}; + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_DEVX_UMEM_REG, UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, @@ -1440,6 +1504,22 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC, + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file), + devx_hot_unplug_async_cmd_event_file, + &devx_async_cmd_event_fops, "[devx_async_cmd]", + O_RDONLY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)); + static bool devx_is_supported(struct ib_device *device) { struct mlx5_ib_dev *dev = to_mdev(device); @@ -1457,5 +1537,8 @@ const struct uapi_definition mlx5_ib_devx_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_DEVX_UMEM, UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UAPI_DEF_IS_OBJ_SUPPORTED(devx_is_supported)), {}, }; diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index acb1bfa3cc99..175d761695e1 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -157,6 +157,7 @@ struct uverbs_obj_fd_type { extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; +void uverbs_close_fd(struct file *f); #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index b8d121d457f1..6ceae29d77cd 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -113,11 +113,20 @@ enum mlx5_ib_devx_umem_methods { MLX5_IB_METHOD_DEVX_UMEM_DEREG, }; +enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_async_cmd_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + enum mlx5_ib_objects { MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_OBJECT_DEVX_OBJ, MLX5_IB_OBJECT_DEVX_UMEM, MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, }; enum mlx5_ib_flow_matcher_create_attrs { -- cgit v1.2.3 From a124edba26270697540f1058bfcd490c1c65b116 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:57 +0200 Subject: IB/mlx5: Introduce async DEVX obj query API Introduce async DEVX obj query API to get the command response back to user space once it's ready without blocking when calling the firmware. The event's data includes a header with some meta data then the firmware output command data. The header includes: - The input 'wr_id' to let application recognizing the response. The input FD attribute is used to have the event data ready on. Downstream patches from this series will implement the file ops to let application read it. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 164 +++++++++++++++++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 ++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 + 3 files changed, 177 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 9933bcf83a6b..9ca116155f9c 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -17,6 +18,16 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +struct devx_async_data { + struct mlx5_ib_dev *mdev; + struct list_head list; + struct ib_uobject *fd_uobj; + struct mlx5_async_work cb_work; + u16 cmd_out_len; + /* must be last field in this structure */ + struct mlx5_ib_uapi_devx_async_cmd_hdr hdr; +}; + #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; @@ -1172,11 +1183,13 @@ struct devx_async_event_queue { spinlock_t lock; wait_queue_head_t poll_wait; struct list_head event_list; + atomic_t bytes_in_use; }; struct devx_async_cmd_event_file { struct ib_uobject uobj; struct devx_async_event_queue ev_queue; + struct mlx5_async_ctx async_ctx; }; static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) @@ -1184,6 +1197,7 @@ static void devx_init_event_queue(struct devx_async_event_queue *ev_queue) spin_lock_init(&ev_queue->lock); INIT_LIST_HEAD(&ev_queue->event_list); init_waitqueue_head(&ev_queue->poll_wait); + atomic_set(&ev_queue->bytes_in_use, 0); } static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( @@ -1193,13 +1207,123 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)( struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); ev_file = container_of(uobj, struct devx_async_cmd_event_file, uobj); devx_init_event_queue(&ev_file->ev_queue); + mlx5_cmd_init_async_ctx(mdev->mdev, &ev_file->async_ctx); return 0; } +static void devx_query_callback(int status, struct mlx5_async_work *context) +{ + struct devx_async_data *async_data = + container_of(context, struct devx_async_data, cb_work); + struct ib_uobject *fd_uobj = async_data->fd_uobj; + struct devx_async_cmd_event_file *ev_file; + struct devx_async_event_queue *ev_queue; + unsigned long flags; + + ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file, + uobj); + ev_queue = &ev_file->ev_queue; + + spin_lock_irqsave(&ev_queue->lock, flags); + list_add_tail(&async_data->list, &ev_queue->event_list); + spin_unlock_irqrestore(&ev_queue->lock, flags); + + wake_up_interruptible(&ev_queue->poll_wait); + fput(fd_uobj->object); +} + +#define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */ + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( + struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN); + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE); + u16 cmd_out_len; + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct ib_uobject *fd_uobj; + int err; + int uid; + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + struct devx_async_cmd_event_file *ev_file; + struct devx_async_data *async_data; + + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; + + if (!devx_is_obj_query_cmd(cmd_in)) + return -EINVAL; + + err = uverbs_get_const(&cmd_out_len, attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN); + if (err) + return err; + + if (!devx_is_valid_obj_id(uobj, cmd_in)) + return -EINVAL; + + fd_uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD); + if (IS_ERR(fd_uobj)) + return PTR_ERR(fd_uobj); + + ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file, + uobj); + + if (atomic_add_return(cmd_out_len, &ev_file->ev_queue.bytes_in_use) > + MAX_ASYNC_BYTES_IN_USE) { + atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use); + return -EAGAIN; + } + + async_data = kvzalloc(struct_size(async_data, hdr.out_data, + cmd_out_len), GFP_KERNEL); + if (!async_data) { + err = -ENOMEM; + goto sub_bytes; + } + + err = uverbs_copy_from(&async_data->hdr.wr_id, attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID); + if (err) + goto free_async; + + async_data->cmd_out_len = cmd_out_len; + async_data->mdev = mdev; + async_data->fd_uobj = fd_uobj; + + get_file(fd_uobj->object); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in, + uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN), + async_data->hdr.out_data, + async_data->cmd_out_len, + devx_query_callback, &async_data->cb_work); + + if (err) + goto cb_err; + + return 0; + +cb_err: + fput(fd_uobj->object); +free_async: + kvfree(async_data); +sub_bytes: + atomic_sub(cmd_out_len, &ev_file->ev_queue.bytes_in_use); + return err; +} + static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, struct uverbs_attr_bundle *attrs, struct devx_umem *obj) @@ -1353,6 +1477,17 @@ static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, static int devx_async_cmd_event_close(struct inode *inode, struct file *filp) { + struct ib_uobject *uobj = filp->private_data; + struct devx_async_cmd_event_file *comp_ev_file = container_of( + uobj, struct devx_async_cmd_event_file, uobj); + struct devx_async_data *entry, *tmp; + + spin_lock_irq(&comp_ev_file->ev_queue.lock); + list_for_each_entry_safe(entry, tmp, + &comp_ev_file->ev_queue.event_list, list) + kvfree(entry); + spin_unlock_irq(&comp_ev_file->ev_queue.lock); + uverbs_close_fd(filp); return 0; } @@ -1374,6 +1509,11 @@ const struct file_operations devx_async_cmd_event_fops = { static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { + struct devx_async_cmd_event_file *comp_ev_file = + container_of(uobj, struct devx_async_cmd_event_file, + uobj); + + mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx); return 0; }; @@ -1487,6 +1627,27 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, + UVERBS_IDR_ANY_OBJECT, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, + u16, UA_MANDATORY), + UVERBS_ATTR_FD(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), @@ -1497,7 +1658,8 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY), &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY), - &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY)); + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)); DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup), diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 6ceae29d77cd..8149d224030b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -84,6 +84,14 @@ enum mlx5_ib_devx_obj_query_attrs { MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, }; +enum mlx5_ib_devx_obj_query_async_attrs { + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, +}; + enum mlx5_ib_devx_query_eqn_attrs { MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, @@ -94,6 +102,7 @@ enum mlx5_ib_devx_obj_methods { MLX5_IB_METHOD_DEVX_OBJ_DESTROY, MLX5_IB_METHOD_DEVX_OBJ_MODIFY, MLX5_IB_METHOD_DEVX_OBJ_QUERY, + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, }; enum mlx5_ib_devx_umem_reg_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 4ef62c0e8452..4a701033b93f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -51,5 +51,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; +struct mlx5_ib_uapi_devx_async_cmd_hdr { + __aligned_u64 wr_id; + __u8 out_data[]; +}; + #endif -- cgit v1.2.3 From 0b5cb3300ae59ed7e93b465dfa2384a6a4df8eb4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Jan 2019 10:25:20 -0800 Subject: RDMA/srp: Increase max_segment_size The default behavior of the SCSI core is to set the block layer request queue parameter max_segment_size to 64 KB. That means that elements of scatterlists are limited to 64 KB. Since RDMA adapters support larger sizes, increase max_segment_size for the SRP initiator. Notes: - The SCSI max_segment_size parameter was introduced in kernel v5.0. See also commit 50c2e9107f17 ("scsi: introduce a max_segment_size host_template parameters"). - Some other block drivers already set max_segment_size to UINT_MAX, e.g. nbd and rbd. Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 1 + include/rdma/ib_verbs.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 0f855294ff3f..29fe46dbdbf1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3823,6 +3823,7 @@ static ssize_t srp_create_target(struct device *dev, target_host->max_id = 1; target_host->max_lun = -1LL; target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; + target_host->max_segment_size = ib_dma_max_seg_size(ibdev); target = host_to_target(target_host); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 94b6e1dd4dab..71ea144ec823 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3715,6 +3715,19 @@ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, return sg_dma_len(sg); } +/** + * ib_dma_max_seg_size - Return the size limit of a single DMA transfer + * @dev: The device to query + * + * The returned value represents a size in bytes. + */ +static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev) +{ + struct device_dma_parameters *p = dev->dma_device->dma_parms; + + return p ? p->max_segment_size : UINT_MAX; +} + /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created -- cgit v1.2.3 From 459cc69fa4c17caf21de596693d8a07170820a58 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 30 Jan 2019 12:49:11 +0200 Subject: RDMA: Provide safe ib_alloc_device() function All callers to ib_alloc_device() provide a larger size than struct ib_device and rely on the fact that struct ib_device is embedded in their driver specific structure as the first member. Provide a safer variant of ib_alloc_device() that checks and enforces this approach to make sure the drivers are using it right. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 6 +++--- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/cxgb3/iwch.c | 2 +- drivers/infiniband/hw/cxgb4/device.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx5/ib_rep.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mthca/mthca_main.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 +- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rdmavt/vt.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- include/rdma/ib_verbs.h | 8 +++++++- 19 files changed, 27 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 200431c540f2..b511cfa00bdb 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -268,7 +268,7 @@ static struct class ib_class = { }; /** - * ib_alloc_device - allocate an IB device struct + * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct @@ -277,7 +277,7 @@ static struct class ib_class = { * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; @@ -303,7 +303,7 @@ struct ib_device *ib_alloc_device(size_t size) return device; } -EXPORT_SYMBOL(ib_alloc_device); +EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 16eecfa5882c..08777506d0b1 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -688,7 +688,7 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, struct bnxt_re_dev *rdev; /* Allocate bnxt_re_dev instance here */ - rdev = (struct bnxt_re_dev *)ib_alloc_device(sizeof(*rdev)); + rdev = ib_alloc_device(bnxt_re_dev, ibdev); if (!rdev) { dev_err(NULL, "%s: bnxt_re_dev allocation failure!", ROCE_DRV_MODULE_NAME); diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c index 591de319c178..fb03bc492ef7 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.c +++ b/drivers/infiniband/hw/cxgb3/iwch.c @@ -146,7 +146,7 @@ static void open_rnic_dev(struct t3cdev *tdev) pr_debug("%s t3cdev %p\n", __func__, tdev); pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION); - rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp)); + rnicp = ib_alloc_device(iwch_dev, ibdev); if (!rnicp) { pr_err("Cannot allocate ib device\n"); return; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 9c10fff6dcfb..4b4e2464b705 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -966,7 +966,7 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop) pr_info("%s: On-Chip Queues not supported on this device\n", pci_name(infop->pdev)); - devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp)); + devp = ib_alloc_device(c4iw_dev, ibdev); if (!devp) { pr_err("Cannot allocate ib device\n"); return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index b74c742b000c..fa08c22aad66 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -5002,7 +5002,7 @@ static int hns_roce_probe(struct platform_device *pdev) struct hns_roce_dev *hr_dev; struct device *dev = &pdev->dev; - hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev)); + hr_dev = ib_alloc_device(hns_roce_dev, ib_dev); if (!hr_dev) return -ENOMEM; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5c483b437bdd..48a5d6548cd4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -6059,7 +6059,7 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle) struct hns_roce_dev *hr_dev; int ret; - hr_dev = (struct hns_roce_dev *)ib_alloc_device(sizeof(*hr_dev)); + hr_dev = ib_alloc_device(hns_roce_dev, ib_dev); if (!hr_dev) return -ENOMEM; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 12b31a8440be..d4ab46dd9e6c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2762,7 +2762,7 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev struct net_device *netdev = iwdev->netdev; struct pci_dev *pcidev = (struct pci_dev *)iwdev->hw.dev_context; - iwibdev = (struct i40iw_ib_device *)ib_alloc_device(sizeof(*iwibdev)); + iwibdev = ib_alloc_device(i40iw_ib_device, ibdev); if (!iwibdev) { i40iw_pr_err("iwdev == NULL\n"); return NULL; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index dc2ffd293a11..d66002a31000 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2635,7 +2635,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (num_ports == 0) return NULL; - ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); + ibdev = ib_alloc_device(mlx4_ib_dev, ib_dev); if (!ibdev) { dev_err(&dev->persist->pdev->dev, "Device struct alloc failed\n"); diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 46a9ddc8ca56..6d7b8bad4b61 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -70,7 +70,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { struct mlx5_ib_dev *ibdev; - ibdev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*ibdev)); + ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev); if (!ibdev) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 96bf1b2f9dd7..8161acda64e6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6508,7 +6508,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) return mlx5_ib_add_slave_port(mdev); - dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + dev = ib_alloc_device(mlx5_ib_dev, ib_dev); if (!dev) return NULL; diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 92c49bff22bc..fe9654a7af71 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -961,7 +961,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) /* We can handle large RDMA requests, so allow larger segments. */ dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); - mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev); + mdev = ib_alloc_device(mthca_dev, ib_dev); if (!mdev) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 034156f7e9ed..6eb991d40035 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3669,7 +3669,7 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) struct nes_vnic *nesvnic = netdev_priv(netdev); struct nes_device *nesdev = nesvnic->nesdev; - nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device)); + nesibdev = ib_alloc_device(nes_ib_device, ibdev); if (nesibdev == NULL) { return NULL; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index b0491b9ecfe4..88970a6bb555 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -297,7 +297,7 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) u8 lstate = 0; struct ocrdma_dev *dev; - dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev)); + dev = ib_alloc_device(ocrdma_dev, ibdev); if (!dev) { pr_err("Unable to allocate ib device\n"); return NULL; diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index f85e72b65a10..878e9e23652b 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -853,7 +853,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, struct qedr_dev *dev; int rc = 0; - dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev)); + dev = ib_alloc_device(qedr_dev, ibdev); if (!dev) { pr_err("Unable to allocate ib device\n"); return NULL; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 3201dd1899c7..0c0a288cb585 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -372,7 +372,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) usnic_dbg("\n"); netdev = pci_get_drvdata(dev); - us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev)); + us_ibdev = ib_alloc_device(usnic_ib_dev, ib_dev); if (!us_ibdev) { usnic_err("Device %s context alloc failed\n", netdev_name(pci_get_drvdata(dev))); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5f02276d903..e582beaf9430 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -795,7 +795,7 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, dev_dbg(&pdev->dev, "initializing driver %s\n", pci_name(pdev)); /* Allocate zero-out device */ - dev = (struct pvrdma_dev *)ib_alloc_device(sizeof(*dev)); + dev = ib_alloc_device(pvrdma_dev, ib_dev); if (!dev) { dev_err(&pdev->dev, "failed to allocate IB device\n"); return -ENOMEM; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 7de7389d0235..b3f0c5578925 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -91,7 +91,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) { struct rvt_dev_info *rdi; - rdi = (struct rvt_dev_info *)ib_alloc_device(size); + rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev); if (!rdi) return rdi; diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 8fd03ae20efc..19f3c69916b1 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -555,7 +555,7 @@ struct rxe_dev *rxe_net_add(struct net_device *ndev) int err; struct rxe_dev *rxe = NULL; - rxe = (struct rxe_dev *)ib_alloc_device(sizeof(*rxe)); + rxe = ib_alloc_device(rxe_dev, ib_dev); if (!rxe) return NULL; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 71ea144ec823..a1a1e710642c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2621,7 +2621,13 @@ struct ib_client { struct list_head list; }; -struct ib_device *ib_alloc_device(size_t size); +struct ib_device *_ib_alloc_device(size_t size); +#define ib_alloc_device(drv_struct, member) \ + container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct drv_struct, member))), \ + struct drv_struct, member) + void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -- cgit v1.2.3 From 6780c4fa9d6e091b2f206ac429a40e2e8d2e45f3 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 22 Jan 2019 10:08:22 +0200 Subject: RDMA: Add indication for in kernel API support to IB device Drivers that do not provide kernel verbs support should not be used by ib kernel clients at all. In case a device does not implement all mandatory verbs for kverbs usage mark it as a non kverbs provider and prevent its usage for all clients except for uverbs. The device is marked as a non kverbs provider using the 'kverbs_provider' flag which should only be set by the core code. The clients can choose whether kverbs are requested for its usage using the 'no_kverbs_req' flag which is currently set for uverbs only. This patch allows drivers to remove mandatory verbs stubs and simply set the callbacks to NULL. The IB device will be registered as a non-kverbs provider. Note that verbs that are required for the device registration process must be implemented. Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 10 ++++++---- drivers/infiniband/core/uverbs_main.c | 1 + include/rdma/ib_verbs.h | 5 +++++ 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index b511cfa00bdb..9d2e108235e9 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -121,13 +121,12 @@ static int ib_device_check_mandatory(struct ib_device *device) }; int i; + device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { - dev_warn(&device->dev, - "Device is missing mandatory function %s\n", - mandatory_table[i].name); - return -EINVAL; + device->kverbs_provider = false; + break; } } @@ -325,6 +324,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client { struct ib_client_data *context; + if (!device->kverbs_provider && !client->no_kverbs_req) + return -EOPNOTSUPP; + context = kmalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 996f167d1436..d628747e058c 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1151,6 +1151,7 @@ static const struct file_operations uverbs_mmap_fops = { static struct ib_client uverbs_client = { .name = "uverbs", + .no_kverbs_req = true, .add = ib_uverbs_add_one, .remove = ib_uverbs_remove_one }; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1a1e710642c..4183a03b46b5 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2565,6 +2565,8 @@ struct ib_device { __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; + /* Indicates kernel verbs support, should not be used in drivers */ + u16 kverbs_provider:1; u8 node_type; u8 phys_port_cnt; struct ib_device_attr attrs; @@ -2619,6 +2621,9 @@ struct ib_client { const struct sockaddr *addr, void *client_data); struct list_head list; + + /* kverbs are not required by the client */ + u8 no_kverbs_req:1; }; struct ib_device *_ib_alloc_device(size_t size); -- cgit v1.2.3 From 0ad699c0edc97a864177679dd67f2ccd73b07cb7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 30 Jan 2019 12:48:58 +0200 Subject: RDMA/core: Simplify restrack interface In the current implementation, we have one restrack root per-device and all users are simply providing it directly. Let's simplify the interface and have callers provide the ib_device and internally access the restrack_root. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 4 ++-- drivers/infiniband/core/nldev.c | 4 ++-- drivers/infiniband/core/restrack.c | 26 +++++++++++++++++++++----- include/rdma/restrack.h | 23 ++++------------------- 4 files changed, 29 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9d2e108235e9..919e94ff4b25 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -287,7 +287,7 @@ struct ib_device *_ib_alloc_device(size_t size) if (!device) return NULL; - rdma_restrack_init(&device->res); + rdma_restrack_init(device); device->dev.class = &ib_class; device_initialize(&device->dev); @@ -315,7 +315,7 @@ void ib_dealloc_device(struct ib_device *device) WARN_ON(!list_empty(&device->client_data_list)); WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); - rdma_restrack_clean(&device->res); + rdma_restrack_clean(device); put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 1742ff4fbf79..ee98fc9058b1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -314,7 +314,6 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) [RDMA_RESTRACK_CTX] = "ctx", }; - struct rdma_restrack_root *res = &device->res; struct nlattr *table_attr; int ret, i, curr; @@ -328,7 +327,8 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device) for (i = 0; i < RDMA_RESTRACK_MAX; i++) { if (!names[i]) continue; - curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + curr = rdma_restrack_count(device, i, + task_active_pid_ns(current)); ret = fill_res_info_entry(msg, names[i], curr); if (ret) goto err; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 46a5c553c624..0ade3da0a5c7 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -18,8 +18,14 @@ static int fill_res_noop(struct sk_buff *msg, return 0; } -void rdma_restrack_init(struct rdma_restrack_root *res) +/** + * rdma_restrack_init() - initialize resource tracking + * @dev: IB device + */ +void rdma_restrack_init(struct ib_device *dev) { + struct rdma_restrack_root *res = &dev->res; + init_rwsem(&res->rwsem); res->fill_res_entry = fill_res_noop; } @@ -38,11 +44,15 @@ static const char *type2str(enum rdma_restrack_type type) return names[type]; }; -void rdma_restrack_clean(struct rdma_restrack_root *res) +/** + * rdma_restrack_clean() - clean resource tracking + * @dev: IB device + */ +void rdma_restrack_clean(struct ib_device *dev) { + struct rdma_restrack_root *res = &dev->res; struct rdma_restrack_entry *e; char buf[TASK_COMM_LEN]; - struct ib_device *dev; const char *owner; int bkt; @@ -72,10 +82,16 @@ void rdma_restrack_clean(struct rdma_restrack_root *res) pr_err("restrack: %s", CUT_HERE); } -int rdma_restrack_count(struct rdma_restrack_root *res, - enum rdma_restrack_type type, +/** + * rdma_restrack_count() - the current usage of specific object + * @dev: IB device + * @type: actual type of object to operate + * @ns: PID namespace + */ +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns) { + struct rdma_restrack_root *res = &dev->res; struct rdma_restrack_entry *e; u32 cnt = 0; diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 8f179be9d9a9..f756fc48eee5 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -49,6 +49,7 @@ enum rdma_restrack_type { }; #define RDMA_RESTRACK_HASH_BITS 8 +struct ib_device; struct rdma_restrack_entry; /** @@ -122,25 +123,9 @@ struct rdma_restrack_entry { bool user; }; -/** - * rdma_restrack_init() - initialize resource tracking - * @res: resource tracking root - */ -void rdma_restrack_init(struct rdma_restrack_root *res); - -/** - * rdma_restrack_clean() - clean resource tracking - * @res: resource tracking root - */ -void rdma_restrack_clean(struct rdma_restrack_root *res); - -/** - * rdma_restrack_count() - the current usage of specific object - * @res: resource entry - * @type: actual type of object to operate - * @ns: PID namespace - */ -int rdma_restrack_count(struct rdma_restrack_root *res, +void rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns); -- cgit v1.2.3 From 02da37509705d3ba6a58fe4799a0caf6b4baecb0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 30 Jan 2019 12:49:02 +0200 Subject: RDMA/core: Use the ops infrastructure to keep all callbacks in one place As preparation to hide rdma_restrack_root, refactor the code to use the ops structure instead of a special callback which is hidden in rdma_restrack_root. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/nldev.c | 28 ++++++++++++++++++---------- drivers/infiniband/core/restrack.c | 7 ------- drivers/infiniband/hw/cxgb4/provider.c | 2 +- include/rdma/ib_verbs.h | 5 +++++ include/rdma/restrack.h | 7 ------- 6 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 919e94ff4b25..b9f725df4195 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1263,6 +1263,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); + SET_DEVICE_OP(dev_ops, fill_res_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ee98fc9058b1..25a248847575 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -361,11 +361,19 @@ static int fill_res_name_pid(struct sk_buff *msg, return 0; } +static bool fill_res_entry(struct ib_device *dev, struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + if (!dev->ops.fill_res_entry) + return false; + return dev->ops.fill_res_entry(msg, res); +} + static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); - struct rdma_restrack_root *resroot = &qp->device->res; + struct ib_device *dev = qp->device; struct ib_qp_init_attr qp_init_attr; struct nlattr *entry_attr; struct ib_qp_attr qp_attr; @@ -415,7 +423,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; nla_nest_end(msg, entry_attr); @@ -432,7 +440,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); - struct rdma_restrack_root *resroot = &id_priv->id.device->res; + struct ib_device *dev = id_priv->id.device; struct rdma_cm_id *cm_id = &id_priv->id; struct nlattr *entry_attr; @@ -474,7 +482,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; nla_nest_end(msg, entry_attr); @@ -490,7 +498,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); - struct rdma_restrack_root *resroot = &cq->device->res; + struct ib_device *dev = cq->device; struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY); @@ -511,7 +519,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; nla_nest_end(msg, entry_attr); @@ -527,7 +535,7 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); - struct rdma_restrack_root *resroot = &mr->pd->device->res; + struct ib_device *dev = mr->pd->device; struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY); @@ -548,7 +556,7 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; nla_nest_end(msg, entry_attr); @@ -564,7 +572,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); - struct rdma_restrack_root *resroot = &pd->device->res; + struct ib_device *dev = pd->device; struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY); @@ -591,7 +599,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, if (fill_res_name_pid(msg, res)) goto err; - if (resroot->fill_res_entry(msg, res)) + if (fill_res_entry(dev, msg, res)) goto err; nla_nest_end(msg, entry_attr); diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index bd7770ed4174..f80b37d437ac 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -12,12 +12,6 @@ #include "cma_priv.h" -static int fill_res_noop(struct sk_buff *msg, - struct rdma_restrack_entry *entry) -{ - return 0; -} - /** * rdma_restrack_init() - initialize resource tracking * @dev: IB device @@ -27,7 +21,6 @@ void rdma_restrack_init(struct ib_device *dev) struct rdma_restrack_root *res = &dev->res; init_rwsem(&res->rwsem); - res->fill_res_entry = fill_res_noop; } static const char *type2str(enum rdma_restrack_type type) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index f977f8e7e162..cb5b713bbf39 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -549,6 +549,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .destroy_cq = c4iw_destroy_cq, .destroy_qp = c4iw_destroy_qp, .destroy_srq = c4iw_destroy_srq, + .fill_res_entry = fill_res_entry, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = c4iw_get_dma_mr, .get_hw_stats = c4iw_get_mib, @@ -629,7 +630,6 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref; dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; dev->ibdev.iwcm->get_qp = c4iw_get_qp; - dev->ibdev.res.fill_res_entry = fill_res_entry; memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4183a03b46b5..5fc3be884444 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2510,6 +2510,11 @@ struct ib_device_ops { */ int (*init_port)(struct ib_device *device, u8 port_num, struct kobject *port_sysfs); + /** + * Allows rdma drivers to add their own restrack attributes. + */ + int (*fill_res_entry)(struct sk_buff *msg, + struct rdma_restrack_entry *entry); }; struct ib_device { diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index f756fc48eee5..cc66cc7a11d3 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -65,13 +65,6 @@ struct rdma_restrack_root { * @hash: global database for all resources per-device */ DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS); - /** - * @fill_res_entry: driver-specific fill function - * - * Allows rdma drivers to add their own restrack attributes. - */ - int (*fill_res_entry)(struct sk_buff *msg, - struct rdma_restrack_entry *entry); }; /** -- cgit v1.2.3 From ddf922c31fedd19c5b89a269c35e5c8b68c64327 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:21:01 -0800 Subject: IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue The OPFN protocol uses the COMPARE_SWAP request to exchange data between the requester and the responder and therefore needs to be stored in the QP's s_ack_queue when the request is received on the responder side. However, because the user does not know anything about the OPFN protocol, this extra entry in the queue cannot be advertised to the user. This patch adds an extra entry in a QP's s_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 13 +++++++------ drivers/infiniband/hw/hfi1/verbs.c | 1 + include/rdma/rdma_vt.h | 10 +++++++++- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 940e9236c328..8970fc7ffd4b 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -122,7 +122,8 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) + if (++qp->s_tail_ack_queue > + rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) qp->s_tail_ack_queue = 0; /* FALLTHROUGH */ case OP(SEND_ONLY): @@ -1818,7 +1819,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, if (i) prev = i - 1; else - prev = HFI1_MAX_RDMA_ATOMIC; + prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); if (prev == qp->r_head_ack_queue) { e = NULL; break; @@ -1942,7 +1943,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) unsigned next; next = n + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); @@ -2298,8 +2299,8 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) goto nack_inv; next = qp->r_head_ack_queue + 1; - /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ - if (next > HFI1_MAX_RDMA_ATOMIC) + /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */ + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { @@ -2373,7 +2374,7 @@ send_last: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) goto nack_inv; next = qp->r_head_ack_queue + 1; - if (next > HFI1_MAX_RDMA_ATOMIC) + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); if (unlikely(next == qp->s_tail_ack_queue)) { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c980345cf1e1..ec3899c0874c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1735,6 +1735,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; + dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index acb3bc96dfa7..168e40be183c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -182,6 +182,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 extra_rdma_atomic; u8 reserved_operations; }; @@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) */ static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) { - return rdi->dparms.max_rdma_atomic + 1; + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic + 1; +} + +static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic; } /* -- cgit v1.2.3 From da82334219bc386ef7ea5b4b185a339a973dd513 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 08:48:41 +0200 Subject: IB/core: Allocate a bit for SRQ ODP support The ODP support matrix is per operation and per transport. The support for each transport (RC, UD, etc.) is described with a bit field. ODP for SRQ WQEs is considered a different kind of support from ODP for RQ WQs and therefore needs a different capability bit. Signed-off-by: Moni Shoua Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5fc3be884444..5eefdea62831 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -268,6 +268,7 @@ enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_WRITE = 1 << 2, IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, + IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, }; struct ib_odp_caps { -- cgit v1.2.3 From 52a72e2a395fa3c5ab5df41058a8511e87215730 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 08:48:42 +0200 Subject: IB/uverbs: Expose XRC ODP device capabilities Expose XRC ODP capabilities as part of the extended device capabilities. Signed-off-by: Moni Shoua Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 + include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_verbs.h | 2 ++ 3 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d4f1a2ef5015..68c4ea514faf 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3613,6 +3613,7 @@ static int ib_uverbs_ex_query_device(struct uverbs_attr_bundle *attrs) attr.odp_caps.per_transport_caps.uc_odp_caps; resp.odp_caps.per_transport_caps.ud_odp_caps = attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.xrc_odp_caps = attr.odp_caps.per_transport_caps.xrc_odp_caps; resp.timestamp_mask = attr.timestamp_mask; resp.hca_core_clock = attr.hca_core_clock; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5eefdea62831..8219c07340a9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -277,6 +277,7 @@ struct ib_odp_caps { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; + uint32_t xrc_odp_caps; } per_transport_caps; }; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 480d9a60b68e..0474c7400268 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -270,6 +270,8 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_tm_caps tm_caps; struct ib_uverbs_cq_moderation_caps cq_moderation_caps; __aligned_u64 max_dm_size; + __u32 xrc_odp_caps; + __u32 reserved; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From a163afc88556e099271a7b423295bc5176fcecce Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 31 Jan 2019 08:30:34 -0800 Subject: IB/core: Remove ib_sg_dma_address() and ib_sg_dma_len() Keeping single line wrapper functions is not useful. Hence remove the ib_sg_dma_address() and ib_sg_dma_len() functions. This patch does not change any functionality. Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 12 +++++------- drivers/infiniband/ulp/iser/iser_memory.c | 9 ++++----- drivers/infiniband/ulp/srp/ib_srp.c | 17 +++++++---------- include/rdma/ib_verbs.h | 27 --------------------------- net/rds/ib.h | 12 ++++-------- net/rds/ib_fmr.c | 8 ++++---- net/rds/ib_frmr.c | 4 ++-- net/rds/ib_recv.c | 8 +++----- net/rds/ib_send.c | 15 +++++++-------- 9 files changed, 36 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index d22c4a2ebac6..89a5be3a2f97 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -179,7 +179,6 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : qp->max_read_sge; struct ib_sge *sge; @@ -209,8 +208,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { - sge->addr = ib_sg_dma_address(dev, sg) + offset; - sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->addr = sg_dma_address(sg) + offset; + sge->length = sg_dma_len(sg) - offset; sge->lkey = qp->pd->local_dma_lkey; total_len += sge->length; @@ -236,14 +235,13 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { - struct ib_device *dev = qp->pd->device; struct ib_rdma_wr *rdma_wr = &ctx->single.wr; ctx->nr_ops = 1; ctx->single.sge.lkey = qp->pd->local_dma_lkey; - ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; - ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + ctx->single.sge.addr = sg_dma_address(sg) + offset; + ctx->single.sge.length = sg_dma_len(sg) - offset; memset(rdma_wr, 0, sizeof(*rdma_wr)); if (dir == DMA_TO_DEVICE) @@ -294,7 +292,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, * Skip to the S/G entry that sg_offset falls into: */ for (;;) { - u32 len = ib_sg_dma_len(dev, sg); + u32 len = sg_dma_len(sg); if (sg_offset < len) break; diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 394d1b9c2ff7..2ba70729d7b0 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -145,9 +145,8 @@ static void iser_data_buf_dump(struct iser_data_buf *data, for_each_sg(data->sg, sg, data->dma_nents, i) iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " "off:0x%x sz:0x%x dma_len:0x%x\n", - i, (unsigned long)ib_sg_dma_address(ibdev, sg), - sg_page(sg), sg->offset, - sg->length, ib_sg_dma_len(ibdev, sg)); + i, (unsigned long)sg_dma_address(sg), + sg_page(sg), sg->offset, sg->length, sg_dma_len(sg)); } static void iser_dump_page_vec(struct iser_page_vec *page_vec) @@ -204,8 +203,8 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, reg->rkey = device->pd->unsafe_global_rkey; else reg->rkey = 0; - reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); - reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); + reg->sge.addr = sg_dma_address(&sg[0]); + reg->sge.length = sg_dma_len(&sg[0]); iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx," " length=0x%x\n", reg->sge.lkey, reg->rkey, diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 29fe46dbdbf1..84184910f038 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1600,9 +1600,8 @@ static int srp_map_sg_entry(struct srp_map_state *state, { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; - struct ib_device *ibdev = dev->dev; - dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + dma_addr_t dma_addr = sg_dma_address(sg); + unsigned int dma_len = sg_dma_len(sg); unsigned int len = 0; int ret; @@ -1696,13 +1695,11 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch, int count) { struct srp_target_port *target = ch->target; - struct srp_device *dev = target->srp_host->srp_dev; struct scatterlist *sg; int i; for_each_sg(scat, sg, count, i) { - srp_map_desc(state, ib_sg_dma_address(dev->dev, sg), - ib_sg_dma_len(dev->dev, sg), + srp_map_desc(state, sg_dma_address(sg), sg_dma_len(sg), target->global_rkey); } @@ -1852,8 +1849,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, buf->len = cpu_to_be32(data_len); WARN_ON_ONCE((void *)(buf + 1) > (void *)cmd + len); for_each_sg(scat, sg, count, i) { - sge[i].addr = ib_sg_dma_address(ibdev, sg); - sge[i].length = ib_sg_dma_len(ibdev, sg); + sge[i].addr = sg_dma_address(sg); + sge[i].length = sg_dma_len(sg); sge[i].lkey = target->lkey; } req->cmd->num_sge += count; @@ -1874,9 +1871,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_direct_buf *buf; buf = (void *)cmd->add_data + cmd->add_cdb_len; - buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); + buf->va = cpu_to_be64(sg_dma_address(scat)); buf->key = cpu_to_be32(target->global_rkey); - buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); + buf->len = cpu_to_be32(sg_dma_len(scat)); req->nmdesc = 0; goto map_complete; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8219c07340a9..f7e8709e48cd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3705,33 +3705,6 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, { dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } -/** - * ib_sg_dma_address - Return the DMA address from a scatter/gather entry - * @dev: The device for which the DMA addresses were created - * @sg: The scatter/gather entry - * - * Note: this function is obsolete. To do: change all occurrences of - * ib_sg_dma_address() into sg_dma_address(). - */ -static inline u64 ib_sg_dma_address(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg_dma_address(sg); -} - -/** - * ib_sg_dma_len - Return the DMA length from a scatter/gather entry - * @dev: The device for which the DMA addresses were created - * @sg: The scatter/gather entry - * - * Note: this function is obsolete. To do: change all occurrences of - * ib_sg_dma_len() into sg_dma_len(). - */ -static inline unsigned int ib_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg_dma_len(sg); -} /** * ib_dma_max_seg_size - Return the size limit of a single DMA transfer diff --git a/net/rds/ib.h b/net/rds/ib.h index 71ff356ee702..1fd1cac85da2 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -331,10 +331,8 @@ static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev, unsigned int i; for_each_sg(sglist, sg, sg_dma_len, i) { - ib_dma_sync_single_for_cpu(dev, - ib_sg_dma_address(dev, sg), - ib_sg_dma_len(dev, sg), - direction); + ib_dma_sync_single_for_cpu(dev, sg_dma_address(sg), + sg_dma_len(sg), direction); } } #define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu @@ -348,10 +346,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, unsigned int i; for_each_sg(sglist, sg, sg_dma_len, i) { - ib_dma_sync_single_for_device(dev, - ib_sg_dma_address(dev, sg), - ib_sg_dma_len(dev, sg), - direction); + ib_dma_sync_single_for_device(dev, sg_dma_address(sg), + sg_dma_len(sg), direction); } } #define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index e0f70c4051b6..31cf37da4510 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -108,8 +108,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, page_cnt = 0; for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + unsigned int dma_len = sg_dma_len(&scat[i]); + u64 dma_addr = sg_dma_address(&scat[i]); if (dma_addr & ~PAGE_MASK) { if (i > 0) { @@ -148,8 +148,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, page_cnt = 0; for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); - u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); + unsigned int dma_len = sg_dma_len(&scat[i]); + u64 dma_addr = sg_dma_address(&scat[i]); for (j = 0; j < dma_len; j += PAGE_SIZE) dma_pages[page_cnt++] = diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 6431a023ac89..688dcd68d4ea 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -181,8 +181,8 @@ static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, ret = -EINVAL; for (i = 0; i < ibmr->sg_dma_len; ++i) { - unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]); - u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]); + unsigned int dma_len = sg_dma_len(&ibmr->sg[i]); + u64 dma_addr = sg_dma_address(&ibmr->sg[i]); frmr->sg_byte_len += dma_len; if (dma_addr & ~PAGE_MASK) { diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 2f16146e4ec9..672b91a9e207 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -346,8 +346,8 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, sge->length = sizeof(struct rds_header); sge = &recv->r_sge[1]; - sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg); - sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg); + sge->addr = sg_dma_address(&recv->r_frag->f_sg); + sge->length = sg_dma_len(&recv->r_frag->f_sg); ret = 0; out: @@ -409,9 +409,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) rdsdebug("recv %p ibinc %p page %p addr %lu\n", recv, recv->r_ibinc, sg_page(&recv->r_frag->f_sg), - (long) ib_sg_dma_address( - ic->i_cm_id->device, - &recv->r_frag->f_sg)); + (long)sg_dma_address(&recv->r_frag->f_sg)); /* XXX when can this fail? */ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, NULL); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 2dcb555e6350..dc5897a3a958 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -645,16 +645,16 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]) { len = min(RDS_FRAG_SIZE, - ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff); + sg_dma_len(scat) - rm->data.op_dmaoff); send->s_wr.num_sge = 2; - send->s_sge[1].addr = ib_sg_dma_address(dev, scat); + send->s_sge[1].addr = sg_dma_address(scat); send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].length = len; bytes_sent += len; rm->data.op_dmaoff += len; - if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) { + if (rm->data.op_dmaoff == sg_dma_len(scat)) { scat++; rm->data.op_dmasg++; rm->data.op_dmaoff = 0; @@ -808,8 +808,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) } /* Convert our struct scatterlist to struct ib_sge */ - send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); - send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].addr = sg_dma_address(op->op_sg); + send->s_sge[0].length = sg_dma_len(op->op_sg); send->s_sge[0].lkey = ic->i_pd->local_dma_lkey; rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, @@ -921,9 +921,8 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) for (j = 0; j < send->s_rdma_wr.wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { - len = ib_sg_dma_len(ic->i_cm_id->device, scat); - send->s_sge[j].addr = - ib_sg_dma_address(ic->i_cm_id->device, scat); + len = sg_dma_len(scat); + send->s_sge[j].addr = sg_dma_address(scat); send->s_sge[j].length = len; send->s_sge[j].lkey = ic->i_pd->local_dma_lkey; -- cgit v1.2.3 From 668aa15b5bf87f156ec805cb7348c785c56b82ab Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 29 Jan 2019 12:08:50 +0200 Subject: RDMA/rxe: Improve loopback marking Currently a packet is marked for loopback only if the source and destination addresses equals. This is not enough when multiple gids are present in rxe device's gid table and the traffic is from one gid to another. Fix it by marking the packet for loopback if the destination MAC address is equal to the source MAC address. Signed-off-by: Kamal Heib Reviewed-by: Yuval Shaia Tested-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 1 + drivers/infiniband/sw/rxe/rxe_net.c | 9 +++------ include/uapi/rdma/rdma_user_rxe.h | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 27a7dec18874..81ee756c19b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -38,6 +38,7 @@ void rxe_init_av(struct rdma_ah_attr *attr, struct rxe_av *av) { rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr); rxe_av_fill_ip_info(av, attr); + memcpy(av->dmac, attr->roce.dmac, ETH_ALEN); } int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 19f3c69916b1..3b162e92e8e8 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -384,9 +384,6 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb, return -EHOSTUNREACH; } - if (!memcmp(saddr, daddr, sizeof(*daddr))) - pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); @@ -411,9 +408,6 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb, return -EHOSTUNREACH; } - if (!memcmp(saddr, daddr, sizeof(*daddr))) - pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); @@ -437,6 +431,9 @@ int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) *crc = rxe_icrc_hdr(pkt, skb); + if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) + pkt->mask |= RXE_LOOPBACK_MASK; + return err; } diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 44ef6a3b7afc..aae2e696bb38 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -58,8 +58,7 @@ struct rxe_global_route { struct rxe_av { __u8 port_num; __u8 network_type; - __u16 reserved1; - __u32 reserved2; + __u8 dmac[6]; struct rxe_global_route grh; union { struct sockaddr_in _sockaddr_in; -- cgit v1.2.3 From f76903d574b26bc596951a5c5e757eb02c67abbd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:11 -0800 Subject: RDMA/IWPM: refactor the IWPM message attribute names In order to add new IWPM_NL attributes, the enums for the IWPM commands attributes are refactored such that a new attribute can be added without breaking ABI version 3. Instead of sharing nl attribute enums for both request and response messages, we create separate enums for each IWPM message request and reply. This allows us to extend any given IWPM message by adding new attributes for just that message. These new enums are created, though, in a way to avoid breaking ABI version 3. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwpm_msg.c | 40 ++++++++++++++++++++++---------------- include/uapi/rdma/rdma_netlink.h | 21 +++++++++++++++++--- 2 files changed, 41 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 8861c052155a..3a8753595c8f 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -403,10 +403,12 @@ register_pid_response_exit: /* netlink attribute policy for the received response to add mapping request */ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { - [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } + [IWPM_NLA_RMANAGE_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RMANAGE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; /* @@ -430,7 +432,7 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RMANAGE_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -439,9 +441,9 @@ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_ADDR]); mapped_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]); + nla_data(nltb[IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR]); if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) { nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR; @@ -472,11 +474,15 @@ add_mapping_response_exit: /* netlink attribute policy for the response to add and query mapping request * and response with remote address info */ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = { - [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 }, - [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) }, - [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPING_SEQ] = { .type = NLA_U32 }, + [IWPM_NLA_RQUERY_LOCAL_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_REMOTE_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, + [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { + .len = sizeof(struct sockaddr_storage) }, [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; @@ -502,7 +508,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, return -EINVAL; atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); - msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]); + msg_seq = nla_get_u32(nltb[IWPM_NLA_RQUERY_MAPPING_SEQ]); nlmsg_request = iwpm_find_nlmsg_request(msg_seq); if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", @@ -511,9 +517,9 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) @@ -588,9 +594,9 @@ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); local_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_LOCAL_ADDR]); remote_sockaddr = (struct sockaddr_storage *) - nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]); + nla_data(nltb[IWPM_NLA_RQUERY_REMOTE_ADDR]); mapped_loc_sockaddr = (struct sockaddr_storage *) nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]); mapped_rem_sockaddr = (struct sockaddr_storage *) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 2e18b77a817f..42d53e182d5f 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -83,13 +83,20 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, - IWPM_NLA_MANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_MANAGE_MAPPING_MAX +}; + +enum { + IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_RMANAGE_MAPPING_SEQ, + IWPM_NLA_RMANAGE_ADDR, + IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + /* The following maintains bisectability of rdma-core */ + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, IWPM_NLA_RMANAGE_MAPPING_ERR, IWPM_NLA_RMANAGE_MAPPING_MAX }; -#define IWPM_NLA_MANAGE_MAPPING_MAX 3 -#define IWPM_NLA_QUERY_MAPPING_MAX 4 #define IWPM_NLA_MAPINFO_SEND_MAX 3 enum { @@ -97,6 +104,14 @@ enum { IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_MAPPING_MAX, +}; + +enum { + IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_RQUERY_MAPPING_SEQ, + IWPM_NLA_RQUERY_LOCAL_ADDR, + IWPM_NLA_RQUERY_REMOTE_ADDR, IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, IWPM_NLA_RQUERY_MAPPED_REM_ADDR, IWPM_NLA_RQUERY_MAPPING_ERR, -- cgit v1.2.3 From b0bad9ad514fc1dd8890f1749f5d2425a73270e3 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:16 -0800 Subject: RDMA/IWPM: Support no port mapping requirements A soft iwarp driver that uses the host TCP stack via a kernel mode socket does not need port mapping. In fact, if the port map daemon, iwpmd, is running, then iwpmd must not try and create/bind a socket to the actual port for a soft iwarp connection, since the driver already has that socket bound. Yet if the soft iwarp driver wants to interoperate with hard iwarp devices that -are- using port mapping, then the soft iwarp driver's mappings still need to be maintained and advertised by the iwpm protocol. This patch enhances the rdma driver<->iwcm interface to allow an iwarp driver to specify that it does not want port mapping. The iwpm kernel<->iwpmd interface is also enhanced to pass up this information on map requests. Care is taken to interoperate with the current iwpmd version (ABI version 3) and only use the new NL attributes if iwpmd supports ABI version 4. The ABI version define has also been created in rdma_netlink.h so both kernel and user code can share it. The iwcm and iwpmd negotiate the ABI version to use with a new HELLO netlink message. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwcm.c | 7 +++- drivers/infiniband/core/iwpm_msg.c | 80 +++++++++++++++++++++++++++++++++++-- drivers/infiniband/core/iwpm_util.c | 48 +++++++++++++++++++++- drivers/infiniband/core/iwpm_util.h | 12 ++++++ include/rdma/iw_cm.h | 13 ++++++ include/rdma/iw_portmap.h | 15 ++++++- include/uapi/rdma/rdma_netlink.h | 24 +++++++++++ 7 files changed, 192 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 476abc74178e..350ea2bab78a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -87,7 +87,8 @@ static struct rdma_nl_cbs iwcm_nl_cb_table[RDMA_NL_IWPM_NUM_OPS] = { [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb}, [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb}, [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb}, - [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb} + [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}, + [RDMA_NL_IWPM_HELLO] = {.dump = iwpm_hello_cb} }; static struct workqueue_struct *iwcm_wq; @@ -525,6 +526,8 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; + pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ? + IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_IWCM); @@ -543,7 +546,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) return iwpm_create_mapinfo(&cm_id->local_addr, &cm_id->m_local_addr, - RDMA_NL_IWCM); + RDMA_NL_IWCM, pm_msg.flags); } /* diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 3a8753595c8f..2e30e65b0816 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -34,7 +34,7 @@ #include "iwpm_util.h" static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser"; -static int iwpm_ulib_version = 3; +u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; @@ -130,6 +130,7 @@ pid_query_error: * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] + * [IWPM_NLA_MANAGE_FLAGS] */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -173,6 +174,18 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto add_mapping_error; + /* If flags are required and we're not V4, then return a quiet error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto add_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_MANAGE_FLAGS); + if (ret) + goto add_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -187,6 +200,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; add_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +add_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -201,6 +215,7 @@ add_mapping_error: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] * [IWPM_NLA_QUERY_REMOTE_ADDR] + * [IWPM_NLA_QUERY_FLAGS] */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -251,6 +266,18 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) if (ret) goto query_mapping_error; + /* If flags are required and we're not V4, then return a quite error */ + if (pm_msg->flags && iwpm_ulib_version == IWPM_UABI_VERSION_MIN) { + ret = -EINVAL; + goto query_mapping_error_nowarn; + } + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), &pm_msg->flags, + IWPM_NLA_QUERY_FLAGS); + if (ret) + goto query_mapping_error; + } + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; @@ -264,6 +291,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) return ret; query_mapping_error: pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client); +query_mapping_error_nowarn: if (skb) dev_kfree_skb(skb); if (nlmsg_request) @@ -379,7 +407,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) /* check device name, ulib name and version */ if (strcmp(pm_msg->dev_name, dev_name) || strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n", __func__, dev_name, iwpm_name, iwpm_version); @@ -387,6 +415,10 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) goto register_pid_response_exit; } iwpm_user_pid = cb->nlh->nlmsg_pid; + iwpm_ulib_version = iwpm_version; + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", __func__, iwpm_user_pid); @@ -661,7 +693,7 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]); iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]); if (strcmp(iwpm_ulib_name, iwpm_name) || - iwpm_version != iwpm_ulib_version) { + iwpm_version < IWPM_UABI_VERSION_MIN) { pr_info("%s: Invalid port mapper name = %s version = %d\n", __func__, iwpm_name, iwpm_version); return ret; @@ -675,6 +707,11 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); iwpm_user_pid = cb->nlh->nlmsg_pid; + + if (iwpm_ulib_version < IWPM_UABI_VERSION) + pr_warn_once("%s: Down level iwpmd/pid %u. Continuing...", + __func__, iwpm_user_pid); + if (!iwpm_mapinfo_available()) return 0; pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", @@ -754,3 +791,40 @@ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) up(&nlmsg_request->sem); return 0; } + +/* netlink attribute policy for the received hello request */ +static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { + [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } +}; + +/* + * iwpm_hello_cb - Process a port mapper hello request + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *nltb[IWPM_NLA_HELLO_MAX]; + const char *msg_type = "Hello request"; + u8 nl_client; + u16 abi_version; + int ret = -EINVAL; + + if (iwpm_parse_nlmsg(cb, IWPM_NLA_HELLO_MAX, hello_policy, nltb, + msg_type)) { + pr_info("%s: Unable to parse nlmsg\n", __func__); + return ret; + } + abi_version = nla_get_u16(nltb[IWPM_NLA_HELLO_ABI_VERSION]); + nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type); + if (!iwpm_valid_client(nl_client)) { + pr_info("%s: Invalid port mapper client = %d\n", + __func__, nl_client); + return ret; + } + iwpm_set_registration(nl_client, IWPM_REG_INCOMPL); + atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); + iwpm_ulib_version = min_t(u16, IWPM_UABI_VERSION, abi_version); + pr_debug("Using ABI version %u\n", iwpm_ulib_version); + iwpm_user_pid = cb->nlh->nlmsg_pid; + ret = iwpm_send_hello(nl_client, iwpm_user_pid, iwpm_ulib_version); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index cdb63f3f4de7..363938435476 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -114,7 +114,7 @@ static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, - u8 nl_client) + u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; @@ -132,6 +132,7 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; + map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { @@ -686,6 +687,14 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { + ret = ibnl_put_attr(skb, nlh, sizeof(u32), + &map_info->map_flags, + IWPM_NLA_MAPINFO_FLAGS); + if (ret) + goto send_mapping_info_unlock; + } + nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, @@ -754,3 +763,40 @@ int iwpm_mapinfo_available(void) spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } + +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + u32 msg_seq; + const char *err_str = ""; + int ret = -EINVAL; + + skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); + if (!skb) { + err_str = "Unable to create a nlmsg"; + goto hello_num_error; + } + nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); + msg_seq = 0; + err_str = "Unable to put attribute of abi_version into nlmsg"; + ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, + IWPM_NLA_HELLO_ABI_VERSION); + if (ret) + goto hello_num_error; + nlmsg_end(skb, nlh); + + ret = rdma_nl_unicast(skb, iwpm_pid); + if (ret) { + skb = NULL; + err_str = "Unable to send a nlmsg"; + goto hello_num_error; + } + pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); + return 0; +hello_num_error: + pr_info("%s: %s\n", __func__, err_str); + if (skb) + dev_kfree_skb(skb); + return ret; +} diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index af1fc14a0d3d..7e2bcc72f66c 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -78,6 +78,7 @@ struct iwpm_mapping_info { struct sockaddr_storage local_sockaddr; struct sockaddr_storage mapped_sockaddr; u8 nl_client; + u32 map_flags; }; struct iwpm_remote_info { @@ -266,4 +267,15 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, * @msg: Message to print */ void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg); + +/** + * iwpm_send_hello - Send hello response to iwpmd + * + * @nl_client: The index of the netlink client + * @abi_version: The kernel's abi_version + * + * Returns 0 on success or a negative error code + */ +int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version); +extern u16 iwpm_ulib_version; #endif diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 5cd7701db148..48512abd3162 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -105,6 +105,18 @@ struct iw_cm_conn_param { u32 qpn; }; +enum iw_flags { + + /* + * This flag allows the iwcm and iwpmd to still advertise + * mappings but the real and mapped port numbers are the + * same. Further, iwpmd will not bind any user socket to + * reserve the port. This is required for soft iwarp + * to play in the port mapped iwarp space. + */ + IW_F_NO_PORT_MAP = (1 << 0), +}; + struct iw_cm_verbs { void (*add_ref)(struct ib_qp *qp); @@ -127,6 +139,7 @@ struct iw_cm_verbs { int (*destroy_listen)(struct iw_cm_id *cm_id); char ifname[IFNAMSIZ]; + enum iw_flags driver_flags; }; /** diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h index fda31673a562..84fac196ef80 100644 --- a/include/rdma/iw_portmap.h +++ b/include/rdma/iw_portmap.h @@ -58,6 +58,7 @@ struct iwpm_sa_data { struct sockaddr_storage mapped_loc_addr; struct sockaddr_storage rem_addr; struct sockaddr_storage mapped_rem_addr; + u32 flags; }; /** @@ -205,9 +206,11 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, * @local_addr: Local ip/tcp address * @mapped_addr: Mapped local ip/tcp address * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, - struct sockaddr_storage *mapped_addr, u8 nl_client); + struct sockaddr_storage *mapped_addr, u8 nl_client, + u32 map_flags); /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address @@ -221,4 +224,14 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr); +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _IW_PORTMAP_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 42d53e182d5f..0f5263767fb4 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -35,6 +35,19 @@ enum { RDMA_NL_RDMA_CM_NUM_ATTR, }; +/* The minimum version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION_MIN 3 + +/* The latest version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION 4 + +/* iwarp port mapper message flags */ +enum { + + /* Do not map the port for this IWPM request */ + IWPM_FLAGS_NO_PORT_MAP = (1 << 0), +}; + /* iwarp port mapper op-codes */ enum { RDMA_NL_IWPM_REG_PID = 0, @@ -45,6 +58,7 @@ enum { RDMA_NL_IWPM_HANDLE_ERR, RDMA_NL_IWPM_MAPINFO, RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_HELLO, RDMA_NL_IWPM_NUM_OPS }; @@ -83,6 +97,7 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_FLAGS, IWPM_NLA_MANAGE_MAPPING_MAX }; @@ -98,12 +113,14 @@ enum { }; #define IWPM_NLA_MAPINFO_SEND_MAX 3 +#define IWPM_NLA_REMOVE_MAPPING_MAX 3 enum { IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_FLAGS, IWPM_NLA_QUERY_MAPPING_MAX, }; @@ -129,6 +146,7 @@ enum { IWPM_NLA_MAPINFO_UNSPEC = 0, IWPM_NLA_MAPINFO_LOCAL_ADDR, IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_FLAGS, IWPM_NLA_MAPINFO_MAX }; @@ -147,6 +165,12 @@ enum { IWPM_NLA_ERR_MAX }; +enum { + IWPM_NLA_HELLO_UNSPEC = 0, + IWPM_NLA_HELLO_ABI_VERSION, + IWPM_NLA_HELLO_MAX +}; + /* * Local service operations: * RESOLVE - The client requests the local service to resolve a path. -- cgit v1.2.3 From a78e8723a50530d15faa25cc0b6f009bcd251c20 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 16 Jan 2019 09:55:41 +0200 Subject: RDMA/cma: Remove CM_ID statistics provided by rdma-cm module Netlink statistics exported by rdma-cm never had any working user space component published to the mailing list or to any open source project. Canvassing various proprietary users, and the original requester, we find that there are no real users of this interface. This patch simply removes all occurrences of RDMA CM netlink in favour of modern nldev implementation, which provides the same information and accompanied by widely used user space component. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 83 --------------------------------------- drivers/infiniband/core/netlink.c | 4 +- include/uapi/rdma/rdma_netlink.h | 17 +------- 3 files changed, 3 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 81bded0d37d1..e15546ae4d0f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4616,85 +4616,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data) kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS, - NLM_F_MULTI); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), - cma_src_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) - goto out; - if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_dst_addr(id_priv)), - cma_dst_addr(id_priv), - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) - goto out; - - id_stats->pid = task_pid_vnr(id_priv->res.task); - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - nlmsg_end(skb, nlh); - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct rdma_nl_cbs cma_cb_table[RDMA_NL_RDMA_CM_NUM_OPS] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, -}; - static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); @@ -4743,7 +4664,6 @@ static int __init cma_init(void) if (ret) goto err; - rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4759,7 +4679,6 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); @@ -4767,7 +4686,5 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } -MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); - module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 724f5a62e82f..eecfc0b377c9 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -56,7 +56,6 @@ EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS] = { - [RDMA_NL_RDMA_CM] = RDMA_NL_RDMA_CM_NUM_OPS, [RDMA_NL_IWCM] = RDMA_NL_IWPM_NUM_OPS, [RDMA_NL_LS] = RDMA_NL_LS_NUM_OPS, [RDMA_NL_NLDEV] = RDMA_NLDEV_NUM_OPS, @@ -181,8 +180,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; } /* FIXME: Convert IWCM to properly handle doit callbacks */ - if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_RDMA_CM || - index == RDMA_NL_IWCM) { + if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { struct netlink_dump_control c = { .dump = cb_table[op].dump, }; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0f5263767fb4..3a9e681e4257 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -5,8 +5,7 @@ #include enum { - RDMA_NL_RDMA_CM = 1, - RDMA_NL_IWCM, + RDMA_NL_IWCM = 2, RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ RDMA_NL_NLDEV, /* RDMA device interface */ @@ -14,8 +13,7 @@ enum { }; enum { - RDMA_NL_GROUP_CM = 1, - RDMA_NL_GROUP_IWPM, + RDMA_NL_GROUP_IWPM = 2, RDMA_NL_GROUP_LS, RDMA_NL_NUM_GROUPS }; @@ -24,17 +22,6 @@ enum { #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) -enum { - RDMA_NL_RDMA_CM_ID_STATS = 0, - RDMA_NL_RDMA_CM_NUM_OPS -}; - -enum { - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR, - RDMA_NL_RDMA_CM_NUM_ATTR, -}; - /* The minimum version that the iwpm kernel supports */ #define IWPM_UABI_VERSION_MIN 3 -- cgit v1.2.3 From a2bfd708b17adb6e597e70d4eca824667f2d4e3c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Feb 2019 11:33:22 -0800 Subject: RDMA/iwpm: move kdoc comments to functions Move the iwpm kdoc comments from the prototype declarations to above the function bodies. There are no functional changes in this patch. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwpm_msg.c | 116 ++++++++++++++++++++-------- drivers/infiniband/core/iwpm_util.c | 40 ++++++++++ include/rdma/iw_portmap.h | 149 ------------------------------------ 3 files changed, 123 insertions(+), 182 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 2e30e65b0816..2452b0ddcf0d 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -38,14 +38,21 @@ u16 iwpm_ulib_version = IWPM_UABI_VERSION_MIN; static int iwpm_user_pid = IWPM_PID_UNDEFINED; static atomic_t echo_nlmsg_seq; +/** + * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid + * + * Returns true if the pid is greater than zero, otherwise returns false + */ int iwpm_valid_pid(void) { return iwpm_user_pid > 0; } -/* - * iwpm_register_pid - Send a netlink query to user space - * for the iwarp port mapper pid +/** + * iwpm_register_pid - Send a netlink query to userspace + * to get the iwarp port mapper pid + * @pm_msg: Contains driver info to send to the userspace port mapper + * @nl_client: The index of the netlink client * * nlmsg attributes: * [IWPM_NLA_REG_PID_SEQ] @@ -124,13 +131,19 @@ pid_query_error: return ret; } -/* - * iwpm_add_mapping - Send a netlink add mapping message - * to the port mapper +/** + * iwpm_add_mapping - Send a netlink add mapping request to + * the userspace port mapper + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] * [IWPM_NLA_MANAGE_FLAGS] + * + * If the request is successful, the pm_msg stores + * the port mapper response (mapped address info) */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) { @@ -208,9 +221,12 @@ add_mapping_error_nowarn: return ret; } -/* - * iwpm_add_and_query_mapping - Send a netlink add and query - * mapping message to the port mapper +/** + * iwpm_add_and_query_mapping - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @pm_msg: Contains the local ip/tcp address info to send + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_QUERY_MAPPING_SEQ] * [IWPM_NLA_QUERY_LOCAL_ADDR] @@ -299,9 +315,13 @@ query_mapping_error_nowarn: return ret; } -/* - * iwpm_remove_mapping - Send a netlink remove mapping message - * to the port mapper +/** + * iwpm_remove_mapping - Send a netlink remove mapping request + * to the userspace port mapper + * + * @local_addr: Local ip/tcp address to remove + * @nl_client: The index of the netlink client + * * nlmsg attributes: * [IWPM_NLA_MANAGE_MAPPING_SEQ] * [IWPM_NLA_MANAGE_ADDR] @@ -372,9 +392,14 @@ static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = { [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_register_pid_cb - Process a port mapper response to - * iwpm_register_pid() +/** + * iwpm_register_pid_cb - Process the port mapper response to + * iwpm_register_pid query + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * If successful, the function receives the userspace port mapper pid + * which is used in future communication with the port mapper */ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -443,9 +468,11 @@ static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = { [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_mapping_cb - Process a port mapper response to - * iwpm_add_mapping() +/** + * iwpm_add_mapping_cb - Process the port mapper response to + * iwpm_add_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -518,9 +545,11 @@ static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 } }; -/* - * iwpm_add_and_query_mapping_cb - Process a port mapper response to - * iwpm_add_and_query_mapping() +/** + * iwpm_add_and_query_mapping_cb - Process the port mapper response to + * iwpm_add_and_query_mapping request + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb) @@ -598,9 +627,13 @@ query_mapping_response_exit: return 0; } -/* - * iwpm_remote_info_cb - Process a port mapper message, containing - * the remote connecting peer address info +/** + * iwpm_remote_info_cb - Process remote connecting peer address info, which + * the port mapper has received from the connecting peer + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Stores the IPv4/IPv6 address info in a hash table */ int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -673,8 +706,14 @@ static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = { [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 } }; -/* - * iwpm_mapping_info_cb - Process a port mapper request for mapping info +/** + * iwpm_mapping_info_cb - Process a notification that the userspace + * port mapper daemon is started + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send all the local mapping + * info records to the userspace port mapper */ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -727,9 +766,11 @@ static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = { [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 } }; -/* - * iwpm_ack_mapping_info_cb - Process a port mapper ack for - * the provided mapping info records +/** + * iwpm_ack_mapping_info_cb - Process the port mapper ack for + * the provided local mapping info records + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -755,8 +796,11 @@ static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = { [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 }, }; -/* - * iwpm_mapping_error_cb - Process a port mapper error message +/** + * iwpm_mapping_error_cb - Process port mapper notification for error + * + * @skb: + * @cb: Contains the received message (payload and netlink header) */ int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb) { @@ -797,8 +841,14 @@ static const struct nla_policy hello_policy[IWPM_NLA_HELLO_MAX] = { [IWPM_NLA_HELLO_ABI_VERSION] = { .type = NLA_U16 } }; -/* - * iwpm_hello_cb - Process a port mapper hello request +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. */ int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb) { diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 363938435476..7b97f6e2075f 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -51,6 +51,12 @@ static DEFINE_SPINLOCK(iwpm_reminfo_lock); static DEFINE_MUTEX(iwpm_admin_lock); static struct iwpm_admin_data iwpm_admin; +/** + * iwpm_init - Allocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes up. + */ int iwpm_init(u8 nl_client) { int ret = 0; @@ -87,6 +93,12 @@ init_exit: static void free_hash_bucket(void); static void free_reminfo_bucket(void); +/** + * iwpm_exit - Deallocate resources for the iwarp port mapper + * @nl_client: The index of the netlink client + * + * Should be called when network interface goes down. + */ int iwpm_exit(u8 nl_client) { @@ -112,6 +124,14 @@ int iwpm_exit(u8 nl_client) static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); +/** + * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address + * info in a hash table + * @local_addr: Local ip/tcp address + * @mapped_addr: Mapped local ip/tcp address + * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags + */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, u8 nl_client, u32 map_flags) @@ -151,6 +171,15 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, return ret; } +/** + * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address + * info from the hash table + * @local_addr: Local ip/tcp address + * @mapped_local_addr: Mapped local ip/tcp address + * + * Returns err code if mapping info is not found in the hash table, + * otherwise returns 0 + */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { @@ -251,6 +280,17 @@ void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } +/** + * iwpm_get_remote_info - Get the remote connecting peer address info + * + * @mapped_loc_addr: Mapped local address of the listening peer + * @mapped_rem_addr: Mapped remote address of the connecting peer + * @remote_addr: To store the remote address of the connecting peer + * @nl_client: The index of the netlink client + * + * The remote address info is retrieved and provided to the client in + * the remote_addr. After that it is removed from the hash table + */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h index 84fac196ef80..b9fee7feeeb5 100644 --- a/include/rdma/iw_portmap.h +++ b/include/rdma/iw_portmap.h @@ -61,177 +61,28 @@ struct iwpm_sa_data { u32 flags; }; -/** - * iwpm_init - Allocate resources for the iwarp port mapper - * - * Should be called when network interface goes up. - */ int iwpm_init(u8); - -/** - * iwpm_exit - Deallocate resources for the iwarp port mapper - * - * Should be called when network interface goes down. - */ int iwpm_exit(u8); - -/** - * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid - * - * Returns true if the pid is greater than zero, otherwise returns false - */ int iwpm_valid_pid(void); - -/** - * iwpm_register_pid - Send a netlink query to userspace - * to get the iwarp port mapper pid - * @pm_msg: Contains driver info to send to the userspace port mapper - * @nl_client: The index of the netlink client - */ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client); - -/** - * iwpm_add_mapping - Send a netlink add mapping request to - * the userspace port mapper - * @pm_msg: Contains the local ip/tcp address info to send - * @nl_client: The index of the netlink client - * - * If the request is successful, the pm_msg stores - * the port mapper response (mapped address info) - */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); - -/** - * iwpm_add_and_query_mapping - Send a netlink add and query mapping request - * to the userspace port mapper - * @pm_msg: Contains the local and remote ip/tcp address info to send - * @nl_client: The index of the netlink client - * - * If the request is successful, the pm_msg stores the - * port mapper response (mapped local and remote address info) - */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); - -/** - * iwpm_remove_mapping - Send a netlink remove mapping request - * to the userspace port mapper - * - * @local_addr: Local ip/tcp address to remove - * @nl_client: The index of the netlink client - */ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client); - -/** - * iwpm_register_pid_cb - Process the port mapper response to - * iwpm_register_pid query - * @skb: - * @cb: Contains the received message (payload and netlink header) - * - * If successful, the function receives the userspace port mapper pid - * which is used in future communication with the port mapper - */ int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_add_mapping_cb - Process the port mapper response to - * iwpm_add_mapping request - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_add_and_query_mapping_cb - Process the port mapper response to - * iwpm_add_and_query_mapping request - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_remote_info_cb - Process remote connecting peer address info, which - * the port mapper has received from the connecting peer - * - * @cb: Contains the received message (payload and netlink header) - * - * Stores the IPv4/IPv6 address info in a hash table - */ int iwpm_remote_info_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_mapping_error_cb - Process port mapper notification for error - * - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_mapping_info_cb - Process a notification that the userspace - * port mapper daemon is started - * @skb: - * @cb: Contains the received message (payload and netlink header) - * - * Using the received port mapper pid, send all the local mapping - * info records to the userspace port mapper - */ int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_ack_mapping_info_cb - Process the port mapper ack for - * the provided local mapping info records - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_get_remote_info - Get the remote connecting peer address info - * - * @mapped_loc_addr: Mapped local address of the listening peer - * @mapped_rem_addr: Mapped remote address of the connecting peer - * @remote_addr: To store the remote address of the connecting peer - * @nl_client: The index of the netlink client - * - * The remote address info is retrieved and provided to the client in - * the remote_addr. After that it is removed from the hash table - */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, u8 nl_client); - -/** - * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address - * info in a hash table - * @local_addr: Local ip/tcp address - * @mapped_addr: Mapped local ip/tcp address - * @nl_client: The index of the netlink client - * @map_flags: IWPM mapping flags - */ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr, u8 nl_client, u32 map_flags); - -/** - * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address - * info from the hash table - * @local_addr: Local ip/tcp address - * @mapped_addr: Mapped local ip/tcp address - * - * Returns err code if mapping info is not found in the hash table, - * otherwise returns 0 - */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr); -/** - * iwpm_hello_cb - Process a hello message from iwpmd - * - * @skb: - * @cb: Contains the received message (payload and netlink header) - * - * Using the received port mapper pid, send the kernel's abi_version - * after adjusting it to support the iwpmd version. - */ int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _IW_PORTMAP_H */ -- cgit v1.2.3 From 385156c5f2a61834666f079ee66338f177c65c28 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:29:44 -0800 Subject: IB/hfi: Move RC functions into a header file This patch moves some RC helper functions into a header file so that they can be called from both RC and TID RDMA functions. In addition, a common function for rewinding a request is created in rdmavt so that it can be shared between qib and hfi1 driver. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 119 +++++++++++++++---------------------- drivers/infiniband/hw/hfi1/rc.h | 50 ++++++++++++++++ drivers/infiniband/hw/qib/qib_rc.c | 7 +-- drivers/infiniband/sw/rdmavt/rc.c | 13 ++++ include/rdma/rdmavt_qp.h | 10 ++++ 5 files changed, 123 insertions(+), 76 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/rc.h (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 092d5eba980f..6e74cd3814b8 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -51,28 +51,48 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs_txreq.h" #include "trace.h" -/* cut down ridiculously long IB macro names */ -#define OP(x) RC_OP(x) - -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp); - -static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, - u32 psn, u32 pmtu) +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled) + __must_hold(&qp->s_lock) { - u32 len; + struct rvt_ack_entry *e = NULL; + u8 i, p; + bool s = true; - len = delta_psn(psn, wqe->psn) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + for (i = qp->r_head_ack_queue; ; i = p) { + if (i == qp->s_tail_ack_queue) + s = false; + if (i) + p = i - 1; + else + p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); + if (p == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[p]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (p == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + s = false; + break; + } + } + if (prev) + *prev = p; + if (prev_ack) + *prev_ack = i; + if (scheduled) + *scheduled = s; + return e; } /** @@ -1229,9 +1249,9 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn) * This is similar to hfi1_send_complete but has to check to be sure * that the SGEs are not being referenced if the SWQE is being resent. */ -static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, - struct rvt_swqe *wqe, - struct hfi1_ibport *ibp) +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) { lockdep_assert_held(&qp->s_lock); /* @@ -1314,8 +1334,8 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * May be called at interrupt level, with the QP s_lock held. * Returns 1 if OK, 0 if current operation should be aborted (NAK). */ -static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, - u64 val, struct hfi1_ctxtdata *rcd) +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) { struct hfi1_ibport *ibp; enum ib_wc_status status; @@ -1754,16 +1774,6 @@ bail: return; } -static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, - struct rvt_qp *qp) -{ - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_NAK; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, &rcd->qp_wait_list); - } -} - static inline void rc_cancel_ack(struct rvt_qp *qp) { qp->r_adefered = 0; @@ -1796,8 +1806,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, struct hfi1_ibport *ibp = rcd_to_iport(rcd); struct rvt_ack_entry *e; unsigned long flags; - u8 i, prev; - int old_req; + u8 prev; + u8 mra; /* most recent ACK */ + bool old_req; trace_hfi1_rcv_error(qp, psn); if (diff > 0) { @@ -1843,29 +1854,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, spin_lock_irqsave(&qp->s_lock, flags); - for (i = qp->r_head_ack_queue; ; i = prev) { - if (i == qp->s_tail_ack_queue) - old_req = 0; - if (i) - prev = i - 1; - else - prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device)); - if (prev == qp->r_head_ack_queue) { - e = NULL; - break; - } - e = &qp->s_ack_queue[prev]; - if (!e->opcode) { - e = NULL; - break; - } - if (cmp_psn(psn, e->psn) >= 0) { - if (prev == qp->s_tail_ack_queue && - cmp_psn(psn, e->lpsn) <= 0) - old_req = 0; - break; - } - } + e = find_prev_entry(qp, psn, &prev, &mra, &old_req); + switch (opcode) { case OP(RDMA_READ_REQUEST): { struct ib_reth *reth; @@ -1940,7 +1930,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the most recent ACK if this request is * after all the previous RDMA reads and atomics. */ - if (i == qp->r_head_ack_queue) { + if (mra == qp->r_head_ack_queue) { spin_unlock_irqrestore(&qp->s_lock, flags); qp->r_nak_state = 0; qp->r_ack_psn = qp->r_psn - 1; @@ -1951,7 +1941,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ - qp->s_tail_ack_queue = i; + qp->s_tail_ack_queue = mra; break; } qp->s_ack_state = OP(ACKNOWLEDGE); @@ -1968,17 +1958,6 @@ send_ack: return 0; } -static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) -{ - unsigned next; - - next = n + 1; - if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - next = 0; - qp->s_tail_ack_queue = next; - qp->s_ack_state = OP(ACKNOWLEDGE); -} - static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, u32 rqpn, u8 svc_type) { diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h new file mode 100644 index 000000000000..4329eadcb3df --- /dev/null +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef HFI1_RC_H +#define HFI1_RC_H + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) +{ + unsigned int next; + + next = n + 1; + if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp) +{ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_NAK; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + return rvt_restart_sge(ss, wqe, len); +} + +struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev, + u8 *prev_ack, bool *scheduled); +int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val, + struct hfi1_ctxtdata *rcd); +struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct hfi1_ibport *ibp); + +#endif /* HFI1_RC_H */ diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 6fa002940451..50dd9811b088 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len; len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu; - ss->sge = wqe->sg_list[0]; - ss->sg_list = wqe->sg_list + 1; - ss->num_sge = wqe->wr.num_sge; - ss->total_len = wqe->length; - rvt_skip_sge(ss, len, false); - return wqe->length - len; + return rvt_restart_sge(ss, wqe, len); } /** diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 6131cc558bdb..8d71647820a8 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth) } } EXPORT_SYMBOL(rvt_get_credit); + +/* rvt_restart_sge - rewind the sge state for a wqe */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len) +{ + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + rvt_skip_sge(ss, len, false); + return wqe->length - len; +} +EXPORT_SYMBOL(rvt_restart_sge); + diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index cbafb1878669..56a9221378d9 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -628,6 +628,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp); */ void rvt_get_credit(struct rvt_qp *qp, u32 aeth); +/** + * rvt_restart_sge - rewind the sge state for a wqe + * @ss: the sge state pointer + * @wqe: the wqe to rewind + * @len: the data length from the start of the wqe in bytes + * + * Returns the remaining data length. + */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); + /** * @qp - the qp pair * @len - the length -- cgit v1.2.3 From 838b6fd2d9ca29998869e4d1ecf4566efe807666 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:07 -0800 Subject: IB/hfi1: TID RDMA RcvArray programming and TID allocation TID entries are used by hfi1 hardware to receive data payload from incoming packets directly into a user buffer and thus avoid data copying by software. This patch implements the functions for TID allocation, freeing, and programming TID RcvArray entries in hardware for kernel clients. TID entries are managed via lists of TID groups similar to PSM. Furthermore, to track TID resource allocation for each request, software flows are also allocated and freed as needed. Since software flows consume large amount of memory for tracking TID allocation and freeing, it is generally desirable to allocate them dynamically in the send queue and only for TID RDMA requests, but pre-allocate them for receive queue because the send queue could have thousands of entries while the receive queue has only a limited number of entries. Signed-off-by: Mitko Haralanov Signed-off-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 2 + drivers/infiniband/hw/hfi1/init.c | 3 +- drivers/infiniband/hw/hfi1/tid_rdma.c | 877 +++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/tid_rdma.h | 101 ++++ drivers/infiniband/hw/hfi1/user_exp_rcv.h | 1 - drivers/infiniband/hw/hfi1/verbs.c | 29 +- drivers/infiniband/hw/hfi1/verbs.h | 34 ++ drivers/infiniband/sw/rdmavt/qp.c | 2 +- include/rdma/rdmavt_qp.h | 2 + 9 files changed, 1033 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 78aa344c7403..1412ed157c98 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -303,6 +303,8 @@ struct hfi1_ctxtdata { spinlock_t exp_lock; /* Queue for QP's waiting for HW TID flows */ struct tid_queue flow_queue; + /* Queue for QP's waiting for HW receive array entries */ + struct tid_queue rarr_queue; /* when waiting for rcv or pioavail */ wait_queue_head_t wait; /* uuid from PSM */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 56830a514b92..d13304f7340d 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -372,6 +372,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, mutex_init(&rcd->exp_mutex); spin_lock_init(&rcd->exp_lock); INIT_LIST_HEAD(&rcd->flow_queue.queue_head); + INIT_LIST_HEAD(&rcd->rarr_queue.queue_head); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); @@ -1596,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd) struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; if (rcd) { - hfi1_clear_tids(rcd); + hfi1_free_ctxt_rcv_groups(rcd); hfi1_free_ctxt(rcd); } } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 70671212808f..1d02b12590f6 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -8,6 +8,7 @@ #include "qp.h" #include "verbs.h" #include "tid_rdma.h" +#include "exp_rcv.h" #include "trace.h" #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) @@ -35,8 +36,14 @@ static u32 mask_generation(u32 a) #define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE #define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1) +/* Maximum number of segments in flight per QP request. */ #define TID_RDMA_MAX_READ_SEGS_PER_REQ 6 #define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4 +#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \ + TID_RDMA_MAX_WRITE_SEGS_PER_REQ) +#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1) + +#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 @@ -79,6 +86,11 @@ static u32 mask_generation(u32 a) */ static void tid_rdma_trigger_resume(struct work_struct *work); +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req); +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp); +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req); static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p) { @@ -230,7 +242,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit) BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY); rcd->jkey = TID_RDMA_JKEY; hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey); - return 0; + return hfi1_alloc_ctxt_rcv_groups(rcd); } /** @@ -266,6 +278,7 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr) { struct hfi1_qp_priv *qpriv = qp->priv; + int i, ret; qpriv->rcd = qp_to_rcd(rdi, qp); @@ -278,15 +291,75 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, qpriv->flow_state.generation = KERN_GENERATION_RESERVED; INIT_LIST_HEAD(&qpriv->tid_wait); + if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + struct hfi1_devdata *dd = qpriv->rcd->dd; + + qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES * + sizeof(*qpriv->pages), + GFP_KERNEL, dd->node); + if (!qpriv->pages) + return -ENOMEM; + for (i = 0; i < qp->s_size; i++) { + struct hfi1_swqe_priv *priv; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i); + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.swqe = wqe; + wqe->priv = priv; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv; + + priv = kzalloc_node(sizeof(*priv), GFP_KERNEL, + dd->node); + if (!priv) + return -ENOMEM; + + hfi1_init_trdma_req(qp, &priv->tid_req); + priv->tid_req.e.ack = &qp->s_ack_queue[i]; + + ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, + GFP_KERNEL); + if (ret) { + kfree(priv); + return ret; + } + qp->s_ack_queue[i].priv = priv; + } + } + return 0; } void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) { - struct hfi1_qp_priv *priv = qp->priv; - - if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) - cancel_work_sync(&priv->opfn.opfn_work); + struct hfi1_qp_priv *qpriv = qp->priv; + struct rvt_swqe *wqe; + u32 i; + + if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) { + for (i = 0; i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->priv); + wqe->priv = NULL; + } + for (i = 0; i < rvt_max_atomic(rdi); i++) { + struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv; + + if (priv) + hfi1_kern_exp_rcv_free_flows(&priv->tid_req); + kfree(priv); + qp->s_ack_queue[i].priv = NULL; + } + cancel_work_sync(&qpriv->opfn.opfn_work); + kfree(qpriv->pages); + qpriv->pages = NULL; + } } /* Flow and tid waiter functions */ @@ -540,6 +613,7 @@ void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue); + _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue); } /* Flow functions */ @@ -702,3 +776,796 @@ void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd) kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i); } } + +/* TID allocation functions */ +static u8 trdma_pset_order(struct tid_rdma_pageset *s) +{ + u8 count = s->count; + + return ilog2(count) + 1; +} + +/** + * tid_rdma_find_phys_blocks_4k - get groups base on mr info + * @npages - number of pages + * @pages - pointer to an array of page structs + * @list - page set array to return + * + * This routine returns the number of groups associated with + * the current sge information. This implementation is based + * on the expected receive find_phys_blocks() adjusted to + * use the MR information vs. the pfn. + * + * Return: + * the number of RcvArray entries + */ +static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 pagecount, pageidx, setcount = 0, i; + void *vaddr, *this_vaddr; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + vaddr = page_address(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_vaddr = i < npages ? page_address(pages[i]) : NULL; + /* + * If the vaddr's are not sequential, pages are not physically + * contiguous. + */ + if (this_vaddr != (vaddr + PAGE_SIZE)) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + vaddr = this_vaddr; + } else { + vaddr += PAGE_SIZE; + pagecount++; + } + } + /* insure we always return an even number of sets */ + if (setcount & 1) + list[setcount++].count = 0; + return setcount; +} + +/** + * tid_flush_pages - dump out pages into pagesets + * @list - list of pagesets + * @idx - pointer to current page index + * @pages - number of pages to dump + * @sets - current number of pagesset + * + * This routine flushes out accumuated pages. + * + * To insure an even number of sets the + * code may add a filler. + * + * This can happen with when pages is not + * a power of 2 or pages is a power of 2 + * less than the maximum pages. + * + * Return: + * The new number of sets + */ + +static u32 tid_flush_pages(struct tid_rdma_pageset *list, + u32 *idx, u32 pages, u32 sets) +{ + while (pages) { + u32 maxpages = pages; + + if (maxpages > MAX_EXPECTED_PAGES) + maxpages = MAX_EXPECTED_PAGES; + else if (!is_power_of_2(maxpages)) + maxpages = rounddown_pow_of_two(maxpages); + list[sets].idx = *idx; + list[sets++].count = maxpages; + *idx += maxpages; + pages -= maxpages; + } + /* might need a filler */ + if (sets & 1) + list[sets++].count = 0; + return sets; +} + +/** + * tid_rdma_find_phys_blocks_8k - get groups base on mr info + * @pages - pointer to an array of page structs + * @npages - number of pages + * @list - page set array to return + * + * This routine parses an array of pages to compute pagesets + * in an 8k compatible way. + * + * pages are tested two at a time, i, i + 1 for contiguous + * pages and i - 1 and i contiguous pages. + * + * If any condition is false, any accumlated pages are flushed and + * v0,v1 are emitted as separate PAGE_SIZE pagesets + * + * Otherwise, the current 8k is totaled for a future flush. + * + * Return: + * The number of pagesets + * list set with the returned number of pagesets + * + */ +static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow, + struct page **pages, + u32 npages, + struct tid_rdma_pageset *list) +{ + u32 idx, sets = 0, i; + u32 pagecnt = 0; + void *v0, *v1, *vm1; + + if (!npages) + return 0; + for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) { + /* get a new v0 */ + v0 = page_address(pages[i]); + v1 = i + 1 < npages ? + page_address(pages[i + 1]) : NULL; + /* compare i, i + 1 vaddr */ + if (v1 != (v0 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + /* output v0,v1 as two pagesets */ + list[sets].idx = idx++; + list[sets++].count = 1; + if (v1) { + list[sets].count = 1; + list[sets++].idx = idx++; + } else { + list[sets++].count = 0; + } + vm1 = NULL; + pagecnt = 0; + continue; + } + /* i,i+1 consecutive, look at i-1,i */ + if (vm1 && v0 != (vm1 + PAGE_SIZE)) { + /* flush out pages */ + sets = tid_flush_pages(list, &idx, pagecnt, sets); + pagecnt = 0; + } + /* pages will always be a multiple of 8k */ + pagecnt += 2; + /* save i-1 */ + vm1 = v1; + /* move to next pair */ + } + /* dump residual pages at end */ + sets = tid_flush_pages(list, &idx, npages - idx, sets); + /* by design cannot be odd sets */ + WARN_ON(sets & 1); + return sets; +} + +/** + * Find pages for one segment of a sge array represented by @ss. The function + * does not check the sge, the sge must have been checked for alignment with a + * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of + * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge + * copy maintained in @ss->sge, the original sge is not modified. + * + * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not + * releasing the MR reference count at the same time. Otherwise, we'll "leak" + * references to the MR. This difference requires that we keep track of progress + * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request + * structure. + */ +static u32 kern_find_pages(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + struct tid_rdma_request *req = flow->req; + struct rvt_sge *sge = &ss->sge; + u32 length = flow->req->seg_len; + u32 len = PAGE_SIZE; + u32 i = 0; + + while (length && req->isge < ss->num_sge) { + pages[i++] = virt_to_page(sge->vaddr); + + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (!sge->sge_length) { + if (++req->isge < ss->num_sge) + *sge = ss->sg_list[req->isge - 1]; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + ++sge->m; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + + flow->length = flow->req->seg_len - length; + *last = req->isge == ss->num_sge ? false : true; + return i; +} + +static void dma_unmap_flow(struct tid_rdma_flow *flow) +{ + struct hfi1_devdata *dd; + int i; + struct tid_rdma_pageset *pset; + + dd = flow->req->rcd->dd; + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count && pset->addr) { + dma_unmap_page(&dd->pcidev->dev, + pset->addr, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + pset->mapped = 0; + } + } +} + +static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages) +{ + int i; + struct hfi1_devdata *dd = flow->req->rcd->dd; + struct tid_rdma_pageset *pset; + + for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets; + i++, pset++) { + if (pset->count) { + pset->addr = dma_map_page(&dd->pcidev->dev, + pages[pset->idx], + 0, + PAGE_SIZE * pset->count, + DMA_FROM_DEVICE); + + if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) { + dma_unmap_flow(flow); + return -ENOMEM; + } + pset->mapped = 1; + } + } + return 0; +} + +static inline bool dma_mapped(struct tid_rdma_flow *flow) +{ + return !!flow->pagesets[0].mapped; +} + +/* + * Get pages pointers and identify contiguous physical memory chunks for a + * segment. All segments are of length flow->req->seg_len. + */ +static int kern_get_phys_blocks(struct tid_rdma_flow *flow, + struct page **pages, + struct rvt_sge_state *ss, bool *last) +{ + u8 npages; + + /* Reuse previously computed pagesets, if any */ + if (flow->npagesets) { + if (!dma_mapped(flow)) + return dma_map_flow(flow, pages); + return 0; + } + + npages = kern_find_pages(flow, pages, ss, last); + + if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096)) + flow->npagesets = + tid_rdma_find_phys_blocks_4k(flow, pages, npages, + flow->pagesets); + else + flow->npagesets = + tid_rdma_find_phys_blocks_8k(flow, pages, npages, + flow->pagesets); + + return dma_map_flow(flow, pages); +} + +static inline void kern_add_tid_node(struct tid_rdma_flow *flow, + struct hfi1_ctxtdata *rcd, char *s, + struct tid_group *grp, u8 cnt) +{ + struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++]; + + WARN_ON_ONCE(flow->tnode_cnt >= + (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT)); + if (WARN_ON_ONCE(cnt & 1)) + dd_dev_err(rcd->dd, + "unexpected odd allocation cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + + node->grp = grp; + node->map = grp->map; + node->cnt = cnt; +} + +/* + * Try to allocate pageset_count TID's from TID groups for a context + * + * This function allocates TID's without moving groups between lists or + * modifying grp->map. This is done as follows, being cogizant of the lists + * between which the TID groups will move: + * 1. First allocate complete groups of 8 TID's since this is more efficient, + * these groups will move from group->full without affecting used + * 2. If more TID's are needed allocate from used (will move from used->full or + * stay in used) + * 3. If we still don't have the required number of TID's go back and look again + * at a complete group (will move from group->used) + */ +static int kern_alloc_tids(struct tid_rdma_flow *flow) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + u32 ngroups, pageidx = 0; + struct tid_group *group = NULL, *used; + u8 use; + + flow->tnode_cnt = 0; + ngroups = flow->npagesets / dd->rcv_entries.group_size; + if (!ngroups) + goto used_list; + + /* First look at complete groups */ + list_for_each_entry(group, &rcd->tid_group_list.list, list) { + kern_add_tid_node(flow, rcd, "complete groups", group, + group->size); + + pageidx += group->size; + if (!--ngroups) + break; + } + + if (pageidx >= flow->npagesets) + goto ok; + +used_list: + /* Now look at partially used groups */ + list_for_each_entry(used, &rcd->tid_used_list.list, list) { + use = min_t(u32, flow->npagesets - pageidx, + used->size - used->used); + kern_add_tid_node(flow, rcd, "used groups", used, use); + + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; + } + + /* + * Look again at a complete group, continuing from where we left. + * However, if we are at the head, we have reached the end of the + * complete groups list from the first loop above + */ + if (group && &group->list == &rcd->tid_group_list.list) + goto bail_eagain; + group = list_prepare_entry(group, &rcd->tid_group_list.list, + list); + if (list_is_last(&group->list, &rcd->tid_group_list.list)) + goto bail_eagain; + group = list_next_entry(group, list); + use = min_t(u32, flow->npagesets - pageidx, group->size); + kern_add_tid_node(flow, rcd, "complete continue", group, use); + pageidx += use; + if (pageidx >= flow->npagesets) + goto ok; +bail_eagain: + return -EAGAIN; +ok: + return 0; +} + +static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num, + u32 *pset_idx) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + struct tid_rdma_pageset *pset; + u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT; + u32 rcventry, npages = 0, pair = 0, tidctrl; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + pset = &flow->pagesets[(*pset_idx)++]; + if (pset->count) { + hfi1_put_tid(dd, rcventry, PT_EXPECTED, + pset->addr, trdma_pset_order(pset)); + } else { + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + } + npages += pset->count; + + rcventry -= rcd->expected_base; + tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1; + /* + * A single TID entry will be used to use a rcvarr pair (with + * tidctrl 0x3), if ALL these are true (a) the bit pos is even + * (b) the group map shows current and the next bits as free + * indicating two consecutive rcvarry entries are available (c) + * we actually need 2 more entries + */ + pair = !(i & 0x1) && !((node->map >> i) & 0x3) && + node->cnt >= cnt + 2; + if (!pair) { + if (!pset->count) + tidctrl = 0x1; + flow->tid_entry[flow->tidcnt++] = + EXP_TID_SET(IDX, rcventry >> 1) | + EXP_TID_SET(CTRL, tidctrl) | + EXP_TID_SET(LEN, npages); + /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */ + flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg); + npages = 0; + } + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_full_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_group_list, + &rcd->tid_used_list); + + grp->used++; + grp->map |= BIT(i); + cnt++; + } +} + +static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num) +{ + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + struct kern_tid_node *node = &flow->tnode[grp_num]; + struct tid_group *grp = node->grp; + u32 rcventry; + u8 i, cnt = 0; + + for (i = 0; i < grp->size; i++) { + rcventry = grp->base + i; + + if (node->map & BIT(i) || cnt >= node->cnt) { + rcv_array_wc_fill(dd, rcventry); + continue; + } + + hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0); + + grp->used--; + grp->map &= ~BIT(i); + cnt++; + + if (grp->used == grp->size - 1) + tid_group_move(grp, &rcd->tid_full_list, + &rcd->tid_used_list); + else if (!grp->used) + tid_group_move(grp, &rcd->tid_used_list, + &rcd->tid_group_list); + } + if (WARN_ON_ONCE(cnt & 1)) { + struct hfi1_ctxtdata *rcd = flow->req->rcd; + struct hfi1_devdata *dd = rcd->dd; + + dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u", + cnt, grp->map, grp->used); + } +} + +static void kern_program_rcvarray(struct tid_rdma_flow *flow) +{ + u32 pset_idx = 0; + int i; + + flow->npkts = 0; + flow->tidcnt = 0; + for (i = 0; i < flow->tnode_cnt; i++) + kern_program_rcv_group(flow, i, &pset_idx); +} + +/** + * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a + * TID RDMA request + * + * @req: TID RDMA request for which the segment/flow is being set up + * @ss: sge state, maintains state across successive segments of a sge + * @last: set to true after the last sge segment has been processed + * + * This function + * (1) finds a free flow entry in the flow circular buffer + * (2) finds pages and continuous physical chunks constituing one segment + * of an sge + * (3) allocates TID group entries for those chunks + * (4) programs rcvarray entries in the hardware corresponding to those + * TID's + * (5) computes a tidarray with formatted TID entries which can be sent + * to the sender + * (6) Reserves and programs HW flows. + * (7) It also manages queing the QP when TID/flow resources are not + * available. + * + * @req points to struct tid_rdma_request of which the segments are a part. The + * function uses qp, rcd and seg_len members of @req. In the absence of errors, + * req->flow_idx is the index of the flow which has been prepared in this + * invocation of function call. With flow = &req->flows[req->flow_idx], + * flow->tid_entry contains the TID array which the sender can use for TID RDMA + * sends and flow->npkts contains number of packets required to send the + * segment. + * + * hfi1_check_sge_align should be called prior to calling this function and if + * it signals error TID RDMA cannot be used for this sge and this function + * should not be called. + * + * For the queuing, caller must hold the flow->req->qp s_lock from the send + * engine and the function will procure the exp_lock. + * + * Return: + * The function returns -EAGAIN if sufficient number of TID/flow resources to + * map the segment could not be allocated. In this case the function should be + * called again with previous arguments to retry the TID allocation. There are + * no other error returns. The function returns 0 on success. + */ +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->setup_head]; + struct hfi1_ctxtdata *rcd = req->rcd; + struct hfi1_qp_priv *qpriv = req->qp->priv; + unsigned long flags; + struct rvt_qp *fqp; + u16 clear_tail = req->clear_tail; + + lockdep_assert_held(&req->qp->s_lock); + /* + * We return error if either (a) we don't have space in the flow + * circular buffer, or (b) we already have max entries in the buffer. + * Max entries depend on the type of request we are processing and the + * negotiated TID RDMA parameters. + */ + if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) || + CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >= + req->n_flows) + return -EINVAL; + + /* + * Get pages, identify contiguous physical memory chunks for the segment + * If we can not determine a DMA address mapping we will treat it just + * like if we ran out of space above. + */ + if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) { + hfi1_wait_kmem(flow->req->qp); + return -ENOMEM; + } + + spin_lock_irqsave(&rcd->exp_lock, flags); + if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp)) + goto queue; + + /* + * At this point we know the number of pagesets and hence the number of + * TID's to map the segment. Allocate the TID's from the TID groups. If + * we cannot allocate the required number we exit and try again later + */ + if (kern_alloc_tids(flow)) + goto queue; + /* + * Finally program the TID entries with the pagesets, compute the + * tidarray and enable the HW flow + */ + kern_program_rcvarray(flow); + + /* + * Setup the flow state with relevant information. + * This information is used for tracking the sequence of data packets + * for the segment. + * The flow is setup here as this is the most accurate time and place + * to do so. Doing at a later time runs the risk of the flow data in + * qpriv getting out of sync. + */ + memset(&flow->flow_state, 0x0, sizeof(flow->flow_state)); + flow->idx = qpriv->flow_state.index; + flow->flow_state.generation = qpriv->flow_state.generation; + flow->flow_state.spsn = qpriv->flow_state.psn; + flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1; + flow->flow_state.r_next_psn = + full_flow_psn(flow, flow->flow_state.spsn); + qpriv->flow_state.psn += flow->npkts; + + dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp); + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + tid_rdma_schedule_tid_wakeup(fqp); + + req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1); + return 0; +queue: + queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + return -EAGAIN; +} + +static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow) +{ + flow->npagesets = 0; +} + +/* + * This function is called after one segment has been successfully sent to + * release the flow and TID HW/SW resources for that segment. The segments for a + * TID RDMA request are setup and cleared in FIFO order which is managed using a + * circular buffer. + */ +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + struct tid_rdma_flow *flow = &req->flows[req->clear_tail]; + struct hfi1_ctxtdata *rcd = req->rcd; + unsigned long flags; + int i; + struct rvt_qp *fqp; + + lockdep_assert_held(&req->qp->s_lock); + /* Exit if we have nothing in the flow circular buffer */ + if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) + return -EINVAL; + + spin_lock_irqsave(&rcd->exp_lock, flags); + + for (i = 0; i < flow->tnode_cnt; i++) + kern_unprogram_rcv_group(flow, i); + /* To prevent double unprogramming */ + flow->tnode_cnt = 0; + /* get head before dropping lock */ + fqp = first_qp(rcd, &rcd->rarr_queue); + spin_unlock_irqrestore(&rcd->exp_lock, flags); + + dma_unmap_flow(flow); + + hfi1_tid_rdma_reset_flow(flow); + req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1); + + if (fqp == req->qp) { + __trigger_tid_waiter(fqp); + rvt_put_qp(fqp); + } else { + tid_rdma_schedule_tid_wakeup(fqp); + } + + return 0; +} + +/* + * This function is called to release all the tid entries for + * a request. + */ +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req) + __must_hold(&req->qp->s_lock) +{ + /* Use memory barrier for proper ordering */ + while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) { + if (hfi1_kern_exp_rcv_clear(req)) + break; + } +} + +/** + * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information + * @req - the tid rdma request to be cleaned + */ +static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req) +{ + kfree(req->flows); + req->flows = NULL; +} + +/** + * __trdma_clean_swqe - clean up for large sized QPs + * @qp: the queue patch + * @wqe: the send wqe + */ +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + struct hfi1_swqe_priv *p = wqe->priv; + + hfi1_kern_exp_rcv_free_flows(&p->tid_req); +} + +/* + * This can be called at QP create time or in the data path. + */ +static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req, + gfp_t gfp) +{ + struct tid_rdma_flow *flows; + int i; + + if (likely(req->flows)) + return 0; + flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp, + req->rcd->numa_id); + if (!flows) + return -ENOMEM; + /* mini init */ + for (i = 0; i < MAX_FLOWS; i++) { + flows[i].req = req; + flows[i].npagesets = 0; + flows[i].pagesets[0].mapped = 0; + } + req->flows = flows; + return 0; +} + +static void hfi1_init_trdma_req(struct rvt_qp *qp, + struct tid_rdma_request *req) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + /* + * Initialize various TID RDMA request variables. + * These variables are "static", which is why they + * can be pre-initialized here before the WRs has + * even been submitted. + * However, non-NULL values for these variables do not + * imply that this WQE has been enabled for TID RDMA. + * Drivers should check the WQE's opcode to determine + * if a request is a TID RDMA one or not. + */ + req->qp = qp; + req->rcd = qpriv->rcd; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3bc0aaf9568f..524baf8c8fac 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -6,7 +6,16 @@ #ifndef HFI1_TID_RDMA_H #define HFI1_TID_RDMA_H +#include +#include "common.h" + +/* Add a convenience helper */ +#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1)) +#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) +#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) + #define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */ +#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT) struct tid_rdma_params { struct rcu_head rcu_head; @@ -36,6 +45,81 @@ struct tid_flow_state { u8 flags; }; +struct tid_rdma_request { + struct rvt_qp *qp; + struct hfi1_ctxtdata *rcd; + union { + struct rvt_swqe *swqe; + struct rvt_ack_entry *ack; + } e; + + struct tid_rdma_flow *flows; /* array of tid flows */ + u16 n_flows; /* size of the flow buffer window */ + u16 setup_head; /* flow index we are setting up */ + u16 clear_tail; /* flow index we are clearing */ + u16 flow_idx; /* flow index most recently set up */ + + u32 seg_len; + + u32 isge; /* index of "current" sge */ +}; + +/* + * When header suppression is used, PSNs associated with a "flow" are + * relevant (and not the PSNs maintained by verbs). Track per-flow + * PSNs here for a TID RDMA segment. + * + */ +struct flow_state { + u32 flags; + u32 resp_ib_psn; /* The IB PSN of the response for this flow */ + u32 generation; /* generation of flow */ + u32 spsn; /* starting PSN in TID space */ + u32 lpsn; /* last PSN in TID space */ + u32 r_next_psn; /* next PSN to be received (in TID space) */ +}; + +struct tid_rdma_pageset { + dma_addr_t addr : 48; /* Only needed for the first page */ + u8 idx: 8; + u8 count : 7; + u8 mapped: 1; +}; + +/** + * kern_tid_node - used for managing TID's in TID groups + * + * @grp_idx: rcd relative index to tid_group + * @map: grp->map captured prior to programming this TID group in HW + * @cnt: Only @cnt of available group entries are actually programmed + */ +struct kern_tid_node { + struct tid_group *grp; + u8 map; + u8 cnt; +}; + +/* Overall info for a TID RDMA segment */ +struct tid_rdma_flow { + /* + * While a TID RDMA segment is being transferred, it uses a QP number + * from the "KDETH section of QP numbers" (which is different from the + * QP number that originated the request). Bits 11-15 of these QP + * numbers identify the "TID flow" for the segment. + */ + struct flow_state flow_state; + struct tid_rdma_request *req; + u32 length; + u8 tnode_cnt; + u8 tidcnt; + u8 idx; + u8 npagesets; + u8 npkts; + struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; + struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; + u32 tid_entry[TID_RDMA_MAX_PAGES]; +}; + bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data); bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data); bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data); @@ -43,6 +127,23 @@ void tid_rdma_conn_error(struct rvt_qp *qp); void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p); int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit); +int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req, + struct rvt_sge_state *ss, bool *last); +int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req); +void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req); +void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe); + +/** + * trdma_clean_swqe - clean flows for swqe if large send queue + * @qp: the qp + * @wqe: the send wqe + */ +static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) +{ + if (!wqe->priv) + return; + __trdma_clean_swqe(qp, wqe); +} int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp, struct ib_qp_init_attr *init_attr); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index e383cc01a2bf..43b105de1d54 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -48,7 +48,6 @@ */ #include "hfi.h" - #include "exp_rcv.h" struct tid_pageset { diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 571bfd549c2a..02c1873a976c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -504,11 +504,28 @@ static void verbs_sdma_complete( hfi1_put_txreq(tx); } +void hfi1_wait_kmem(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_qp *ibqp = &qp->ibqp; + struct ib_device *ibdev = ibqp->device; + struct hfi1_ibdev *dev = to_idev(ibdev); + + if (list_empty(&priv->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + priv->s_iowait.lock = &dev->iowait_lock; + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); + rvt_get_qp(qp); + } +} + static int wait_kmem(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { - struct hfi1_qp_priv *priv = qp->priv; unsigned long flags; int ret = 0; @@ -517,15 +534,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, &ps->wait->tx_head); - if (list_empty(&priv->s_iowait.list)) { - if (list_empty(&dev->memwait)) - mod_timer(&dev->mem_timer, jiffies + 1); - qp->s_flags |= RVT_S_WAIT_KMEM; - list_add_tail(&priv->s_iowait.list, &dev->memwait); - priv->s_iowait.lock = &dev->iowait_lock; - trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); - rvt_get_qp(qp); - } + hfi1_wait_kmem(qp); write_sequnlock(&dev->iowait_lock); hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 9065e470bebb..94f198b47239 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -159,6 +159,7 @@ struct hfi1_qp_priv { struct sdma_engine *s_sde; /* current sde */ struct send_context *s_sendcontext; /* current sendcontext */ struct hfi1_ctxtdata *rcd; /* QP's receive context */ + struct page **pages; /* for TID page scan */ u32 tid_enqueue; /* saved when tid waited */ u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; @@ -173,6 +174,14 @@ struct hfi1_qp_priv { u8 timeout_shift; /* account for number of packets per segment */ }; +struct hfi1_swqe_priv { + struct tid_rdma_request tid_req; +}; + +struct hfi1_ack_priv { + struct tid_rdma_request tid_req; +}; + /* * This structure is used to hold commonly lookedup and computed values during * the send engine progress. @@ -321,6 +330,21 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } +/* + * Look through all the active flows for a TID RDMA request and find + * the one (if it exists) that contains the specified PSN. + */ +static inline u32 __full_flow_psn(struct flow_state *state, u32 psn) +{ + return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) | + (psn & HFI1_KDETH_BTH_SEQ_MASK)); +} + +static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn) +{ + return __full_flow_psn(&flow->flow_state, psn); +} + struct verbs_txreq; void hfi1_put_txreq(struct verbs_txreq *tx); @@ -403,6 +427,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); } +void hfi1_wait_kmem(struct rvt_qp *qp); + +static inline void hfi1_trdma_send_complete(struct rvt_qp *qp, + struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + trdma_clean_swqe(qp, wqe); + rvt_send_complete(qp, wqe, status); +} + extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; extern const u8 hdr_len_by_opcode[]; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 16247d2a671d..c8e70cf69a8a 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp) kref_put(&qp->ip->ref, rvt_release_mmap_info); else vfree(qp->r_rq.wq); - vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + vfree(qp->s_wq); kfree(qp); return 0; } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 56a9221378d9..9095a0b71250 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -174,6 +174,7 @@ struct rvt_swqe { u32 lpsn; /* last packet sequence number */ u32 ssn; /* send sequence number */ u32 length; /* total length of data in sg_list */ + void *priv; /* driver dependent field */ struct rvt_sge sg_list[0]; }; @@ -235,6 +236,7 @@ struct rvt_ack_entry { u32 lpsn; u8 opcode; u8 sent; + void *priv; }; #define RC_QP_SCALING_INTERVAL 5 -- cgit v1.2.3 From 742a3826cf82395e304df99f6494d04b0dd03a84 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:40 -0800 Subject: IB/hfi1: Add functions to build TID RDMA READ request This patch adds the helper functions to build the TID RDMA READ request on the requester side. The key is to allocate TID resources (TID flow and TID entries) and send the resource information to the responder side along with the read request. Since the TID resources are limited, each TID RDMA READ request has to be split into segments with a default segment size of 256K. A software flow is allocated to track the data transaction for each segment. The work request opcode, packet opcode, and packet formats for TID RDMA READ protocol are also defined in this patch. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/tid_rdma.c | 200 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 32 ++++++ drivers/infiniband/hw/hfi1/verbs.h | 9 ++ include/rdma/ib_hdrs.h | 9 +- include/rdma/tid_rdma_defs.h | 52 +++++++++ 5 files changed, 301 insertions(+), 1 deletion(-) create mode 100644 include/rdma/tid_rdma_defs.h (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 506b5a59ded5..56c8c10b5a85 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -6,11 +6,27 @@ #include "hfi.h" #include "qp.h" +#include "rc.h" #include "verbs.h" #include "tid_rdma.h" #include "exp_rcv.h" #include "trace.h" +/** + * DOC: TID RDMA READ protocol + * + * This is an end-to-end protocol at the hfi1 level between two nodes that + * improves performance by avoiding data copy on the requester side. It + * converts a qualified RDMA READ request into a TID RDMA READ request on + * the requester side and thereafter handles the request and response + * differently. To be qualified, the RDMA READ request should meet the + * following: + * -- The total data length should be greater than 256K; + * -- The total data length should be a multiple of 4K page size; + * -- Each local scatter-gather entry should be 4K page aligned; + * -- Each local scatter-gather entry should be a multiple of 4K page size; + */ + #define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32) #define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33) #define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34) @@ -18,6 +34,9 @@ #define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37) #define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38) +/* Maximum number of packets within a flow generation. */ +#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT) + #define GENERATION_MASK 0xFFFFF static u32 mask_generation(u32 a) @@ -45,6 +64,9 @@ static u32 mask_generation(u32 a) #define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE) +#define TID_RDMA_DESTQP_FLOW_SHIFT 11 +#define TID_RDMA_DESTQP_FLOW_MASK 0x1f + #define TID_OPFN_QP_CTXT_MASK 0xff #define TID_OPFN_QP_CTXT_SHIFT 56 #define TID_OPFN_QP_KDETH_MASK 0xff @@ -1597,3 +1619,181 @@ u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, return dd->verbs_dev.n_tidwait; } + +/* TID RDMA READ functions */ +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) +{ + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = &req->flows[req->flow_idx]; + struct rvt_qp *qp = req->qp; + struct hfi1_qp_priv *qpriv = qp->priv; + struct hfi1_swqe_priv *wpriv = wqe->priv; + struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req; + struct tid_rdma_params *remote; + u32 req_len = 0; + void *req_addr = NULL; + + /* This is the IB psn used to send the request */ + *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt); + + /* TID Entries for TID RDMA READ payload */ + req_addr = &flow->tid_entry[flow->tid_idx]; + req_len = sizeof(*flow->tid_entry) * + (flow->tidcnt - flow->tid_idx); + + memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req)); + wpriv->ss.sge.vaddr = req_addr; + wpriv->ss.sge.sge_length = req_len; + wpriv->ss.sge.length = wpriv->ss.sge.sge_length; + /* + * We can safely zero these out. Since the first SGE covers the + * entire packet, nothing else should even look at the MR. + */ + wpriv->ss.sge.mr = NULL; + wpriv->ss.sge.m = 0; + wpriv->ss.sge.n = 0; + + wpriv->ss.sg_list = NULL; + wpriv->ss.total_len = wpriv->ss.sge.sge_length; + wpriv->ss.num_sge = 1; + + /* Construct the TID RDMA READ REQ packet header */ + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + + KDETH_RESET(rreq->kdeth0, KVER, 0x1); + KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey); + rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr + + req->cur_seg * req->seg_len + flow->sent); + rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey); + rreq->reth.length = cpu_to_be32(*len); + rreq->tid_flow_psn = + cpu_to_be32((flow->flow_state.generation << + HFI1_KDETH_BTH_SEQ_SHIFT) | + ((flow->flow_state.spsn + flow->pkt) & + HFI1_KDETH_BTH_SEQ_MASK)); + rreq->tid_flow_qp = + cpu_to_be32(qpriv->tid_rdma.local.qp | + ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) << + TID_RDMA_DESTQP_FLOW_SHIFT) | + qpriv->rcd->ctxt); + rreq->verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + *bth2 |= IB_BTH_REQ_ACK; + rcu_read_unlock(); + + /* We are done with this segment */ + flow->sent += *len; + req->cur_seg++; + qp->s_state = TID_OP(READ_REQ); + req->ack_pending++; + req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1); + qpriv->pending_tid_r_segs++; + qp->s_num_rd_atomic++; + + /* Set the TID RDMA READ request payload size */ + *len = req_len; + + return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32); +} + +/* + * @len: contains the data length to read upon entry and the read request + * payload length upon exit. + */ +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len) + __must_hold(&qp->s_lock) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_flow *flow = NULL; + u32 hdwords = 0; + bool last; + bool retry = true; + u32 npkts = rvt_div_round_up_mtu(qp, *len); + + /* + * Check sync conditions. Make sure that there are no pending + * segments before freeing the flow. + */ +sync_check: + if (req->state == TID_REQUEST_SYNC) { + if (qpriv->pending_tid_r_segs) + goto done; + + hfi1_kern_clear_hw_flow(req->rcd, qp); + req->state = TID_REQUEST_ACTIVE; + } + + /* + * If the request for this segment is resent, the tid resources should + * have been allocated before. In this case, req->flow_idx should + * fall behind req->setup_head. + */ + if (req->flow_idx == req->setup_head) { + retry = false; + if (req->state == TID_REQUEST_RESEND) { + /* + * This is the first new segment for a request whose + * earlier segments have been re-sent. We need to + * set up the sge pointer correctly. + */ + restart_sge(&qp->s_sge, wqe, req->s_next_psn, + qp->pmtu); + req->isge = 0; + req->state = TID_REQUEST_ACTIVE; + } + + /* + * Check sync. The last PSN of each generation is reserved for + * RESYNC. + */ + if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) { + req->state = TID_REQUEST_SYNC; + goto sync_check; + } + + /* Allocate the flow if not yet */ + if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp)) + goto done; + + /* + * The following call will advance req->setup_head after + * allocating the tid entries. + */ + if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) { + req->state = TID_REQUEST_QUEUED; + + /* + * We don't have resources for this segment. The QP has + * already been queued. + */ + goto done; + } + } + + /* req->flow_idx should only be one slot behind req->setup_head */ + flow = &req->flows[req->flow_idx]; + flow->pkt = 0; + flow->tid_idx = 0; + flow->sent = 0; + if (!retry) { + /* Set the first and last IB PSN for the flow in use.*/ + flow->flow_state.ib_spsn = req->s_next_psn; + flow->flow_state.ib_lpsn = + flow->flow_state.ib_spsn + flow->npkts - 1; + } + + /* Calculate the next segment start psn.*/ + req->s_next_psn += flow->npkts; + + /* Build the packet header */ + hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len); +done: + return hdwords; +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index 3dbeaa8cb5b3..f692f3ff9419 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -45,6 +45,19 @@ struct tid_flow_state { u8 flags; }; +enum tid_rdma_req_state { + TID_REQUEST_INACTIVE = 0, + TID_REQUEST_INIT, + TID_REQUEST_INIT_RESEND, + TID_REQUEST_ACTIVE, + TID_REQUEST_RESEND, + TID_REQUEST_RESEND_ACTIVE, + TID_REQUEST_QUEUED, + TID_REQUEST_SYNC, + TID_REQUEST_RNR_NAK, + TID_REQUEST_COMPLETE, +}; + struct tid_rdma_request { struct rvt_qp *qp; struct hfi1_ctxtdata *rcd; @@ -60,8 +73,13 @@ struct tid_rdma_request { u16 flow_idx; /* flow index most recently set up */ u32 seg_len; + u32 s_next_psn; /* IB PSN of next segment start for read */ + u32 cur_seg; /* index of current segment */ u32 isge; /* index of "current" sge */ + u32 ack_pending; /* num acks pending for this request */ + + enum tid_rdma_req_state state; }; /* @@ -77,6 +95,10 @@ struct flow_state { u32 spsn; /* starting PSN in TID space */ u32 lpsn; /* last PSN in TID space */ u32 r_next_psn; /* next PSN to be received (in TID space) */ + + /* For tid rdma read */ + u32 ib_spsn; /* starting PSN in Verbs space */ + u32 ib_lpsn; /* last PSn in Verbs space */ }; struct tid_rdma_pageset { @@ -110,11 +132,14 @@ struct tid_rdma_flow { struct flow_state flow_state; struct tid_rdma_request *req; u32 length; + u32 sent; u8 tnode_cnt; u8 tidcnt; + u8 tid_idx; u8 idx; u8 npagesets; u8 npkts; + u8 pkt; struct kern_tid_node tnode[TID_RDMA_MAX_PAGES]; struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES]; u32 tid_entry[TID_RDMA_MAX_PAGES]; @@ -159,4 +184,11 @@ struct cntr_entry; u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry, void *context, int vl, int mode, u64 data); +u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); +u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, u32 *bth1, + u32 *bth2, u32 *len); + #endif /* HFI1_TID_RDMA_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 20729454f181..2965b0957855 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -170,12 +170,16 @@ struct hfi1_qp_priv { struct rvt_qp *owner; u8 hdr_type; /* 9B or 16B */ unsigned long tid_timer_timeout_jiffies; + + /* For TID RDMA READ */ + u32 pending_tid_r_segs; /* Num of pending tid read segments */ u16 pkts_ps; /* packets per segment */ u8 timeout_shift; /* account for number of packets per segment */ }; struct hfi1_swqe_priv { struct tid_rdma_request tid_req; + struct rvt_sge_state ss; /* Used for TID RDMA READ Request */ }; struct hfi1_ack_priv { @@ -331,6 +335,11 @@ static inline u32 delta_psn(u32 a, u32 b) return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; } +static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe) +{ + return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req; +} + /* * Look through all the active flows for a TID RDMA request and find * the one (if it exists) that contains the specified PSN. diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 6e35416170a3..58a0a0f99e7f 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -100,6 +100,8 @@ struct ib_atomic_eth { __be64 compare_data; /* potentially unaligned */ } __packed; +#include + union ib_ehdrs { struct { __be32 deth[2]; @@ -117,6 +119,11 @@ union ib_ehdrs { __be32 aeth; __be32 ieth; struct ib_atomic_eth atomic_eth; + /* TID RDMA headers */ + union { + struct tid_rdma_read_req r_req; + struct tid_rdma_read_resp r_rsp; + } tid_rdma; } __packed; struct ib_other_headers { diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h new file mode 100644 index 000000000000..1c431ea32b52 --- /dev/null +++ b/include/rdma/tid_rdma_defs.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef TID_RDMA_DEFS_H +#define TID_RDMA_DEFS_H + +#include + +struct tid_rdma_read_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_read_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[4]; + __be32 verbs_psn; + __be32 verbs_qp; +}; + +/* + * TID RDMA Opcodes + */ +#define IB_OPCODE_TID_RDMA 0xe0 +enum { + IB_OPCODE_READ_REQ = 0x4, + IB_OPCODE_READ_RESP = 0x5, + + IB_OPCODE(TID_RDMA, READ_REQ), + IB_OPCODE(TID_RDMA, READ_RESP), +}; + +#define TID_OP(x) IB_OPCODE_TID_RDMA_##x + +/* + * Define TID RDMA specific WR opcodes. The ib_wr_opcode + * enum already provides some reserved values for use by + * low level drivers. Two of those are used but renamed + * to be more descriptive. + */ +#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 + +#endif /* TID_RDMA_DEFS_H */ -- cgit v1.2.3 From 039cd3daf19b9acbf080054d765cbceac842b6a0 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:31:57 -0800 Subject: IB/hfi1: Increment the retry timeout value for TID RDMA READ request The RC retry timeout value is based on the estimated time for the response packet to come back. However, for TID RDMA READ request, due to the use of header suppression, the driver is normally not notified for each incoming response packet until the last TID RDMA READ response packet. Consequently, the retry timeout value should be extended to cover the transaction time for the entire length of a segment (default 256K) instead of that for a single packet. This patch addresses the issue by introducing new retry timer functions to account for multiple packets and wrapper functions for backward compatibility. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 11 ++++++----- include/rdma/rdma_vt.h | 12 +++++++++--- include/rdma/rdmavt_qp.h | 6 +++++- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index c8e70cf69a8a..2769ebdf89fb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2393,11 +2393,12 @@ static inline unsigned long rvt_aeth_to_usec(u32 aeth) } /* - * rvt_add_retry_timer - add/start a retry timer + * rvt_add_retry_timer_ext - add/start a retry timer * @qp - the QP + * @shift - timeout shift to wait for multiple packets * add a retry timer on the QP */ -void rvt_add_retry_timer(struct rvt_qp *qp) +void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift) { struct ib_qp *ibqp = &qp->ibqp; struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); @@ -2405,11 +2406,11 @@ void rvt_add_retry_timer(struct rvt_qp *qp) lockdep_assert_held(&qp->s_lock); qp->s_flags |= RVT_S_TIMER; /* 4.096 usec. * (1 << qp->timeout) */ - qp->s_timer.expires = jiffies + qp->timeout_jiffies + - rdi->busy_jiffies; + qp->s_timer.expires = jiffies + rdi->busy_jiffies + + (qp->timeout_jiffies << shift); add_timer(&qp->s_timer); } -EXPORT_SYMBOL(rvt_add_retry_timer); +EXPORT_SYMBOL(rvt_add_retry_timer_ext); /** * rvt_add_rnr_timer - add/start an rnr timer diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 168e40be183c..87d66c9630d7 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -574,9 +574,10 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, /** * rvt_mod_retry_timer - mod a retry timer * @qp - the QP + * @shift - timeout shift to wait for multiple packets * Modify a potentially already running retry timer */ -static inline void rvt_mod_retry_timer(struct rvt_qp *qp) +static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift) { struct ib_qp *ibqp = &qp->ibqp; struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); @@ -584,8 +585,13 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp) lockdep_assert_held(&qp->s_lock); qp->s_flags |= RVT_S_TIMER; /* 4.096 usec. * (1 << qp->timeout) */ - mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies + - rdi->busy_jiffies); + mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies + + (qp->timeout_jiffies << shift)); +} + +static inline void rvt_mod_retry_timer(struct rvt_qp *qp) +{ + return rvt_mod_retry_timer_ext(qp, 0); } struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 9095a0b71250..d8d88d023092 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -688,7 +688,11 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t); void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth); void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); -void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift); +static inline void rvt_add_retry_timer(struct rvt_qp *qp) +{ + rvt_add_retry_timer_ext(qp, 0); +} void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, void *data, u32 length, -- cgit v1.2.3 From c098bbb00cd1986cbb58ed1712643f80ed00fcc3 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:28 -0800 Subject: IB/hfi1: Build TID RDMA WRITE request This patch adds the functions to build TID RDMA WRITE request. The work request opcode, packet opcode, and packet formats for TID RDMA WRITE protocol are also defined in this patch. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.h | 2 ++ drivers/infiniband/hw/hfi1/tid_rdma.c | 38 ++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/tid_rdma.h | 3 ++ include/rdma/ib_hdrs.h | 5 ++++ include/rdma/tid_rdma_defs.h | 56 +++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index ce25a27aa4a1..f74e2509e8b9 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -64,12 +64,14 @@ extern const struct rvt_operation_params hfi1_post_parms[]; * HFI1_S_AHG_CLEAR - have send engine clear ahg state * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource + * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 */ #define HFI1_S_AHG_VALID 0x80000000 #define HFI1_S_AHG_CLEAR 0x40000000 #define HFI1_S_WAIT_PIO_DRAIN 0x20000000 #define HFI1_S_WAIT_TID_SPACE 0x10000000 +#define HFI1_S_WAIT_TID_RESP 0x08000000 #define HFI1_S_MIN_BIT_MASK 0x01000000 /* diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 0ee79403acaf..089e301d9bcd 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2975,3 +2975,41 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe) exit: rcu_read_unlock(); } + +/* TID RDMA WRITE functions */ + +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + struct tid_rdma_request *req = wqe_to_tid_req(wqe); + struct tid_rdma_params *remote; + + rcu_read_lock(); + remote = rcu_dereference(qpriv->tid_rdma.remote); + /* + * Set the number of flow to be used based on negotiated + * parameters. + */ + req->n_flows = remote->max_write; + req->state = TID_REQUEST_ACTIVE; + + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1); + KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey); + ohdr->u.tid_rdma.w_req.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len)); + ohdr->u.tid_rdma.w_req.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len); + ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn); + *bth1 &= ~RVT_QPN_MASK; + *bth1 |= remote->qp; + qp->s_state = TID_OP(WRITE_REQ); + qp->s_flags |= HFI1_S_WAIT_TID_RESP; + *bth2 |= IB_BTH_REQ_ACK; + *len = 0; + + rcu_read_unlock(); + return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32); +} diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h index a53598ce45b2..baba539b2b80 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.h +++ b/drivers/infiniband/hw/hfi1/tid_rdma.h @@ -233,4 +233,7 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp, setup_tid_rdma_wqe(qp, wqe); } +u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u32 *bth1, u32 *bth2, u32 *len); #endif /* HFI1_TID_RDMA_H */ diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 58a0a0f99e7f..9a90bd031e8c 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -123,6 +123,11 @@ union ib_ehdrs { union { struct tid_rdma_read_req r_req; struct tid_rdma_read_resp r_rsp; + struct tid_rdma_write_req w_req; + struct tid_rdma_write_resp w_rsp; + struct tid_rdma_write_data w_data; + struct tid_rdma_resync resync; + struct tid_rdma_ack ack; } tid_rdma; } __packed; diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h index 1c431ea32b52..08fe47c7ad2c 100644 --- a/include/rdma/tid_rdma_defs.h +++ b/include/rdma/tid_rdma_defs.h @@ -27,16 +27,71 @@ struct tid_rdma_read_resp { __be32 verbs_qp; }; +struct tid_rdma_write_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 reserved[2]; + __be32 verbs_qp; +}; + +struct tid_rdma_write_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[3]; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_write_data { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_resync { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_ack { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[2]; + __be32 tid_flow_psn; + __be32 verbs_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + /* * TID RDMA Opcodes */ #define IB_OPCODE_TID_RDMA 0xe0 enum { + IB_OPCODE_WRITE_REQ = 0x0, + IB_OPCODE_WRITE_RESP = 0x1, + IB_OPCODE_WRITE_DATA = 0x2, + IB_OPCODE_WRITE_DATA_LAST = 0x3, IB_OPCODE_READ_REQ = 0x4, IB_OPCODE_READ_RESP = 0x5, + IB_OPCODE_RESYNC = 0x6, + IB_OPCODE_ACK = 0x7, + IB_OPCODE(TID_RDMA, WRITE_REQ), + IB_OPCODE(TID_RDMA, WRITE_RESP), + IB_OPCODE(TID_RDMA, WRITE_DATA), + IB_OPCODE(TID_RDMA, WRITE_DATA_LAST), IB_OPCODE(TID_RDMA, READ_REQ), IB_OPCODE(TID_RDMA, READ_RESP), + IB_OPCODE(TID_RDMA, RESYNC), + IB_OPCODE(TID_RDMA, ACK), }; #define TID_OP(x) IB_OPCODE_TID_RDMA_##x @@ -47,6 +102,7 @@ enum { * low level drivers. Two of those are used but renamed * to be more descriptive. */ +#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1 #define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 #endif /* TID_RDMA_DEFS_H */ -- cgit v1.2.3 From 4f9264d156dc6c154a8a6cfae780730bad45c6f8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:48 -0800 Subject: IB/hfi1: Add an s_acked_ack_queue pointer The s_ack_queue is managed by two pointers into the ring: r_head_ack_queue and s_tail_ack_queue. r_head_ack_queue is the index of where the next received request is going to be placed and s_tail_ack_queue is the entry of the request currently being processed. This works perfectly fine for normal Verbs as the requests are processed one at a time and the s_tail_ack_queue is not moved until the request that it points to is fully completed. In this fashion, s_tail_ack_queue constantly chases r_head_ack_queue and the two pointers can easily be used to determine "queue full" and "queue empty" conditions. The detection of these two conditions are imported in determining when an old entry can safely be overwritten with a new received request and the resources associated with the old request be safely released. When pipelined TID RDMA WRITE is introduced into this mix, things look very different. r_head_ack_queue is still the point at which a newly received request will be inserted, s_tail_ack_queue is still the currently processed request. However, with pipelined TID RDMA WRITE requests, s_tail_ack_queue moves to the next request once all TID RDMA WRITE responses for that request have been sent. The rest of the protocol for a particular request is managed by other pointers specific to TID RDMA - r_tid_tail and r_tid_ack - which point to the entries for which the next TID RDMA DATA packets are going to arrive and the request for which the next TID RDMA ACK packets are to be generated, respectively. What this means is that entries in the ring, which are "behind" s_tail_ack_queue (entries which s_tail_ack_queue has gone past) are no longer considered complete. This is where the problem is - a newly received request could potentially overwrite a still active TID RDMA WRITE request. The reason why the TID RDMA pointers trail s_tail_ack_queue is that the normal Verbs send engine uses s_tail_ack_queue as the pointer for the next response. Since TID RDMA WRITE responses are processed by the normal Verbs send engine, s_tail_ack_queue had to be moved to the next entry once all TID RDMA WRITE response packets were sent to get the desired pipelining between requests. Doing otherwise would mean that the normal Verbs send engine would not be able to send the TID RDMA WRITE responses for the next TID RDMA request until the current one is fully completed. This patch introduces the s_acked_ack_queue index to point to the next request to complete on the responder side. For requests other than TID RDMA WRITE, s_acked_ack_queue should always be kept in sync with s_tail_ack_queue. For TID RDMA WRITE request, it may fall behind s_tail_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 33 ++++++++++++++++++++++++++++----- drivers/infiniband/hw/hfi1/rc.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 2 ++ drivers/infiniband/hw/hfi1/trace_tid.h | 10 ++++++++-- drivers/infiniband/sw/rdmavt/qp.c | 1 + include/rdma/rdmavt_qp.h | 1 + 6 files changed, 41 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6c9ef572fc69..9dc8e524510e 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -120,6 +120,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_qp_priv *priv = qp->priv; bool last_pkt; u32 delta; + u8 next = qp->s_tail_ack_queue; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -149,9 +150,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, * response has been sent instead of only being * constructed. */ - if (++qp->s_tail_ack_queue > - rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) - qp->s_tail_ack_queue = 0; + if (++next > rvt_size_atomic(&dev->rdi)) + next = 0; + /* + * Only advance the s_acked_ack_queue pointer if there + * have been no TID RDMA requests. + */ + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode != TID_OP(WRITE_REQ) && + qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = next; + qp->s_tail_ack_queue = next; /* FALLTHROUGH */ case OP(SEND_ONLY): case OP(ACKNOWLEDGE): @@ -172,6 +181,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -202,6 +215,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, */ len = e->rdma_sge.sge_length; if (len && !e->rdma_sge.mr) { + if (qp->s_acked_ack_queue == + qp->s_tail_ack_queue) + qp->s_acked_ack_queue = + qp->r_head_ack_queue; qp->s_tail_ack_queue = qp->r_head_ack_queue; goto bail; } @@ -2235,6 +2252,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, e->psn = psn; if (old_req) goto unlock_done; + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2248,6 +2267,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, */ if (!e || e->opcode != (u8)opcode || old_req) goto unlock_done; + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; break; } @@ -2274,6 +2295,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data, * Resend the RDMA read or atomic op which * ACKs this duplicate request. */ + if (qp->s_tail_ack_queue == qp->s_acked_ack_queue) + qp->s_acked_ack_queue = mra; qp->s_tail_ack_queue = mra; break; } @@ -2646,7 +2669,7 @@ send_last: if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); @@ -2723,7 +2746,7 @@ send_last: if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; spin_lock_irqsave(&qp->s_lock, flags); - if (unlikely(next == qp->s_tail_ack_queue)) { + if (unlikely(next == qp->s_acked_ack_queue)) { if (!qp->s_ack_queue[next].sent) goto nack_inv_unlck; update_ack_queue(qp, next); diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h index 4329eadcb3df..8e0935b9bf2a 100644 --- a/drivers/infiniband/hw/hfi1/rc.h +++ b/drivers/infiniband/hw/hfi1/rc.h @@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n) if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device))) next = 0; qp->s_tail_ack_queue = next; + qp->s_acked_ack_queue = next; qp->s_ack_state = OP(ACKNOWLEDGE); } diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 089e301d9bcd..c320a99afb35 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -2044,6 +2044,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet, goto unlock; } /* Re-process old requests.*/ + if (qp->s_acked_ack_queue == qp->s_tail_ack_queue) + qp->s_acked_ack_queue = prev; qp->s_tail_ack_queue = prev; /* * Since the qp->s_tail_ack_queue is modified, the diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h index b71638c22d4b..51f5b0e8da71 100644 --- a/drivers/infiniband/hw/hfi1/trace_tid.h +++ b/drivers/infiniband/hw/hfi1/trace_tid.h @@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); #define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \ "r_head_ack_queue %u s_tail_ack_queue %u " \ - "s_ack_state 0x%x " \ + "s_acked_ack_queue %u s_ack_state 0x%x " \ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \ "iow_flags 0x%lx" @@ -62,7 +62,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent); "s_next_psn 0x%x" #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \ - "s_tail_ack_queue %u " \ + "s_acked_ack_queue %u s_tail_ack_queue %u " \ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \ " diff %d" @@ -671,6 +671,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __field(u8, r_flags) __field(u8, r_head_ack_queue) __field(u8, s_tail_ack_queue) + __field(u8, s_acked_ack_queue) __field(u8, s_ack_state) __field(u8, s_nak_state) __field(u8, r_nak_state) @@ -691,6 +692,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags = qp->r_flags; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_ack_state = qp->s_ack_state; __entry->s_nak_state = qp->s_nak_state; __entry->s_flags = qp->s_flags; @@ -709,6 +711,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */ __entry->r_flags, __entry->r_head_ack_queue, __entry->s_tail_ack_queue, + __entry->s_acked_ack_queue, __entry->s_ack_state, __entry->s_nak_state, __entry->s_flags, @@ -1007,6 +1010,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __field(u32, qpn) __field(u32, s_flags) __field(u8, state) + __field(u8, s_acked_ack_queue) __field(u8, s_tail_ack_queue) __field(u8, r_head_ack_queue) __field(u32, opcode) @@ -1019,6 +1023,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn = qp->ibqp.qp_num; __entry->s_flags = qp->s_flags; __entry->state = qp->state; + __entry->s_acked_ack_queue = qp->s_acked_ack_queue; __entry->s_tail_ack_queue = qp->s_tail_ack_queue; __entry->r_head_ack_queue = qp->r_head_ack_queue; __entry->opcode = opcode; @@ -1032,6 +1037,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */ __entry->qpn, __entry->s_flags, __entry->state, + __entry->s_acked_ack_queue, __entry->s_tail_ack_queue, __entry->r_head_ack_queue, __entry->opcode, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2769ebdf89fb..14ec2577bcaa 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -854,6 +854,7 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_mig_state = IB_MIG_MIGRATED; qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; + qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; if (qp->r_rq.wq) { qp->r_rq.wq->head = 0; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d8d88d023092..4ee612ab6cb4 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -375,6 +375,7 @@ struct rvt_qp { u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_acked_ack_queue; /* index into s_ack_queue[] */ struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; -- cgit v1.2.3 From 3c6cb20a0d17d7a75778fb0935d6fa427c8177af Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:39 -0800 Subject: IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs This patch integrates TID RDMA WRITE protocol into normal RDMA verbs framework. The TID RDMA WRITE protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA WRITE request into a TID RDMA WRITE request to avoid data copying on the responder side. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/qp.c | 7 + drivers/infiniband/hw/hfi1/rc.c | 487 +++++++++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/tid_rdma.c | 14 + drivers/infiniband/hw/hfi1/user_sdma.c | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 17 +- drivers/infiniband/hw/hfi1/verbs.h | 1 + include/rdma/rdmavt_qp.h | 1 + 7 files changed, 481 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 96632c77f36f..cfd598e4b303 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { .flags = RVT_OPERATION_USE_RESERVE, }, +[IB_WR_TID_RDMA_WRITE] = { + .length = sizeof(struct ib_rdma_wr), + .qpt_support = BIT(IB_QPT_RC), + .flags = RVT_OPERATION_IGN_RNR_CNT, +}, + }; static void flush_list_head(struct list_head *l) @@ -780,6 +786,7 @@ void quiesce_qp(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; hfi1_del_tid_reap_timer(qp); + hfi1_del_tid_retry_timer(qp); iowait_sdma_drain(&priv->s_iowait); qp_pio_drain(qp); flush_tx_list(qp); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index fcb733ea8dfb..6d2abea896e5 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -111,16 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct rvt_ack_entry *e; - u32 hwords; + u32 hwords, hdrlen; u32 len = 0; u32 bth0 = 0, bth2 = 0; u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT); int middle = 0; u32 pmtu = qp->pmtu; - struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_qp_priv *qpriv = qp->priv; bool last_pkt; u32 delta; u8 next = qp->s_tail_ack_queue; + struct tid_rdma_request *req; trace_hfi1_rsp_make_rc_ack(qp, 0); lockdep_assert_held(&qp->s_lock); @@ -128,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) goto bail; - if (priv->hdr_type == HFI1_PKT_TYPE_9B) + if (qpriv->hdr_type == HFI1_PKT_TYPE_9B) /* header size in 32-bit words LRH+BTH = (8+12)/4. */ hwords = 5; else @@ -206,6 +207,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, hwords++; qp->s_ack_rdma_psn = e->psn; bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else if (e->opcode == TID_OP(WRITE_REQ)) { + /* + * If a TID RDMA WRITE RESP is being resent, we have to + * wait for the actual request. All requests that are to + * be resent will have their state set to + * TID_REQUEST_RESEND. When the new request arrives, the + * state will be changed to TID_REQUEST_RESEND_ACTIVE. + */ + req = ack_to_tid_req(e); + if (req->state == TID_REQUEST_RESEND || + req->state == TID_REQUEST_INIT_RESEND) + goto bail; + qp->s_ack_state = TID_OP(WRITE_RESP); + qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg); + goto write_resp; } else if (e->opcode == TID_OP(READ_REQ)) { /* * If a TID RDMA read response is being resent and @@ -267,6 +283,59 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, bth2 = mask_psn(qp->s_ack_rdma_psn++); break; + case TID_OP(WRITE_RESP): +write_resp: + /* + * 1. Check if RVT_S_ACK_PENDING is set. If yes, + * goto normal. + * 2. Attempt to allocate TID resources. + * 3. Remove RVT_S_RESP_PENDING flags from s_flags + * 4. If resources not available: + * 4.1 Set RVT_S_WAIT_TID_SPACE + * 4.2 Queue QP on RCD TID queue + * 4.3 Put QP on iowait list. + * 4.4 Build IB RNR NAK with appropriate timeout value + * 4.5 Return indication progress made. + * 5. If resources are available: + * 5.1 Program HW flow CSRs + * 5.2 Build TID RDMA WRITE RESP packet + * 5.3 If more resources needed, do 2.1 - 2.3. + * 5.4 Wake up next QP on RCD TID queue. + * 5.5 Return indication progress made. + */ + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + req = ack_to_tid_req(e); + + /* + * Send scheduled RNR NAK's. RNR NAK's need to be sent at + * segment boundaries, not at request boundaries. Don't change + * s_ack_state because we are still in the middle of a request + */ + if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND && + qp->s_tail_ack_queue == qpriv->r_tid_alloc && + req->cur_seg == req->alloc_seg) { + qpriv->rnr_nak_state = TID_RNR_NAK_SENT; + goto normal_no_state; + } + + bth2 = mask_psn(qp->s_ack_rdma_psn); + hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1, + bth2, &len, + &ps->s_txreq->ss); + if (!hdrlen) + return 0; + + hwords += hdrlen; + bth0 = qp->s_ack_state << 24; + qp->s_ack_rdma_psn++; + if (req->cur_seg != req->total_segs) + break; + + e->sent = 1; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + break; + case TID_OP(READ_RESP): read_resp: e = &qp->s_ack_queue[qp->s_tail_ack_queue]; @@ -298,8 +367,7 @@ normal: * (see above). */ qp->s_ack_state = OP(SEND_ONLY); - qp->s_flags &= ~RVT_S_ACK_PENDING; - ps->s_txreq->ss = NULL; +normal_no_state: if (qp->s_nak_state) ohdr->u.aeth = cpu_to_be32((qp->r_msn & IB_MSN_MASK) | @@ -311,9 +379,11 @@ normal: len = 0; bth0 = OP(ACKNOWLEDGE) << 24; bth2 = mask_psn(qp->s_ack_psn); + qp->s_flags &= ~RVT_S_ACK_PENDING; + ps->s_txreq->ss = NULL; } qp->s_rdma_ack_cnt++; - ps->s_txreq->sde = priv->s_sde; + ps->s_txreq->sde = qpriv->s_sde; ps->s_txreq->s_cur_size = len; ps->s_txreq->hdr_dwords = hwords; hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps); @@ -366,6 +436,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) int middle = 0; int delta; struct tid_rdma_flow *flow = NULL; + struct tid_rdma_params *remote; trace_hfi1_sender_make_rc_req(qp); lockdep_assert_held(&qp->s_lock); @@ -414,7 +485,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto done_free_tx; } - if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT)) goto bail; if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { @@ -586,6 +657,108 @@ no_flow_control: qp->s_cur = 0; break; + case IB_WR_TID_RDMA_WRITE: + if (newreq) { + /* + * Limit the number of TID RDMA WRITE requests. + */ + if (atomic_read(&priv->n_tid_requests) >= + HFI1_TID_RDMA_WRITE_CNT) + goto bail; + + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, + &bth1, &bth2, + &len); + ss = NULL; + if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) { + priv->s_tid_cur = qp->s_cur; + if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = TID_OP(WRITE_RESP); + } + } else if (priv->s_tid_cur == priv->s_tid_head) { + struct rvt_swqe *__w; + struct tid_rdma_request *__r; + + __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur); + __r = wqe_to_tid_req(__w); + + /* + * The s_tid_cur pointer is advanced to s_cur if + * any of the following conditions about the WQE + * to which s_ti_cur currently points to are + * satisfied: + * 1. The request is not a TID RDMA WRITE + * request, + * 2. The request is in the INACTIVE or + * COMPLETE states (TID RDMA READ requests + * stay at INACTIVE and TID RDMA WRITE + * transition to COMPLETE when done), + * 3. The request is in the ACTIVE or SYNC + * state and the number of completed + * segments is equal to the total segment + * count. + * (If ACTIVE, the request is waiting for + * ACKs. If SYNC, the request has not + * received any responses because it's + * waiting on a sync point.) + */ + if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE || + __r->state == TID_REQUEST_INACTIVE || + __r->state == TID_REQUEST_COMPLETE || + ((__r->state == TID_REQUEST_ACTIVE || + __r->state == TID_REQUEST_SYNC) && + __r->comp_seg == __r->total_segs)) { + if (priv->s_tid_tail == + priv->s_tid_cur && + priv->s_state == + TID_OP(WRITE_DATA_LAST)) { + priv->s_tid_tail = qp->s_cur; + priv->s_state = + TID_OP(WRITE_RESP); + } + priv->s_tid_cur = qp->s_cur; + } + /* + * A corner case: when the last TID RDMA WRITE + * request was completed, s_tid_head, + * s_tid_cur, and s_tid_tail all point to the + * same location. Other requests are posted and + * s_cur wraps around to the same location, + * where a new TID RDMA WRITE is posted. In + * this case, none of the indices need to be + * updated. However, the priv->s_state should. + */ + if (priv->s_tid_tail == qp->s_cur && + priv->s_state == TID_OP(WRITE_DATA_LAST)) + priv->s_state = TID_OP(WRITE_RESP); + } + req = wqe_to_tid_req(wqe); + if (newreq) { + priv->s_tid_head = qp->s_cur; + priv->pending_tid_w_resp += req->total_segs; + atomic_inc(&priv->n_tid_requests); + atomic_dec(&priv->n_requests); + } else { + req->state = TID_REQUEST_RESEND; + req->comp_seg = delta_psn(bth2, wqe->psn); + /* + * Pull back any segments since we are going + * to re-receive them. + */ + req->setup_head = req->clear_tail; + priv->pending_tid_w_resp += + delta_psn(wqe->lpsn, bth2) + 1; + } + + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case IB_WR_RDMA_READ: /* * Don't allow more operations to be started @@ -745,7 +918,8 @@ no_flow_control: if (qp->s_tail >= qp->s_size) qp->s_tail = 0; } - if (wqe->wr.opcode == IB_WR_RDMA_READ) + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_psn = wqe->lpsn + 1; else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) qp->s_psn = req->s_next_psn; @@ -865,6 +1039,33 @@ no_flow_control: if (qp->s_cur == qp->s_size) qp->s_cur = 0; break; + + case TID_OP(WRITE_RESP): + /* + * This value for s_state is used for restarting a TID RDMA + * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE + * for more). + */ + req = wqe_to_tid_req(wqe); + req->state = TID_REQUEST_RESEND; + rcu_read_lock(); + remote = rcu_dereference(priv->tid_rdma.remote); + req->comp_seg = delta_psn(qp->s_psn, wqe->psn); + len = wqe->length - (req->comp_seg * remote->max_len); + rcu_read_unlock(); + + bth2 = mask_psn(qp->s_psn); + hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1, + &bth2, &len); + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + qp->s_state = TID_OP(WRITE_REQ); + priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1; + priv->s_tid_cur = qp->s_cur; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + case TID_OP(READ_RESP): if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) goto bail; @@ -965,7 +1166,8 @@ no_flow_control: } qp->s_sending_hpsn = bth2; delta = delta_psn(bth2, wqe->psn); - if (delta && delta % HFI1_PSN_CREDIT == 0) + if (delta && delta % HFI1_PSN_CREDIT == 0 && + wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) bth2 |= IB_BTH_REQ_ACK; if (qp->s_flags & RVT_S_SEND_ONE) { qp->s_flags &= ~RVT_S_SEND_ONE; @@ -998,6 +1200,12 @@ bail: bail_no_tx: ps->s_txreq = NULL; qp->s_flags &= ~RVT_S_BUSY; + /* + * If we didn't get a txreq, the QP will be woken up later to try + * again. Set the flags to indicate which work item to wake + * up. + */ + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); return 0; } @@ -1285,6 +1493,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) lockdep_assert_held(&qp->s_lock); qp->s_cur = n; priv->pending_tid_r_segs = 0; + priv->pending_tid_w_resp = 0; qp->s_num_rd_atomic = 0; /* @@ -1342,6 +1551,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn) qp->s_state = OP(RDMA_READ_RESPONSE_LAST); break; + case IB_WR_TID_RDMA_WRITE: + qp->s_state = TID_OP(WRITE_RESP); + break; + case IB_WR_RDMA_READ: qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); break; @@ -1435,7 +1648,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | - RVT_S_WAIT_ACK); + RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP); if (wait) qp->s_flags |= RVT_S_SEND_ONE; reset_psn(qp, psn); @@ -1443,7 +1656,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) /* * Set qp->s_sending_psn to the next PSN after the given one. - * This would be psn+1 except when RDMA reads are present. + * This would be psn+1 except when RDMA reads or TID RDMA ops + * are present. */ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) { @@ -1456,7 +1670,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) wqe = rvt_get_swqe_ptr(qp, n); if (cmp_psn(psn, wqe->lpsn) <= 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || - wqe->wr.opcode == IB_WR_TID_RDMA_READ) + wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) qp->s_sending_psn = wqe->lpsn + 1; else qp->s_sending_psn = psn + 1; @@ -1479,8 +1694,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct rvt_swqe *wqe; struct ib_header *hdr = NULL; struct hfi1_16b_header *hdr_16b = NULL; - u32 opcode; + u32 opcode, head, tail; u32 psn; + struct tid_rdma_request *req; lockdep_assert_held(&qp->s_lock); if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) @@ -1507,29 +1723,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) opcode = ib_bth_get_opcode(ohdr); if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) || - opcode == TID_OP(READ_RESP)) { + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) { WARN_ON(!qp->s_rdma_ack_cnt); qp->s_rdma_ack_cnt--; return; } psn = ib_bth_get_psn(ohdr); - reset_sending_psn(qp, psn); + /* + * Don't attempt to reset the sending PSN for packets in the + * KDETH PSN space since the PSN does not match anything. + */ + if (opcode != TID_OP(WRITE_DATA) && + opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC)) + reset_sending_psn(qp, psn); + + /* Handle TID RDMA WRITE packets differently */ + if (opcode >= TID_OP(WRITE_REQ) && + opcode <= TID_OP(WRITE_DATA_LAST)) { + head = priv->s_tid_head; + tail = priv->s_tid_cur; + /* + * s_tid_cur is set to s_tid_head in the case, where + * a new TID RDMA request is being started and all + * previous ones have been completed. + * Therefore, we need to do a secondary check in order + * to properly determine whether we should start the + * RC timer. + */ + wqe = rvt_get_swqe_ptr(qp, tail); + req = wqe_to_tid_req(wqe); + if (head == tail && req->comp_seg < req->total_segs) { + if (tail == 0) + tail = qp->s_size - 1; + else + tail -= 1; + } + } else { + head = qp->s_tail; + tail = qp->s_acked; + } /* * Start timer after a packet requesting an ACK has been sent and * there are still requests that haven't been acked. */ - if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + if ((psn & IB_BTH_REQ_ACK) && tail != head && + opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) && + opcode != TID_OP(RESYNC) && !(qp->s_flags & - (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && - (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { if (opcode == TID_OP(READ_REQ)) rvt_add_retry_timer_ext(qp, priv->timeout_shift); else rvt_add_retry_timer(qp); } + /* Start TID RDMA ACK timer */ + if ((opcode == TID_OP(WRITE_DATA) || + opcode == TID_OP(WRITE_DATA_LAST) || + opcode == TID_OP(RESYNC)) && + (psn & IB_BTH_REQ_ACK) && + !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + /* + * The TID RDMA ACK packet could be received before this + * function is called. Therefore, add the timer only if TID + * RDMA ACK packets are actually pending. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + req = wqe_to_tid_req(wqe); + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + req->ack_seg < req->cur_seg) + hfi1_add_tid_retry_timer(qp); + } + while (qp->s_last != qp->s_acked) { u32 s_last; @@ -1628,7 +1899,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, } qp->s_retry = qp->s_retry_cnt; - update_last_psn(qp, wqe->lpsn); + /* + * Don't update the last PSN if the request being completed is + * a TID RDMA WRITE request. + * Completion of the TID RDMA WRITE requests are done by the + * TID RDMA ACKs and as such could be for a request that has + * already been ACKed as far as the IB state machine is + * concerned. + */ + if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE) + update_last_psn(qp, wqe->lpsn); /* * If we are completing a request which is in the process of @@ -1658,6 +1938,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, return wqe; } +static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd) +{ + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + rvt_get_qp(qp); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } +} + +/** + * update_qp_retry_state - Update qp retry state. + * @qp: the QP + * @psn: the packet sequence number of the TID RDMA WRITE RESP. + * @spsn: The start psn for the given TID RDMA WRITE swqe. + * @lpsn: The last psn for the given TID RDMA WRITE swqe. + * + * This function is called to update the qp retry state upon + * receiving a TID WRITE RESP after the qp is scheduled to retry + * a request. + */ +static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn, + u32 lpsn) +{ + struct hfi1_qp_priv *qpriv = qp->priv; + + qp->s_psn = psn + 1; + /* + * If this is the first TID RDMA WRITE RESP packet for the current + * request, change the s_state so that the retry will be processed + * correctly. Similarly, if this is the last TID RDMA WRITE RESP + * packet, change the s_state and advance the s_cur. + */ + if (cmp_psn(psn, lpsn) >= 0) { + qp->s_cur = qpriv->s_tid_cur + 1; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_state = TID_OP(WRITE_REQ); + } else if (!cmp_psn(psn, spsn)) { + qp->s_cur = qpriv->s_tid_cur; + qp->s_state = TID_OP(WRITE_RESP); + } +} + /** * do_rc_ack - process an incoming RC ACK * @qp: the QP the ACK came in on @@ -1679,6 +2007,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, int ret = 0; u32 ack_psn; int diff; + struct rvt_dev_info *rdi; lockdep_assert_held(&qp->s_lock); /* @@ -1725,18 +2054,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, (opcode != TID_OP(READ_RESP) || diff != 0)) || ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && - (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { - /* Retry this request. */ - if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { - qp->r_flags |= RVT_R_RDMAR_SEQ; - hfi1_restart_rc(qp, qp->s_last_psn + 1, 0); - if (list_empty(&qp->rspwait)) { - qp->r_flags |= RVT_R_RSP_SEND; - rvt_get_qp(qp); - list_add_tail(&qp->rspwait, - &rcd->qp_wait_list); - } - } + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) || + (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + (delta_psn(psn, qp->s_last_psn) != 1))) { + set_restart_qp(qp, rcd); /* * No need to process the ACK/NAK since we are * restarting an earlier request. @@ -1768,6 +2089,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, hfi1_schedule_send(qp); } } + + /* + * TID RDMA WRITE requests will be completed by the TID RDMA + * ACK packet handler (see tid_rdma.c). + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) + break; + wqe = do_rc_completion(qp, wqe, ibp); if (qp->s_acked == qp->s_tail) break; @@ -1785,17 +2114,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, else rvt_stop_rc_timers(qp); } else if (qp->s_acked != qp->s_tail) { + struct rvt_swqe *__w = NULL; + + if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID) + __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur); + /* - * We are expecting more ACKs so - * mod the retry timer. - */ - rvt_mod_retry_timer(qp); - /* - * We can stop re-sending the earlier packets and - * continue with the next packet the receiver wants. + * Stop timers if we've received all of the TID RDMA + * WRITE * responses. */ - if (cmp_psn(qp->s_psn, psn) <= 0) - reset_psn(qp, psn + 1); + if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode == TID_OP(WRITE_RESP)) { + /* + * Normally, the loop above would correctly + * process all WQEs from s_acked onward and + * either complete them or check for correct + * PSN sequencing. + * However, for TID RDMA, due to pipelining, + * the response may not be for the request at + * s_acked so the above look would just be + * skipped. This does not allow for checking + * the PSN sequencing. It has to be done + * separately. + */ + if (cmp_psn(psn, qp->s_last_psn + 1)) { + set_restart_qp(qp, rcd); + goto bail_stop; + } + /* + * If the psn is being resent, stop the + * resending. + */ + if (qp->s_cur != qp->s_tail && + cmp_psn(qp->s_psn, psn) <= 0) + update_qp_retry_state(qp, psn, + __w->psn, + __w->lpsn); + else if (--qpriv->pending_tid_w_resp) + rvt_mod_retry_timer(qp); + else + rvt_stop_rc_timers(qp); + } else { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + rvt_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets + * and continue with the next packet the + * receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } } else { /* No more acks - kill all timers */ rvt_stop_rc_timers(qp); @@ -1811,6 +2183,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, rvt_get_credit(qp, aeth); qp->s_rnr_retry = qp->s_rnr_retry_cnt; qp->s_retry = qp->s_retry_cnt; + /* + * If the current request is a TID RDMA WRITE request and the + * response is not a TID RDMA WRITE RESP packet, s_last_psn + * can't be advanced. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE && + opcode != TID_OP(WRITE_RESP) && + cmp_psn(psn, wqe->psn) >= 0) + return 1; update_last_psn(qp, psn); return 1; @@ -1820,20 +2201,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, goto bail_stop; if (qp->s_flags & RVT_S_WAIT_RNR) goto bail_stop; - if (qp->s_rnr_retry == 0) { + rdi = ib_to_rvt(qp->ibqp.device); + if (qp->s_rnr_retry == 0 && + !((rdi->post_parms[wqe->wr.opcode].flags & + RVT_OPERATION_IGN_RNR_CNT) && + qp->s_rnr_retry_cnt == 0)) { status = IB_WC_RNR_RETRY_EXC_ERR; goto class_b; } - if (qp->s_rnr_retry_cnt < 7) + if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0) qp->s_rnr_retry--; - /* The last valid PSN is the previous PSN. */ - update_last_psn(qp, psn - 1); + /* + * The last valid PSN is the previous PSN. For TID RDMA WRITE + * request, s_last_psn should be incremented only when a TID + * RDMA WRITE RESP is received to avoid skipping lost TID RDMA + * WRITE RESP packets. + */ + if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) { + reset_psn(qp, qp->s_last_psn + 1); + } else { + update_last_psn(qp, psn - 1); + reset_psn(qp, psn); + } ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); - - reset_psn(qp, psn); - qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); rvt_stop_rc_timers(qp); rvt_add_rnr_timer(qp, aeth); @@ -1918,6 +2310,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, while (cmp_psn(psn, wqe->lpsn) > 0) { if (wqe->wr.opcode == IB_WR_RDMA_READ || wqe->wr.opcode == IB_WR_TID_RDMA_READ || + wqe->wr.opcode == IB_WR_TID_RDMA_WRITE || wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) break; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 80111dd1d876..490e47a0f68b 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -3205,6 +3205,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp) do { struct hfi1_swqe_priv *priv = wqe->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); + } while (!ret); + } + for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device))) + i = 0; + /* Free only locally allocated TID entries */ + if (e->opcode != TID_OP(WRITE_REQ)) + continue; + do { + struct hfi1_ack_priv *priv = e->priv; + ret = hfi1_kern_exp_rcv_clear(&priv->tid_req); } while (!ret); } diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index e5e7fad09f32..6764114b886c 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1126,7 +1126,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 0xffffffull), psn = val & mask; if (expct) - psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | + ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); else psn = psn + frags; return psn & mask; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 7b87b77582bd..ab97d71cdd92 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the */ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, [IB_WR_SEND] = IB_WC_SEND, [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, @@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36, [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36, + [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, @@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = { [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv, /* TID RDMA has separate handlers for different opcodes.*/ + [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req, + [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp, + [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data, + [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data, [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req, [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp, + [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync, + [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack, /* UC */ [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, @@ -1332,7 +1345,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_mr_size = U64_MAX; rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; - rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_qp_wr = + (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ? + HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs); rdi->dparms.props.max_send_sge = hfi1_max_sges; rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bee3d21a548e..62ace0b2d17a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -193,6 +193,7 @@ struct hfi1_qp_priv { u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */ u32 r_tid_alloc; /* Request for which we are allocating resources */ u32 pending_tid_w_segs; /* Num of pending tid write segments */ + u32 pending_tid_w_resp; /* Num of pending tid write responses */ u32 alloc_w_segs; /* Number of segments for which write */ /* resources have been allocated for this QP */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 4ee612ab6cb4..f0fbd4063fef 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -246,6 +246,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_USE_RESERVE 0x00000010 +#define RVT_OPERATION_IGN_RNR_CNT 0x00000020 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) -- cgit v1.2.3 From 70f8a3ca68d3e1f3344d959981ca55d5f6ec77f7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Feb 2019 09:59:15 -0800 Subject: mm: make mm->pinned_vm an atomic64 counter Taking a sleeping lock to _only_ increment a variable is quite the overkill, and pretty much all users do this. Furthermore, some drivers (ie: infiniband and scif) that need pinned semantics can go to quite some trouble to actually delay via workqueue (un)accounting for pinned pages when not possible to acquire it. By making the counter atomic we no longer need to hold the mmap_sem and can simply some code around it for pinned_vm users. The counter is 64-bit such that we need not worry about overflows such as rdma user input controlled from userspace. Reviewed-by: Ira Weiny Reviewed-by: Christoph Lameter Reviewed-by: Daniel Jordan Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 12 ++++++------ drivers/infiniband/hw/hfi1/user_pages.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 4 ++-- drivers/infiniband/hw/usnic/usnic_uiom.c | 8 ++++---- drivers/misc/mic/scif/scif_rma.c | 6 +++--- fs/proc/task_mmu.c | 2 +- include/linux/mm_types.h | 2 +- kernel/events/core.c | 8 ++++---- kernel/fork.c | 2 +- mm/debug.c | 5 +++-- 10 files changed, 28 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 1efe0a74e06b..678abe1afcba 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -166,13 +166,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; down_write(&mm->mmap_sem); - if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || - (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { + new_pinned = atomic64_read(&mm->pinned_vm) + npages; + if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) { up_write(&mm->mmap_sem); ret = -ENOMEM; goto out; } - mm->pinned_vm = new_pinned; + atomic64_set(&mm->pinned_vm, new_pinned); up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -234,7 +234,7 @@ umem_release: __ib_umem_release(context->device, umem, 0); vma: down_write(&mm->mmap_sem); - mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); up_write(&mm->mmap_sem); out: if (vma_list) @@ -263,7 +263,7 @@ static void ib_umem_release_defer(struct work_struct *work) struct ib_umem *umem = container_of(work, struct ib_umem, work); down_write(&umem->owning_mm->mmap_sem); - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); up_write(&umem->owning_mm->mmap_sem); __ib_umem_release_tail(umem); @@ -302,7 +302,7 @@ void ib_umem_release(struct ib_umem *umem) } else { down_write(&umem->owning_mm->mmap_sem); } - umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); up_write(&umem->owning_mm->mmap_sem); __ib_umem_release_tail(umem); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index e341e6dcc388..40a6e434190f 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -92,7 +92,7 @@ bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, size = DIV_ROUND_UP(size, PAGE_SIZE); down_read(&mm->mmap_sem); - pinned = mm->pinned_vm; + pinned = atomic64_read(&mm->pinned_vm); up_read(&mm->mmap_sem); /* First, check the absolute limit against all pinned pages. */ @@ -112,7 +112,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np return ret; down_write(&mm->mmap_sem); - mm->pinned_vm += ret; + atomic64_add(ret, &mm->pinned_vm); up_write(&mm->mmap_sem); return ret; @@ -131,7 +131,7 @@ void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, if (mm) { /* during close after signal, mm can be NULL */ down_write(&mm->mmap_sem); - mm->pinned_vm -= npages; + atomic64_sub(npages, &mm->pinned_vm); up_write(&mm->mmap_sem); } } diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 075f09fb7ce3..c6c81022d313 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -75,7 +75,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, goto bail_release; } - current->mm->pinned_vm += num_pages; + atomic64_add(num_pages, ¤t->mm->pinned_vm); ret = 0; goto bail; @@ -156,7 +156,7 @@ void qib_release_user_pages(struct page **p, size_t num_pages) __qib_release_user_pages(p, num_pages, 1); if (current->mm) { - current->mm->pinned_vm -= num_pages; + atomic64_sub(num_pages, ¤t->mm->pinned_vm); up_write(¤t->mm->mmap_sem); } } diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index ce01a59fccc4..854436a2b437 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -129,7 +129,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, uiomr->owning_mm = mm = current->mm; down_write(&mm->mmap_sem); - locked = npages + current->mm->pinned_vm; + locked = npages + atomic64_read(¤t->mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { @@ -187,7 +187,7 @@ out: if (ret < 0) usnic_uiom_put_pages(chunk_list, 0); else { - mm->pinned_vm = locked; + atomic64_set(&mm->pinned_vm, locked); mmgrab(uiomr->owning_mm); } @@ -441,7 +441,7 @@ static void usnic_uiom_release_defer(struct work_struct *work) container_of(work, struct usnic_uiom_reg, work); down_write(&uiomr->owning_mm->mmap_sem); - uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm); up_write(&uiomr->owning_mm->mmap_sem); __usnic_uiom_release_tail(uiomr); @@ -469,7 +469,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, } else { down_write(&uiomr->owning_mm->mmap_sem); } - uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + atomic64_sub(usnic_uiom_num_pages(uiomr), &uiomr->owning_mm->pinned_vm); up_write(&uiomr->owning_mm->mmap_sem); __usnic_uiom_release_tail(uiomr); diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c index 749321eb91ae..2448368f181e 100644 --- a/drivers/misc/mic/scif/scif_rma.c +++ b/drivers/misc/mic/scif/scif_rma.c @@ -285,7 +285,7 @@ __scif_dec_pinned_vm_lock(struct mm_struct *mm, } else { down_write(&mm->mmap_sem); } - mm->pinned_vm -= nr_pages; + atomic64_sub(nr_pages, &mm->pinned_vm); up_write(&mm->mmap_sem); return 0; } @@ -299,7 +299,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, return 0; locked = nr_pages; - locked += mm->pinned_vm; + locked += atomic64_read(&mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { dev_err(scif_info.mdev.this_device, @@ -307,7 +307,7 @@ static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, locked, lock_limit); return -ENOMEM; } - mm->pinned_vm = locked; + atomic64_set(&mm->pinned_vm, locked); return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..d2902962244d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -59,7 +59,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) SEQ_PUT_DEC("VmPeak:\t", hiwater_vm); SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm); SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm); - SEQ_PUT_DEC(" kB\nVmPin:\t", mm->pinned_vm); + SEQ_PUT_DEC(" kB\nVmPin:\t", atomic64_read(&mm->pinned_vm)); SEQ_PUT_DEC(" kB\nVmHWM:\t", hiwater_rss); SEQ_PUT_DEC(" kB\nVmRSS:\t", total_rss); SEQ_PUT_DEC(" kB\nRssAnon:\t", anon); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2c471a2c43fa..acea2ea2d6c4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long pinned_vm; /* Refcount permanently increased */ + atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ede6918050..29e9f2473656 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5459,7 +5459,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); @@ -5532,7 +5532,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5680,7 +5680,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5721,7 +5721,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..85e08c379a9e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -981,7 +981,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); diff --git a/mm/debug.c b/mm/debug.c index 0abb987dad9b..7d13941a72f9 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -135,7 +135,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -166,7 +166,8 @@ void dump_mm(const struct mm_struct *mm) mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, + atomic64_read(&mm->pinned_vm), + mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, -- cgit v1.2.3 From 95b86d1c91ad3b19f882d9e70aa37c8e99e8dc17 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 7 Feb 2019 01:31:27 -0500 Subject: RDMA/bnxt_re: Update kernel user abi to pass chip context User space verbs provider library would need chip context. Changing the ABI to add chip version details in structure. Furthermore, changing the kernel driver ucontext allocation code to initialize the abi structure with appropriate values. As suggested by community, appended the new fields at the bottom of the ABI structure and retaining to older fields as those were in the older versions. Keeping the ABI version at 1 and adding a new field in the ucontext response structure to hold the component mask. The user space library should check pre-defined flags to figure out if a certain feature is supported on not. Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 17 ++++++++++++++--- include/uapi/rdma/bnxt_re-abi.h | 11 +++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 08c1725f371a..1d7469e23cde 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3692,9 +3692,10 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; struct bnxt_re_uctx_resp resp; struct bnxt_re_ucontext *uctx; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + u32 chip_met_rev_num = 0; int rc; dev_dbg(rdev_to_dev(rdev), "ABI version requested %d", @@ -3719,14 +3720,24 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, } spin_lock_init(&uctx->sh_lock); - resp.dev_id = rdev->en_dev->pdev->devfn; /*Temp, Use idr_alloc instead*/ + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_CCTX; + chip_met_rev_num = rdev->chip_ctx.chip_num; + chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) << + BNXT_RE_CHIP_ID0_CHIP_REV_SFT; + chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) << + BNXT_RE_CHIP_ID0_CHIP_MET_SFT; + resp.chip_id0 = chip_met_rev_num; + /* Future extension of chip info */ + resp.chip_id1 = 0; + /*Temp, Use idr_alloc instead */ + resp.dev_id = rdev->en_dev->pdev->devfn; resp.max_qp = rdev->qplib_ctx.qpc_count; resp.pg_size = PAGE_SIZE; resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; resp.rsvd = 0; - rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to copy user context"); rc = -EFAULT; diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index a7a6111e50c7..dc52e3cf574c 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -44,6 +44,14 @@ #define BNXT_RE_ABI_VERSION 1 +#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT 0x00 +#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT 0x10 +#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT 0x18 + +enum { + BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL +}; + struct bnxt_re_uctx_resp { __u32 dev_id; __u32 max_qp; @@ -51,6 +59,9 @@ struct bnxt_re_uctx_resp { __u32 cqe_sz; __u32 max_cqd; __u32 rsvd; + __aligned_u64 comp_mask; + __u32 chip_id0; + __u32 chip_id1; }; /* -- cgit v1.2.3 From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Thu, 24 Jan 2019 14:18:15 +0200 Subject: IB/cma: Define option to set ack timeout and pack tos_set Define new option in 'rdma_set_option' to override calculated QP timeout when requested to provide QP attributes to modify a QP. At the same time, pack tos_set to be bitfield. Signed-off-by: Danit Goldberg Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 32 ++++++++++++++++++++++++++++++++ drivers/infiniband/core/cma_priv.h | 4 +++- drivers/infiniband/core/ucma.c | 7 +++++++ include/rdma/rdma_cm.h | 1 + include/uapi/rdma/rdma_user_cm.h | 4 ++++ 5 files changed, 47 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e15546ae4d0f..83aa2ad0c27e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -888,6 +888,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->timeout_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); @@ -1130,6 +1131,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = -ENOSYS; + if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) + qp_attr->timeout = id_priv->timeout; + return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); @@ -2490,6 +2494,34 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +/** + * rdma_set_ack_timeout() - Set the ack timeout of QP associated + * with a connection identifier. + * @id: Communication identifier to associated with service type. + * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. + * + * This function should be called before rdma_connect() on active side, + * and on passive side before rdma_accept(). It is applicable to primary + * path only. The timeout will affect the local side of the QP, it is not + * negotiated with remote side and zero disables the timer. + * + * Return: 0 for success + */ +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) +{ + struct rdma_id_private *id_priv; + + if (id->qp_type != IB_QPT_RC) + return -EINVAL; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->timeout = timeout; + id_priv->timeout_set = true; + + return 0; +} +EXPORT_SYMBOL(rdma_set_ack_timeout); + static void cma_query_handler(int status, struct sa_path_rec *path_rec, void *context) { diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index cf47c69436a7..ca7307277518 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -84,9 +84,11 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; - bool tos_set; + u8 tos_set:1; + u8 timeout_set:1; u8 reuseaddr; u8 afonly; + u8 timeout; enum ib_gid_type gid_type; /* diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 01d68ed46c1b..7468b26b8a01 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1236,6 +1236,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; + case RDMA_OPTION_ID_ACK_TIMEOUT: + if (optlen != sizeof(u8)) { + ret = -EINVAL; + break; + } + ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); + break; default: ret = -ENOSYS; } diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 60987a5903b7..71f48cfdc24c 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -374,6 +374,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout); /** * rdma_get_service_id - Return the IB service ID for a specified address. * @id: Communication identifier associated with the address. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 0d1e78ebad05..e42940a215a3 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -300,6 +300,10 @@ enum { RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_ID_ACK_TIMEOUT = 3 +}; + +enum { RDMA_OPTION_IB_PATH = 1 }; -- cgit v1.2.3 From 926ba19b3574f6a80823a42484877ed65e91da9c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:32 -0800 Subject: RDMA/iwcm: add tos_set bool to iw_cm struct This allows drivers to know the tos was actively set by the application. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 ++ include/rdma/iw_cm.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e761ddd09aed..c43512752b8a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2414,6 +2414,7 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) return PTR_ERR(id); id->tos = id_priv->tos; + id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), @@ -3843,6 +3844,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, return PTR_ERR(cm_id); cm_id->tos = id_priv->tos; + cm_id->tos_set = id_priv->tos_set; id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 48512abd3162..0e1f02815643 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -94,7 +94,8 @@ struct iw_cm_id { void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); u8 tos; - bool mapped; + bool tos_set:1; + bool mapped:1; }; struct iw_cm_conn_param { -- cgit v1.2.3 From 805b754d492f6227e1646001bdf85ad4bb819e55 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:44 +0200 Subject: IB/core: Eliminate a hole in MAD agent struct Move the security related fields above the u8s to eliminate a hole in the struct. pahole before: struct ib_mad_agent { ... u32 hi_tid; /* 48 4 */ u32 flags; /* 52 4 */ u8 port_num; /* 56 1 */ u8 rmpp_version; /* 57 1 */ /* XXX 6 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ void * security; /* 64 8 */ bool smp_allowed; /* 72 1 */ bool lsm_nb_reg; /* 73 1 */ /* XXX 6 bytes hole, try to pack */ struct notifier_block lsm_nb; /* 80 24 */ /* XXX last struct has 4 bytes of padding */ /* size: 104, cachelines: 2, members: 14 */ ... }; pahole after: struct ib_mad_agent { ... u32 hi_tid; /* 48 4 */ u32 flags; /* 52 4 */ void * security; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct notifier_block lsm_nb; /* 64 24 */ /* XXX last struct has 4 bytes of padding */ u8 port_num; /* 88 1 */ u8 rmpp_version; /* 89 1 */ bool smp_allowed; /* 90 1 */ bool lsm_nb_reg; /* 91 1 */ /* size: 96, cachelines: 2, members: 14 */ ... }; Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_mad.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index fdef558e3a2d..1c0b914f199d 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -616,12 +616,12 @@ struct ib_mad_agent { void *context; u32 hi_tid; u32 flags; + void *security; + struct notifier_block lsm_nb; u8 port_num; u8 rmpp_version; - void *security; bool smp_allowed; bool lsm_nb_reg; - struct notifier_block lsm_nb; }; /** -- cgit v1.2.3 From c66f67414c1f88554485bb2a0abf8b5c0d741de7 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:45 +0200 Subject: IB/core: Don't register each MAD agent for LSM notifier When creating many MAD agents in a short period of time, receive packet processing can be delayed long enough to cause timeouts while new agents are being added to the atomic notifier chain with IRQs disabled. Notifier chain registration and unregstration is an O(n) operation. With large numbers of MAD agents being created and destroyed simultaneously the CPUs spend too much time with interrupts disabled. Instead of each MAD agent registering for it's own LSM notification, maintain a list of agents internally and register once, this registration already existed for handling the PKeys. This list is write mostly, so a normal spin lock is used vs a read/write lock. All MAD agents must be checked, so a single list is used instead of breaking them down per device. Notifier calls are done under rcu_read_lock, so there isn't a risk of similar packet timeouts while checking the MAD agents security settings when notified. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Acked-by: Paul Moore Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 5 ++++ drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/security.c | 50 +++++++++++++++++++++---------------- include/rdma/ib_mad.h | 3 +-- 4 files changed, 35 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index bcb3e3029a9b..d053110207eb 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -202,6 +202,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type); void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +void ib_mad_agent_security_change(void); #else static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) { @@ -267,6 +268,10 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, { return 0; } + +static inline void ib_mad_agent_security_change(void) +{ +} #endif struct ib_device *ib_device_get_by_index(u32 ifindex); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 55221990d946..32cd35c9b21e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -452,6 +452,7 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_DONE; schedule_work(&ib_policy_change_work); + ib_mad_agent_security_change(); return NOTIFY_OK; } diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 7662e9347238..a70d2ba312ed 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -39,6 +39,10 @@ #include "core_priv.h" #include "mad_priv.h" +static LIST_HEAD(mad_agent_list); +/* Lock to protect mad_agent_list */ +static DEFINE_SPINLOCK(mad_agent_list_lock); + static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) { struct pkey_index_qp_list *pkey = NULL; @@ -676,19 +680,18 @@ static int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -static int ib_mad_agent_security_change(struct notifier_block *nb, - unsigned long event, - void *data) +void ib_mad_agent_security_change(void) { - struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb); - - if (event != LSM_POLICY_CHANGE) - return NOTIFY_DONE; - - ag->smp_allowed = !security_ib_endport_manage_subnet( - ag->security, dev_name(&ag->device->dev), ag->port_num); - - return NOTIFY_OK; + struct ib_mad_agent *ag; + + spin_lock(&mad_agent_list_lock); + list_for_each_entry(ag, + &mad_agent_list, + mad_agent_sec_list) + WRITE_ONCE(ag->smp_allowed, + !security_ib_endport_manage_subnet(ag->security, + dev_name(&ag->device->dev), ag->port_num)); + spin_unlock(&mad_agent_list_lock); } int ib_mad_agent_security_setup(struct ib_mad_agent *agent, @@ -699,6 +702,8 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (!rdma_protocol_ib(agent->device, agent->port_num)) return 0; + INIT_LIST_HEAD(&agent->mad_agent_sec_list); + ret = security_ib_alloc_security(&agent->security); if (ret) return ret; @@ -706,22 +711,20 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, if (qp_type != IB_QPT_SMI) return 0; + spin_lock(&mad_agent_list_lock); ret = security_ib_endport_manage_subnet(agent->security, dev_name(&agent->device->dev), agent->port_num); if (ret) goto free_security; - agent->lsm_nb.notifier_call = ib_mad_agent_security_change; - ret = register_lsm_notifier(&agent->lsm_nb); - if (ret) - goto free_security; - - agent->smp_allowed = true; - agent->lsm_nb_reg = true; + WRITE_ONCE(agent->smp_allowed, true); + list_add(&agent->mad_agent_sec_list, &mad_agent_list); + spin_unlock(&mad_agent_list_lock); return 0; free_security: + spin_unlock(&mad_agent_list_lock); security_ib_free_security(agent->security); return ret; } @@ -731,8 +734,11 @@ void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) if (!rdma_protocol_ib(agent->device, agent->port_num)) return; - if (agent->lsm_nb_reg) - unregister_lsm_notifier(&agent->lsm_nb); + if (agent->qp->qp_type == IB_QPT_SMI) { + spin_lock(&mad_agent_list_lock); + list_del(&agent->mad_agent_sec_list); + spin_unlock(&mad_agent_list_lock); + } security_ib_free_security(agent->security); } @@ -743,7 +749,7 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) return 0; if (map->agent.qp->qp_type == IB_QPT_SMI) { - if (!map->agent.smp_allowed) + if (!READ_ONCE(map->agent.smp_allowed)) return -EACCES; return 0; } diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 1c0b914f199d..79ba8219e7dc 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -617,11 +617,10 @@ struct ib_mad_agent { u32 hi_tid; u32 flags; void *security; - struct notifier_block lsm_nb; + struct list_head mad_agent_sec_list; u8 port_num; u8 rmpp_version; bool smp_allowed; - bool lsm_nb_reg; }; /** -- cgit v1.2.3 From 30471d4b20335d9bd9ae9b2382a1e1e97d18d86d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 3 Feb 2019 14:55:50 +0200 Subject: RDMA/core: Share driver structure size with core Add new macros to be used in drivers while registering ops structure and IB/core while calling allocation routines, so drivers won't need to perform kzalloc/kfree in their paths. The change in allocation stage allows us to initialize common fields prior to calling to drivers (e.g. restrack). Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 ++ include/rdma/ib_verbs.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 32cd35c9b21e..d806a5c7b202 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1228,6 +1228,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) (ptr)->name = ops->name; \ } while (0) +#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2e1f1e885ee5..e29eae4aec84 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2264,6 +2264,19 @@ struct ib_counters_read_attr { struct uverbs_attr_bundle; +#define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ + .size_##ib_struct = \ + (sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ + BUILD_BUG_ON_ZERO( \ + !__same_type(((struct drv_struct *)NULL)->member, \ + struct ib_struct))) + +#define rdma_zalloc_drv_obj(ib_dev, ib_type) \ + ((struct ib_type *)kzalloc(ib_dev->ops.size_##ib_type, GFP_KERNEL)) + +#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct + /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will -- cgit v1.2.3 From 21a428a019c9a6d133e745b529b9bf18c1187e70 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 3 Feb 2019 14:55:51 +0200 Subject: RDMA: Handle PD allocations by IB/core The PD allocations in IB/core allows us to simplify drivers and their error flows in their .alloc_pd() paths. The changes in .alloc_pd() go hand in had with relevant update in .dealloc_pd(). We will use this opportunity and convert .dealloc_pd() to don't fail, as it was suggested a long time ago, failures are not happening as we have never seen a WARN_ON print. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 + drivers/infiniband/core/uverbs_cmd.c | 15 ++-- drivers/infiniband/core/uverbs_std_types.c | 2 +- drivers/infiniband/core/verbs.c | 27 +++++--- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 37 ++++------ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 9 ++- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 25 +++---- drivers/infiniband/hw/cxgb4/provider.c | 25 +++---- drivers/infiniband/hw/hns/hns_roce_device.h | 7 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 27 +++++--- drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/hns/hns_roce_pd.c | 25 +++---- drivers/infiniband/hw/i40iw/i40iw_utils.c | 1 - drivers/infiniband/hw/i40iw/i40iw_verbs.c | 32 ++++----- drivers/infiniband/hw/mlx4/main.c | 36 ++++------ drivers/infiniband/hw/mlx5/main.c | 48 +++++++------ drivers/infiniband/hw/mthca/mthca_provider.c | 29 +++----- drivers/infiniband/hw/nes/nes_verbs.c | 32 +++------ drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 92 +++++++++++-------------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 6 +- drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qedr/verbs.c | 34 +++------ drivers/infiniband/hw/qedr/verbs.h | 6 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 1 + drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 26 ++----- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 7 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 + drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 43 ++++-------- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 7 +- drivers/infiniband/sw/rdmavt/pd.c | 29 +++----- drivers/infiniband/sw/rdmavt/pd.h | 7 +- drivers/infiniband/sw/rdmavt/vt.c | 1 + drivers/infiniband/sw/rxe/rxe_pool.c | 60 +++++++++++++--- drivers/infiniband/sw/rxe/rxe_pool.h | 4 ++ drivers/infiniband/sw/rxe/rxe_verbs.c | 16 ++--- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- include/rdma/ib_verbs.h | 9 +-- 39 files changed, 325 insertions(+), 409 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d806a5c7b202..57e1e177921e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1319,6 +1319,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, unmap_fmr); + + SET_OBJ_SIZE(dev_ops, ib_pd); } EXPORT_SYMBOL(ib_set_device_ops); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index aa260cafbd85..5ac143f22df0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -407,9 +407,9 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = ib_dev->ops.alloc_pd(ib_dev, uobj->context, &attrs->driver_udata); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); + pd = rdma_zalloc_drv_obj(ib_dev, ib_pd); + if (!pd) { + ret = -ENOMEM; goto err; } @@ -417,11 +417,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) pd->uobject = uobj; pd->__internal_mr = NULL; atomic_set(&pd->usecnt, 0); + pd->res.type = RDMA_RESTRACK_PD; + + ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata); + if (ret) + goto err_alloc; uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; - pd->res.type = RDMA_RESTRACK_PD; rdma_restrack_uadd(&pd->res); ret = uverbs_response(attrs, &resp, sizeof(resp)); @@ -432,7 +436,8 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) err_copy: ib_dealloc_pd(pd); - +err_alloc: + kfree(pd); err: uobj_alloc_abort(uobj); return ret; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index cbc72312eb41..f224cb727224 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -188,7 +188,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject, if (ret) return ret; - ib_dealloc_pd((struct ib_pd *)uobject->object); + ib_dealloc_pd(pd); return 0; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3220fb42ecce..de5d895a5054 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -254,10 +254,11 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, { struct ib_pd *pd; int mr_access_flags = 0; + int ret; - pd = device->ops.alloc_pd(device, NULL, NULL); - if (IS_ERR(pd)) - return pd; + pd = rdma_zalloc_drv_obj(device, ib_pd); + if (!pd) + return ERR_PTR(-ENOMEM); pd->device = device; pd->uobject = NULL; @@ -265,6 +266,16 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, atomic_set(&pd->usecnt, 0); pd->flags = flags; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_set_task(&pd->res, caller); + + ret = device->ops.alloc_pd(pd, NULL, NULL); + if (ret) { + kfree(pd); + return ERR_PTR(ret); + } + rdma_restrack_kadd(&pd->res); + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else @@ -275,10 +286,6 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } - pd->res.type = RDMA_RESTRACK_PD; - rdma_restrack_set_task(&pd->res, caller); - rdma_restrack_kadd(&pd->res); - if (mr_access_flags) { struct ib_mr *mr; @@ -329,10 +336,8 @@ void ib_dealloc_pd(struct ib_pd *pd) WARN_ON(atomic_read(&pd->usecnt)); rdma_restrack_del(&pd->res); - /* Making delalloc_pd a void return is a WIP, no driver should return - an error here. */ - ret = pd->device->ops.dealloc_pd(pd); - WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); + pd->device->ops.dealloc_pd(pd); + kfree(pd); } EXPORT_SYMBOL(ib_dealloc_pd); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 1d7469e23cde..1606571af63d 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -563,41 +563,29 @@ fail: } /* Protection Domains */ -int bnxt_re_dealloc_pd(struct ib_pd *ib_pd) +void bnxt_re_dealloc_pd(struct ib_pd *ib_pd) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; - int rc; bnxt_re_destroy_fence_mr(pd); - if (pd->qplib_pd.id) { - rc = bnxt_qplib_dealloc_pd(&rdev->qplib_res, - &rdev->qplib_res.pd_tbl, - &pd->qplib_pd); - if (rc) - dev_err(rdev_to_dev(rdev), "Failed to deallocate HW PD"); - } - - kfree(pd); - return 0; + if (pd->qplib_pd.id) + bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, + &pd->qplib_pd); } -struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *ucontext, - struct ib_udata *udata) +int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_re_ucontext *ucntx = container_of(ucontext, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_re_pd *pd; + struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd); int rc; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - pd->rdev = rdev; if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) { dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD"); @@ -637,13 +625,12 @@ struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, if (bnxt_re_create_fence_mr(pd)) dev_warn(rdev_to_dev(rdev), "Failed to create Fence-MR\n"); - return &pd->ib_pd; + return 0; dbfail: - (void)bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, - &pd->qplib_pd); + bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl, + &pd->qplib_pd); fail: - kfree(pd); - return ERR_PTR(rc); + return rc; } /* Address Handles */ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index c4af72604b4f..c7cca803cfa3 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -56,8 +56,8 @@ struct bnxt_re_fence_data { }; struct bnxt_re_pd { + struct ib_pd ib_pd; struct bnxt_re_dev *rdev; - struct ib_pd ib_pd; struct bnxt_qplib_pd qplib_pd; struct bnxt_re_fence_data fence; }; @@ -163,10 +163,9 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid); enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev, u8 port_num); -struct ib_pd *bnxt_re_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int bnxt_re_dealloc_pd(struct ib_pd *pd); +int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void bnxt_re_dealloc_pd(struct ib_pd *pd); struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 0d40a930c192..0a89ef6e5754 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -637,6 +637,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .query_srq = bnxt_re_query_srq, .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), }; static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 804c1fc7bfc1..4cc9a6ae2139 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -370,7 +370,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) return ret; } -static int iwch_deallocate_pd(struct ib_pd *pd) +static void iwch_deallocate_pd(struct ib_pd *pd) { struct iwch_dev *rhp; struct iwch_pd *php; @@ -379,15 +379,13 @@ static int iwch_deallocate_pd(struct ib_pd *pd) rhp = php->rhp; pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid); cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid); - kfree(php); - return 0; } -static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct iwch_pd *php; + struct iwch_pd *php = to_iwch_pd(pd); + struct ib_device *ibdev = pd->device; u32 pdid; struct iwch_dev *rhp; @@ -395,12 +393,8 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, rhp = (struct iwch_dev *) ibdev; pdid = cxio_hal_get_pdid(rhp->rdev.rscp); if (!pdid) - return ERR_PTR(-EINVAL); - php = kzalloc(sizeof(*php), GFP_KERNEL); - if (!php) { - cxio_hal_put_pdid(rhp->rdev.rscp, pdid); - return ERR_PTR(-ENOMEM); - } + return -EINVAL; + php->pdid = pdid; php->rhp = rhp; if (context) { @@ -408,11 +402,11 @@ static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { iwch_deallocate_pd(&php->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php); - return &php->ibpd; + return 0; } static int iwch_dereg_mr(struct ib_mr *ib_mr) @@ -1350,6 +1344,7 @@ static const struct ib_device_ops iwch_dev_ops = { .reg_user_mr = iwch_reg_user_mr, .req_notify_cq = iwch_arm_cq, .resize_cq = iwch_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), }; int iwch_register_device(struct iwch_dev *dev) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index f59bf7e5a589..680b5e98491d 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -209,7 +209,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) return ret; } -static int c4iw_deallocate_pd(struct ib_pd *pd) +static void c4iw_deallocate_pd(struct ib_pd *pd) { struct c4iw_dev *rhp; struct c4iw_pd *php; @@ -221,15 +221,13 @@ static int c4iw_deallocate_pd(struct ib_pd *pd) mutex_lock(&rhp->rdev.stats.lock); rhp->rdev.stats.pd.cur--; mutex_unlock(&rhp->rdev.stats.lock); - kfree(php); - return 0; } -static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct c4iw_pd *php; + struct c4iw_pd *php = to_c4iw_pd(pd); + struct ib_device *ibdev = pd->device; u32 pdid; struct c4iw_dev *rhp; @@ -237,12 +235,8 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, rhp = (struct c4iw_dev *) ibdev; pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table); if (!pdid) - return ERR_PTR(-EINVAL); - php = kzalloc(sizeof(*php), GFP_KERNEL); - if (!php) { - c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid); - return ERR_PTR(-ENOMEM); - } + return -EINVAL; + php->pdid = pdid; php->rhp = rhp; if (context) { @@ -250,7 +244,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { c4iw_deallocate_pd(&php->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } mutex_lock(&rhp->rdev.stats.lock); @@ -259,7 +253,7 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev, rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur; mutex_unlock(&rhp->rdev.stats.lock); pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php); - return &php->ibpd; + return 0; } static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index, @@ -570,6 +564,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .query_qp = c4iw_ib_query_qp, .reg_user_mr = c4iw_reg_user_mr, .req_notify_cq = c4iw_arm_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd), }; void c4iw_register_device(struct work_struct *work) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 8ca8d74dfb6a..9ee86daf1700 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1114,10 +1114,9 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *pd, int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags); -struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, - struct ib_ucontext *context, - struct ib_udata *udata); -int hns_roce_dealloc_pd(struct ib_pd *pd); +int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void hns_roce_dealloc_pd(struct ib_pd *pd); struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index fa08c22aad66..a18b88c95995 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -711,13 +711,14 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) struct ib_qp_attr attr = { 0 }; struct hns_roce_v1_priv *priv; struct hns_roce_qp *hr_qp; + struct ib_device *ibdev; struct ib_cq *cq; struct ib_pd *pd; union ib_gid dgid; u64 subnet_prefix; int attr_mask = 0; + int ret = -ENOMEM; int i, j; - int ret; u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 }; u8 phy_port; u8 port = 0; @@ -742,12 +743,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) free_mr->mr_free_cq->ib_cq.cq_context = NULL; atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0); - pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL); - if (IS_ERR(pd)) { - dev_err(dev, "Create pd for reserved loop qp failed!"); - ret = -ENOMEM; + ibdev = &hr_dev->ib_dev; + pd = rdma_zalloc_drv_obj(ibdev, ib_pd); + if (pd) + goto alloc_mem_failed; + + pd->device = ibdev; + ret = hns_roce_alloc_pd(pd, NULL, NULL); + if (ret) goto alloc_pd_failed; - } + free_mr->mr_free_pd = to_hr_pd(pd); free_mr->mr_free_pd->ibpd.device = &hr_dev->ib_dev; free_mr->mr_free_pd->ibpd.uobject = NULL; @@ -854,10 +859,12 @@ create_lp_qp_failed: dev_err(dev, "Destroy qp %d for mr free failed!\n", i); } - if (hns_roce_dealloc_pd(pd)) - dev_err(dev, "Destroy pd for create_lp_qp failed!\n"); + hns_roce_dealloc_pd(pd); alloc_pd_failed: + kfree(pd); + +alloc_mem_failed: if (hns_roce_ib_destroy_cq(cq)) dev_err(dev, "Destroy cq for create_lp_qp failed!\n"); @@ -891,9 +898,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) if (ret) dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret); - ret = hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd); - if (ret) - dev_err(dev, "Destroy pd for mr_free failed(%d)!\n", ret); + hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd); } static int hns_roce_db_init(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 67a8c4333f4f..ccf10622586c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -472,6 +472,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .query_pkey = hns_roce_query_pkey, .query_port = hns_roce_query_port, .reg_user_mr = hns_roce_reg_user_mr, + INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd), }; static const struct ib_device_ops hns_roce_dev_mr_ops = { diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index 4a29b2cb9bab..b9b97c5e97e6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -57,24 +57,19 @@ void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev) hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap); } -struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, - struct ib_ucontext *context, - struct ib_udata *udata) +int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ib_dev = ibpd->device; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); struct device *dev = hr_dev->dev; - struct hns_roce_pd *pd; + struct hns_roce_pd *pd = to_hr_pd(ibpd); int ret; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn); if (ret) { - kfree(pd); dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n"); - return ERR_PTR(ret); + return ret; } if (context) { @@ -83,21 +78,17 @@ struct ib_pd *hns_roce_alloc_pd(struct ib_device *ib_dev, if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn); dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n"); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } EXPORT_SYMBOL_GPL(hns_roce_alloc_pd); -int hns_roce_dealloc_pd(struct ib_pd *pd) +void hns_roce_dealloc_pd(struct ib_pd *pd) { hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn); - kfree(to_hr_pd(pd)); - - return 0; } EXPORT_SYMBOL_GPL(hns_roce_dealloc_pd); diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 59e978141ad4..c5a881172524 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -601,7 +601,6 @@ void i40iw_rem_pdusecount(struct i40iw_pd *iwpd, struct i40iw_device *iwdev) if (!atomic_dec_and_test(&iwpd->usecount)) return; i40iw_free_resource(iwdev, iwdev->allocated_pds, iwpd->sc_pd.pd_id); - kfree(iwpd); } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index d4ab46dd9e6c..28449ad57b37 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -312,16 +312,15 @@ static void i40iw_dealloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_ /** * i40iw_alloc_pd - allocate protection domain - * @ibdev: device pointer from stack + * @pd: PD pointer * @context: user context created during alloc * @udata: user data */ -static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct i40iw_pd *iwpd; - struct i40iw_device *iwdev = to_iwdev(ibdev); + struct i40iw_pd *iwpd = to_iwpd(pd); + struct i40iw_device *iwdev = to_iwdev(pd->device); struct i40iw_sc_dev *dev = &iwdev->sc_dev; struct i40iw_alloc_pd_resp uresp; struct i40iw_sc_pd *sc_pd; @@ -330,19 +329,13 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev, int err; if (iwdev->closing) - return ERR_PTR(-ENODEV); + return -ENODEV; err = i40iw_alloc_resource(iwdev, iwdev->allocated_pds, iwdev->max_pd, &pd_id, &iwdev->next_pd); if (err) { i40iw_pr_err("alloc resource failed\n"); - return ERR_PTR(err); - } - - iwpd = kzalloc(sizeof(*iwpd), GFP_KERNEL); - if (!iwpd) { - err = -ENOMEM; - goto free_res; + return err; } sc_pd = &iwpd->sc_pd; @@ -361,25 +354,23 @@ static struct ib_pd *i40iw_alloc_pd(struct ib_device *ibdev, } i40iw_add_pdusecount(iwpd); - return &iwpd->ibpd; + return 0; + error: - kfree(iwpd); -free_res: i40iw_free_resource(iwdev, iwdev->allocated_pds, pd_id); - return ERR_PTR(err); + return err; } /** * i40iw_dealloc_pd - deallocate pd * @ibpd: ptr of pd to be deallocated */ -static int i40iw_dealloc_pd(struct ib_pd *ibpd) +static void i40iw_dealloc_pd(struct ib_pd *ibpd) { struct i40iw_pd *iwpd = to_iwpd(ibpd); struct i40iw_device *iwdev = to_iwdev(ibpd->device); i40iw_rem_pdusecount(iwpd, iwdev); - return 0; } /** @@ -2750,6 +2741,7 @@ static const struct ib_device_ops i40iw_dev_ops = { .query_qp = i40iw_query_qp, .reg_user_mr = i40iw_reg_user_mr, .req_notify_cq = i40iw_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd), }; /** diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d66002a31000..c0f6aea7ed7c 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1186,38 +1186,27 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) } } -static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct mlx4_ib_pd *pd; + struct mlx4_ib_pd *pd = to_mpd(ibpd); + struct ib_device *ibdev = ibpd->device; int err; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + if (err) + return err; - if (context) - if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { - mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); - kfree(pd); - return ERR_PTR(-EFAULT); - } - return &pd->ibpd; + if (context && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) { + mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); + return -EFAULT; + } + return 0; } -static int mlx4_ib_dealloc_pd(struct ib_pd *pd) +static void mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); - kfree(pd); - - return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, @@ -2580,6 +2569,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .req_notify_cq = mlx4_ib_arm_cq, .rereg_user_mr = mlx4_ib_rereg_user_mr, .resize_cq = mlx4_ib_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd), }; static const struct ib_device_ops mlx4_ib_dev_wq_ops = { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 76d6c2557d0c..f9cddc6f2ab6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2280,30 +2280,24 @@ int mlx5_ib_dealloc_dm(struct ib_dm *ibdm) return 0; } -static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct mlx5_ib_pd *pd = to_mpd(ibpd); + struct ib_device *ibdev = ibpd->device; struct mlx5_ib_alloc_pd_resp resp; - struct mlx5_ib_pd *pd; int err; u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; u16 uid = 0; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - uid = context ? to_mucontext(context)->devx_uid : 0; MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); MLX5_SET(alloc_pd_in, in, uid, uid); err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), out, sizeof(out)); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + if (err) + return err; pd->pdn = MLX5_GET(alloc_pd_out, out, pd); pd->uid = uid; @@ -2311,23 +2305,19 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } -static int mlx5_ib_dealloc_pd(struct ib_pd *pd) +static void mlx5_ib_dealloc_pd(struct ib_pd *pd) { struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); - kfree(mpd); - - return 0; } enum { @@ -4680,23 +4670,28 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; + struct ib_device *ibdev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; int port; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); + ibdev = &dev->ib_dev; mutex_init(&devr->mutex); - devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); - if (IS_ERR(devr->p0)) { - ret = PTR_ERR(devr->p0); - goto error0; - } - devr->p0->device = &dev->ib_dev; + devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd); + if (!devr->p0) + return -ENOMEM; + + devr->p0->device = ibdev; devr->p0->uobject = NULL; atomic_set(&devr->p0->usecnt, 0); + ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL); + if (ret) + goto error0; + devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL); if (IS_ERR(devr->c0)) { ret = PTR_ERR(devr->c0); @@ -4794,6 +4789,7 @@ error2: error1: mlx5_ib_dealloc_pd(devr->p0); error0: + kfree(devr->p0); return ret; } @@ -4809,6 +4805,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); + kfree(devr->p0); /* Make sure no change P_Key work items are still executing */ for (port = 0; port < dev->num_ports; ++port) @@ -5938,6 +5935,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), }; static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = { diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 1bb67562c8c8..2c754bc226f3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -374,40 +374,30 @@ static int mthca_mmap_uar(struct ib_ucontext *context, return 0; } -static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct mthca_pd *pd; + struct ib_device *ibdev = ibpd->device; + struct mthca_pd *pd = to_mpd(ibpd); int err; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - err = mthca_pd_alloc(to_mdev(ibdev), !context, pd); - if (err) { - kfree(pd); - return ERR_PTR(err); - } + if (err) + return err; if (context) { if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) { mthca_pd_free(to_mdev(ibdev), pd); - kfree(pd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &pd->ibpd; + return 0; } -static int mthca_dealloc_pd(struct ib_pd *pd) +static void mthca_dealloc_pd(struct ib_pd *pd) { mthca_pd_free(to_mdev(pd->device), to_mpd(pd)); - kfree(pd); - - return 0; } static struct ib_ah *mthca_ah_create(struct ib_pd *pd, @@ -1228,6 +1218,7 @@ static const struct ib_device_ops mthca_dev_ops = { .query_qp = mthca_query_qp, .reg_user_mr = mthca_reg_user_mr, .resize_cq = mthca_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd), }; static const struct ib_device_ops mthca_dev_arbel_srq_ops = { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 6eb991d40035..f18b28ae4bd9 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -658,10 +658,11 @@ static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) /** * nes_alloc_pd */ -static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, struct ib_udata *udata) +static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct nes_pd *nespd; + struct ib_device *ibdev = pd->device; + struct nes_pd *nespd = to_nespd(pd); struct nes_vnic *nesvnic = to_nesvnic(ibdev); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; @@ -676,15 +677,8 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds, nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD); - if (err) { - return ERR_PTR(err); - } - - nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL); - if (!nespd) { - nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - return ERR_PTR(-ENOMEM); - } + if (err) + return err; nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", nespd, dev_name(&nesvnic->nesibdev->ibdev.dev)); @@ -700,16 +694,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) { nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n"); nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - kfree(nespd); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } uresp.pd_id = nespd->pd_id; uresp.mmap_db_index = nespd->mmap_db_index; if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) { nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num); - kfree(nespd); - return ERR_PTR(-EFAULT); + return -EFAULT; } set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells); @@ -718,14 +710,14 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, } nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd); - return &nespd->ibpd; + return 0; } /** * nes_dealloc_pd */ -static int nes_dealloc_pd(struct ib_pd *ibpd) +static void nes_dealloc_pd(struct ib_pd *ibpd) { struct nes_ucontext *nesucontext; struct nes_pd *nespd = to_nespd(ibpd); @@ -748,9 +740,6 @@ static int nes_dealloc_pd(struct ib_pd *ibpd) nespd->pd_id, nespd); nes_free_resource(nesadapter, nesadapter->allocated_pds, (nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12)); - kfree(nespd); - - return 0; } @@ -3658,6 +3647,7 @@ static const struct ib_device_ops nes_dev_ops = { .query_qp = nes_query_qp, .reg_user_mr = nes_reg_user_mr, .req_notify_cq = nes_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd), }; /** diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 88970a6bb555..0de83c92691f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -179,6 +179,7 @@ static const struct ib_device_ops ocrdma_dev_ops = { .reg_user_mr = ocrdma_reg_user_mr, .req_notify_cq = ocrdma_arm_cq, .resize_cq = ocrdma_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd), }; static const struct ib_device_ops ocrdma_dev_srq_ops = { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 2a62936bef4d..980ba97188ff 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -367,17 +367,12 @@ static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd) return status; } -static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, - struct ocrdma_ucontext *uctx, - struct ib_udata *udata) +static int _ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd, + struct ocrdma_ucontext *uctx, + struct ib_udata *udata) { - struct ocrdma_pd *pd = NULL; int status; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - if (udata && uctx && dev->attr.max_dpp_pds) { pd->dpp_enabled = ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; @@ -386,15 +381,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, dev->attr.wqe_size) : 0; } - if (dev->pd_mgr->pd_prealloc_valid) { - status = ocrdma_get_pd_num(dev, pd); - if (status == 0) { - return pd; - } else { - kfree(pd); - return ERR_PTR(status); - } - } + if (dev->pd_mgr->pd_prealloc_valid) + return ocrdma_get_pd_num(dev, pd); retry: status = ocrdma_mbx_alloc_pd(dev, pd); @@ -403,13 +391,11 @@ retry: pd->dpp_enabled = false; pd->num_dpp_qp = 0; goto retry; - } else { - kfree(pd); - return ERR_PTR(status); } + return status; } - return pd; + return 0; } static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, @@ -418,30 +404,33 @@ static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, return (uctx->cntxt_pd == pd); } -static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, +static void _ocrdma_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd) { - int status; - if (dev->pd_mgr->pd_prealloc_valid) - status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); + ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled); else - status = ocrdma_mbx_dealloc_pd(dev, pd); - - kfree(pd); - return status; + ocrdma_mbx_dealloc_pd(dev, pd); } static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev, struct ocrdma_ucontext *uctx, struct ib_udata *udata) { - int status = 0; + struct ib_device *ibdev = &dev->ibdev; + struct ib_pd *pd; + int status; + + pd = rdma_zalloc_drv_obj(ibdev, ib_pd); + if (!pd) + return -ENOMEM; + + pd->device = ibdev; + uctx->cntxt_pd = get_ocrdma_pd(pd); - uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata); - if (IS_ERR(uctx->cntxt_pd)) { - status = PTR_ERR(uctx->cntxt_pd); - uctx->cntxt_pd = NULL; + status = _ocrdma_alloc_pd(dev, uctx->cntxt_pd, uctx, udata); + if (status) { + kfree(uctx->cntxt_pd); goto err; } @@ -460,6 +449,7 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) pr_err("%s(%d) Freeing in use pdid=0x%x.\n", __func__, dev->id, pd->id); } + kfree(uctx->cntxt_pd); uctx->cntxt_pd = NULL; (void)_ocrdma_dealloc_pd(dev, pd); return 0; @@ -537,6 +527,7 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, return &ctx->ibucontext; cpy_err: + ocrdma_dealloc_ucontext_pd(ctx); pd_err: ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len); map_err: @@ -658,10 +649,10 @@ dpp_map_err: return status; } -struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); struct ocrdma_pd *pd; struct ocrdma_ucontext *uctx = NULL; @@ -677,11 +668,10 @@ struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev, } } - pd = _ocrdma_alloc_pd(dev, uctx, udata); - if (IS_ERR(pd)) { - status = PTR_ERR(pd); + pd = get_ocrdma_pd(ibpd); + status = _ocrdma_alloc_pd(dev, pd, uctx, udata); + if (status) goto exit; - } pd_mapping: if (udata && context) { @@ -689,25 +679,22 @@ pd_mapping: if (status) goto err; } - return &pd->ibpd; + return 0; err: - if (is_uctx_pd) { + if (is_uctx_pd) ocrdma_release_ucontext_pd(uctx); - } else { - if (_ocrdma_dealloc_pd(dev, pd)) - pr_err("%s: _ocrdma_dealloc_pd() failed\n", __func__); - } + else + _ocrdma_dealloc_pd(dev, pd); exit: - return ERR_PTR(status); + return status; } -int ocrdma_dealloc_pd(struct ib_pd *ibpd) +void ocrdma_dealloc_pd(struct ib_pd *ibpd) { struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); struct ocrdma_ucontext *uctx = NULL; - int status = 0; u64 usr_db; uctx = pd->uctx; @@ -721,11 +708,10 @@ int ocrdma_dealloc_pd(struct ib_pd *ibpd) if (is_ucontext_pd(uctx, pd)) { ocrdma_release_ucontext_pd(uctx); - return status; + return; } } - status = _ocrdma_dealloc_pd(dev, pd); - return status; + _ocrdma_dealloc_pd(dev, pd); } static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index b69cfdce7970..1fd66721c930 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -70,9 +70,9 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *); int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); -struct ib_pd *ocrdma_alloc_pd(struct ib_device *, - struct ib_ucontext *, struct ib_udata *); -int ocrdma_dealloc_pd(struct ib_pd *pd); +int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx, + struct ib_udata *udata); +void ocrdma_dealloc_pd(struct ib_pd *pd); struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 878e9e23652b..44ce4989dcef 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -239,6 +239,7 @@ static const struct ib_device_ops qedr_dev_ops = { .reg_user_mr = qedr_reg_user_mr, .req_notify_cq = qedr_arm_cq, .resize_cq = qedr_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd), }; static int qedr_register_device(struct qedr_dev *dev) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 989f08633fbe..a06d2258394a 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -450,11 +450,12 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) vma->vm_page_prot); } -struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, struct ib_udata *udata) +int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct qedr_dev *dev = get_qedr_dev(ibdev); - struct qedr_pd *pd; + struct qedr_pd *pd = get_qedr_pd(ibpd); u16 pd_id; int rc; @@ -463,16 +464,12 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, if (!dev->rdma_ctx) { DP_ERR(dev, "invalid RDMA context\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - rc = dev->ops->rdma_alloc_pd(dev->rdma_ctx, &pd_id); if (rc) - goto err; + return rc; pd->pd_id = pd_id; @@ -485,36 +482,23 @@ struct ib_pd *qedr_alloc_pd(struct ib_device *ibdev, if (rc) { DP_ERR(dev, "copy error pd_id=0x%x.\n", pd_id); dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd_id); - goto err; + return rc; } pd->uctx = get_qedr_ucontext(context); pd->uctx->pd = pd; } - return &pd->ibpd; - -err: - kfree(pd); - return ERR_PTR(rc); + return 0; } -int qedr_dealloc_pd(struct ib_pd *ibpd) +void qedr_dealloc_pd(struct ib_pd *ibpd) { struct qedr_dev *dev = get_qedr_dev(ibpd->device); struct qedr_pd *pd = get_qedr_pd(ibpd); - if (!pd) { - pr_err("Invalid PD received in dealloc_pd\n"); - return -EINVAL; - } - DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id); dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id); - - kfree(pd); - - return 0; } static void qedr_free_pbl(struct qedr_dev *dev, diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 1852b7012bf4..97a6ff3f9afb 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -47,9 +47,9 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *); int qedr_dealloc_ucontext(struct ib_ucontext *); int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); -struct ib_pd *qedr_alloc_pd(struct ib_device *, - struct ib_ucontext *, struct ib_udata *); -int qedr_dealloc_pd(struct ib_pd *pd); +int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx, + struct ib_udata *udata); +void qedr_dealloc_pd(struct ib_pd *pd); struct ib_cq *qedr_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 1ec155823716..256ad2f236c8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -352,6 +352,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_port = usnic_ib_query_port, .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, + INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), }; /* Start of PF discovery section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 9dea18106247..0ced89b51448 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -456,37 +456,23 @@ int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, return 0; } -struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct usnic_ib_pd *pd; + struct usnic_ib_pd *pd = to_upd(ibpd); void *umem_pd; - usnic_dbg("\n"); - - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) - return ERR_PTR(-ENOMEM); - umem_pd = pd->umem_pd = usnic_uiom_alloc_pd(); if (IS_ERR_OR_NULL(umem_pd)) { - kfree(pd); - return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM); + return umem_pd ? PTR_ERR(umem_pd) : -ENOMEM; } - usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", - pd, context, dev_name(&ibdev->dev)); - return &pd->ibpd; + return 0; } -int usnic_ib_dealloc_pd(struct ib_pd *pd) +void usnic_ib_dealloc_pd(struct ib_pd *pd) { - usnic_info("freeing domain 0x%p\n", pd); - usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd); - kfree(pd); - return 0; } struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 99a6d81c2bcd..44a9d2f82bf5 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -51,10 +51,9 @@ int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index, struct net_device *usnic_get_netdev(struct ib_device *device, u8 port_num); int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); -struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int usnic_ib_dealloc_pd(struct ib_pd *pd); +int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata); +void usnic_ib_dealloc_pd(struct ib_pd *pd); struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index e582beaf9430..47e653d2495c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -195,6 +195,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .query_qp = pvrdma_query_qp, .reg_user_mr = pvrdma_reg_user_mr, .req_notify_cq = pvrdma_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd), }; static const struct ib_device_ops pvrdma_dev_srq_ops = { diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index fafb2add3b44..f44220f72e05 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -438,37 +438,29 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) /** * pvrdma_alloc_pd - allocate protection domain - * @ibdev: the IB device + * @ibpd: PD pointer * @context: user context * @udata: user data * * @return: the ib_pd protection domain pointer on success, otherwise errno. */ -struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct pvrdma_pd *pd; + struct ib_device *ibdev = ibpd->device; + struct pvrdma_pd *pd = to_vpd(ibpd); struct pvrdma_dev *dev = to_vdev(ibdev); - union pvrdma_cmd_req req; - union pvrdma_cmd_resp rsp; + union pvrdma_cmd_req req = {}; + union pvrdma_cmd_resp rsp = {}; struct pvrdma_cmd_create_pd *cmd = &req.create_pd; struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp; struct pvrdma_alloc_pd_resp pd_resp = {0}; int ret; - void *ptr; /* Check allowed max pds */ if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd)) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ptr = ERR_PTR(-ENOMEM); - goto err; - } - - memset(cmd, 0, sizeof(*cmd)); cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD; cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP); @@ -476,8 +468,7 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, dev_warn(&dev->pdev->dev, "failed to allocate protection domain, error: %d\n", ret); - ptr = ERR_PTR(ret); - goto freepd; + goto err; } pd->privileged = !context; @@ -490,18 +481,16 @@ struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, dev_warn(&dev->pdev->dev, "failed to copy back protection domain\n"); pvrdma_dealloc_pd(&pd->ibpd); - return ERR_PTR(-EFAULT); + return -EFAULT; } } /* u32 pd handle */ - return &pd->ibpd; + return 0; -freepd: - kfree(pd); err: atomic_dec(&dev->num_pds); - return ptr; + return ret; } /** @@ -510,14 +499,13 @@ err: * * @return: 0 on success, otherwise errno. */ -int pvrdma_dealloc_pd(struct ib_pd *pd) +void pvrdma_dealloc_pd(struct ib_pd *pd) { struct pvrdma_dev *dev = to_vdev(pd->device); - union pvrdma_cmd_req req; + union pvrdma_cmd_req req = {}; struct pvrdma_cmd_destroy_pd *cmd = &req.destroy_pd; int ret; - memset(cmd, 0, sizeof(*cmd)); cmd->hdr.cmd = PVRDMA_CMD_DESTROY_PD; cmd->pd_handle = to_vpd(pd)->pd_handle; @@ -527,10 +515,7 @@ int pvrdma_dealloc_pd(struct ib_pd *pd) "could not dealloc protection domain, error: %d\n", ret); - kfree(to_vpd(pd)); atomic_dec(&dev->num_pds); - - return 0; } /** diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index f7f758d60110..ed91baad1ffa 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -399,10 +399,9 @@ int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata); int pvrdma_dealloc_ucontext(struct ib_ucontext *context); -struct ib_pd *pvrdma_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int pvrdma_dealloc_pd(struct ib_pd *ibpd); +int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void pvrdma_dealloc_pd(struct ib_pd *ibpd); struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/sw/rdmavt/pd.c b/drivers/infiniband/sw/rdmavt/pd.c index dcc1870b8d23..6033054b22fa 100644 --- a/drivers/infiniband/sw/rdmavt/pd.c +++ b/drivers/infiniband/sw/rdmavt/pd.c @@ -50,7 +50,7 @@ /** * rvt_alloc_pd - allocate a protection domain - * @ibdev: ib device + * @ibpd: PD * @context: optional user context * @udata: optional user data * @@ -58,19 +58,14 @@ * * Return: 0 on success */ -struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata) +int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { + struct ib_device *ibdev = ibpd->device; struct rvt_dev_info *dev = ib_to_rvt(ibdev); - struct rvt_pd *pd; - struct ib_pd *ret; + struct rvt_pd *pd = ibpd_to_rvtpd(ibpd); + int ret = 0; - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } /* * While we could continue allocating protecetion domains, being * constrained only by system resources. The IBTA spec defines that @@ -81,8 +76,7 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, spin_lock(&dev->n_pds_lock); if (dev->n_pds_allocated == dev->dparms.props.max_pd) { spin_unlock(&dev->n_pds_lock); - kfree(pd); - ret = ERR_PTR(-ENOMEM); + ret = -ENOMEM; goto bail; } @@ -92,8 +86,6 @@ struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, /* ib_alloc_pd() will initialize pd->ibpd. */ pd->user = !!udata; - ret = &pd->ibpd; - bail: return ret; } @@ -104,16 +96,11 @@ bail: * * Return: always 0 */ -int rvt_dealloc_pd(struct ib_pd *ibpd) +void rvt_dealloc_pd(struct ib_pd *ibpd) { - struct rvt_pd *pd = ibpd_to_rvtpd(ibpd); struct rvt_dev_info *dev = ib_to_rvt(ibpd->device); spin_lock(&dev->n_pds_lock); dev->n_pds_allocated--; spin_unlock(&dev->n_pds_lock); - - kfree(pd); - - return 0; } diff --git a/drivers/infiniband/sw/rdmavt/pd.h b/drivers/infiniband/sw/rdmavt/pd.h index 1892ca4a9746..7a887e4a45e7 100644 --- a/drivers/infiniband/sw/rdmavt/pd.h +++ b/drivers/infiniband/sw/rdmavt/pd.h @@ -50,9 +50,8 @@ #include -struct ib_pd *rvt_alloc_pd(struct ib_device *ibdev, - struct ib_ucontext *context, - struct ib_udata *udata); -int rvt_dealloc_pd(struct ib_pd *ibpd); +int rvt_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); +void rvt_dealloc_pd(struct ib_pd *ibpd); #endif /* DEF_RDMAVTPD_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index b3f0c5578925..a19832c73d5a 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -436,6 +436,7 @@ static const struct ib_device_ops rvt_dev_ops = { .req_notify_cq = rvt_req_notify_cq, .resize_cq = rvt_resize_cq, .unmap_fmr = rvt_unmap_fmr, + INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd), }; static noinline int check_support(struct rvt_dev_info *rdi, int verb) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index b5c91df22047..cd3f14629ba8 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -46,6 +46,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_PD] = { .name = "rxe-pd", .size = sizeof(struct rxe_pd), + .flags = RXE_POOL_NO_ALLOC, }, [RXE_TYPE_AH] = { .name = "rxe-ah", @@ -119,8 +120,10 @@ static void rxe_cache_clean(size_t cnt) for (i = 0; i < cnt; i++) { type = &rxe_type_info[i]; - kmem_cache_destroy(type->cache); - type->cache = NULL; + if (!(type->flags & RXE_POOL_NO_ALLOC)) { + kmem_cache_destroy(type->cache); + type->cache = NULL; + } } } @@ -134,14 +137,17 @@ int rxe_cache_init(void) for (i = 0; i < RXE_NUM_TYPES; i++) { type = &rxe_type_info[i]; size = ALIGN(type->size, RXE_POOL_ALIGN); - type->cache = kmem_cache_create(type->name, size, - RXE_POOL_ALIGN, - RXE_POOL_CACHE_FLAGS, NULL); - if (!type->cache) { - pr_err("Unable to init kmem cache for %s\n", - type->name); - err = -ENOMEM; - goto err1; + if (!(type->flags & RXE_POOL_NO_ALLOC)) { + type->cache = + kmem_cache_create(type->name, size, + RXE_POOL_ALIGN, + RXE_POOL_CACHE_FLAGS, NULL); + if (!type->cache) { + pr_err("Unable to init kmem cache for %s\n", + type->name); + err = -ENOMEM; + goto err1; + } } } @@ -415,6 +421,37 @@ out_put_pool: return NULL; } +int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem) +{ + unsigned long flags; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); + + read_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); + return -EINVAL; + } + kref_get(&pool->ref_cnt); + read_unlock_irqrestore(&pool->pool_lock, flags); + + kref_get(&pool->rxe->ref_cnt); + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) + goto out_put_pool; + + elem->pool = pool; + kref_init(&elem->ref_cnt); + + return 0; + +out_put_pool: + atomic_dec(&pool->num_elem); + rxe_dev_put(pool->rxe); + rxe_pool_put(pool); + return -EINVAL; +} + void rxe_elem_release(struct kref *kref) { struct rxe_pool_entry *elem = @@ -424,7 +461,8 @@ void rxe_elem_release(struct kref *kref) if (pool->cleanup) pool->cleanup(elem); - kmem_cache_free(pool_cache(pool), elem); + if (!(pool->flags & RXE_POOL_NO_ALLOC)) + kmem_cache_free(pool_cache(pool), elem); atomic_dec(&pool->num_elem); rxe_dev_put(pool->rxe); rxe_pool_put(pool); diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 72968c29e01f..2f2cff1cbe43 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -41,6 +41,7 @@ enum rxe_pool_flags { RXE_POOL_ATOMIC = BIT(0), RXE_POOL_INDEX = BIT(1), RXE_POOL_KEY = BIT(2), + RXE_POOL_NO_ALLOC = BIT(4), }; enum rxe_elem_type { @@ -131,6 +132,9 @@ void rxe_pool_cleanup(struct rxe_pool *pool); /* allocate an object from pool */ void *rxe_alloc(struct rxe_pool *pool); +/* connect already allocated object to pool */ +int rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem); + /* assign an index to an indexed object and insert object into * pool's rb tree */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index cc5a05124ece..051c3930e808 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -191,23 +191,20 @@ static int rxe_port_immutable(struct ib_device *dev, u8 port_num, return 0; } -static struct ib_pd *rxe_alloc_pd(struct ib_device *dev, - struct ib_ucontext *context, - struct ib_udata *udata) +static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context, + struct ib_udata *udata) { - struct rxe_dev *rxe = to_rdev(dev); - struct rxe_pd *pd; + struct rxe_dev *rxe = to_rdev(ibpd->device); + struct rxe_pd *pd = to_rpd(ibpd); - pd = rxe_alloc(&rxe->pd_pool); - return pd ? &pd->ibpd : ERR_PTR(-ENOMEM); + return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem); } -static int rxe_dealloc_pd(struct ib_pd *ibpd) +static void rxe_dealloc_pd(struct ib_pd *ibpd) { struct rxe_pd *pd = to_rpd(ibpd); rxe_drop_ref(pd); - return 0; } static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, @@ -1183,6 +1180,7 @@ static const struct ib_device_ops rxe_dev_ops = { .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .resize_cq = rxe_resize_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), }; int rxe_register_device(struct rxe_dev *rxe) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 74e04801d34d..70839d3f55d9 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -66,8 +66,8 @@ struct rxe_ucontext { }; struct rxe_pd { + struct ib_pd ibpd; struct rxe_pool_entry pelem; - struct ib_pd ibpd; }; struct rxe_ah { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e29eae4aec84..854d7816787c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2385,10 +2385,9 @@ struct ib_device_ops { int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); - struct ib_pd *(*alloc_pd)(struct ib_device *device, - struct ib_ucontext *context, - struct ib_udata *udata); - int (*dealloc_pd)(struct ib_pd *pd); + int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); + void (*dealloc_pd)(struct ib_pd *pd); struct ib_ah *(*create_ah)(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, struct ib_udata *udata); @@ -2530,6 +2529,8 @@ struct ib_device_ops { */ int (*fill_res_entry)(struct sk_buff *msg, struct rdma_restrack_entry *entry); + + DECLARE_RDMA_OBJ_SIZE(ib_pd); }; struct ib_device { -- cgit v1.2.3 From 652432f33c01b2edaa5b2550b423cd894b1c7b9a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:50 -0700 Subject: RDMA/device: Get rid of reg_state This really has no purpose anymore, refcount can be used to tell if the device is still registered. Keeping it around just invites mis-use. Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/device.c | 8 ++------ include/rdma/ib_verbs.h | 6 ------ 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 872662a84b16..1c54ded776d0 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -243,7 +243,7 @@ static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); - WARN_ON(dev->reg_state == IB_DEV_REGISTERED); + WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); kfree(dev->port_pkey_list); @@ -316,8 +316,7 @@ EXPORT_SYMBOL(_ib_alloc_device); void ib_dealloc_device(struct ib_device *device) { WARN_ON(!list_empty(&device->client_data_list)); - WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && - device->reg_state != IB_DEV_UNINITIALIZED); + WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); put_device(&device->dev); } @@ -602,7 +601,6 @@ int ib_register_device(struct ib_device *device, const char *name) } refcount_set(&device->refcount, 1); - device->reg_state = IB_DEV_REGISTERED; list_for_each_entry(client, &client_list, list) if (!add_client_context(device, client) && client->add) @@ -673,8 +671,6 @@ void ib_unregister_device(struct ib_device *device) } write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); - - device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 854d7816787c..d8ba987e8b29 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2572,12 +2572,6 @@ struct ib_device { struct kobject *ports_kobj; struct list_head port_list; - enum { - IB_DEV_UNINITIALIZED, - IB_DEV_REGISTERED, - IB_DEV_UNREGISTERED - } reg_state; - int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; -- cgit v1.2.3 From e59178d895afa29b671323f8265a1e50afe989e5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:52 -0700 Subject: RDMA/devices: Use xarray to store the clients This gives each client a unique ID and will let us move client_data to use xarray, and revise the locking scheme. clients have to be add/removed in strict FIFO/LIFO order as they interdepend. To support this the client_ids are assigned to increase in FIFO order. The existing linked list is kept to support reverse iteration until xarray can get a reverse iteration API. Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/device.c | 50 ++++++++++++++++++++++++++++++++++++---- include/rdma/ib_verbs.h | 3 ++- 2 files changed, 47 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3a80f96c2919..f87d85659359 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -65,15 +65,17 @@ struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); -/* The device_list and client_list contain devices and clients after their +/* The device_list and clients contain devices and clients after their * registration has completed, and the devices and clients are removed * during unregistration. */ static LIST_HEAD(device_list); static LIST_HEAD(client_list); +#define CLIENT_REGISTERED XA_MARK_1 +static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); /* * device_mutex and lists_rwsem protect access to both device_list and - * client_list. device_mutex protects writer access by device and client + * clients. device_mutex protects writer access by device and client * registration / de-registration. lists_rwsem protects reader access to * these lists. Iterators of these lists must lock it for read, while updates * to the lists must be done with a write lock. A special case is when the @@ -564,6 +566,7 @@ int ib_register_device(struct ib_device *device, const char *name) { int ret; struct ib_client *client; + unsigned long index; setup_dma_device(device); @@ -608,7 +611,7 @@ int ib_register_device(struct ib_device *device, const char *name) refcount_set(&device->refcount, 1); - list_for_each_entry(client, &client_list, list) + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) if (!add_client_context(device, client) && client->add) client->add(device); @@ -680,6 +683,32 @@ void ib_unregister_device(struct ib_device *device) } EXPORT_SYMBOL(ib_unregister_device); +static int assign_client_id(struct ib_client *client) +{ + int ret; + + /* + * The add/remove callbacks must be called in FIFO/LIFO order. To + * achieve this we assign client_ids so they are sorted in + * registration order, and retain a linked list we can reverse iterate + * to get the LIFO order. The extra linked list can go away if xarray + * learns to reverse iterate. + */ + if (list_empty(&client_list)) + client->client_id = 0; + else + client->client_id = + list_last_entry(&client_list, struct ib_client, list) + ->client_id; + ret = xa_alloc(&clients, &client->client_id, INT_MAX, client, + GFP_KERNEL); + if (ret) + goto out; + +out: + return ret; +} + /** * ib_register_client - Register an IB client * @client:Client to register @@ -696,15 +725,21 @@ EXPORT_SYMBOL(ib_unregister_device); int ib_register_client(struct ib_client *client) { struct ib_device *device; + int ret; mutex_lock(&device_mutex); + ret = assign_client_id(client); + if (ret) { + mutex_unlock(&device_mutex); + return ret; + } list_for_each_entry(device, &device_list, core_list) if (!add_client_context(device, client) && client->add) client->add(device); down_write(&lists_rwsem); - list_add_tail(&client->list, &client_list); + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&lists_rwsem); mutex_unlock(&device_mutex); @@ -729,7 +764,7 @@ void ib_unregister_client(struct ib_client *client) mutex_lock(&device_mutex); down_write(&lists_rwsem); - list_del(&client->list); + xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&lists_rwsem); list_for_each_entry(device, &device_list, core_list) { @@ -765,6 +800,10 @@ void ib_unregister_client(struct ib_client *client) kfree(found_context); } + down_write(&lists_rwsem); + list_del(&client->list); + xa_erase(&clients, client->client_id); + up_write(&lists_rwsem); mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); @@ -1422,6 +1461,7 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); + WARN_ON(!xa_empty(&clients)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d8ba987e8b29..cc15820513cd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2610,7 +2610,7 @@ struct ib_device { }; struct ib_client { - char *name; + const char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); @@ -2637,6 +2637,7 @@ struct ib_client { const struct sockaddr *addr, void *client_data); struct list_head list; + u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; -- cgit v1.2.3 From 0df91bb67334eebaf73d4ba32567e16d55f4f116 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:53 -0700 Subject: RDMA/devices: Use xarray to store the client_data Now that we have a small ID for each client we can use xarray instead of linearly searching linked lists for client data. This will give much faster and scalable client data lookup, and will lets us revise the locking scheme. Since xarray can store 'going_down' using a mark just entirely eliminate the struct ib_client_data and directly store the client_data value in the xarray. However this does require a special iterator as we must still iterate over any NULL client_data values. Also eliminate the client_data_lock in favour of internal xarray locking. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 348 +++++++++++++++++++-------------------- include/rdma/ib_verbs.h | 23 ++- 2 files changed, 186 insertions(+), 185 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f87d85659359..5096593b99e9 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -51,30 +51,72 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); -struct ib_client_data { - struct list_head list; - struct ib_client *client; - void * data; - /* The device or client is going down. Do not call client or device - * callbacks other than remove(). */ - bool going_down; -}; - struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); -/* The device_list and clients contain devices and clients after their - * registration has completed, and the devices and clients are removed - * during unregistration. */ -static LIST_HEAD(device_list); +/* + * devices contains devices that have had their names assigned. The + * devices may not be registered. Users that care about the registration + * status need to call ib_device_try_get() on the device to ensure it is + * registered, and keep it registered, for the required duration. + * + */ +static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); + +/* + * Note that if the *rwsem is held and the *_REGISTERED mark is seen then the + * object is guaranteed to be and remain registered for the duration of the + * lock. + */ +#define DEVICE_REGISTERED XA_MARK_1 + static LIST_HEAD(client_list); #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); /* - * device_mutex and lists_rwsem protect access to both device_list and + * If client_data is registered then the corresponding client must also still + * be registered. + */ +#define CLIENT_DATA_REGISTERED XA_MARK_1 +/* + * xarray has this behavior where it won't iterate over NULL values stored in + * allocated arrays. So we need our own iterator to see all values stored in + * the array. This does the same thing as xa_for_each except that it also + * returns NULL valued entries if the array is allocating. Simplified to only + * work on simple xarrays. + */ +static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, + xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + entry = xas_find_marked(&xas, ULONG_MAX, filter); + if (xa_is_zero(entry)) + break; + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) { + *indexp = xas.xa_index; + if (xa_is_zero(entry)) + return NULL; + return entry; + } + return XA_ERROR(-ENOENT); +} +#define xan_for_each_marked(xa, index, entry, filter) \ + for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ + !xa_is_err(entry); \ + (index)++, entry = xan_find_marked(xa, &(index), filter)) + +/* + * device_mutex and lists_rwsem protect access to both devices and * clients. device_mutex protects writer access by device and client * registration / de-registration. lists_rwsem protects reader access to * these lists. Iterators of these lists must lock it for read, while updates @@ -135,17 +177,6 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } -static struct ib_device *__ib_device_get_by_index(u32 index) -{ - struct ib_device *device; - - list_for_each_entry(device, &device_list, core_list) - if (device->index == index) - return device; - - return NULL; -} - /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. @@ -155,7 +186,7 @@ struct ib_device *ib_device_get_by_index(u32 index) struct ib_device *device; down_read(&lists_rwsem); - device = __ib_device_get_by_index(index); + device = xa_load(&devices, index); if (device) { if (!ib_device_try_get(device)) device = NULL; @@ -181,8 +212,9 @@ EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; + unsigned long index; - list_for_each_entry(device, &device_list, core_list) + xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; @@ -216,12 +248,13 @@ out: static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; + unsigned long index; struct ida inuse; int rc; int i; ida_init(&inuse); - list_for_each_entry(device, &device_list, core_list) { + xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) @@ -256,6 +289,7 @@ static void ib_device_release(struct device *device) ib_security_release_port_pkey_list(dev); kfree(dev->port_pkey_list); kfree(dev->port_immutable); + xa_destroy(&dev->client_data); kfree(dev); } @@ -306,8 +340,11 @@ struct ib_device *_ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - rwlock_init(&device->client_data_lock); - INIT_LIST_HEAD(&device->client_data_list); + /* + * client_data needs to be alloc because we don't want our mark to be + * destroyed if the user stores NULL in the client data. + */ + xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); INIT_LIST_HEAD(&device->port_list); init_completion(&device->unreg_completion); @@ -323,7 +360,7 @@ EXPORT_SYMBOL(_ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { - WARN_ON(!list_empty(&device->client_data_list)); + WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); put_device(&device->dev); @@ -332,26 +369,20 @@ EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { - struct ib_client_data *context; + void *entry; if (!device->kverbs_provider && !client->no_kverbs_req) return -EOPNOTSUPP; - context = kmalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return -ENOMEM; - - context->client = client; - context->data = NULL; - context->going_down = false; - down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_add(&context->list, &device->client_data_list); - write_unlock_irq(&device->client_data_lock); + entry = xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL); + if (!xa_is_err(entry)) + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); up_write(&lists_rwsem); - return 0; + return xa_err(entry); } static int verify_immutable(const struct ib_device *dev, u8 port) @@ -428,9 +459,10 @@ static int setup_port_pkey_list(struct ib_device *device) static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; + unsigned long index; down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { int i; for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { @@ -461,28 +493,48 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_OK; } -/** - * __dev_new_index - allocate an device index - * - * Returns a suitable unique value for a new device interface - * number. It assumes that there are less than 2^32-1 ib devices - * will be present in the system. +/* + * Assign the unique string device name and the unique device index. */ -static u32 __dev_new_index(void) +static int assign_name(struct ib_device *device, const char *name) { - /* - * The device index to allow stable naming. - * Similar to struct net -> ifindex. - */ - static u32 index; + static u32 last_id; + int ret; - for (;;) { - if (!(++index)) - index = 1; + /* Assign a unique name to the device */ + if (strchr(name, '%')) + ret = alloc_name(device, name); + else + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); - if (!__ib_device_get_by_index(index)) - return index; + /* Cyclically allocate a user visible ID for the device */ + device->index = last_id; + ret = xa_alloc(&devices, &device->index, INT_MAX, device, GFP_KERNEL); + if (ret == -ENOSPC) { + device->index = 0; + ret = xa_alloc(&devices, &device->index, INT_MAX, device, + GFP_KERNEL); } + if (ret) + goto out; + last_id = device->index + 1; + + ret = 0; +out: + return ret; +} + +static void release_name(struct ib_device *device) +{ + xa_erase(&devices, device->index); } static void setup_dma_device(struct ib_device *device) @@ -572,34 +624,21 @@ int ib_register_device(struct ib_device *device, const char *name) mutex_lock(&device_mutex); - if (strchr(name, '%')) { - ret = alloc_name(device, name); - if (ret) - goto out; - } else { - ret = dev_set_name(&device->dev, name); - if (ret) - goto out; - } - if (__ib_device_get_by_name(dev_name(&device->dev))) { - ret = -ENFILE; + ret = assign_name(device, name); + if (ret) goto out; - } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = setup_device(device); if (ret) - goto out; + goto out_name; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto out; + goto out_name; } - device->index = __dev_new_index(); - ib_device_register_rdmacg(device); ret = ib_device_register_sysfs(device); @@ -616,7 +655,7 @@ int ib_register_device(struct ib_device *device, const char *name) client->add(device); down_write(&lists_rwsem); - list_add_tail(&device->core_list, &device_list); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&lists_rwsem); mutex_unlock(&device_mutex); return 0; @@ -624,6 +663,8 @@ int ib_register_device(struct ib_device *device, const char *name) cg_cleanup: ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); +out_name: + release_name(device); out: mutex_unlock(&device_mutex); return ret; @@ -638,8 +679,8 @@ EXPORT_SYMBOL(ib_register_device); */ void ib_unregister_device(struct ib_device *device) { - struct ib_client_data *context, *tmp; - unsigned long flags; + struct ib_client *client; + unsigned long index; /* * Wait for all netlink command callers to finish working on the @@ -651,34 +692,31 @@ void ib_unregister_device(struct ib_device *device) mutex_lock(&device_mutex); down_write(&lists_rwsem); - list_del(&device->core_list); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - context->going_down = true; - write_unlock_irq(&device->client_data_lock); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + xa_for_each (&clients, index, client) + xa_clear_mark(&device->client_data, index, + CLIENT_DATA_REGISTERED); downgrade_write(&lists_rwsem); - list_for_each_entry(context, &device->client_data_list, list) { - if (context->client->remove) - context->client->remove(device, context->data); - } + list_for_each_entry_reverse(client, &client_list, list) + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED) && + client->remove) + client->remove(device, xa_load(&device->client_data, + client->client_id)); up_read(&lists_rwsem); ib_device_unregister_sysfs(device); ib_device_unregister_rdmacg(device); + release_name(device); + mutex_unlock(&device_mutex); ib_cache_cleanup_one(device); down_write(&lists_rwsem); - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { - list_del(&context->list); - kfree(context); - } - write_unlock_irqrestore(&device->client_data_lock, flags); + xa_destroy(&device->client_data); up_write(&lists_rwsem); } EXPORT_SYMBOL(ib_unregister_device); @@ -725,6 +763,7 @@ out: int ib_register_client(struct ib_client *client) { struct ib_device *device; + unsigned long index; int ret; mutex_lock(&device_mutex); @@ -734,7 +773,7 @@ int ib_register_client(struct ib_client *client) return ret; } - list_for_each_entry(device, &device_list, core_list) + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) if (!add_client_context(device, client) && client->add) client->add(device); @@ -758,8 +797,8 @@ EXPORT_SYMBOL(ib_register_client); */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context; struct ib_device *device; + unsigned long index; mutex_lock(&device_mutex); @@ -767,37 +806,19 @@ void ib_unregister_client(struct ib_client *client) xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&lists_rwsem); - list_for_each_entry(device, &device_list, core_list) { - struct ib_client_data *found_context = NULL; - + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->going_down = true; - found_context = context; - break; - } - write_unlock_irq(&device->client_data_lock); + xa_clear_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); up_write(&lists_rwsem); if (client->remove) - client->remove(device, found_context ? - found_context->data : NULL); - - if (!found_context) { - dev_warn(&device->dev, - "No client context found for %s\n", - client->name); - continue; - } + client->remove(device, xa_load(&device->client_data, + client->client_id)); down_write(&lists_rwsem); - write_lock_irq(&device->client_data_lock); - list_del(&found_context->list); - write_unlock_irq(&device->client_data_lock); + xa_erase(&device->client_data, client->client_id); up_write(&lists_rwsem); - kfree(found_context); } down_write(&lists_rwsem); @@ -808,59 +829,28 @@ void ib_unregister_client(struct ib_client *client) } EXPORT_SYMBOL(ib_unregister_client); -/** - * ib_get_client_data - Get IB client context - * @device:Device to get context for - * @client:Client to get context for - * - * ib_get_client_data() returns client context set with - * ib_set_client_data(). - */ -void *ib_get_client_data(struct ib_device *device, struct ib_client *client) -{ - struct ib_client_data *context; - void *ret = NULL; - unsigned long flags; - - read_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - ret = context->data; - break; - } - read_unlock_irqrestore(&device->client_data_lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_get_client_data); - /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * - * ib_set_client_data() sets client context that can be retrieved with - * ib_get_client_data(). + * ib_set_client_data() sets client context data that can be retrieved with + * ib_get_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { - struct ib_client_data *context; - unsigned long flags; - - write_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry(context, &device->client_data_list, list) - if (context->client == client) { - context->data = data; - goto out; - } + void *rc; - dev_warn(&device->dev, "No client context found for %s\n", - client->name); + if (WARN_ON(IS_ERR(data))) + data = NULL; -out: - write_unlock_irqrestore(&device->client_data_lock, flags); + rc = xa_store(&device->client_data, client->client_id, data, + GFP_KERNEL); + WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); @@ -1018,9 +1008,10 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *cookie) { struct ib_device *dev; + unsigned long index; down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&lists_rwsem); } @@ -1034,12 +1025,13 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { + unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&lists_rwsem); - list_for_each_entry(dev, &device_list, core_list) { + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { ret = nldev_cb(dev, skb, cb, idx); if (ret) break; @@ -1212,26 +1204,25 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, const struct sockaddr *addr) { struct net_device *net_dev = NULL; - struct ib_client_data *context; + unsigned long index; + void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; down_read(&lists_rwsem); - list_for_each_entry(context, &dev->client_data_list, list) { - struct ib_client *client = context->client; + xan_for_each_marked (&dev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); - if (context->going_down) + if (!client || !client->get_net_dev_by_params) continue; - if (client->get_net_dev_by_params) { - net_dev = client->get_net_dev_by_params(dev, port, pkey, - gid, addr, - context->data); - if (net_dev) - break; - } + net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, + addr, client_data); + if (net_dev) + break; } up_read(&lists_rwsem); @@ -1462,6 +1453,7 @@ static void __exit ib_core_cleanup(void) /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); WARN_ON(!xa_empty(&clients)); + WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cc15820513cd..8558f31ca46f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,12 +2542,7 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - rwlock_t client_data_lock; - struct list_head core_list; - /* Access to the client_data_list is protected by the client_data_lock - * rwlock and the lists_rwsem read-write semaphore - */ - struct list_head client_data_list; + struct xarray client_data; struct ib_cache cache; /** @@ -2660,7 +2655,21 @@ void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -void *ib_get_client_data(struct ib_device *device, struct ib_client *client); +/** + * ib_get_client_data - Get IB client context + * @device:Device to get context for + * @client:Client to get context for + * + * ib_get_client_data() returns the client context data set with + * ib_set_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. + */ +static inline void *ib_get_client_data(struct ib_device *device, + struct ib_client *client) +{ + return xa_load(&device->client_data, client->client_id); +} void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, -- cgit v1.2.3 From 921eab1143aadf976a42cac4605b4d35159b355d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:54 -0700 Subject: RDMA/devices: Re-organize device.c locking The locking here started out with a single lock that covered everything and then has lately veered into crazy town. The fundamental problem is that several places need to iterate over a linked list, but also need to drop their locks to avoid deadlock during client callbacks. xarray's restartable iteration offers a simple solution to the problem. Once all the lists are xarrays we can drop locks in the places that need that and rely on xarray to provide consistency and locking for the data structure. The resulting simplification is that each of the three lists has a dedicated rwsem that must be held when working with the list it covers. One data structure is no longer covered by multiple locks. The sleeping semaphore is selected because the read side generally needs to be held over something sleeping, and using RCU reader locking in those cases is overkill. In the process this simplifies the entire registration/unregistration flow to be the expected list of setups and the reversed list of matching teardowns, and the registration lock 'refcount' can now be revised to be released after the ULPs are removed, providing a very sane semantic for this feature. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 361 ++++++++++++++++++++++++--------------- include/rdma/ib_verbs.h | 1 + 2 files changed, 222 insertions(+), 140 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5096593b99e9..3325be4f91a5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -56,6 +55,29 @@ struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +/* + * Each of the three rwsem locks (devices, clients, client_data) protects the + * xarray of the same name. Specifically it allows the caller to assert that + * the MARK will/will not be changing under the lock, and for devices and + * clients, that the value in the xarray is still a valid pointer. Change of + * the MARK is linked to the object state, so holding the lock and testing the + * MARK also asserts that the contained object is in a certain state. + * + * This is used to build a two stage register/unregister flow where objects + * can continue to be in the xarray even though they are still in progress to + * register/unregister. + * + * The xarray itself provides additional locking, and restartable iteration, + * which is also relied on. + * + * Locks should not be nested, with the exception of client_data, which is + * allowed to nest under the read side of the other two locks. + * + * The devices_rwsem also protects the device name list, any change or + * assignment of device name must also hold the write side to guarantee unique + * names. + */ + /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration @@ -64,17 +86,13 @@ EXPORT_SYMBOL_GPL(ib_wq); * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); - -/* - * Note that if the *rwsem is held and the *_REGISTERED mark is seen then the - * object is guaranteed to be and remain registered for the duration of the - * lock. - */ +static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static LIST_HEAD(client_list); #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); +static DECLARE_RWSEM(clients_rwsem); /* * If client_data is registered then the corresponding client must also still @@ -115,20 +133,6 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) -/* - * device_mutex and lists_rwsem protect access to both devices and - * clients. device_mutex protects writer access by device and client - * registration / de-registration. lists_rwsem protects reader access to - * these lists. Iterators of these lists must lock it for read, while updates - * to the lists must be done with a write lock. A special case is when the - * device_mutex is locked. In this case locking the lists for read access is - * not necessary as the device_mutex implies it. - * - * lists_rwsem also protects access to the client data list. - */ -static DEFINE_MUTEX(device_mutex); -static DECLARE_RWSEM(lists_rwsem); - static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); @@ -185,13 +189,13 @@ struct ib_device *ib_device_get_by_index(u32 index) { struct ib_device *device; - down_read(&lists_rwsem); + down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!ib_device_try_get(device)) device = NULL; } - up_read(&lists_rwsem); + up_read(&devices_rwsem); return device; } @@ -225,7 +229,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) { int ret; - mutex_lock(&device_mutex); + down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { ret = 0; goto out; @@ -241,7 +245,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) goto out; strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); out: - mutex_unlock(&device_mutex); + up_write(&devices_rwsem); return ret; } @@ -253,6 +257,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name) int rc; int i; + lockdep_assert_held_exclusive(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; @@ -345,6 +350,7 @@ struct ib_device *_ib_alloc_device(size_t size) * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); + init_rwsem(&device->client_data_rwsem); INIT_LIST_HEAD(&device->port_list); init_completion(&device->unreg_completion); @@ -367,22 +373,86 @@ void ib_dealloc_device(struct ib_device *device) } EXPORT_SYMBOL(ib_dealloc_device); -static int add_client_context(struct ib_device *device, struct ib_client *client) +/* + * add_client_context() and remove_client_context() must be safe against + * parallel calls on the same device - registration/unregistration of both the + * device and client can be occurring in parallel. + * + * The routines need to be a fence, any caller must not return until the add + * or remove is fully completed. + */ +static int add_client_context(struct ib_device *device, + struct ib_client *client) { - void *entry; + int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) - return -EOPNOTSUPP; + return 0; + + down_write(&device->client_data_rwsem); + /* + * Another caller to add_client_context got here first and has already + * completely initialized context. + */ + if (xa_get_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED)) + goto out; + + ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, + GFP_KERNEL)); + if (ret) + goto out; + downgrade_write(&device->client_data_rwsem); + if (client->add) + client->add(device); + + /* Readers shall not see a client until add has been completed */ + xa_set_mark(&device->client_data, client->client_id, + CLIENT_DATA_REGISTERED); + up_read(&device->client_data_rwsem); + return 0; + +out: + up_write(&device->client_data_rwsem); + return ret; +} + +static void remove_client_context(struct ib_device *device, + unsigned int client_id) +{ + struct ib_client *client; + void *client_data; - down_write(&lists_rwsem); - entry = xa_store(&device->client_data, client->client_id, NULL, - GFP_KERNEL); - if (!xa_is_err(entry)) - xa_set_mark(&device->client_data, client->client_id, - CLIENT_DATA_REGISTERED); - up_write(&lists_rwsem); + down_write(&device->client_data_rwsem); + if (!xa_get_mark(&device->client_data, client_id, + CLIENT_DATA_REGISTERED)) { + up_write(&device->client_data_rwsem); + return; + } + client_data = xa_load(&device->client_data, client_id); + xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); + client = xa_load(&clients, client_id); + downgrade_write(&device->client_data_rwsem); - return xa_err(entry); + /* + * Notice we cannot be holding any exclusive locks when calling the + * remove callback as the remove callback can recurse back into any + * public functions in this module and thus try for any locks those + * functions take. + * + * For this reason clients and drivers should not call the + * unregistration functions will holdling any locks. + * + * It tempting to drop the client_data_rwsem too, but this is required + * to ensure that unregister_client does not return until all clients + * are completely unregistered, which is required to avoid module + * unloading races. + */ + if (client->remove) + client->remove(device, client_data); + + xa_erase(&device->client_data, client_id); + up_read(&device->client_data_rwsem); } static int verify_immutable(const struct ib_device *dev, u8 port) @@ -461,7 +531,7 @@ static void ib_policy_change_task(struct work_struct *work) struct ib_device *dev; unsigned long index; - down_read(&lists_rwsem); + down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { int i; @@ -478,7 +548,7 @@ static void ib_policy_change_task(struct work_struct *work) ib_security_cache_change(dev, i, sp); } } - up_read(&lists_rwsem); + up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, @@ -501,6 +571,7 @@ static int assign_name(struct ib_device *device, const char *name) static u32 last_id; int ret; + down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); @@ -528,13 +599,17 @@ static int assign_name(struct ib_device *device, const char *name) last_id = device->index + 1; ret = 0; + out: + up_write(&devices_rwsem); return ret; } static void release_name(struct ib_device *device) { + down_write(&devices_rwsem); xa_erase(&devices, device->index); + up_write(&devices_rwsem); } static void setup_dma_device(struct ib_device *device) @@ -572,11 +647,18 @@ static void setup_dma_device(struct ib_device *device) } } +/* + * setup_device() allocates memory and sets up data that requires calling the + * device ops, this is the only reason these actions are not done during + * ib_alloc_device. It is undone by ib_dealloc_device(). + */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; + setup_dma_device(device); + ret = ib_device_check_mandatory(device); if (ret) return ret; @@ -605,6 +687,54 @@ static int setup_device(struct ib_device *device) return 0; } +static void disable_device(struct ib_device *device) +{ + struct ib_client *client; + + WARN_ON(!refcount_read(&device->refcount)); + + down_write(&devices_rwsem); + xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + down_read(&clients_rwsem); + list_for_each_entry_reverse(client, &client_list, list) + remove_client_context(device, client->client_id); + up_read(&clients_rwsem); + + /* Pairs with refcount_set in enable_device */ + ib_device_put(device); + wait_for_completion(&device->unreg_completion); +} + +/* + * An enabled device is visible to all clients and to all the public facing + * APIs that return a device pointer. + */ +static int enable_device(struct ib_device *device) +{ + struct ib_client *client; + unsigned long index; + int ret; + + refcount_set(&device->refcount, 1); + down_write(&devices_rwsem); + xa_set_mark(&devices, device->index, DEVICE_REGISTERED); + up_write(&devices_rwsem); + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&clients_rwsem); + disable_device(device); + return ret; + } + } + up_read(&clients_rwsem); + return 0; +} + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -617,26 +747,20 @@ static int setup_device(struct ib_device *device) int ib_register_device(struct ib_device *device, const char *name) { int ret; - struct ib_client *client; - unsigned long index; - - setup_dma_device(device); - - mutex_lock(&device_mutex); ret = assign_name(device, name); if (ret) - goto out; + return ret; ret = setup_device(device); if (ret) - goto out_name; + goto out; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto out_name; + goto out; } ib_device_register_rdmacg(device); @@ -648,25 +772,19 @@ int ib_register_device(struct ib_device *device, const char *name) goto cg_cleanup; } - refcount_set(&device->refcount, 1); - - xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) - if (!add_client_context(device, client) && client->add) - client->add(device); + ret = enable_device(device); + if (ret) + goto sysfs_cleanup; - down_write(&lists_rwsem); - xa_set_mark(&devices, device->index, DEVICE_REGISTERED); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); return 0; +sysfs_cleanup: + ib_device_unregister_sysfs(device); cg_cleanup: ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); -out_name: - release_name(device); out: - mutex_unlock(&device_mutex); + release_name(device); return ret; } EXPORT_SYMBOL(ib_register_device); @@ -679,45 +797,11 @@ EXPORT_SYMBOL(ib_register_device); */ void ib_unregister_device(struct ib_device *device) { - struct ib_client *client; - unsigned long index; - - /* - * Wait for all netlink command callers to finish working on the - * device. - */ - ib_device_put(device); - wait_for_completion(&device->unreg_completion); - - mutex_lock(&device_mutex); - - down_write(&lists_rwsem); - xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); - xa_for_each (&clients, index, client) - xa_clear_mark(&device->client_data, index, - CLIENT_DATA_REGISTERED); - downgrade_write(&lists_rwsem); - - list_for_each_entry_reverse(client, &client_list, list) - if (xa_get_mark(&device->client_data, client->client_id, - CLIENT_DATA_REGISTERED) && - client->remove) - client->remove(device, xa_load(&device->client_data, - client->client_id)); - up_read(&lists_rwsem); - + disable_device(device); ib_device_unregister_sysfs(device); ib_device_unregister_rdmacg(device); - - release_name(device); - - mutex_unlock(&device_mutex); - ib_cache_cleanup_one(device); - - down_write(&lists_rwsem); - xa_destroy(&device->client_data); - up_write(&lists_rwsem); + release_name(device); } EXPORT_SYMBOL(ib_unregister_device); @@ -725,6 +809,7 @@ static int assign_client_id(struct ib_client *client) { int ret; + down_write(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in @@ -743,7 +828,11 @@ static int assign_client_id(struct ib_client *client) if (ret) goto out; + xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); + list_add_tail(&client->list, &client_list); + out: + up_write(&clients_rwsem); return ret; } @@ -766,23 +855,20 @@ int ib_register_client(struct ib_client *client) unsigned long index; int ret; - mutex_lock(&device_mutex); ret = assign_client_id(client); - if (ret) { - mutex_unlock(&device_mutex); + if (ret) return ret; - } - - xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) - if (!add_client_context(device, client) && client->add) - client->add(device); - - down_write(&lists_rwsem); - xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); - up_write(&lists_rwsem); - - mutex_unlock(&device_mutex); + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { + ret = add_client_context(device, client); + if (ret) { + up_read(&devices_rwsem); + ib_unregister_client(client); + return ret; + } + } + up_read(&devices_rwsem); return 0; } EXPORT_SYMBOL(ib_register_client); @@ -794,38 +880,31 @@ EXPORT_SYMBOL(ib_register_client); * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. + * + * This is a full fence, once it returns no client callbacks will be called, + * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; - mutex_lock(&device_mutex); - - down_write(&lists_rwsem); + down_write(&clients_rwsem); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); - up_write(&lists_rwsem); - - xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { - down_write(&lists_rwsem); - xa_clear_mark(&device->client_data, client->client_id, - CLIENT_DATA_REGISTERED); - up_write(&lists_rwsem); - - if (client->remove) - client->remove(device, xa_load(&device->client_data, - client->client_id)); - - down_write(&lists_rwsem); - xa_erase(&device->client_data, client->client_id); - up_write(&lists_rwsem); - } + up_write(&clients_rwsem); + /* + * Every device still known must be serialized to make sure we are + * done with the client callbacks before we return. + */ + down_read(&devices_rwsem); + xa_for_each (&devices, index, device) + remove_client_context(device, client->client_id); + up_read(&devices_rwsem); - down_write(&lists_rwsem); + down_write(&clients_rwsem); list_del(&client->list); xa_erase(&clients, client->client_id); - up_write(&lists_rwsem); - mutex_unlock(&device_mutex); + up_write(&clients_rwsem); } EXPORT_SYMBOL(ib_unregister_client); @@ -1010,10 +1089,10 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, struct ib_device *dev; unsigned long index; - down_read(&lists_rwsem); + down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); - up_read(&lists_rwsem); + up_read(&devices_rwsem); } /** @@ -1030,15 +1109,14 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, unsigned int idx = 0; int ret = 0; - down_read(&lists_rwsem); + down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } - - up_read(&lists_rwsem); + up_read(&devices_rwsem); return ret; } @@ -1196,6 +1274,7 @@ EXPORT_SYMBOL(ib_find_pkey); * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. + * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, @@ -1210,8 +1289,11 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, if (!rdma_protocol_ib(dev, port)) return NULL; - down_read(&lists_rwsem); - + /* + * Holding the read side guarantees that the client will not become + * unregistered while we are calling get_net_dev_by_params() + */ + down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); @@ -1224,8 +1306,7 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, if (net_dev) break; } - - up_read(&lists_rwsem); + up_read(&dev->client_data_rwsem); return net_dev; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8558f31ca46f..135fab2c016c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,6 +2542,7 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; + struct rw_semaphore client_data_rwsem; struct xarray client_data; struct ib_cache cache; -- cgit v1.2.3 From d901b2760dc6cd5fbbf2eac31d71d94baa6c4aef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Jan 2019 11:40:21 -0700 Subject: lib/scatterlist: Provide a DMA page iterator Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o backing pages") introduced the sg_page_iter_dma_address() function without providing a way to use it in the general case. If the sg_dma_len() is not equal to the sg length callers cannot safely use the for_each_sg_page/sg_page_iter_dma_address combination. Resolve this API mistake by providing a DMA specific iterator, for_each_sg_dma_page(), that uses the right length so sg_page_iter_dma_address() works as expected with all sglists. A new iterator type is introduced to provide compile-time safety against wrongly mixing accessors and iterators. Acked-by: Christoph Hellwig (for scatterlist) Acked-by: Thomas Hellstrom Acked-by: Sakari Ailus (ipu3-cio2) Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c | 8 ++++- drivers/media/pci/intel/ipu3/ipu3-cio2.c | 4 +-- include/linux/scatterlist.h | 49 ++++++++++++++++++++++++------ lib/scatterlist.c | 26 ++++++++++++++++ 5 files changed, 76 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/.clang-format b/.clang-format index bc2ffb2a0b53..335ce29ab813 100644 --- a/.clang-format +++ b/.clang-format @@ -240,6 +240,7 @@ ForEachMacros: - 'for_each_set_bit' - 'for_each_set_bit_from' - 'for_each_sg' + - 'for_each_sg_dma_page' - 'for_each_sg_page' - 'for_each_sibling_event' - '__for_each_thread' diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c index 31786b200afc..a3357ff7540d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c @@ -311,7 +311,13 @@ static dma_addr_t __vmw_piter_dma_addr(struct vmw_piter *viter) static dma_addr_t __vmw_piter_sg_addr(struct vmw_piter *viter) { - return sg_page_iter_dma_address(&viter->iter); + /* + * FIXME: This driver wrongly mixes DMA and CPU SG list iteration and + * needs revision. See + * https://lore.kernel.org/lkml/20190104223531.GA1705@ziepe.ca/ + */ + return sg_page_iter_dma_address( + container_of(&viter->iter, struct sg_dma_page_iter, base)); } diff --git a/drivers/media/pci/intel/ipu3/ipu3-cio2.c b/drivers/media/pci/intel/ipu3/ipu3-cio2.c index cdb79ae2d8dc..9fbfbda74171 100644 --- a/drivers/media/pci/intel/ipu3/ipu3-cio2.c +++ b/drivers/media/pci/intel/ipu3/ipu3-cio2.c @@ -846,7 +846,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb) unsigned int pages = DIV_ROUND_UP(vb->planes[0].length, CIO2_PAGE_SIZE); unsigned int lops = DIV_ROUND_UP(pages + 1, entries_per_page); struct sg_table *sg; - struct sg_page_iter sg_iter; + struct sg_dma_page_iter sg_iter; int i, j; if (lops <= 0 || lops > CIO2_MAX_LOPS) { @@ -873,7 +873,7 @@ static int cio2_vb2_buf_init(struct vb2_buffer *vb) b->offset = sg->sgl->offset; i = j = 0; - for_each_sg_page(sg->sgl, &sg_iter, sg->nents, 0) { + for_each_sg_dma_page (sg->sgl, &sg_iter, sg->nents, 0) { if (!pages--) break; b->lop[i][j] = sg_page_iter_dma_address(&sg_iter) >> PAGE_SHIFT; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b96f0d0b5b8f..b4be960c7e5d 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -339,12 +339,12 @@ int sg_alloc_table_chained(struct sg_table *table, int nents, /* * sg page iterator * - * Iterates over sg entries page-by-page. On each successful iteration, - * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter) - * to get the current page and its dma address. @piter->sg will point to the - * sg holding this page and @piter->sg_pgoffset to the page's page offset - * within the sg. The iteration will stop either when a maximum number of sg - * entries was reached or a terminating sg (sg_last(sg) == true) was reached. + * Iterates over sg entries page-by-page. On each successful iteration, you + * can call sg_page_iter_page(@piter) to get the current page and its dma + * address. @piter->sg will point to the sg holding this page and + * @piter->sg_pgoffset to the page's page offset within the sg. The iteration + * will stop either when a maximum number of sg entries was reached or a + * terminating sg (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ @@ -356,7 +356,19 @@ struct sg_page_iter { * next step */ }; +/* + * sg page iterator for DMA addresses + * + * This is the same as sg_page_iter however you can call + * sg_page_iter_dma_address(@dma_iter) to get the page's DMA + * address. sg_page_iter_page() cannot be called on this iterator. + */ +struct sg_dma_page_iter { + struct sg_page_iter base; +}; + bool __sg_page_iter_next(struct sg_page_iter *piter); +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); @@ -372,11 +384,13 @@ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. - * @piter: page iterator holding the page + * @dma_iter: page iterator holding the page */ -static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) +static inline dma_addr_t +sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { - return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT); + return sg_dma_address(dma_iter->base.sg) + + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** @@ -385,11 +399,28 @@ static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_page() to get each page pointer. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) +/** + * for_each_sg_dma_page - iterate over the pages of the given sg list + * @sglist: sglist to iterate over + * @dma_iter: page iterator to hold current page + * @dma_nents: maximum number of sg entries to iterate over, this is the value + * returned from dma_map_sg + * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_dma_address() to get each page's DMA address. + */ +#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ + for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ + pgoffset); \ + __sg_page_iter_dma_next(dma_iter);) + /* * Mapping sg iterator * diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 9ba349e775ef..739dc9fe2c55 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -625,6 +625,32 @@ bool __sg_page_iter_next(struct sg_page_iter *piter) } EXPORT_SYMBOL(__sg_page_iter_next); +static int sg_dma_page_count(struct scatterlist *sg) +{ + return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT; +} + +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter) +{ + struct sg_page_iter *piter = &dma_iter->base; + + if (!piter->__nents || !piter->sg) + return false; + + piter->sg_pgoffset += piter->__pg_advance; + piter->__pg_advance = 1; + + while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) { + piter->sg_pgoffset -= sg_dma_page_count(piter->sg); + piter->sg = sg_next(piter->sg); + if (!--piter->__nents || !piter->sg) + return false; + } + + return true; +} +EXPORT_SYMBOL(__sg_page_iter_dma_next); + /** * sg_miter_start - start mapping iteration over a sg list * @miter: sg mapping iter to be started -- cgit v1.2.3 From 3d9dfd060391928bd615db62ecddea5e1255edfd Mon Sep 17 00:00:00 2001 From: Shamir Rabinovitch Date: Thu, 7 Feb 2019 18:44:47 +0200 Subject: IB/uverbs: Add ib_ucontext to uverbs_attr_bundle sent from ioctl and cmd flows Add ib_ucontext to the uverbs_attr_bundle sent down the iocl and cmd flows as soon as the flow has ib_uobject. In addition, remove rdma_get_ucontext helper function that is only used by ib_umem_get. Signed-off-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 32 ++++++++++++++++++++++++++++++++ drivers/infiniband/core/umem.c | 10 +++++++--- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/uverbs_ioctl.c | 3 +++ drivers/infiniband/core/uverbs_main.c | 25 +------------------------ include/rdma/ib_verbs.h | 2 -- include/rdma/uverbs_ioctl.h | 1 + include/rdma/uverbs_std_types.h | 18 +++++++++++++----- 8 files changed, 58 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index a260d2f8e0b7..96f919fe86e7 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -438,6 +438,38 @@ free: uverbs_uobject_put(uobj); return ERR_PTR(ret); } +struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, + object_id, UVERBS_LOOKUP_READ); + if (IS_ERR(uobj)) + return uobj; + + attrs->context = uobj->context; + + return uobj; +} + +struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj; + + uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, + object_id, UVERBS_LOOKUP_WRITE); + + if (IS_ERR(uobj)) + return uobj; + + attrs->context = uobj->context; + + return uobj; +} static struct ib_uobject * alloc_begin_idr_uobject(const struct uverbs_api_object *obj, diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index b69d3efa8712..fe5551562dbc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -96,9 +96,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, struct scatterlist *sg, *sg_list_start; unsigned int gup_flags = FOLL_WRITE; - context = rdma_get_ucontext(udata); - if (IS_ERR(context)) - return ERR_CAST(context); + if (!udata) + return ERR_PTR(-EIO); + + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + if (!context) + return ERR_PTR(-EIO); if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ac143f22df0..c2e7733a7fe0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2634,7 +2634,7 @@ void flow_resources_add(struct ib_uflow_resources *uflow_res, } EXPORT_SYMBOL(flow_resources_add); -static int kern_spec_to_ib_spec_action(const struct uverbs_attr_bundle *attrs, +static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec, struct ib_uflow_resources *uflow_res) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 0ca04d224015..e1379949e663 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -213,6 +213,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, ret = PTR_ERR(attr->uobjects[i]); break; } + pbundle->bundle.context = attr->uobjects[i]->context; } attr->len = i; @@ -330,6 +331,7 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, uattr->data_s64); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); + pbundle->bundle.context = o_attr->uobject->context; __set_bit(attr_bkey, pbundle->uobj_finalize); if (spec->u.obj.access == UVERBS_ACCESS_NEW) { @@ -592,6 +594,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, pbundle->method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; + pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ pbundle->radix = &uapi->radix; pbundle->radix_slots = slot; pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index accc61cc93ac..70b7d80431a9 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -101,30 +101,6 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile) } EXPORT_SYMBOL(ib_uverbs_get_ucontext_file); -/* rdma_get_ucontext - Return the ucontext from a udata - * @udata: The udata to get the context from - * - * This can only be called from within a uapi method that was passed ib_udata - * as a parameter. It returns the ucontext associated with the udata, or ERR_PTR - * if the udata is NULL or the ucontext has been disassociated. - */ -struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata) -{ - if (!udata) - return ERR_PTR(-EIO); - - /* - * FIXME: Really all cases that get here with a udata will have - * already called ib_uverbs_get_ucontext_file, or located a uobject - * that points to a ucontext. We could store that result in the udata - * so this function can't fail. - */ - return ib_uverbs_get_ucontext_file( - container_of(udata, struct uverbs_attr_bundle, driver_udata) - ->ufile); -} -EXPORT_SYMBOL(rdma_get_ucontext); - int uverbs_dealloc_mw(struct ib_mw *mw) { struct ib_pd *pd = mw->pd; @@ -719,6 +695,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, memset(bundle.attr_present, 0, sizeof(bundle.attr_present)); bundle.ufile = file; + bundle.context = NULL; /* only valid if bundle has uobject */ if (!method_elm->is_ex) { size_t in_len = hdr.in_words * 4 - sizeof(hdr); size_t out_len = hdr.out_words * 4; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 135fab2c016c..64ee7c08be22 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4247,8 +4247,6 @@ void rdma_roce_rescan_device(struct ib_device *ibdev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); -struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata); - int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 27da906beea7..b14a9ee786e9 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -652,6 +652,7 @@ struct uverbs_attr_bundle { struct ib_udata driver_udata; struct ib_udata ucore; struct ib_uverbs_file *ufile; + struct ib_ucontext *context; DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN); struct uverbs_attr attrs[]; }; diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 883abcf6d36e..794c47565971 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -48,9 +48,12 @@ #define uobj_get_type(_attrs, _object) \ uapi_get_object((_attrs)->ufile->device->uapi, _object) +struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs); + #define uobj_get_read(_type, _id, _attrs) \ - rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \ - _uobj_check_id(_id), UVERBS_LOOKUP_READ) + _uobj_get_read(_type, _uobj_check_id(_id), _attrs) #define ufd_get_read(_type, _fdnum, _attrs) \ rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \ @@ -67,9 +70,12 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj) ((struct ib_##_object *)_uobj_get_obj_read( \ uobj_get_read(_type, _id, _attrs))) +struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs); + #define uobj_get_write(_type, _id, _attrs) \ - rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \ - _uobj_check_id(_id), UVERBS_LOOKUP_WRITE) + _uobj_get_write(_type, _uobj_check_id(_id), _attrs) int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, const struct uverbs_attr_bundle *attrs); @@ -123,8 +129,10 @@ __uobj_alloc(const struct uverbs_api_object *obj, { struct ib_uobject *uobj = rdma_alloc_begin_uobject(obj, attrs->ufile); - if (!IS_ERR(uobj)) + if (!IS_ERR(uobj)) { *ib_dev = uobj->context->device; + attrs->context = uobj->context; + } return uobj; } -- cgit v1.2.3 From 730623f4a56fa42d4559715ff2f4a5c32b3ae8bf Mon Sep 17 00:00:00 2001 From: Shamir Rabinovitch Date: Thu, 7 Feb 2019 18:44:48 +0200 Subject: IB/verbs: Add helper function rdma_udata_to_drv_context Helper function to get driver's context out of ib_udata wrapped in uverbs_attr_bundle for user objects or NULL for kernel objects. Signed-off-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_ioctl.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index b14a9ee786e9..28570ac2b6a0 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -664,6 +664,23 @@ static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_b attrs_bundle->attr_present); } +/** + * rdma_udata_to_drv_context - Helper macro to get the driver's context out of + * ib_udata which is embedded in uverbs_attr_bundle. + * + * If udata is not NULL this cannot fail. Otherwise a NULL udata will result + * in a NULL ucontext pointer, as a safety precaution. Callers should be using + * 'udata' to determine if the driver call is in user or kernel mode, not + * 'ucontext'. + * + */ +#define rdma_udata_to_drv_context(udata, drv_dev_struct, member) \ + (udata ? container_of(container_of(udata, struct uverbs_attr_bundle, \ + driver_udata) \ + ->context, \ + drv_dev_struct, member) : \ + (drv_dev_struct *)NULL) + #define IS_UVERBS_COPY_ERR(_ret) ((_ret) && (_ret) != -ENOENT) static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, -- cgit v1.2.3 From 89944450547334aa6655e0cd4aec8df1897a205a Mon Sep 17 00:00:00 2001 From: Shamir Rabinovitch Date: Thu, 7 Feb 2019 18:44:49 +0200 Subject: IB/{hw,sw}: Remove 'uobject->context' dependency in object creation APIs Now when we have the udata passed to all the ib_xxx object creation APIs and the additional macro 'rdma_udata_to_drv_context' to get the ib_ucontext from ib_udata stored in uverbs_attr_bundle, we can finally start to remove the dependency of the drivers in the ib_xxx->uobject->context. Signed-off-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 20 ++++---- drivers/infiniband/hw/cxgb3/iwch_provider.c | 4 +- drivers/infiniband/hw/cxgb4/qp.c | 9 ++-- drivers/infiniband/hw/hns/hns_roce_qp.c | 21 ++++---- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 9 ++-- drivers/infiniband/hw/mlx4/mr.c | 12 ++--- drivers/infiniband/hw/mlx4/qp.c | 72 +++++++++++++++------------- drivers/infiniband/hw/mlx4/srq.c | 9 ++-- drivers/infiniband/hw/mlx5/devx.c | 21 ++++---- drivers/infiniband/hw/mlx5/qp.c | 58 ++++++++++++---------- drivers/infiniband/hw/mlx5/srq.c | 8 ++-- drivers/infiniband/hw/mthca/mthca_provider.c | 20 ++++---- drivers/infiniband/hw/mthca/mthca_qp.c | 13 +++-- drivers/infiniband/hw/mthca/mthca_srq.c | 21 ++++---- drivers/infiniband/hw/nes/nes_verbs.c | 12 ++--- drivers/infiniband/hw/qedr/verbs.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 5 +- drivers/infiniband/sw/rdmavt/qp.c | 5 +- drivers/infiniband/sw/rdmavt/srq.c | 5 +- drivers/infiniband/sw/rdmavt/vt.c | 4 -- drivers/infiniband/sw/rxe/rxe_qp.c | 8 ++-- drivers/infiniband/sw/rxe/rxe_verbs.c | 6 ++- include/rdma/rdma_vt.h | 5 ++ 23 files changed, 194 insertions(+), 155 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2ed778683c6b..83bf6f5d67c0 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "bnxt_ulp.h" @@ -722,12 +723,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, /* Write AVID to shared page. */ if (udata) { - struct ib_ucontext *ib_uctx = ib_pd->uobject->context; - struct bnxt_re_ucontext *uctx; + struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context( + udata, struct bnxt_re_ucontext, ib_uctx); unsigned long flag; u32 *wrptr; - uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); spin_lock_irqsave(&uctx->sh_lock, flag); wrptr = (u32 *)(uctx->shpg + BNXT_RE_AVID_OFFT); *wrptr = ah->qplib_ah.id; @@ -872,10 +872,9 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp; struct ib_umem *umem; int bytes = 0, psn_sz; - struct ib_ucontext *context = pd->ib_pd.uobject->context; - struct bnxt_re_ucontext *cntx = container_of(context, - struct bnxt_re_ucontext, - ib_uctx); + struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context( + udata, struct bnxt_re_ucontext, ib_uctx); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) return -EFAULT; @@ -1359,10 +1358,9 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; struct ib_umem *umem; int bytes = 0; - struct ib_ucontext *context = pd->ib_pd.uobject->context; - struct bnxt_re_ucontext *cntx = container_of(context, - struct bnxt_re_ucontext, - ib_uctx); + struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context( + udata, struct bnxt_re_ucontext, ib_uctx); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) return -EFAULT; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 80dff6804e48..b74fd90a22dc 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "cxio_hal.h" #include "iwch.h" @@ -825,7 +826,8 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd, * Kernel users need more wq space for fastreg WRs which can take * 2 WR fragments. */ - ucontext = udata ? to_iwch_ucontext(pd->uobject->context) : NULL; + ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext, + ibucontext); if (!ucontext && wqsize < (rqsize + (2 * sqsize))) wqsize = roundup_pow_of_two(rqsize + roundup_pow_of_two(attrs->cap.max_send_wr * 2)); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 0fe87b9c1e10..2509f65f4420 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -31,6 +31,7 @@ */ #include +#include #include "iw_cxgb4.h" @@ -2136,7 +2137,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; unsigned int sqsize, rqsize = 0; - struct c4iw_ucontext *ucontext; + struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct c4iw_ucontext, ibucontext); int ret; struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm; struct c4iw_mm_entry *rq_db_key_mm = NULL, *ma_sync_key_mm = NULL; @@ -2170,8 +2172,6 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, if (sqsize < 8) sqsize = 8; - ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL; - qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); if (!qhp) return ERR_PTR(-ENOMEM); @@ -2718,7 +2718,8 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs, rqsize = attrs->attr.max_wr + 1; rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16)); - ucontext = udata ? to_c4iw_ucontext(pd->uobject->context) : NULL; + ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext, + ibucontext); srq = kzalloc(sizeof(*srq), GFP_KERNEL); if (!srq) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 728ad8b6de2f..57c76eafef2f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "hns_roce_common.h" #include "hns_roce_device.h" #include "hns_roce_hem.h" @@ -560,6 +561,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, struct device *dev = hr_dev->dev; struct hns_roce_ib_create_qp ucmd; struct hns_roce_ib_create_qp_resp resp = {}; + struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context( + udata, struct hns_roce_ucontext, ibucontext); unsigned long qpn = 0; int ret = 0; u32 page_shift; @@ -670,9 +673,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, (udata->inlen >= sizeof(ucmd)) && (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_sq(init_attr)) { - ret = hns_roce_db_map_user( - to_hr_ucontext(ib_pd->uobject->context), udata, - ucmd.sdb_addr, &hr_qp->sdb); + ret = hns_roce_db_map_user(uctx, udata, ucmd.sdb_addr, + &hr_qp->sdb); if (ret) { dev_err(dev, "sq record doorbell map failed!\n"); goto err_mtt; @@ -686,9 +688,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_rq(init_attr)) { - ret = hns_roce_db_map_user( - to_hr_ucontext(ib_pd->uobject->context), udata, - ucmd.db_addr, &hr_qp->rdb); + ret = hns_roce_db_map_user(uctx, udata, ucmd.db_addr, + &hr_qp->rdb); if (ret) { dev_err(dev, "rq record doorbell map failed!\n"); goto err_sq_dbmap; @@ -838,9 +839,7 @@ err_wrid: if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_rq(init_attr)) - hns_roce_db_unmap_user( - to_hr_ucontext(ib_pd->uobject->context), - &hr_qp->rdb); + hns_roce_db_unmap_user(uctx, &hr_qp->rdb); } else { kfree(hr_qp->sq.wrid); kfree(hr_qp->rq.wrid); @@ -852,9 +851,7 @@ err_sq_dbmap: (udata->inlen >= sizeof(ucmd)) && (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_sq(init_attr)) - hns_roce_db_unmap_user( - to_hr_ucontext(ib_pd->uobject->context), - &hr_qp->sdb); + hns_roce_db_unmap_user(uctx, &hr_qp->sdb); err_mtt: hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index d5fb2b927587..76b4d1218696 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "i40iw.h" /** @@ -556,7 +557,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, struct i40iw_device *iwdev = to_iwdev(ibpd->device); struct i40iw_cqp *iwcqp = &iwdev->cqp; struct i40iw_qp *iwqp; - struct i40iw_ucontext *ucontext; + struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct i40iw_ucontext, ibucontext); struct i40iw_create_qp_req req; struct i40iw_create_qp_resp uresp; u32 qp_num = 0; @@ -665,7 +667,6 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, } iwqp->ctx_info.qp_compl_ctx = req.user_compl_ctx; iwqp->user_mode = 1; - ucontext = to_ucontext(ibpd->uobject->context); if (req.user_wqe_buffers) { struct i40iw_pbl *iwpbl; @@ -1819,7 +1820,8 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, { struct i40iw_pd *iwpd = to_iwpd(pd); struct i40iw_device *iwdev = to_iwdev(pd->device); - struct i40iw_ucontext *ucontext; + struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct i40iw_ucontext, ibucontext); struct i40iw_pble_alloc *palloc; struct i40iw_pbl *iwpbl; struct i40iw_mr *iwmr; @@ -1860,7 +1862,6 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, iwmr->region = region; iwmr->ibmr.pd = pd; iwmr->ibmr.device = pd->device; - ucontext = to_ucontext(pd->uobject->context); iwmr->page_size = PAGE_SIZE; iwmr->page_msk = PAGE_MASK; diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 56639ecd53ad..395379a480cb 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -367,8 +367,7 @@ end: return block_shift; } -static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, - struct ib_udata *udata, u64 start, +static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, u64 length, u64 virt_addr, int access_flags) { @@ -416,8 +415,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = mlx4_get_umem_mr(pd->uobject->context, udata, start, length, - virt_addr, access_flags); + mr->umem = + mlx4_get_umem_mr(udata, start, length, virt_addr, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -506,9 +505,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = - mlx4_get_umem_mr(mr->uobject->context, udata, start, - length, virt_addr, mr_access_flags); + mmr->umem = mlx4_get_umem_mr(udata, start, length, virt_addr, + mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index e38bab50cecf..429a59c5801c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -52,7 +53,8 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); -static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state); +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state, + struct ib_udata *udata); enum { MLX4_IB_ACK_REQ_FREQ = 8, @@ -863,6 +865,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, int err; struct mlx4_ib_sqp *sqp = NULL; struct mlx4_ib_qp *qp; + struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx4_ib_ucontext, ibucontext); enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; struct mlx4_ib_cq *mcq; unsigned long flags; @@ -1038,7 +1042,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user( - to_mucontext(pd->uobject->context), udata, + context, udata, (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr : ucmd.wq.db_addr, &qp->db); @@ -1112,8 +1116,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } } } else if (src == MLX4_IB_RWQ_SRC) { - err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp, - range_size, &qpn); + err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn); if (err) goto err_wrid; } else { @@ -1184,8 +1187,7 @@ err_qpn: if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qpn, 1); else if (src == MLX4_IB_RWQ_SRC) - mlx4_ib_release_wqn(to_mucontext(pd->uobject->context), - qp, 0); + mlx4_ib_release_wqn(context, qp, 0); else mlx4_qp_release_range(dev->dev, qpn, 1); } @@ -1195,7 +1197,7 @@ err_proxy: err_wrid: if (udata) { if (qp_has_rq(init_attr)) - mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); + mlx4_ib_db_unmap_user(context, &qp->db); } else { kvfree(qp->sq.wrid); kvfree(qp->rq.wrid); @@ -1942,7 +1944,8 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type) * Go over all RSS QP's childes (WQs) and apply their HW state according to * their logic state if the RSS QP is the first RSS QP associated for the WQ. */ -static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) +static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num, + struct ib_udata *udata) { int err = 0; int i; @@ -1966,7 +1969,7 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) } wq->port = port_num; if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) { - err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY); + err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata); if (err) { mutex_unlock(&wq->mutex); break; @@ -1988,7 +1991,8 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY)) - if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, + udata)) pr_warn("failed to reverse WQN=0x%06x\n", ibwq->wq_num); wq->rss_usecnt--; @@ -2000,7 +2004,8 @@ static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) return err; } -static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl) +static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, + struct ib_udata *udata) { int i; @@ -2011,7 +2016,7 @@ static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl) mutex_lock(&wq->mutex); if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY)) - if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata)) pr_warn("failed to reverse WQN=%x\n", ibwq->wq_num); wq->rss_usecnt--; @@ -2043,9 +2048,10 @@ static void fill_qp_rss_context(struct mlx4_qp_context *context, static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, const struct ib_qp_attr *attr, int attr_mask, - enum ib_qp_state cur_state, enum ib_qp_state new_state) + enum ib_qp_state cur_state, + enum ib_qp_state new_state, + struct ib_udata *udata) { - struct ib_uobject *ibuobject; struct ib_srq *ibsrq; const struct ib_gid_attr *gid_attr = NULL; struct ib_rwq_ind_table *rwq_ind_tbl; @@ -2054,6 +2060,8 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, struct mlx4_ib_qp *qp; struct mlx4_ib_pd *pd; struct mlx4_ib_cq *send_cq, *recv_cq; + struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx4_ib_ucontext, ibucontext); struct mlx4_qp_context *context; enum mlx4_qp_optpar optpar = 0; int sqd_event; @@ -2065,7 +2073,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, struct ib_wq *ibwq; ibwq = (struct ib_wq *)src; - ibuobject = ibwq->uobject; ibsrq = NULL; rwq_ind_tbl = NULL; qp_type = IB_QPT_RAW_PACKET; @@ -2076,7 +2083,6 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, struct ib_qp *ibqp; ibqp = (struct ib_qp *)src; - ibuobject = ibqp->uobject; ibsrq = ibqp->srq; rwq_ind_tbl = ibqp->rwq_ind_tbl; qp_type = ibqp->qp_type; @@ -2161,11 +2167,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, context->param3 |= cpu_to_be32(1 << 30); } - if (ibuobject) + if (ucontext) context->usr_page = cpu_to_be32( - mlx4_to_hw_uar_index(dev->dev, - to_mucontext(ibuobject->context) - ->uar.index)); + mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index)); else context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index)); @@ -2297,7 +2301,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); /* Set "fast registration enabled" for all kernel QPs */ - if (!ibuobject) + if (!ucontext) context->params1 |= cpu_to_be32(1 << 11); if (attr_mask & IB_QP_RNR_RETRY) { @@ -2434,7 +2438,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, else sqd_event = 0; - if (!ibuobject && + if (!ucontext && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->rlkey_roce_mode |= (1 << 4); @@ -2445,7 +2449,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, * headroom is stamped so that the hardware doesn't start * processing stale work requests. */ - if (!ibuobject && + if (!ucontext && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { struct mlx4_wqe_ctrl_seg *ctrl; @@ -2509,7 +2513,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET) { - if (!ibuobject) { + if (!ucontext) { mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, ibsrq ? to_msrq(ibsrq) : NULL); if (send_cq != recv_cq) @@ -2735,16 +2739,17 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) { - err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num); + err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num, + udata); if (err) goto out; } err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask, - cur_state, new_state); + cur_state, new_state, udata); if (ibqp->rwq_ind_tbl && err) - bring_down_rss_rwqs(ibqp->rwq_ind_tbl); + bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata); if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) attr->port_num = 1; @@ -4122,7 +4127,8 @@ static int ib_wq2qp_state(enum ib_wq_state state) } } -static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state) +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state, + struct ib_udata *udata) { struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); enum ib_qp_state qp_cur_state; @@ -4146,7 +4152,8 @@ static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state) attr_mask = IB_QP_PORT; err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr, - attr_mask, IB_QPS_RESET, IB_QPS_INIT); + attr_mask, IB_QPS_RESET, IB_QPS_INIT, + udata); if (err) { pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n", ibwq->wq_num); @@ -4158,12 +4165,13 @@ static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state) attr_mask = 0; err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask, - qp_cur_state, qp_new_state); + qp_cur_state, qp_new_state, udata); if (err && (qp_cur_state == IB_QPS_INIT)) { qp_new_state = IB_QPS_RESET; if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, - attr_mask, IB_QPS_INIT, IB_QPS_RESET)) { + attr_mask, IB_QPS_INIT, IB_QPS_RESET, + udata)) { pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n", ibwq->wq_num); qp_new_state = IB_QPS_INIT; @@ -4226,7 +4234,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, * WQ, so we can apply its port on the WQ. */ if (qp->rss_usecnt) - err = _mlx4_ib_modify_wq(ibwq, new_state); + err = _mlx4_ib_modify_wq(ibwq, new_state, udata); if (!err) ibwq->state = new_state; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 498588eac051..381cf899bcef 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -37,6 +37,7 @@ #include "mlx4_ib.h" #include +#include static void *get_wqe(struct mlx4_ib_srq *srq, int n) { @@ -73,6 +74,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx4_ib_ucontext, ibucontext); struct mlx4_ib_srq *srq; struct mlx4_wqe_srq_next_seg *next; struct mlx4_wqe_data_seg *scatter; @@ -128,8 +131,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; - err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), - udata, ucmd.db_addr, &srq->db); + err = mlx4_ib_db_map_user(ucontext, udata, ucmd.db_addr, + &srq->db); if (err) goto err_mtt; } else { @@ -202,7 +205,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, err_wrid: if (udata) - mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db); + mlx4_ib_db_unmap_user(ucontext, &srq->db); else kvfree(srq->wrid); diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 8e6d23d6859f..eaa055007f28 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1151,7 +1151,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( void *cmd_out; struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); - struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; struct devx_obj *obj; @@ -1227,8 +1228,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT); struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE); - struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); - struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); void *cmd_out; int err; int uid; @@ -1268,11 +1270,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT); struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE); - struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); void *cmd_out; int err; int uid; - struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); uid = devx_get_uid(c, cmd_in); if (uid < 0) @@ -1370,11 +1373,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)( attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE); u16 cmd_out_len; - struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); struct ib_uobject *fd_uobj; int err; int uid; - struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + struct mlx5_ib_dev *mdev = to_mdev(c->ibucontext.device); struct devx_async_cmd_event_file *ev_file; struct devx_async_data *async_data; @@ -1530,7 +1534,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); u32 obj_id; - struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_ucontext *c = rdma_udata_to_drv_context( + &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); int err; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 4f2bc101b061..6b1f0e76900b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -794,6 +794,8 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_udata *udata, struct mlx5_ib_rwq *rwq, struct mlx5_ib_create_wq *ucmd) { + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); int page_shift = 0; int npages; u32 offset = 0; @@ -828,8 +830,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, (unsigned long long)ucmd->buf_addr, rwq->buf_size, npages, page_shift, ncont, offset); - err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata, - ucmd->db_addr, &rwq->db); + err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db); if (err) { mlx5_ib_dbg(dev, "map failed\n"); goto err_umem; @@ -877,7 +878,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, return err; } - context = to_mucontext(pd->uobject->context); + context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext, + ibucontext); if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) { uar_index = bfregn_to_uar_index(dev, &context->bfregi, ucmd.bfreg_index, true); @@ -1456,9 +1458,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; - struct ib_uobject *uobj = pd->uobject; - struct ib_ucontext *ucontext = uobj->context; - struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); int err; u32 tdn = mucontext->tdn; u16 uid = to_mpd(pd)->uid; @@ -1572,9 +1573,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { - struct ib_uobject *uobj = pd->uobject; - struct ib_ucontext *ucontext = uobj->context; - struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_create_qp_resp resp = {}; int inlen; int err; @@ -1916,6 +1916,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_create_qp_resp resp = {}; + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; @@ -2018,8 +2020,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_QP_FLAG_TYPE_DCT)) return -EINVAL; - err = get_qp_user_index(to_mucontext(pd->uobject->context), - &ucmd, udata->inlen, &uidx); + err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx); if (err) return err; @@ -2503,8 +2504,11 @@ static const char *ib_qp_type_str(enum ib_qp_type type) static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct ib_qp_init_attr *attr, - struct mlx5_ib_create_qp *ucmd) + struct mlx5_ib_create_qp *ucmd, + struct ib_udata *udata) { + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_qp *qp; int err = 0; u32 uidx = MLX5_IB_DEFAULT_UIDX; @@ -2513,8 +2517,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, if (!attr->srq || !attr->recv_cq) return ERR_PTR(-EINVAL); - err = get_qp_user_index(to_mucontext(pd->uobject->context), - ucmd, sizeof(*ucmd), &uidx); + err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx); if (err) return ERR_PTR(err); @@ -2596,15 +2599,17 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, int err; struct ib_qp_init_attr mlx_init_attr; struct ib_qp_init_attr *init_attr = verbs_init_attr; + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); if (pd) { dev = to_mdev(pd->device); if (init_attr->qp_type == IB_QPT_RAW_PACKET) { - if (!udata) { + if (!ucontext) { mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); return ERR_PTR(-EINVAL); - } else if (!to_mucontext(pd->uobject->context)->cqe_version) { + } else if (!ucontext->cqe_version) { mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); return ERR_PTR(-EINVAL); } @@ -2636,7 +2641,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); } } else { - return mlx5_ib_create_dct(pd, init_attr, &ucmd); + return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata); } } @@ -3274,14 +3279,12 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp_base *qp_base, - u8 port_num) + u8 port_num, struct ib_udata *udata) { - struct mlx5_ib_ucontext *ucontext = NULL; + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); unsigned int tx_port_affinity; - if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context) - ucontext = to_mucontext(pd->ibpd.uobject->context); - if (ucontext) { tx_port_affinity = (unsigned int)atomic_add_return( 1, &ucontext->tx_port_affinity) % @@ -3304,8 +3307,10 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, - enum ib_qp_state cur_state, enum ib_qp_state new_state, - const struct mlx5_ib_modify_qp *ucmd) + enum ib_qp_state cur_state, + enum ib_qp_state new_state, + const struct mlx5_ib_modify_qp *ucmd, + struct ib_udata *udata) { static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { [MLX5_QP_STATE_RST] = { @@ -3396,7 +3401,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, (ibqp->qp_type == IB_QPT_XRC_TGT)) { if (dev->lag_active) { u8 p = mlx5_core_native_port_num(dev->mdev); - tx_affinity = get_tx_affinity(dev, pd, base, p); + tx_affinity = get_tx_affinity(dev, pd, base, p, + udata); context->flags |= cpu_to_be32(tx_affinity << 24); } } @@ -3879,7 +3885,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, - new_state, &ucmd); + new_state, &ucmd, udata); out: mutex_unlock(&qp->mutex); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 22bd774e0b4e..1ec1beb1296b 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -47,6 +47,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_srq ucmd = {}; + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); size_t ucmdlen; int err; int npages; @@ -71,8 +73,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, return -EINVAL; if (in->type != IB_SRQT_BASIC) { - err = get_srq_user_index(to_mucontext(pd->uobject->context), - &ucmd, udata->inlen, &uidx); + err = get_srq_user_index(ucontext, &ucmd, udata->inlen, &uidx); if (err) return err; } @@ -103,8 +104,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, mlx5_ib_populate_pas(dev, srq->umem, page_shift, in->pas, 0); - err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), udata, - ucmd.db_addr, &srq->db); + err = mlx5_ib_db_map_user(ucontext, udata, ucmd.db_addr, &srq->db); if (err) { mlx5_ib_dbg(dev, "map doorbell failed\n"); goto err_in; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 516c8cf9c0fd..80c3af217d96 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -435,7 +436,8 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd, struct ib_udata *udata) { struct mthca_create_srq ucmd; - struct mthca_ucontext *context = NULL; + struct mthca_ucontext *context = rdma_udata_to_drv_context( + udata, struct mthca_ucontext, ibucontext); struct mthca_srq *srq; int err; @@ -447,8 +449,6 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd, return ERR_PTR(-ENOMEM); if (udata) { - context = to_mucontext(pd->uobject->context); - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { err = -EFAULT; goto err_free; @@ -510,6 +510,8 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { + struct mthca_ucontext *context = rdma_udata_to_drv_context( + udata, struct mthca_ucontext, ibucontext); struct mthca_create_qp ucmd; struct mthca_qp *qp; int err; @@ -522,15 +524,11 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd, case IB_QPT_UC: case IB_QPT_UD: { - struct mthca_ucontext *context; - qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); if (udata) { - context = to_mucontext(pd->uobject->context); - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { kfree(qp); return ERR_PTR(-EFAULT); @@ -568,8 +566,6 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd, &init_attr->cap, qp, udata); if (err && udata) { - context = to_mucontext(pd->uobject->context); - mthca_unmap_user_db(to_mdev(pd->device), &context->uar, context->db_tab, @@ -898,6 +894,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, { struct mthca_dev *dev = to_mdev(pd->device); struct sg_dma_page_iter sg_iter; + struct mthca_ucontext *context = rdma_udata_to_drv_context( + udata, struct mthca_ucontext, ibucontext); struct mthca_mr *mr; struct mthca_reg_mr ucmd; u64 *pages; @@ -906,12 +904,12 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int write_mtt_size; if (udata->inlen < sizeof ucmd) { - if (!to_mucontext(pd->uobject->context)->reg_mr_warned) { + if (!context->reg_mr_warned) { mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", current->comm); mthca_warn(dev, " Update libmthca to fix this.\n"); } - ++to_mucontext(pd->uobject->context)->reg_mr_warned; + ++context->reg_mr_warned; ucmd.mr_attrs = 0; } else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) return ERR_PTR(-EFAULT); diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 4e5b5cc17f1d..7a5b25d13faa 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "mthca_dev.h" #include "mthca_cmd.h" @@ -554,10 +555,14 @@ static int mthca_path_set(struct mthca_dev *dev, const struct rdma_ah_attr *ah, static int __mthca_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, - enum ib_qp_state cur_state, enum ib_qp_state new_state) + enum ib_qp_state cur_state, + enum ib_qp_state new_state, + struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); + struct mthca_ucontext *context = rdma_udata_to_drv_context( + udata, struct mthca_ucontext, ibucontext); struct mthca_mailbox *mailbox; struct mthca_qp_param *qp_param; struct mthca_qp_context *qp_context; @@ -619,8 +624,7 @@ static int __mthca_modify_qp(struct ib_qp *ibqp, /* leave arbel_sched_queue as 0 */ if (qp->ibqp.uobject) - qp_context->usr_page = - cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index); + qp_context->usr_page = cpu_to_be32(context->uar.index); else qp_context->usr_page = cpu_to_be32(dev->driver_uar.index); qp_context->local_qpn = cpu_to_be32(qp->qpn); @@ -913,7 +917,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, goto out; } - err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state, + udata); out: mutex_unlock(&qp->mutex); diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index b8333c79e3fa..06b920385512 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -36,6 +36,8 @@ #include +#include + #include "mthca_dev.h" #include "mthca_cmd.h" #include "mthca_memfree.h" @@ -96,17 +98,19 @@ static void mthca_tavor_init_srq_context(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_srq *srq, struct mthca_tavor_srq_context *context, - bool is_user) + struct ib_udata *udata) { + struct mthca_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mthca_ucontext, ibucontext); + memset(context, 0, sizeof *context); context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4)); context->state_pd = cpu_to_be32(pd->pd_num); context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); - if (is_user) - context->uar = - cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); + if (udata) + context->uar = cpu_to_be32(ucontext->uar.index); else context->uar = cpu_to_be32(dev->driver_uar.index); } @@ -115,8 +119,10 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev, struct mthca_pd *pd, struct mthca_srq *srq, struct mthca_arbel_srq_context *context, - bool is_user) + struct ib_udata *udata) { + struct mthca_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mthca_ucontext, ibucontext); int logsize, max; memset(context, 0, sizeof *context); @@ -131,9 +137,8 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev, context->lkey = cpu_to_be32(srq->mr.ibmr.lkey); context->db_index = cpu_to_be32(srq->db_index); context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29); - if (is_user) - context->logstride_usrpage |= - cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index); + if (udata) + context->logstride_usrpage |= cpu_to_be32(ucontext->uar.index); else context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index); context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 138844299d73..38480b7708eb 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "nes.h" @@ -974,7 +975,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, struct nes_adapter *nesadapter = nesdev->nesadapter; struct nes_qp *nesqp; struct nes_cq *nescq; - struct nes_ucontext *nes_ucontext; + struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( + udata, struct nes_ucontext, ibucontext); struct nes_hw_cqp_wqe *cqp_wqe; struct nes_cqp_request *cqp_request; struct nes_create_qp_req req; @@ -1055,9 +1057,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, } if (req.user_qp_buffer) nesqp->nesuqp_addr = req.user_qp_buffer; - if (udata && (ibpd->uobject->context)) { + if (udata) { nesqp->user_mode = 1; - nes_ucontext = to_nesucontext(ibpd->uobject->context); if (virt_wqs) { err = 1; list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) { @@ -1078,7 +1079,6 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, } } - nes_ucontext = to_nesucontext(ibpd->uobject->context); nesqp->mmap_sq_db_index = find_next_zero_bit(nes_ucontext->allocated_wqs, NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq); @@ -2099,7 +2099,8 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct nes_adapter *nesadapter = nesdev->nesadapter; struct ib_mr *ibmr = ERR_PTR(-EINVAL); struct sg_dma_page_iter dma_iter; - struct nes_ucontext *nes_ucontext; + struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( + udata, struct nes_ucontext, ibucontext); struct nes_pbl *nespbl; struct nes_mr *nesmr; struct ib_umem *region; @@ -2342,7 +2343,6 @@ reg_user_mr_err: return ERR_PTR(-ENOMEM); } nesmr->region = region; - nes_ucontext = to_nesucontext(pd->uobject->context); pbl_depth = region->length >> 12; pbl_depth += (region->length & (4096-1)) ? 1 : 0; nespbl->pbl_size = pbl_depth*sizeof(u64); diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index a613ebde322f..d51bc3ede9d1 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1434,7 +1434,7 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, hw_srq->max_wr = init_attr->attr.max_wr; hw_srq->max_sges = init_attr->attr.max_sge; - if (udata && ibpd->uobject && ibpd->uobject->context) { + if (udata) { if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { DP_ERR(dev, "create srq: problem copying data from user space\n"); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 0ced89b51448..b4575b1c2a62 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -37,6 +37,7 @@ #include #include +#include #include "usnic_abi.h" #include "usnic_ib.h" @@ -482,7 +483,8 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, int err; struct usnic_ib_dev *us_ibdev; struct usnic_ib_qp_grp *qp_grp; - struct usnic_ib_ucontext *ucontext; + struct usnic_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct usnic_ib_ucontext, ibucontext); int cq_cnt; struct usnic_vnic_res_spec res_spec; struct usnic_ib_create_qp_cmd cmd; @@ -490,7 +492,6 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, usnic_dbg("\n"); - ucontext = to_uucontext(pd->uobject->context); us_ibdev = to_usdev(pd->device); if (init_attr->create_flags) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 6b1b2b75ef60..72664f0a52ee 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "qp.h" #include "vt.h" #include "trace.h" @@ -956,6 +957,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, size_t sg_list_sz; struct ib_qp *ret = ERR_PTR(-ENOMEM); struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); + struct rvt_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct rvt_ucontext, ibucontext); void *priv = NULL; size_t sqsize; @@ -1129,7 +1132,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz; qp->ip = rvt_create_mmap_info(rdi, s, - ibpd->uobject->context, + &ucontext->ibucontext, qp->r_rq.wq); if (!qp->ip) { ret = ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 78e06fc456c5..895b3fabd0bf 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "srq.h" #include "vt.h" @@ -77,6 +78,8 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, struct ib_udata *udata) { struct rvt_dev_info *dev = ib_to_rvt(ibpd->device); + struct rvt_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct rvt_ucontext, ibucontext); struct rvt_srq *srq; u32 sz; struct ib_srq *ret; @@ -119,7 +122,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz; srq->ip = - rvt_create_mmap_info(dev, s, ibpd->uobject->context, + rvt_create_mmap_info(dev, s, &ucontext->ibucontext, srq->rq.wq); if (!srq->ip) { ret = ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index a19832c73d5a..ee9c82cb3b6b 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -284,10 +284,6 @@ static int rvt_query_gid(struct ib_device *ibdev, u8 port_num, &gid->global.interface_id); } -struct rvt_ucontext { - struct ib_ucontext ibucontext; -}; - static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext *ibucontext) { diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index be5d76b2bcca..09ede70dc1e8 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "rxe.h" #include "rxe_loc.h" @@ -343,7 +344,8 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, struct rxe_cq *rcq = to_rcq(init->recv_cq); struct rxe_cq *scq = to_rcq(init->send_cq); struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL; - struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; + struct rxe_ucontext *ucontext = + rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc); rxe_add_ref(pd); rxe_add_ref(rcq); @@ -358,11 +360,11 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, rxe_qp_init_misc(rxe, qp, init); - err = rxe_qp_init_req(rxe, qp, init, context, uresp); + err = rxe_qp_init_req(rxe, qp, init, &ucontext->ibuc, uresp); if (err) goto err1; - err = rxe_qp_init_resp(rxe, qp, init, context, uresp); + err = rxe_qp_init_resp(rxe, qp, init, &ucontext->ibuc, uresp); if (err) goto err2; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 051c3930e808..ffca654c8697 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -33,6 +33,7 @@ #include #include +#include #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -320,8 +321,9 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd, int err; struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); + struct rxe_ucontext *ucontext = + rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc); struct rxe_srq *srq; - struct ib_ucontext *context = udata ? ibpd->uobject->context : NULL; struct rxe_create_srq_resp __user *uresp = NULL; if (udata) { @@ -344,7 +346,7 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd, rxe_add_ref(pd); srq->pd = pd; - err = rxe_srq_from_init(rxe, srq, init, context, uresp); + err = rxe_srq_from_init(rxe, srq, init, &ucontext->ibuc, uresp); if (err) goto err2; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 87d66c9630d7..4c257aff7d32 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -186,6 +186,11 @@ struct rvt_driver_params { u8 reserved_operations; }; +/* User context */ +struct rvt_ucontext { + struct ib_ucontext ibucontext; +}; + /* Protection domain */ struct rvt_pd { struct ib_pd ibpd; -- cgit v1.2.3 From fd47c2f99f04249d1ba82c422d1818dcbe193908 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:43 +0200 Subject: RDMA/restrack: Convert internal DB from hash to XArray The additions of .doit callbacks posses new access pattern to the resource entries by some user visible index. Back then, the legacy DB was implemented as hash because per-index access wasn't needed and XArray wasn't accepted yet. Acceptance of XArray together with per-index access requires the refresh of DB implementation. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 13 ++--- drivers/infiniband/core/restrack.c | 103 +++++++++++++++++++++++++------------ include/rdma/restrack.h | 22 +++++--- 3 files changed, 93 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 5601fa968244..4bf890ae6e28 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -970,6 +970,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, int start = cb->args[0]; bool has_cap_net_admin; struct nlmsghdr *nlh; + unsigned long id; u32 index, port = 0; bool filled = false; @@ -1020,7 +1021,12 @@ static int res_get_common_dumpit(struct sk_buff *skb, has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); down_read(&device->res.rwsem); - hash_for_each_possible(device->res.hash, res, node, res_type) { + /* + * FIXME: if the skip ahead is something common this loop should + * use xas_for_each & xas_pause to optimize, we can have a lot of + * objects. + */ + xa_for_each(&device->res.xa[res_type], id, res) { if (idx < start) goto next; @@ -1047,11 +1053,6 @@ static int res_get_common_dumpit(struct sk_buff *skb, rdma_restrack_put(res); if (ret == -EMSGSIZE) - /* - * There is a chance to optimize here. - * It can be done by using list_prepare_entry - * and list_for_each_entry_continue afterwards. - */ break; if (ret) goto res_err; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index f80b37d437ac..b4f302811858 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -12,6 +12,28 @@ #include "cma_priv.h" +static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, + u32 *next) +{ + int err; + + *id = *next; + if (*next == U32_MAX) + *id = 0; + + xa_lock(xa); + err = __xa_alloc(xa, id, U32_MAX, entry, GFP_KERNEL); + if (err && *next != U32_MAX) { + *id = 0; + err = __xa_alloc(xa, id, *next, entry, GFP_KERNEL); + } + + if (!err) + *next = *id + 1; + xa_unlock(xa); + return err; +} + /** * rdma_restrack_init() - initialize resource tracking * @dev: IB device @@ -19,6 +41,10 @@ void rdma_restrack_init(struct ib_device *dev) { struct rdma_restrack_root *res = &dev->res; + int i; + + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) + xa_init_flags(&res->xa[i], XA_FLAGS_ALLOC); init_rwsem(&res->rwsem); } @@ -46,33 +72,42 @@ void rdma_restrack_clean(struct ib_device *dev) struct rdma_restrack_root *res = &dev->res; struct rdma_restrack_entry *e; char buf[TASK_COMM_LEN]; + bool found = false; const char *owner; - int bkt; - - if (hash_empty(res->hash)) - return; - - dev = container_of(res, struct ib_device, res); - pr_err("restrack: %s", CUT_HERE); - dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); - hash_for_each(res->hash, bkt, e, node) { - if (rdma_is_kernel_res(e)) { - owner = e->kern_name; - } else { - /* - * There is no need to call get_task_struct here, - * because we can be here only if there are more - * get_task_struct() call than put_task_struct(). - */ - get_task_comm(buf, e->task); - owner = buf; + int i; + + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { + if (!xa_empty(&res->xa[i])) { + unsigned long index; + + if (!found) { + pr_err("restrack: %s", CUT_HERE); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); + } + xa_for_each(&res->xa[i], index, e) { + if (rdma_is_kernel_res(e)) { + owner = e->kern_name; + } else { + /* + * There is no need to call get_task_struct here, + * because we can be here only if there are more + * get_task_struct() call than put_task_struct(). + */ + get_task_comm(buf, e->task); + owner = buf; + } + + pr_err("restrack: %s %s object allocated by %s is not freed\n", + rdma_is_kernel_res(e) ? "Kernel" : + "User", + type2str(e->type), owner); + } + found = true; } - - pr_err("restrack: %s %s object allocated by %s is not freed\n", - rdma_is_kernel_res(e) ? "Kernel" : "User", - type2str(e->type), owner); + xa_destroy(&res->xa[i]); } - pr_err("restrack: %s", CUT_HERE); + if (found) + pr_err("restrack: %s", CUT_HERE); } /** @@ -86,10 +121,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, { struct rdma_restrack_root *res = &dev->res; struct rdma_restrack_entry *e; + unsigned long index = 0; u32 cnt = 0; down_read(&res->rwsem); - hash_for_each_possible(res->hash, e, node, type) { + xa_for_each(&res->xa[type], index, e) { if (ns == &init_pid_ns || (!rdma_is_kernel_res(e) && ns == task_active_pid_ns(e->task))) @@ -166,16 +202,20 @@ EXPORT_SYMBOL(rdma_restrack_set_task); static void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); + int ret; if (!dev) return; kref_init(&res->kref); init_completion(&res->comp); - res->valid = true; down_write(&dev->res.rwsem); - hash_add(dev->res.hash, &res->node, res->type); + ret = rt_xa_alloc_cyclic(&dev->res.xa[res->type], &res->id, res, + &dev->res.next_id[res->type]); + + if (!ret) + res->valid = true; up_write(&dev->res.rwsem); } @@ -241,15 +281,14 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) if (!dev) return; - rdma_restrack_put(res); - - wait_for_completion(&res->comp); - down_write(&dev->res.rwsem); - hash_del(&res->node); + xa_erase(&dev->res.xa[res->type], res->id); res->valid = false; up_write(&dev->res.rwsem); + rdma_restrack_put(res); + wait_for_completion(&res->comp); + out: if (res->task) { put_task_struct(res->task); diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index cc66cc7a11d3..16e11b4c3ec3 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -13,6 +13,7 @@ #include #include #include +#include /** * enum rdma_restrack_type - HW objects to track @@ -48,7 +49,6 @@ enum rdma_restrack_type { RDMA_RESTRACK_MAX }; -#define RDMA_RESTRACK_HASH_BITS 8 struct ib_device; struct rdma_restrack_entry; @@ -62,9 +62,17 @@ struct rdma_restrack_root { */ struct rw_semaphore rwsem; /** - * @hash: global database for all resources per-device + * @xa: Array of XArray structures to hold restrack entries. + * We want to use array of XArrays because insertion is type + * dependent. For types with xisiting unique ID (like QPN), + * we will insert to that unique index. For other types, + * we insert based on pointers and auto-allocate unique index. */ - DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS); + struct xarray xa[RDMA_RESTRACK_MAX]; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id[RDMA_RESTRACK_MAX]; }; /** @@ -102,10 +110,6 @@ struct rdma_restrack_entry { * @kern_name: name of owner for the kernel created entities. */ const char *kern_name; - /** - * @node: hash table entry - */ - struct hlist_node node; /** * @type: various objects in restrack database */ @@ -114,6 +118,10 @@ struct rdma_restrack_entry { * @user: user resource */ bool user; + /** + * @id: ID to expose to users + */ + u32 id; }; void rdma_restrack_init(struct ib_device *dev); -- cgit v1.2.3 From 18c4c66f76d99df89ad682ba25bafb9227e8ec30 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:44 +0200 Subject: RDMA/restrack: Translate from ID to restrack object Add new general helper to get restrack entry given by ID and their respective type. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 25 +++++++++++++++++++++++++ include/rdma/restrack.h | 3 +++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index b4f302811858..ac97167da81c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -256,6 +256,31 @@ int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_restrack_get); +/** + * rdma_restrack_get_byid() - translate from ID to restrack object + * @dev: IB device + * @type: resource track type + * @id: ID to take a look + * + * Return: Pointer to restrack entry or -ENOENT in case of error. + */ +struct rdma_restrack_entry * +rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, u32 id) +{ + struct rdma_restrack_root *rt = &dev->res; + struct rdma_restrack_entry *res; + + down_read(&dev->res.rwsem); + res = xa_load(&rt->xa[type], id); + if (!res || !rdma_restrack_get(res)) + res = ERR_PTR(-ENOENT); + up_read(&dev->res.rwsem); + + return res; +} +EXPORT_SYMBOL(rdma_restrack_get_byid); + static void restrack_release(struct kref *kref) { struct rdma_restrack_entry *res; diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 16e11b4c3ec3..44ce32cc0b51 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -179,4 +179,7 @@ int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value); int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value); +struct rdma_restrack_entry *rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, + u32 id); #endif /* _RDMA_RESTRACK_H_ */ -- cgit v1.2.3 From 48118527186fb255461ebf3685ab0f1c2680bd9c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:46 +0200 Subject: RDMA/restrack: Reduce scope of synchronization lock while updating DB XArray uses internal lock for updates to XArray. This means that our external RW lock is needed to ensure that entry is not deleted while we are performing iteration over list. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 2 -- include/rdma/restrack.h | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index ac97167da81c..076ef6475df8 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -210,13 +210,11 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) kref_init(&res->kref); init_completion(&res->comp); - down_write(&dev->res.rwsem); ret = rt_xa_alloc_cyclic(&dev->res.xa[res->type], &res->id, res, &dev->res.next_id[res->type]); if (!ret) res->valid = true; - up_write(&dev->res.rwsem); } /** diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 44ce32cc0b51..53e1a7fb7355 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -58,7 +58,8 @@ struct rdma_restrack_entry; */ struct rdma_restrack_root { /* - * @rwsem: Read/write lock to protect lists + * @rwsem: Read/write lock to protect erase of entry. + * Lists and insertions are protected by XArray internal lock. */ struct rw_semaphore rwsem; /** -- cgit v1.2.3 From 41eda65c6100930d95bb854a0114f3544593070c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:47 +0200 Subject: RDMA/restrack: Hide restrack DB from IB/core There is no need to expose internals of restrack DB to IB/core. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 6 ++- drivers/infiniband/core/nldev.c | 17 ++++---- drivers/infiniband/core/restrack.c | 83 ++++++++++++++++++++++++++------------ drivers/infiniband/core/restrack.h | 39 ++++++++++++++++++ include/rdma/ib_verbs.h | 7 ++-- include/rdma/restrack.h | 28 ------------- 6 files changed, 114 insertions(+), 66 deletions(-) create mode 100644 drivers/infiniband/core/restrack.h (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3eddc6e67a16..f7e206033d39 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -45,6 +45,7 @@ #include #include "core_priv.h" +#include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); @@ -338,7 +339,10 @@ struct ib_device *_ib_alloc_device(size_t size) if (!device) return NULL; - rdma_restrack_init(device); + if (rdma_restrack_init(device)) { + kfree(device); + return NULL; + } device->dev.class = &ib_class; device->groups[0] = &ib_dev_attr_group; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0cd95f80f7b4..54312f9626a1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -39,6 +39,7 @@ #include "core_priv.h" #include "cma_priv.h" +#include "restrack.h" static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, @@ -1027,6 +1028,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, unsigned long id; u32 index, port = 0; bool filled = false; + struct xarray *xa; err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NULL); @@ -1074,13 +1076,14 @@ static int res_get_common_dumpit(struct sk_buff *skb, has_cap_net_admin = netlink_capable(cb->skb, CAP_NET_ADMIN); - down_read(&device->res.rwsem); + xa = &device->res->xa[res_type]; + down_read(&device->res->rwsem); /* * FIXME: if the skip ahead is something common this loop should * use xas_for_each & xas_pause to optimize, we can have a lot of * objects. */ - xa_for_each(&device->res.xa[res_type], id, res) { + xa_for_each(xa, id, res) { if (idx < start) goto next; @@ -1101,13 +1104,13 @@ static int res_get_common_dumpit(struct sk_buff *skb, if (!entry_attr) { ret = -EMSGSIZE; rdma_restrack_put(res); - up_read(&device->res.rwsem); + up_read(&device->res->rwsem); break; } - up_read(&device->res.rwsem); + up_read(&device->res->rwsem); ret = fe->fill_res_func(skb, has_cap_net_admin, res, port); - down_read(&device->res.rwsem); + down_read(&device->res->rwsem); /* * Return resource back, but it won't be released till * the &device->res.rwsem will be released for write. @@ -1125,7 +1128,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, nla_nest_end(skb, entry_attr); next: idx++; } - up_read(&device->res.rwsem); + up_read(&device->res->rwsem); nla_nest_end(skb, table_attr); nlmsg_end(skb, nlh); @@ -1143,7 +1146,7 @@ next: idx++; res_err: nla_nest_cancel(skb, table_attr); - up_read(&device->res.rwsem); + up_read(&device->res->rwsem); err: nlmsg_cancel(skb, nlh); diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 076ef6475df8..6a4b76c66bcb 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -9,8 +9,10 @@ #include #include #include +#include #include "cma_priv.h" +#include "restrack.h" static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, u32 *next) @@ -35,18 +37,27 @@ static int rt_xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, } /** - * rdma_restrack_init() - initialize resource tracking + * rdma_restrack_init() - initialize and allocate resource tracking * @dev: IB device + * + * Return: 0 on success */ -void rdma_restrack_init(struct ib_device *dev) +int rdma_restrack_init(struct ib_device *dev) { - struct rdma_restrack_root *res = &dev->res; + struct rdma_restrack_root *rt; int i; + dev->res = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!dev->res) + return -ENOMEM; + + rt = dev->res; + for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) - xa_init_flags(&res->xa[i], XA_FLAGS_ALLOC); + xa_init_flags(&rt->xa[i], XA_FLAGS_ALLOC); + init_rwsem(&rt->rwsem); - init_rwsem(&res->rwsem); + return 0; } static const char *type2str(enum rdma_restrack_type type) @@ -69,7 +80,7 @@ static const char *type2str(enum rdma_restrack_type type) */ void rdma_restrack_clean(struct ib_device *dev) { - struct rdma_restrack_root *res = &dev->res; + struct rdma_restrack_root *rt = dev->res; struct rdma_restrack_entry *e; char buf[TASK_COMM_LEN]; bool found = false; @@ -77,14 +88,16 @@ void rdma_restrack_clean(struct ib_device *dev) int i; for (i = 0 ; i < RDMA_RESTRACK_MAX; i++) { - if (!xa_empty(&res->xa[i])) { + struct xarray *xa = &dev->res->xa[i]; + + if (!xa_empty(xa)) { unsigned long index; if (!found) { pr_err("restrack: %s", CUT_HERE); dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); } - xa_for_each(&res->xa[i], index, e) { + xa_for_each(xa, index, e) { if (rdma_is_kernel_res(e)) { owner = e->kern_name; } else { @@ -104,10 +117,12 @@ void rdma_restrack_clean(struct ib_device *dev) } found = true; } - xa_destroy(&res->xa[i]); + xa_destroy(xa); } if (found) pr_err("restrack: %s", CUT_HERE); + + kfree(rt); } /** @@ -119,19 +134,19 @@ void rdma_restrack_clean(struct ib_device *dev) int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns) { - struct rdma_restrack_root *res = &dev->res; + struct xarray *xa = &dev->res->xa[type]; struct rdma_restrack_entry *e; unsigned long index = 0; u32 cnt = 0; - down_read(&res->rwsem); - xa_for_each(&res->xa[type], index, e) { + down_read(&dev->res->rwsem); + xa_for_each(xa, index, e) { if (ns == &init_pid_ns || (!rdma_is_kernel_res(e) && ns == task_active_pid_ns(e->task))) cnt++; } - up_read(&res->rwsem); + up_read(&dev->res->rwsem); return cnt; } EXPORT_SYMBOL(rdma_restrack_count); @@ -202,17 +217,19 @@ EXPORT_SYMBOL(rdma_restrack_set_task); static void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); + struct rdma_restrack_root *rt; + struct xarray *xa; int ret; if (!dev) return; + rt = dev->res; + xa = &dev->res->xa[res->type]; + kref_init(&res->kref); init_completion(&res->comp); - - ret = rt_xa_alloc_cyclic(&dev->res.xa[res->type], &res->id, res, - &dev->res.next_id[res->type]); - + ret = rt_xa_alloc_cyclic(xa, &res->id, res, &rt->next_id[res->type]); if (!ret) res->valid = true; } @@ -266,14 +283,14 @@ struct rdma_restrack_entry * rdma_restrack_get_byid(struct ib_device *dev, enum rdma_restrack_type type, u32 id) { - struct rdma_restrack_root *rt = &dev->res; + struct xarray *xa = &dev->res->xa[type]; struct rdma_restrack_entry *res; - down_read(&dev->res.rwsem); - res = xa_load(&rt->xa[type], id); + down_read(&dev->res->rwsem); + res = xa_load(xa, id); if (!res || !rdma_restrack_get(res)) res = ERR_PTR(-ENOENT); - up_read(&dev->res.rwsem); + up_read(&dev->res->rwsem); return res; } @@ -295,19 +312,33 @@ EXPORT_SYMBOL(rdma_restrack_put); void rdma_restrack_del(struct rdma_restrack_entry *res) { - struct ib_device *dev; + struct ib_device *dev = res_to_dev(res); + struct xarray *xa; if (!res->valid) goto out; - dev = res_to_dev(res); + /* + * All objects except CM_ID set valid device immediately + * after new object is created, it means that for not valid + * objects will still have "dev". + * + * It is not the case for CM_ID, newly created object has + * this field set to NULL and it is set in _cma_attach_to_dev() + * only. + * + * Because we don't want to add any conditions on call + * to rdma_restrack_del(), the check below protects from + * NULL-dereference. + */ if (!dev) return; - down_write(&dev->res.rwsem); - xa_erase(&dev->res.xa[res->type], res->id); + xa = &dev->res->xa[res->type]; + down_write(&dev->res->rwsem); + xa_erase(xa, res->id); res->valid = false; - up_write(&dev->res.rwsem); + up_write(&dev->res->rwsem); rdma_restrack_put(res); wait_for_completion(&res->comp); diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h new file mode 100644 index 000000000000..cf89ef0b8ed5 --- /dev/null +++ b/drivers/infiniband/core/restrack.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2017-2019 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_CORE_RESTRACK_H_ +#define _RDMA_CORE_RESTRACK_H_ + +#include +#include + +/** + * struct rdma_restrack_root - main resource tracking management + * entity, per-device + */ +struct rdma_restrack_root { + /* + * @rwsem: Read/write lock to protect erase of entry. + * Lists and insertions are protected by XArray internal lock. + */ + struct rw_semaphore rwsem; + /** + * @xa: Array of XArray structures to hold restrack entries. + * We want to use array of XArrays because insertion is type + * dependent. For types with xisiting unique ID (like QPN), + * we will insert to that unique index. For other types, + * we insert based on pointers and auto-allocate unique index. + */ + struct xarray xa[RDMA_RESTRACK_MAX]; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id[RDMA_RESTRACK_MAX]; +}; + + +int rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +#endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 64ee7c08be22..2a17c2b30073 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2533,6 +2533,8 @@ struct ib_device_ops { DECLARE_RDMA_OBJ_SIZE(ib_pd); }; +struct rdma_restrack_root; + struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; @@ -2589,10 +2591,7 @@ struct ib_device { #endif u32 index; - /* - * Implementation details of the RDMA core, don't use in drivers - */ - struct rdma_restrack_root res; + struct rdma_restrack_root *res; const struct uapi_definition *driver_def; enum rdma_driver_id driver_id; diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 53e1a7fb7355..ecf3c7702a4f 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -7,7 +7,6 @@ #define _RDMA_RESTRACK_H_ #include -#include #include #include #include @@ -50,31 +49,6 @@ enum rdma_restrack_type { }; struct ib_device; -struct rdma_restrack_entry; - -/** - * struct rdma_restrack_root - main resource tracking management - * entity, per-device - */ -struct rdma_restrack_root { - /* - * @rwsem: Read/write lock to protect erase of entry. - * Lists and insertions are protected by XArray internal lock. - */ - struct rw_semaphore rwsem; - /** - * @xa: Array of XArray structures to hold restrack entries. - * We want to use array of XArrays because insertion is type - * dependent. For types with xisiting unique ID (like QPN), - * we will insert to that unique index. For other types, - * we insert based on pointers and auto-allocate unique index. - */ - struct xarray xa[RDMA_RESTRACK_MAX]; - /** - * @next_id: Next ID to support cyclic allocation - */ - u32 next_id[RDMA_RESTRACK_MAX]; -}; /** * struct rdma_restrack_entry - metadata per-entry @@ -125,8 +99,6 @@ struct rdma_restrack_entry { u32 id; }; -void rdma_restrack_init(struct ib_device *dev); -void rdma_restrack_clean(struct ib_device *dev); int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns); -- cgit v1.2.3 From 517b773e0f612d608cbc62a08c55601bd56f73f6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:49 +0200 Subject: RDMA/nldev: Share with user-space object IDs Give to the user space tools unique identifier for PD, MR, CQ and CM_ID objects, so they can be able to query on them with .doit callbacks. QP .doit is not supported yet, till all drivers will be updated to provide their LQPN to be equal to their restrack ID. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 20 ++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 9 +++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 9b4f891771c4..81d7ee3dcb20 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -108,6 +108,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -466,6 +470,9 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, bool has_cap_net_admin, &cm_id->route.addr.dst_addr)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CM_IDN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -494,6 +501,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -522,6 +532,9 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -552,6 +565,9 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -893,6 +909,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_cmd = RDMA_NLDEV_CMD_RES_CM_ID_GET, .nldev_attr = RDMA_NLDEV_ATTR_RES_CM_ID, .entry = RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CM_IDN, }, [RDMA_RESTRACK_CQ] = { .fill_res_func = fill_res_cq_entry, @@ -900,6 +917,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_CQ, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_CQ_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_CQN, }, [RDMA_RESTRACK_MR] = { .fill_res_func = fill_res_mr_entry, @@ -907,6 +925,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_MR, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_MR_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_MRN, }, [RDMA_RESTRACK_PD] = { .fill_res_func = fill_res_pd_entry, @@ -914,6 +933,7 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .nldev_attr = RDMA_NLDEV_ATTR_RES_PD, .flags = NLDEV_PER_DEV, .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, + .id = RDMA_NLDEV_ATTR_RES_PDN, }, }; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 3a9e681e4257..43362132e0d7 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -456,6 +456,15 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ + /* + * Indexes to get/set secific entry, + * for QP use RDMA_NLDEV_ATTR_RES_LQPN + */ + RDMA_NLDEV_ATTR_RES_PDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ + RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + /* * Always the end */ -- cgit v1.2.3 From c3d02788b45ab4a2d8f243b98c04b549c8193af6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:50 +0200 Subject: RDMA/nldev: Provide parent IDs for PD, MR and QP objects PD, MR and QP objects have parents objects: contexts and PDs. The exposed parent IDs allow to correlate various objects and simplify debug investigation. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 18 ++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 81d7ee3dcb20..e6c7cc510556 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -112,6 +112,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -420,6 +421,10 @@ static int fill_res_qp_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, qp->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -503,6 +508,10 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + cq->uobject->context->res.id)) + goto err; if (fill_res_name_pid(msg, res)) goto err; @@ -535,6 +544,10 @@ static int fill_res_mr_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_MRN, res->id)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, mr->pd->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; @@ -568,6 +581,11 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PDN, res->id)) goto err; + if (!rdma_is_kernel_res(res) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CTXN, + pd->uobject->context->res.id)) + goto err; + if (fill_res_name_pid(msg, res)) goto err; diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 43362132e0d7..4ebbcfb2c6ef 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -464,6 +464,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ /* * Always the end -- cgit v1.2.3 From ea1075edcbab7d92f4e4ccf5490043f796bf78be Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:47 -0700 Subject: RDMA: Add and use rdma_for_each_port We have many loops iterating over all of the end port numbers on a struct ib_device, simplify them with a for_each helper. Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- .clang-format | 1 + drivers/infiniband/core/cache.c | 6 +++--- drivers/infiniband/core/cma.c | 7 +++---- drivers/infiniband/core/device.c | 26 ++++++++++++-------------- drivers/infiniband/core/mad.c | 4 ++-- drivers/infiniband/core/nldev.c | 4 ++-- drivers/infiniband/core/security.c | 11 +++++++---- drivers/infiniband/core/sysfs.c | 12 +++--------- drivers/infiniband/core/user_mad.c | 9 +++++---- drivers/infiniband/ulp/ipoib/ipoib_main.c | 4 ++-- drivers/infiniband/ulp/srp/ib_srp.c | 5 +++-- include/rdma/ib_verbs.h | 10 ++++++++++ 12 files changed, 53 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/.clang-format b/.clang-format index 335ce29ab813..201a4f531b90 100644 --- a/.clang-format +++ b/.clang-format @@ -361,6 +361,7 @@ ForEachMacros: - 'radix_tree_for_each_slot' - 'radix_tree_for_each_tagged' - 'rbtree_postorder_for_each_entry_safe' + - 'rdma_for_each_port' - 'resource_list_for_each_entry' - 'resource_list_for_each_entry_safe' - 'rhl_for_each_entry_rcu' diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 2338d0b3a0ca..3d137d8381a9 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1428,7 +1428,7 @@ static void ib_cache_event(struct ib_event_handler *handler, int ib_cache_setup_one(struct ib_device *device) { - int p; + unsigned int p; int err; rwlock_init(&device->cache.lock); @@ -1447,8 +1447,8 @@ int ib_cache_setup_one(struct ib_device *device) return err; } - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - ib_cache_update(device, p + rdma_start_port(device), true); + rdma_for_each_port (device, p) + ib_cache_update(device, p, true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c43512752b8a..68c997be2429 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -659,7 +659,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; - u8 port; + unsigned int port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) @@ -673,8 +673,7 @@ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { - for (port = rdma_start_port(cma_dev->device); - port <= rdma_end_port(cma_dev->device); port++) { + rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; @@ -4548,7 +4547,7 @@ static void cma_add_one(struct ib_device *device) if (!cma_dev->default_roce_tos) goto free_gid_type; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f7e206033d39..71582f848a9c 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -470,10 +470,8 @@ static int verify_immutable(const struct ib_device *dev, u8 port) static int read_port_immutable(struct ib_device *device) { + unsigned int port; int ret; - u8 start_port = rdma_start_port(device); - u8 end_port = rdma_end_port(device); - u8 port; /** * device->port_immutable is indexed directly by the port number to make @@ -482,13 +480,13 @@ static int read_port_immutable(struct ib_device *device) * Therefore port_immutable is declared as a 1 based array with * potential empty slots at the beginning. */ - device->port_immutable = kcalloc(end_port + 1, - sizeof(*device->port_immutable), - GFP_KERNEL); + device->port_immutable = + kcalloc(rdma_end_port(device) + 1, + sizeof(*device->port_immutable), GFP_KERNEL); if (!device->port_immutable) return -ENOMEM; - for (port = start_port; port <= end_port; ++port) { + rdma_for_each_port (device, port) { ret = device->ops.get_port_immutable( device, port, &device->port_immutable[port]); if (ret) @@ -540,9 +538,9 @@ static void ib_policy_change_task(struct work_struct *work) down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { - int i; + unsigned int i; - for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { + rdma_for_each_port (dev, i) { u64 sp; int ret = ib_get_cached_subnet_prefix(dev, i, @@ -1060,10 +1058,9 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_callback cb, void *cookie) { - u8 port; + unsigned int port; - for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); - port++) + rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = NULL; @@ -1217,9 +1214,10 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index) { union ib_gid tmp_gid; - int ret, port, i; + unsigned int port; + int ret, i; - for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { + rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 7870823bac47..e742a6a2c138 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3326,9 +3326,9 @@ error: static void ib_mad_remove_device(struct ib_device *device, void *client_data) { - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { if (!rdma_cap_ib_mad(device, i)) continue; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index e7350d9d60e9..85f6f2bcce40 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -774,7 +774,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, u32 idx = 0; u32 ifindex; int err; - u32 p; + unsigned int p; err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, NULL); @@ -786,7 +786,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, if (!device) return -EINVAL; - for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + rdma_for_each_port (device, p) { /* * The dumpit function returns all information from specific * index. This specific index is taken from the netlink diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index dad6a94a43f3..492702b83600 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -422,12 +422,15 @@ void ib_close_shared_qp_security(struct ib_qp_security *sec) int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) { - u8 i = rdma_start_port(dev); + unsigned int i; bool is_ib = false; int ret; - while (i <= rdma_end_port(dev) && !is_ib) + rdma_for_each_port (dev, i) { is_ib = rdma_protocol_ib(dev, i++); + if (is_ib) + break; + } /* If this isn't an IB device don't create the security context */ if (!is_ib) @@ -561,9 +564,9 @@ void ib_security_cache_change(struct ib_device *device, void ib_security_release_port_pkey_list(struct ib_device *device) { struct pkey_index_qp_list *pkey, *tmp_pkey; - int i; + unsigned int i; - for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + rdma_for_each_port (device, i) { list_for_each_entry_safe(pkey, tmp_pkey, &device->port_pkey_list[i].pkey_list, diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9335b15c2e38..9b6a065bdfa5 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1308,23 +1308,17 @@ static void ib_free_port_attrs(struct ib_device *device) static int ib_setup_port_attrs(struct ib_device *device) { + unsigned int port; int ret; - int i; device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj); if (!device->ports_kobj) return -ENOMEM; - if (rdma_cap_ib_switch(device)) { - ret = add_port(device, 0); + rdma_for_each_port (device, port) { + ret = add_port(device, port); if (ret) goto err_put; - } else { - for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i); - if (ret) - goto err_put; - } } return 0; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 3ebd211a87ed..02b7947ab215 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1323,14 +1323,15 @@ free: static void ib_umad_remove_one(struct ib_device *device, void *client_data) { struct ib_umad_device *umad_dev = client_data; - int i; + unsigned int i; if (!umad_dev) return; - for (i = 0; i <= rdma_end_port(device) - rdma_start_port(device); ++i) { - if (rdma_cap_ib_mad(device, i + rdma_start_port(device))) - ib_umad_kill_port(&umad_dev->ports[i]); + rdma_for_each_port (device, i) { + if (rdma_cap_ib_mad(device, i)) + ib_umad_kill_port( + &umad_dev->ports[i - rdma_start_port(device)]); } /* balances kref_init() */ ib_umad_dev_put(umad_dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ee4cca80f00b..48eda16db1a7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2495,7 +2495,7 @@ static void ipoib_add_one(struct ib_device *device) struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; - int p; + unsigned int p; int count = 0; dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); @@ -2504,7 +2504,7 @@ static void ipoib_add_one(struct ib_device *device) INIT_LIST_HEAD(dev_list); - for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + rdma_for_each_port (device, p) { if (!rdma_protocol_ib(device, p)) continue; dev = ipoib_add_port("ib%d", device, p); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 84184910f038..151f4eba84b8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -4127,7 +4127,8 @@ static void srp_add_one(struct ib_device *device) struct srp_device *srp_dev; struct ib_device_attr *attr = &device->attrs; struct srp_host *host; - int mr_page_shift, p; + int mr_page_shift; + unsigned int p; u64 max_pages_per_mr; unsigned int flags = 0; @@ -4194,7 +4195,7 @@ static void srp_add_one(struct ib_device *device) WARN_ON_ONCE(srp_dev->global_rkey == 0); } - for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { + rdma_for_each_port (device, p) { host = srp_add_port(srp_dev, p); if (host) list_add_tail(&host->list, &srp_dev->dev_list); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2a17c2b30073..fa0edd6ae33c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2827,6 +2827,16 @@ static inline u8 rdma_start_port(const struct ib_device *device) return rdma_cap_ib_switch(device) ? 0 : 1; } +/** + * rdma_for_each_port - Iterate over all valid port numbers of the IB device + * @device - The struct ib_device * to iterate over + * @iter - The unsigned int to store the port number + */ +#define rdma_for_each_port(device, iter) \ + for (iter = rdma_start_port(device + BUILD_BUG_ON_ZERO(!__same_type( \ + unsigned int, iter))); \ + iter <= rdma_end_port(device); (iter)++) + /** * rdma_end_port - Return the last valid port number for the device * specified -- cgit v1.2.3 From 8ceb1357b33790193e9d55d2d09bcfd6bd59dd6d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:48 -0700 Subject: RDMA/device: Consolidate ib_device per_port data into one place There is no reason to have three allocations of per-port data. Combine them together and make the lifetime for all the per-port data match the struct ib_device. Following patches will require more port-specific data, now there is a good place to put it. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 4 +-- drivers/infiniband/core/device.c | 70 ++++++++++++------------------------ drivers/infiniband/core/security.c | 24 ++++++------- include/rdma/ib_verbs.h | 74 ++++++++++++++++++++++---------------- 4 files changed, 78 insertions(+), 94 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 3d137d8381a9..9d0e8aca741a 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -881,8 +881,8 @@ static int _gid_table_setup_one(struct ib_device *ib_dev) for (port = 0; port < ib_dev->phys_port_cnt; port++) { u8 rdma_port = port + rdma_start_port(ib_dev); - table = alloc_gid_table( - ib_dev->port_immutable[rdma_port].gid_tbl_len); + table = alloc_gid_table( + ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 71582f848a9c..8d7d63a60ef5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -293,8 +293,7 @@ static void ib_device_release(struct device *device) WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); - kfree(dev->port_pkey_list); - kfree(dev->port_immutable); + kfree(dev->port_data); xa_destroy(&dev->client_data); kfree(dev); } @@ -468,27 +467,31 @@ static int verify_immutable(const struct ib_device *dev, u8 port) rdma_max_mad_size(dev, port) != 0); } -static int read_port_immutable(struct ib_device *device) +static int setup_port_data(struct ib_device *device) { unsigned int port; int ret; - /** - * device->port_immutable is indexed directly by the port number to make + /* + * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * - * Therefore port_immutable is declared as a 1 based array with - * potential empty slots at the beginning. + * Therefore port_data is declared as a 1 based array with potential + * empty slots at the beginning. */ - device->port_immutable = - kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_immutable), GFP_KERNEL); - if (!device->port_immutable) + device->port_data = kcalloc(rdma_end_port(device) + 1, + sizeof(*device->port_data), GFP_KERNEL); + if (!device->port_data) return -ENOMEM; rdma_for_each_port (device, port) { - ret = device->ops.get_port_immutable( - device, port, &device->port_immutable[port]); + struct ib_port_data *pdata = &device->port_data[port]; + + spin_lock_init(&pdata->pkey_list_lock); + INIT_LIST_HEAD(&pdata->pkey_list); + + ret = device->ops.get_port_immutable(device, port, + &pdata->immutable); if (ret) return ret; @@ -507,30 +510,6 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str) } EXPORT_SYMBOL(ib_get_device_fw_str); -static int setup_port_pkey_list(struct ib_device *device) -{ - int i; - - /** - * device->port_pkey_list is indexed directly by the port number, - * Therefore it is declared as a 1 based array with potential empty - * slots at the beginning. - */ - device->port_pkey_list = kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_pkey_list), - GFP_KERNEL); - - if (!device->port_pkey_list) - return -ENOMEM; - - for (i = 0; i < (rdma_end_port(device) + 1); i++) { - spin_lock_init(&device->port_pkey_list[i].list_lock); - INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list); - } - - return 0; -} - static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; @@ -668,10 +647,9 @@ static int setup_device(struct ib_device *device) if (ret) return ret; - ret = read_port_immutable(device); + ret = setup_port_data(device); if (ret) { - dev_warn(&device->dev, - "Couldn't create per port immutable data\n"); + dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } @@ -683,12 +661,6 @@ static int setup_device(struct ib_device *device) return ret; } - ret = setup_port_pkey_list(device); - if (ret) { - dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - return ret; - } - return 0; } @@ -1221,7 +1193,8 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, if (!rdma_protocol_ib(device, port)) continue; - for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { + for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; + ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) return ret; @@ -1253,7 +1226,8 @@ int ib_find_pkey(struct ib_device *device, u16 tmp_pkey; int partial_ix = -1; - for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) { + for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; + ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 492702b83600..1ab423b19f77 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -49,16 +49,15 @@ static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) struct pkey_index_qp_list *tmp_pkey; struct ib_device *dev = pp->sec->dev; - spin_lock(&dev->port_pkey_list[pp->port_num].list_lock); - list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[pp->port_num].pkey_list, - pkey_index_list) { + spin_lock(&dev->port_data[pp->port_num].pkey_list_lock); + list_for_each_entry (tmp_pkey, &dev->port_data[pp->port_num].pkey_list, + pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { pkey = tmp_pkey; break; } } - spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock); + spin_unlock(&dev->port_data[pp->port_num].pkey_list_lock); return pkey; } @@ -263,12 +262,12 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) if (!pkey) return -ENOMEM; - spin_lock(&dev->port_pkey_list[port_num].list_lock); + spin_lock(&dev->port_data[port_num].pkey_list_lock); /* Check for the PKey again. A racing process may * have created it. */ list_for_each_entry(tmp_pkey, - &dev->port_pkey_list[port_num].pkey_list, + &dev->port_data[port_num].pkey_list, pkey_index_list) { if (tmp_pkey->pkey_index == pp->pkey_index) { kfree(pkey); @@ -283,9 +282,9 @@ static int port_pkey_list_insert(struct ib_port_pkey *pp) spin_lock_init(&pkey->qp_list_lock); INIT_LIST_HEAD(&pkey->qp_list); list_add(&pkey->pkey_index_list, - &dev->port_pkey_list[port_num].pkey_list); + &dev->port_data[port_num].pkey_list); } - spin_unlock(&dev->port_pkey_list[port_num].list_lock); + spin_unlock(&dev->port_data[port_num].pkey_list_lock); } spin_lock(&pkey->qp_list_lock); @@ -551,9 +550,8 @@ void ib_security_cache_change(struct ib_device *device, { struct pkey_index_qp_list *pkey; - list_for_each_entry(pkey, - &device->port_pkey_list[port_num].pkey_list, - pkey_index_list) { + list_for_each_entry (pkey, &device->port_data[port_num].pkey_list, + pkey_index_list) { check_pkey_qps(pkey, device, port_num, @@ -569,7 +567,7 @@ void ib_security_release_port_pkey_list(struct ib_device *device) rdma_for_each_port (device, i) { list_for_each_entry_safe(pkey, tmp_pkey, - &device->port_pkey_list[i].pkey_list, + &device->port_data[i].pkey_list, pkey_index_list) { list_del(&pkey->pkey_index_list); kfree(pkey); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fa0edd6ae33c..b42e257814f7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2198,6 +2198,13 @@ struct ib_port_immutable { u32 max_mad_size; }; +struct ib_port_data { + struct ib_port_immutable immutable; + + spinlock_t pkey_list_lock; + struct list_head pkey_list; +}; + /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { RDMA_NETDEV_OPA_VNIC, @@ -2243,12 +2250,6 @@ struct rdma_netdev_alloc_params { struct net_device *netdev, void *param); }; -struct ib_port_pkey_list { - /* Lock to hold while modifying the list. */ - spinlock_t list_lock; - struct list_head pkey_list; -}; - struct ib_counters { struct ib_device *device; struct ib_uobject *uobject; @@ -2549,14 +2550,12 @@ struct ib_device { struct ib_cache cache; /** - * port_immutable is indexed by port number + * port_data is indexed by port number */ - struct ib_port_immutable *port_immutable; + struct ib_port_data *port_data; int num_comp_vectors; - struct ib_port_pkey_list *port_pkey_list; - struct iw_cm_verbs *iwcm; struct module *owner; @@ -2860,34 +2859,38 @@ static inline int rdma_is_port_valid(const struct ib_device *device, static inline bool rdma_is_grh_required(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & - RDMA_CORE_PORT_IB_GRH_REQUIRED; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_PORT_IB_GRH_REQUIRED; } static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & - (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); + return device->port_data[port_num].immutable.core_cap_flags & + (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) @@ -2898,12 +2901,14 @@ static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_RAW_PACKET; } static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_USNIC; } /** @@ -2920,7 +2925,8 @@ static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_n */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_MAD; } /** @@ -2944,8 +2950,8 @@ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) { - return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) - == RDMA_CORE_CAP_OPA_MAD; + return (device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_OPA_MAD) == RDMA_CORE_CAP_OPA_MAD; } /** @@ -2970,7 +2976,8 @@ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_SMI; } /** @@ -2990,7 +2997,8 @@ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_CM; } /** @@ -3007,7 +3015,8 @@ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IW_CM; } /** @@ -3027,7 +3036,8 @@ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_SA; } /** @@ -3067,7 +3077,8 @@ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_AF_IB; } /** @@ -3088,7 +3099,8 @@ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_ETH_AH; } /** @@ -3102,7 +3114,7 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) { - return (device->port_immutable[port_num].core_cap_flags & + return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; } @@ -3120,7 +3132,7 @@ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].max_mad_size; + return device->port_data[port_num].immutable.max_mad_size; } /** -- cgit v1.2.3 From 8faea9fd4a3914f12cd343e10810ec5f4215ddd6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:49 -0700 Subject: RDMA/cache: Move the cache per-port data into the main ib_port_data Like the other cases there no real reason to have another array just for the cache. This larger conversion gets its own patch. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 89 ++++++++++++++--------------------------- include/rdma/ib_verbs.h | 3 +- 2 files changed, 33 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 9d0e8aca741a..a28dc1901c80 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -185,7 +185,7 @@ EXPORT_SYMBOL(ib_cache_gid_parse_type_str); static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) { - return device->cache.ports[port - rdma_start_port(device)].gid; + return device->port_data[port].cache.gid; } static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) @@ -765,7 +765,7 @@ err_free_table: return NULL; } -static void release_gid_table(struct ib_device *device, u8 port, +static void release_gid_table(struct ib_device *device, struct ib_gid_table *table) { bool leak = false; @@ -863,31 +863,27 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, static void gid_table_release_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + unsigned int p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - release_gid_table(ib_dev, port, table); - ib_dev->cache.ports[port].gid = NULL; + rdma_for_each_port (ib_dev, p) { + release_gid_table(ib_dev, ib_dev->port_data[p].cache.gid); + ib_dev->port_data[p].cache.gid = NULL; } } static int _gid_table_setup_one(struct ib_device *ib_dev) { - u8 port; struct ib_gid_table *table; + unsigned int rdma_port; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - u8 rdma_port = port + rdma_start_port(ib_dev); - + rdma_for_each_port (ib_dev, rdma_port) { table = alloc_gid_table( ib_dev->port_data[rdma_port].immutable.gid_tbl_len); if (!table) goto rollback_table_setup; gid_table_reserve_default(ib_dev, rdma_port, table); - ib_dev->cache.ports[port].gid = table; + ib_dev->port_data[rdma_port].cache.gid = table; } return 0; @@ -898,14 +894,11 @@ rollback_table_setup: static void gid_table_cleanup_one(struct ib_device *ib_dev) { - struct ib_gid_table *table; - u8 port; + unsigned int p; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - } + rdma_for_each_port (ib_dev, p) + cleanup_gid_table_port(ib_dev, p, + ib_dev->port_data[p].cache.gid); } static int gid_table_setup_one(struct ib_device *ib_dev) @@ -983,17 +976,17 @@ const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - u8 p; + unsigned int p; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; - for (p = 0; p < device->phys_port_cnt; p++) { + rdma_for_each_port(device, p) { struct ib_gid_table *table; unsigned long flags; int index; - table = device->cache.ports[p].gid; + table = device->port_data[p].cache.gid; read_lock_irqsave(&table->rwlock, flags); index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); if (index >= 0) { @@ -1025,7 +1018,7 @@ int ib_get_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; if (index < 0 || index >= cache->table_len) ret = -EINVAL; @@ -1043,14 +1036,12 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx) { unsigned long flags; - int p; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - p = port_num - rdma_start_port(device); read_lock_irqsave(&device->cache.lock, flags); - *sn_pfx = device->cache.ports[p].subnet_prefix; + *sn_pfx = device->port_data[port_num].cache.subnet_prefix; read_unlock_irqrestore(&device->cache.lock, flags); return 0; @@ -1073,7 +1064,7 @@ int ib_find_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; *index = -1; @@ -1113,7 +1104,7 @@ int ib_find_exact_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; + cache = device->port_data[port_num].cache.pkey; *index = -1; @@ -1141,7 +1132,7 @@ int ib_get_cached_lmc(struct ib_device *device, return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc; + *lmc = device->port_data[port_num].cache.lmc; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -1159,8 +1150,7 @@ int ib_get_cached_port_state(struct ib_device *device, return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *port_state = device->cache.ports[port_num - - rdma_start_port(device)].port_state; + *port_state = device->port_data[port_num].cache.port_state; read_unlock_irqrestore(&device->cache.lock, flags); return ret; @@ -1361,16 +1351,13 @@ static void ib_cache_update(struct ib_device *device, write_lock_irq(&device->cache.lock); - old_pkey_cache = device->cache.ports[port - - rdma_start_port(device)].pkey; + old_pkey_cache = device->port_data[port].cache.pkey; - device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; - device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; - device->cache.ports[port - rdma_start_port(device)].port_state = - tprops->state; + device->port_data[port].cache.pkey = pkey_cache; + device->port_data[port].cache.lmc = tprops->lmc; + device->port_data[port].cache.port_state = tprops->state; - device->cache.ports[port - rdma_start_port(device)].subnet_prefix = - tprops->subnet_prefix; + device->port_data[port].cache.subnet_prefix = tprops->subnet_prefix; write_unlock_irq(&device->cache.lock); if (enforce_security) @@ -1433,19 +1420,9 @@ int ib_cache_setup_one(struct ib_device *device) rwlock_init(&device->cache.lock); - device->cache.ports = - kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1, - sizeof(*device->cache.ports), - GFP_KERNEL); - if (!device->cache.ports) - return -ENOMEM; - err = gid_table_setup_one(device); - if (err) { - kfree(device->cache.ports); - device->cache.ports = NULL; + if (err) return err; - } rdma_for_each_port (device, p) ib_cache_update(device, p, true); @@ -1458,10 +1435,7 @@ int ib_cache_setup_one(struct ib_device *device) void ib_cache_release_one(struct ib_device *device) { - int p; - - if (!device->cache.ports) - return; + unsigned int p; /* * The release function frees all the cache elements. @@ -1469,11 +1443,10 @@ void ib_cache_release_one(struct ib_device *device) * all the device's resources when the cache could no * longer be accessed. */ - for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - kfree(device->cache.ports[p].pkey); + rdma_for_each_port (device, p) + kfree(device->port_data[p].cache.pkey); gid_table_release_one(device); - kfree(device->cache.ports); } void ib_cache_cleanup_one(struct ib_device *device) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b42e257814f7..50b7ebc2885e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2186,7 +2186,6 @@ struct ib_port_cache { struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; - struct ib_port_cache *ports; }; struct iw_cm_verbs; @@ -2203,6 +2202,8 @@ struct ib_port_data { spinlock_t pkey_list_lock; struct list_head pkey_list; + + struct ib_port_cache cache; }; /* rdma netdev type - specifies protocol type */ -- cgit v1.2.3 From c2261dd76b549754c14c8ac7cadadd0993b182d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:50 -0700 Subject: RDMA/device: Add ib_device_set_netdev() as an alternative to get_netdev The associated netdev should not actually be very dynamic, so for most drivers there is no reason for a callback like this. Provide an API to inform the core code about the net dev affiliation and use a core maintained data structure instead. This allows the core code to be more aware of the ndev relationship which will allow some new APIs based around this. This also uses locking that makes some kind of sense, many drivers had a confusing RCU lock, or missing locking which isn't right. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 24 +++--- drivers/infiniband/core/core_priv.h | 3 + drivers/infiniband/core/device.c | 166 ++++++++++++++++++++++++++++++++---- drivers/infiniband/core/nldev.c | 4 +- drivers/infiniband/core/verbs.c | 5 +- include/rdma/ib_verbs.h | 7 ++ 6 files changed, 171 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index a28dc1901c80..43c67e5f43c6 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -547,21 +547,19 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, unsigned long mask; int ret; - if (ib_dev->ops.get_netdev) { - idev = ib_dev->ops.get_netdev(ib_dev, port); - if (idev && attr->ndev != idev) { - union ib_gid default_gid; - - /* Adding default GIDs in not permitted */ - make_default_gid(idev, &default_gid); - if (!memcmp(gid, &default_gid, sizeof(*gid))) { - dev_put(idev); - return -EPERM; - } - } - if (idev) + idev = ib_device_get_netdev(ib_dev, port); + if (idev && attr->ndev != idev) { + union ib_gid default_gid; + + /* Adding default GIDs is not permitted */ + make_default_gid(idev, &default_gid); + if (!memcmp(gid, &default_gid, sizeof(*gid))) { dev_put(idev); + return -EPERM; + } } + if (idev) + dev_put(idev); mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE | diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index eeabe9ca8427..08c690249594 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -66,6 +66,9 @@ typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + unsigned int port); + void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8d7d63a60ef5..7680a64a98bc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -134,6 +134,7 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) +static void free_netdevs(struct ib_device *ib_dev); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); @@ -290,6 +291,7 @@ static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); + free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); @@ -371,6 +373,9 @@ EXPORT_SYMBOL(_ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + /* Expedite releasing netdev references */ + free_netdevs(device); + WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); @@ -461,16 +466,16 @@ static void remove_client_context(struct ib_device *device, up_read(&device->client_data_rwsem); } -static int verify_immutable(const struct ib_device *dev, u8 port) -{ - return WARN_ON(!rdma_cap_ib_mad(dev, port) && - rdma_max_mad_size(dev, port) != 0); -} - -static int setup_port_data(struct ib_device *device) +static int alloc_port_data(struct ib_device *device) { unsigned int port; - int ret; + + if (device->port_data) + return 0; + + /* This can only be called once the physical port range is defined */ + if (WARN_ON(!device->phys_port_cnt)) + return -EINVAL; /* * device->port_data is indexed directly by the port number to make @@ -489,6 +494,28 @@ static int setup_port_data(struct ib_device *device) spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); + spin_lock_init(&pdata->netdev_lock); + } + return 0; +} + +static int verify_immutable(const struct ib_device *dev, u8 port) +{ + return WARN_ON(!rdma_cap_ib_mad(dev, port) && + rdma_max_mad_size(dev, port) != 0); +} + +static int setup_port_data(struct ib_device *device) +{ + unsigned int port; + int ret; + + ret = alloc_port_data(device); + if (ret) + return ret; + + rdma_for_each_port (device, port) { + struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); @@ -682,6 +709,9 @@ static void disable_device(struct ib_device *device) /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); + + /* Expedite removing unregistered pointers from the hash table */ + free_netdevs(device); } /* @@ -1012,6 +1042,114 @@ int ib_query_port(struct ib_device *device, } EXPORT_SYMBOL(ib_query_port); +/** + * ib_device_set_netdev - Associate the ib_dev with an underlying net_device + * @ib_dev: Device to modify + * @ndev: net_device to affiliate, may be NULL + * @port: IB port the net_device is connected to + * + * Drivers should use this to link the ib_device to a netdev so the netdev + * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be + * affiliated with any port. + * + * The caller must ensure that the given ndev is not unregistered or + * unregistering, and that either the ib_device is unregistered or + * ib_device_set_netdev() is called with NULL when the ndev sends a + * NETDEV_UNREGISTER event. + */ +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + unsigned int port) +{ + struct net_device *old_ndev; + struct ib_port_data *pdata; + unsigned long flags; + int ret; + + /* + * Drivers wish to call this before ib_register_driver, so we have to + * setup the port data early. + */ + ret = alloc_port_data(ib_dev); + if (ret) + return ret; + + if (!rdma_is_port_valid(ib_dev, port)) + return -EINVAL; + + pdata = &ib_dev->port_data[port]; + spin_lock_irqsave(&pdata->netdev_lock, flags); + if (pdata->netdev == ndev) { + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + return 0; + } + old_ndev = pdata->netdev; + + if (ndev) + dev_hold(ndev); + pdata->netdev = ndev; + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + + if (old_ndev) + dev_put(old_ndev); + + return 0; +} +EXPORT_SYMBOL(ib_device_set_netdev); + +static void free_netdevs(struct ib_device *ib_dev) +{ + unsigned long flags; + unsigned int port; + + rdma_for_each_port (ib_dev, port) { + struct ib_port_data *pdata = &ib_dev->port_data[port]; + + spin_lock_irqsave(&pdata->netdev_lock, flags); + if (pdata->netdev) { + dev_put(pdata->netdev); + pdata->netdev = NULL; + } + spin_unlock_irqrestore(&pdata->netdev_lock, flags); + } +} + +struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, + unsigned int port) +{ + struct ib_port_data *pdata; + struct net_device *res; + + if (!rdma_is_port_valid(ib_dev, port)) + return NULL; + + pdata = &ib_dev->port_data[port]; + + /* + * New drivers should use ib_device_set_netdev() not the legacy + * get_netdev(). + */ + if (ib_dev->ops.get_netdev) + res = ib_dev->ops.get_netdev(ib_dev, port); + else { + spin_lock(&pdata->netdev_lock); + res = pdata->netdev; + if (res) + dev_hold(res); + spin_unlock(&pdata->netdev_lock); + } + + /* + * If we are starting to unregister expedite things by preventing + * propagation of an unregistering netdev. + */ + if (res && res->reg_state != NETREG_REGISTERED) { + dev_put(res); + return NULL; + } + + return res; +} + /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query @@ -1034,16 +1172,8 @@ void ib_enum_roce_netdev(struct ib_device *ib_dev, rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { - struct net_device *idev = NULL; - - if (ib_dev->ops.get_netdev) - idev = ib_dev->ops.get_netdev(ib_dev, port); - - if (idev && - idev->reg_state >= NETREG_UNREGISTERED) { - dev_put(idev); - idev = NULL; - } + struct net_device *idev = + ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 85f6f2bcce40..1980ddc5f7bc 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -268,9 +268,7 @@ static int fill_port_info(struct sk_buff *msg, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) return -EMSGSIZE; - if (device->ops.get_netdev) - netdev = device->ops.get_netdev(device, port); - + netdev = ib_device_get_netdev(device, port); if (netdev && net_eq(dev_net(netdev), net)) { ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index de5d895a5054..5a5e83f5f0fc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1723,10 +1723,7 @@ int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) return -EINVAL; - if (!dev->ops.get_netdev) - return -EOPNOTSUPP; - - netdev = dev->ops.get_netdev(dev, port_num); + netdev = ib_device_get_netdev(dev, port_num); if (!netdev) return -ENODEV; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 50b7ebc2885e..7f81a313c01b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2204,6 +2204,9 @@ struct ib_port_data { struct list_head pkey_list; struct ib_port_cache cache; + + spinlock_t netdev_lock; + struct net_device *netdev; }; /* rdma netdev type - specifies protocol type */ @@ -3996,6 +3999,10 @@ void ib_device_put(struct ib_device *device); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + unsigned int port); +struct net_device *ib_device_netdev(struct ib_device *dev, u8 port); + struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq(struct ib_wq *wq); -- cgit v1.2.3 From 324e227ea7c952626abafe72db42ae0d70220a6e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:51 -0700 Subject: RDMA/device: Add ib_device_get_by_netdev() Several drivers need to find the ib_device from a given netdev. rxe needs this at speed in an unsleepable context, so choose to implement the translation using a RCU safe hash table. The hash table can have a many to one mapping. This is intended to support some future case where multiple IB drivers (ie iWarp and RoCE) connect to the same netdevs. driver_ids will need to be different to support this. In the process this makes the struct ib_device and ib_port_data RCU safe by deferring their kfrees. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 119 +++++++++++++++++++++++++++++++++++---- include/rdma/ib_verbs.h | 10 +++- 2 files changed, 116 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7680a64a98bc..f6795ad7ca98 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,10 @@ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) +/* RCU hash table mapping netdevice pointers to struct ib_port_data */ +static DEFINE_SPINLOCK(ndev_hash_lock); +static DECLARE_HASHTABLE(ndev_hash, 5); + static void free_netdevs(struct ib_device *ib_dev); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); @@ -144,6 +149,12 @@ static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; +/* Pointer to the RCU head at the start of the ib_port_data array */ +struct ib_port_data_rcu { + struct rcu_head rcu_head; + struct ib_port_data pdata[]; +}; + static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } @@ -295,9 +306,12 @@ static void ib_device_release(struct device *device) WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); - kfree(dev->port_data); xa_destroy(&dev->client_data); - kfree(dev); + if (dev->port_data) + kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, + pdata[0]), + rcu_head); + kfree_rcu(dev, rcu_head); } static int ib_device_uevent(struct device *device, @@ -468,6 +482,7 @@ static void remove_client_context(struct ib_device *device, static int alloc_port_data(struct ib_device *device) { + struct ib_port_data_rcu *pdata_rcu; unsigned int port; if (device->port_data) @@ -484,17 +499,26 @@ static int alloc_port_data(struct ib_device *device) * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ - device->port_data = kcalloc(rdma_end_port(device) + 1, - sizeof(*device->port_data), GFP_KERNEL); - if (!device->port_data) + pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, + rdma_end_port(device) + 1), + GFP_KERNEL); + if (!pdata_rcu) return -ENOMEM; + /* + * The rcu_head is put in front of the port data array and the stored + * pointer is adjusted since we never need to see that member until + * kfree_rcu. + */ + device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; + pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); + INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } @@ -1042,6 +1066,29 @@ int ib_query_port(struct ib_device *device, } EXPORT_SYMBOL(ib_query_port); +static void add_ndev_hash(struct ib_port_data *pdata) +{ + unsigned long flags; + + might_sleep(); + + spin_lock_irqsave(&ndev_hash_lock, flags); + if (hash_hashed(&pdata->ndev_hash_link)) { + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock_irqrestore(&ndev_hash_lock, flags); + /* + * We cannot do hash_add_rcu after a hash_del_rcu until the + * grace period + */ + synchronize_rcu(); + spin_lock_irqsave(&ndev_hash_lock, flags); + } + if (pdata->netdev) + hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, + (uintptr_t)pdata->netdev); + spin_unlock_irqrestore(&ndev_hash_lock, flags); +} + /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify @@ -1078,17 +1125,19 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); - if (pdata->netdev == ndev) { + old_ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } - old_ndev = pdata->netdev; if (ndev) dev_hold(ndev); - pdata->netdev = ndev; + rcu_assign_pointer(pdata->netdev, ndev); spin_unlock_irqrestore(&pdata->netdev_lock, flags); + add_ndev_hash(pdata); if (old_ndev) dev_put(old_ndev); @@ -1103,11 +1152,24 @@ static void free_netdevs(struct ib_device *ib_dev) rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; + struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); - if (pdata->netdev) { - dev_put(pdata->netdev); - pdata->netdev = NULL; + ndev = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); + if (ndev) { + spin_lock(&ndev_hash_lock); + hash_del_rcu(&pdata->ndev_hash_link); + spin_unlock(&ndev_hash_lock); + + /* + * If this is the last dev_put there is still a + * synchronize_rcu before the netdev is kfreed, so we + * can continue to rely on unlocked pointer + * comparisons after the put + */ + rcu_assign_pointer(pdata->netdev, NULL); + dev_put(ndev); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } @@ -1132,7 +1194,8 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); - res = pdata->netdev; + res = rcu_dereference_protected( + pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (res) dev_hold(res); spin_unlock(&pdata->netdev_lock); @@ -1150,6 +1213,38 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, return res; } +/** + * ib_device_get_by_netdev - Find an IB device associated with a netdev + * @ndev: netdev to locate + * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) + * + * Find and hold an ib_device that is associated with a netdev via + * ib_device_set_netdev(). The caller must call ib_device_put() on the + * returned pointer. + */ +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id) +{ + struct ib_device *res = NULL; + struct ib_port_data *cur; + + rcu_read_lock(); + hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, + (uintptr_t)ndev) { + if (rcu_access_pointer(cur->netdev) == ndev && + (driver_id == RDMA_DRIVER_UNKNOWN || + cur->ib_dev->driver_id == driver_id) && + ib_device_try_get(cur->ib_dev)) { + res = cur->ib_dev; + break; + } + } + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(ib_device_get_by_netdev); + /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7f81a313c01b..3aa802b65cf3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2198,6 +2198,8 @@ struct ib_port_immutable { }; struct ib_port_data { + struct ib_device *ib_dev; + struct ib_port_immutable immutable; spinlock_t pkey_list_lock; @@ -2206,7 +2208,8 @@ struct ib_port_data { struct ib_port_cache cache; spinlock_t netdev_lock; - struct net_device *netdev; + struct net_device __rcu *netdev; + struct hlist_node ndev_hash_link; }; /* rdma netdev type - specifies protocol type */ @@ -2545,6 +2548,7 @@ struct ib_device { struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; + struct rcu_head rcu_head; struct list_head event_handler_list; spinlock_t event_handler_lock; @@ -3996,6 +4000,10 @@ static inline bool ib_device_try_get(struct ib_device *dev) } void ib_device_put(struct ib_device *device); +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id); +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); -- cgit v1.2.3 From d0899892edd089790eb17943ecf28254a909deae Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:53 -0700 Subject: RDMA/device: Provide APIs from the core code to help unregistration These APIs are intended to support drivers that exist outside the usual driver core probe()/remove() callbacks. Normally the driver core will prevent remove() from running concurrently with probe(), once this safety is lost drivers need more support to get the locking and lifetimes right. ib_unregister_driver() is intended to be used during module_exit of a driver using these APIs. It unregisters all the associated ib_devices. ib_unregister_device_and_put() is to be used by a driver-specific removal function (ie removal by name, removal from a netdev notifier, removal from netlink) ib_unregister_queued() is to be used from netdev notifier chains where RTNL is held. The locking is tricky here since once things become async it is possible to race unregister with registration. This is largely solved by relying on the registration refcount, unregistration will only ever work on something that has a positive registration refcount - and then an unregistration mutex serializes all competing unregistrations of the same device. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 242 +++++++++++++++++++++++++++++++++------ include/rdma/ib_verbs.h | 11 ++ 2 files changed, 217 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f6795ad7ca98..e470fa651961 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -140,6 +140,8 @@ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); +static void ib_unregister_work(struct work_struct *work); +static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); @@ -366,6 +368,7 @@ struct ib_device *_ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); + mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. @@ -374,6 +377,7 @@ struct ib_device *_ib_alloc_device(size_t size) init_rwsem(&device->client_data_rwsem); INIT_LIST_HEAD(&device->port_list); init_completion(&device->unreg_completion); + INIT_WORK(&device->unregistration_work, ib_unregister_work); return device; } @@ -387,6 +391,20 @@ EXPORT_SYMBOL(_ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + if (device->ops.dealloc_driver) + device->ops.dealloc_driver(device); + + /* + * ib_unregister_driver() requires all devices to remain in the xarray + * while their ops are callable. The last op we call is dealloc_driver + * above. This is needed to create a fence on op callbacks prior to + * allowing the driver module to unload. + */ + down_write(&devices_rwsem); + if (xa_load(&devices, device->index) == device) + xa_erase(&devices, device->index); + up_write(&devices_rwsem); + /* Expedite releasing netdev references */ free_netdevs(device); @@ -599,7 +617,8 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, } /* - * Assign the unique string device name and the unique device index. + * Assign the unique string device name and the unique device index. This is + * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { @@ -640,13 +659,6 @@ out: return ret; } -static void release_name(struct ib_device *device) -{ - down_write(&devices_rwsem); - xa_erase(&devices, device->index); - up_write(&devices_rwsem); -} - static void setup_dma_device(struct ib_device *device) { struct device *parent = device->dev.parent; @@ -740,30 +752,38 @@ static void disable_device(struct ib_device *device) /* * An enabled device is visible to all clients and to all the public facing - * APIs that return a device pointer. + * APIs that return a device pointer. This always returns with a new get, even + * if it fails. */ -static int enable_device(struct ib_device *device) +static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; - int ret; + int ret = 0; - refcount_set(&device->refcount, 1); + /* + * One ref belongs to the xa and the other belongs to this + * thread. This is needed to guard against parallel unregistration. + */ + refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); - up_write(&devices_rwsem); + + /* + * By using downgrade_write() we ensure that no other thread can clear + * DEVICE_REGISTERED while we are completing the client setup. + */ + downgrade_write(&devices_rwsem); down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); - if (ret) { - up_read(&clients_rwsem); - disable_device(device); - return ret; - } + if (ret) + break; } up_read(&clients_rwsem); - return 0; + up_read(&devices_rwsem); + return ret; } /** @@ -774,6 +794,10 @@ static int enable_device(struct ib_device *device) * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). + * + * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() + * asynchronously then the device pointer may become freed as soon as this + * function returns. */ int ib_register_device(struct ib_device *device, const char *name) { @@ -785,13 +809,13 @@ int ib_register_device(struct ib_device *device, const char *name) ret = setup_device(device); if (ret) - goto out; + return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto out; + return ret; } ib_device_register_rdmacg(device); @@ -807,42 +831,186 @@ int ib_register_device(struct ib_device *device, const char *name) goto dev_cleanup; } - ret = enable_device(device); - if (ret) - goto sysfs_cleanup; + ret = enable_device_and_get(device); + if (ret) { + void (*dealloc_fn)(struct ib_device *); + + /* + * If we hit this error flow then we don't want to + * automatically dealloc the device since the caller is + * expected to call ib_dealloc_device() after + * ib_register_device() fails. This is tricky due to the + * possibility for a parallel unregistration along with this + * error flow. Since we have a refcount here we know any + * parallel flow is stopped in disable_device and will see the + * NULL pointers, causing the responsibility to + * ib_dealloc_device() to revert back to this thread. + */ + dealloc_fn = device->ops.dealloc_driver; + device->ops.dealloc_driver = NULL; + ib_device_put(device); + __ib_unregister_device(device); + device->ops.dealloc_driver = dealloc_fn; + return ret; + } + ib_device_put(device); return 0; -sysfs_cleanup: - ib_device_unregister_sysfs(device); dev_cleanup: device_del(&device->dev); cg_cleanup: ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); -out: - release_name(device); return ret; } EXPORT_SYMBOL(ib_register_device); +/* Callers must hold a get on the device. */ +static void __ib_unregister_device(struct ib_device *ib_dev) +{ + /* + * We have a registration lock so that all the calls to unregister are + * fully fenced, once any unregister returns the device is truely + * unregistered even if multiple callers are unregistering it at the + * same time. This also interacts with the registration flow and + * provides sane semantics if register and unregister are racing. + */ + mutex_lock(&ib_dev->unregistration_lock); + if (!refcount_read(&ib_dev->refcount)) + goto out; + + disable_device(ib_dev); + ib_device_unregister_sysfs(ib_dev); + device_del(&ib_dev->dev); + ib_device_unregister_rdmacg(ib_dev); + ib_cache_cleanup_one(ib_dev); + + /* + * Drivers using the new flow may not call ib_dealloc_device except + * in error unwind prior to registration success. + */ + if (ib_dev->ops.dealloc_driver) { + WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); + ib_dealloc_device(ib_dev); + } +out: + mutex_unlock(&ib_dev->unregistration_lock); +} + /** * ib_unregister_device - Unregister an IB device - * @device:Device to unregister + * @device: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. + * + * Callers should call this routine only once, and protect against races with + * registration. Typically it should only be called as part of a remove + * callback in an implementation of driver core's struct device_driver and + * related. + * + * If ops.dealloc_driver is used then ib_dev will be freed upon return from + * this function. */ -void ib_unregister_device(struct ib_device *device) +void ib_unregister_device(struct ib_device *ib_dev) { - disable_device(device); - ib_device_unregister_sysfs(device); - device_del(&device->dev); - ib_device_unregister_rdmacg(device); - ib_cache_cleanup_one(device); - release_name(device); + get_device(&ib_dev->dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); +/** + * ib_unregister_device_and_put - Unregister a device while holding a 'get' + * device: The device to unregister + * + * This is the same as ib_unregister_device(), except it includes an internal + * ib_device_put() that should match a 'get' obtained by the caller. + * + * It is safe to call this routine concurrently from multiple threads while + * holding the 'get'. When the function returns the device is fully + * unregistered. + * + * Drivers using this flow MUST use the driver_unregister callback to clean up + * their resources associated with the device and dealloc it. + */ +void ib_unregister_device_and_put(struct ib_device *ib_dev) +{ + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + ib_device_put(ib_dev); + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_and_put); + +/** + * ib_unregister_driver - Unregister all IB devices for a driver + * @driver_id: The driver to unregister + * + * This implements a fence for device unregistration. It only returns once all + * devices associated with the driver_id have fully completed their + * unregistration and returned from ib_unregister_device*(). + * + * If device's are not yet unregistered it goes ahead and starts unregistering + * them. + * + * This does not block creation of new devices with the given driver_id, that + * is the responsibility of the caller. + */ +void ib_unregister_driver(enum rdma_driver_id driver_id) +{ + struct ib_device *ib_dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, ib_dev) { + if (ib_dev->driver_id != driver_id) + continue; + + get_device(&ib_dev->dev); + up_read(&devices_rwsem); + + WARN_ON(!ib_dev->ops.dealloc_driver); + __ib_unregister_device(ib_dev); + + put_device(&ib_dev->dev); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); +} +EXPORT_SYMBOL(ib_unregister_driver); + +static void ib_unregister_work(struct work_struct *work) +{ + struct ib_device *ib_dev = + container_of(work, struct ib_device, unregistration_work); + + __ib_unregister_device(ib_dev); + put_device(&ib_dev->dev); +} + +/** + * ib_unregister_device_queued - Unregister a device using a work queue + * device: The device to unregister + * + * This schedules an asynchronous unregistration using a WQ for the device. A + * driver should use this to avoid holding locks while doing unregistration, + * such as holding the RTNL lock. + * + * Drivers using this API must use ib_unregister_driver before module unload + * to ensure that all scheduled unregistrations have completed. + */ +void ib_unregister_device_queued(struct ib_device *ib_dev) +{ + WARN_ON(!refcount_read(&ib_dev->refcount)); + WARN_ON(!ib_dev->ops.dealloc_driver); + get_device(&ib_dev->dev); + if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) + put_device(&ib_dev->dev); +} +EXPORT_SYMBOL(ib_unregister_device_queued); + static int assign_client_id(struct ib_client *client) { int ret; @@ -1558,6 +1726,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_fmr); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); @@ -1744,6 +1913,7 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); + flush_workqueue(system_unbound_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3aa802b65cf3..ad83f8c38dc8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2538,6 +2538,12 @@ struct ib_device_ops { int (*fill_res_entry)(struct sk_buff *msg, struct rdma_restrack_entry *entry); + /* Device lifecycle callbacks */ + /* + * This is called as part of ib_dealloc_device(). + */ + void (*dealloc_driver)(struct ib_device *dev); + DECLARE_RDMA_OBJ_SIZE(ib_pd); }; @@ -2555,6 +2561,7 @@ struct ib_device { struct rw_semaphore client_data_rwsem; struct xarray client_data; + struct mutex unregistration_lock; struct ib_cache cache; /** @@ -2609,6 +2616,7 @@ struct ib_device { */ refcount_t refcount; struct completion unreg_completion; + struct work_struct unregistration_work; }; struct ib_client { @@ -2658,6 +2666,9 @@ void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, const char *name); void ib_unregister_device(struct ib_device *device); +void ib_unregister_driver(enum rdma_driver_id driver_id); +void ib_unregister_device_and_put(struct ib_device *device); +void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -- cgit v1.2.3 From ca22354b140853b8155692d5b2bc0110aa54e937 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:56 -0700 Subject: RDMA/rxe: Close a race after ib_register_device Since rxe allows unregistration from other threads the rxe pointer can become invalid any moment after ib_register_driver returns. This could cause a user triggered use after free. Add another driver callback to be called right after the device becomes registered to complete any device setup required post-registration. This callback has enough core locking to prevent the device from becoming unregistered. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 9 +++++++++ drivers/infiniband/sw/rxe/rxe_net.c | 8 ++++---- drivers/infiniband/sw/rxe/rxe_net.h | 2 +- drivers/infiniband/sw/rxe/rxe_sysfs.c | 9 ++------- drivers/infiniband/sw/rxe/rxe_verbs.c | 14 ++++++++++++++ include/rdma/ib_verbs.h | 5 +++++ 6 files changed, 35 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2a7d54794ee3..bf2a215d94dd 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -803,6 +803,12 @@ static int enable_device_and_get(struct ib_device *device) */ downgrade_write(&devices_rwsem); + if (device->ops.enable_driver) { + ret = device->ops.enable_driver(device); + if (ret) + goto out; + } + down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); @@ -810,6 +816,8 @@ static int enable_device_and_get(struct ib_device *device) break; } up_read(&clients_rwsem); + +out: up_read(&devices_rwsem); return ret; } @@ -1775,6 +1783,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); + SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index d6dfbcf6a47e..fb792f5bc0b7 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -517,24 +517,24 @@ enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num) return IB_LINK_LAYER_ETHERNET; } -struct rxe_dev *rxe_net_add(struct net_device *ndev) +int rxe_net_add(struct net_device *ndev) { int err; struct rxe_dev *rxe = NULL; rxe = ib_alloc_device(rxe_dev, ib_dev); if (!rxe) - return NULL; + return -ENOMEM; rxe->ndev = ndev; err = rxe_add(rxe, ndev->mtu); if (err) { ib_dealloc_device(&rxe->ib_dev); - return NULL; + return err; } - return rxe; + return 0; } static void rxe_port_event(struct rxe_dev *rxe, diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 106c586dbb26..ad79514191bb 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -43,7 +43,7 @@ struct rxe_recv_sockets { struct socket *sk6; }; -struct rxe_dev *rxe_net_add(struct net_device *ndev); +int rxe_net_add(struct net_device *ndev); int rxe_net_init(void); void rxe_net_exit(void); diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c index d51b55b0a311..46587eb0da0e 100644 --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -60,7 +60,6 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp) char intf[32]; struct net_device *ndev; struct rxe_dev *exists; - struct rxe_dev *rxe; len = sanitize_arg(val, intf, sizeof(intf)); if (!len) { @@ -82,16 +81,12 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp) goto err; } - rxe = rxe_net_add(ndev); - if (!rxe) { + err = rxe_net_add(ndev); + if (err) { pr_err("failed to add %s\n", intf); - err = -EINVAL; goto err; } - rxe_set_port_state(rxe); - dev_info(&rxe->ib_dev.dev, "added %s\n", intf); - err: dev_put(ndev); return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 76da8a142bf7..79ad93b4140c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1125,6 +1125,15 @@ static const struct attribute_group rxe_attr_group = { .attrs = rxe_dev_attributes, }; +static int rxe_enable_driver(struct ib_device *ib_dev) +{ + struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + + rxe_set_port_state(rxe); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + return 0; +} + static const struct ib_device_ops rxe_dev_ops = { .alloc_hw_stats = rxe_ib_alloc_hw_stats, .alloc_mr = rxe_alloc_mr, @@ -1144,6 +1153,7 @@ static const struct ib_device_ops rxe_dev_ops = { .destroy_qp = rxe_destroy_qp, .destroy_srq = rxe_destroy_srq, .detach_mcast = rxe_detach_mcast, + .enable_driver = rxe_enable_driver, .get_dma_mr = rxe_get_dma_mr, .get_hw_stats = rxe_ib_get_hw_stats, .get_link_layer = rxe_get_link_layer, @@ -1245,5 +1255,9 @@ int rxe_register_device(struct rxe_dev *rxe) if (err) pr_warn("%s failed with error %d\n", __func__, err); + /* + * Note that rxe may be invalid at this point if another thread + * unregistered it. + */ return err; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ad83f8c38dc8..640263289ab9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2539,6 +2539,11 @@ struct ib_device_ops { struct rdma_restrack_entry *entry); /* Device lifecycle callbacks */ + /* + * Called after the device becomes registered, before clients are + * attached + */ + int (*enable_driver)(struct ib_device *dev); /* * This is called as part of ib_dealloc_device(). */ -- cgit v1.2.3 From 3856ec4b93c9463d36ee39098dde1fbbd29ec6dd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 15 Feb 2019 11:03:53 -0800 Subject: RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support Add support for new LINK messages to allow adding and deleting rdma interfaces. This will be used initially for soft rdma drivers which instantiate device instances dynamically by the admin specifying a netdev device to use. The rdma_rxe module will be the first user of these messages. The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers register with the rdma core if they provide link add/delete functions. Each driver registers with a unique "type" string, that is used to dispatch messages coming from user space. A new RDMA_NLDEV_ATTR is defined for the "type" string. User mode will pass 3 attributes in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface to use for this link. The DELLINK message will contain the RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Michael J. Ruhl Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 122 +++++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 3 + include/rdma/rdma_netlink.h | 11 ++++ include/uapi/rdma/rdma_netlink.h | 10 +++- 4 files changed, 144 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 1980ddc5f7bc..5e94dc87f04f 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -113,6 +114,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -1200,6 +1203,117 @@ RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +static LIST_HEAD(link_ops); +static DECLARE_RWSEM(link_ops_rwsem); + +static const struct rdma_link_ops *link_ops_get(const char *type) +{ + const struct rdma_link_ops *ops; + + list_for_each_entry(ops, &link_ops, list) { + if (!strcmp(ops->type, type)) + goto out; + } + ops = NULL; +out: + return ops; +} + +void rdma_link_register(struct rdma_link_ops *ops) +{ + down_write(&link_ops_rwsem); + if (link_ops_get(ops->type)) { + WARN_ONCE("Duplicate rdma_link_ops! %s\n", ops->type); + goto out; + } + list_add(&ops->list, &link_ops); +out: + up_write(&link_ops_rwsem); +} +EXPORT_SYMBOL(rdma_link_register); + +void rdma_link_unregister(struct rdma_link_ops *ops) +{ + down_write(&link_ops_rwsem); + list_del(&ops->list); + up_write(&link_ops_rwsem); +} +EXPORT_SYMBOL(rdma_link_unregister); + +static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char ibdev_name[IB_DEVICE_NAME_MAX]; + const struct rdma_link_ops *ops; + char ndev_name[IFNAMSIZ]; + struct net_device *ndev; + char type[IFNAMSIZ]; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || + !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) + return -EINVAL; + + nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + sizeof(ibdev_name)); + if (strchr(ibdev_name, '%')) + return -EINVAL; + + nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + sizeof(ndev_name)); + + ndev = dev_get_by_name(&init_net, ndev_name); + if (!ndev) + return -ENODEV; + + down_read(&link_ops_rwsem); + ops = link_ops_get(type); +#ifdef CONFIG_MODULES + if (!ops) { + up_read(&link_ops_rwsem); + request_module("rdma-link-%s", type); + down_read(&link_ops_rwsem); + ops = link_ops_get(type); + } +#endif + err = ops ? ops->newlink(ibdev_name, ndev) : -EINVAL; + up_read(&link_ops_rwsem); + dev_put(ndev); + + return err; +} + +static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + ib_device_put(device); + return -EINVAL; + } + + ib_unregister_device_and_put(device); + return 0; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1209,6 +1323,14 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_NEWLINK] = { + .doit = nldev_newlink, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_DELLINK] = { + .doit = nldev_dellink, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 640263289ab9..225cb76d469f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -238,6 +238,7 @@ enum ib_device_cap_flags { IB_DEVICE_RDMA_NETDEV_OPA_VNIC = (1ULL << 35), /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = (1ULL << 36), + IB_DEVICE_ALLOW_USER_UNREG = (1ULL << 37), }; enum ib_signature_prot_cap { @@ -2622,6 +2623,8 @@ struct ib_device { refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; + + const struct rdma_link_ops *link_ops; }; struct ib_client { diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 70218e6b5187..10732ab31ba2 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -99,4 +99,15 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); * Returns true on success or false if no listeners. */ bool rdma_nl_chk_listeners(unsigned int group); + +struct rdma_link_ops { + struct list_head list; + const char *type; + int (*newlink)(const char *ibdev_name, struct net_device *ndev); +}; + +void rdma_link_register(struct rdma_link_ops *ops); +void rdma_link_unregister(struct rdma_link_ops *ops); + +#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4ebbcfb2c6ef..5cc592728071 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -255,9 +255,11 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET, /* can dump */ RDMA_NLDEV_CMD_SET, - /* 3 - 4 are free to use */ + RDMA_NLDEV_CMD_NEWLINK, - RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ + RDMA_NLDEV_CMD_DELLINK, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ /* 6 - 8 are free to use */ @@ -465,6 +467,10 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ + /* + * Identifies the rdma driver. eg: "rxe" or "siw" + */ + RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ /* * Always the end -- cgit v1.2.3 From a2a074ef396f8738d9ee08ceefa8811381a4fe4f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 12 Feb 2019 20:39:16 +0200 Subject: RDMA: Handle ucontext allocations by IB/core Following the PD conversion patch, do the same for ucontext allocations. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/rdma_core.c | 9 +---- drivers/infiniband/core/uverbs_cmd.c | 24 +++++++----- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 32 +++++----------- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 7 ++-- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 17 ++++----- drivers/infiniband/hw/cxgb4/provider.c | 26 +++++-------- drivers/infiniband/hw/hns/hns_roce_main.c | 26 +++++-------- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 50 +++++++------------------ drivers/infiniband/hw/mlx4/main.c | 30 ++++++--------- drivers/infiniband/hw/mlx5/main.c | 35 +++++++---------- drivers/infiniband/hw/mthca/mthca_provider.c | 39 +++++++------------ drivers/infiniband/hw/nes/nes_verbs.c | 32 ++++++---------- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 38 +++++++------------ drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 5 +-- drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qedr/verbs.c | 34 +++++------------ drivers/infiniband/hw/qedr/verbs.h | 4 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 1 + drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 18 +++------ drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 5 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 + drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 50 ++++++++----------------- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 5 +-- drivers/infiniband/sw/rdmavt/vt.c | 22 ++++------- drivers/infiniband/sw/rxe/rxe_pool.c | 1 + drivers/infiniband/sw/rxe/rxe_verbs.c | 14 +++---- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- include/rdma/ib_verbs.h | 7 ++-- 31 files changed, 198 insertions(+), 340 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index bf2a215d94dd..a9f29156e486 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1832,6 +1832,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, unmap_fmr); SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_ucontext); } EXPORT_SYMBOL(ib_set_device_ops); diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 96f919fe86e7..778375ff664e 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -844,7 +844,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, { struct ib_ucontext *ucontext = ufile->ucontext; struct ib_device *ib_dev = ucontext->device; - int ret; /* * If we are closing the FD then the user mmap VMAs must have @@ -862,12 +861,8 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, rdma_restrack_del(&ucontext->res); - /* - * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove - * the error return. - */ - ret = ib_dev->ops.dealloc_ucontext(ucontext); - WARN_ON(ret); + ib_dev->ops.dealloc_ucontext(ucontext); + kfree(ucontext); ufile->ucontext = NULL; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index e44ac718f1cd..3128821ca36e 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -224,12 +224,13 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) if (ret) goto err; - ucontext = ib_dev->ops.alloc_ucontext(ib_dev, &attrs->driver_udata); - if (IS_ERR(ucontext)) { - ret = PTR_ERR(ucontext); + ucontext = rdma_zalloc_drv_obj(ib_dev, ib_ucontext); + if (!ucontext) { + ret = -ENOMEM; goto err_alloc; } + ucontext->res.type = RDMA_RESTRACK_CTX; ucontext->device = ib_dev; ucontext->cg_obj = cg_obj; /* ufile is required when some objects are released */ @@ -240,10 +241,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) mutex_init(&ucontext->per_mm_list_lock); INIT_LIST_HEAD(&ucontext->per_mm_list); - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; - - resp.num_comp_vectors = file->device->num_comp_vectors; ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) @@ -256,15 +253,22 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) goto err_fd; } + resp.num_comp_vectors = file->device->num_comp_vectors; + ret = uverbs_response(attrs, &resp, sizeof(resp)); if (ret) goto err_file; - fd_install(resp.async_fd, filp); + ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); + if (ret) + goto err_file; + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + ucontext->invalidate_range = NULL; - ucontext->res.type = RDMA_RESTRACK_CTX; rdma_restrack_uadd(&ucontext->res); + fd_install(resp.async_fd, filp); + /* * Make sure that ib_uverbs_get_ucontext() sees the pointer update * only after all writes to setup the ucontext have completed @@ -283,7 +287,7 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - ib_dev->ops.dealloc_ucontext(ucontext); + kfree(ucontext); err_alloc: ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index f29f29aae537..24092911c2ac 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3671,13 +3671,14 @@ free_mr: return ERR_PTR(rc); } -struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) { + struct ib_device *ibdev = ctx->device; + struct bnxt_re_ucontext *uctx = + container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; struct bnxt_re_uctx_resp resp; - struct bnxt_re_ucontext *uctx; u32 chip_met_rev_num = 0; int rc; @@ -3687,13 +3688,9 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) { dev_dbg(rdev_to_dev(rdev), " is different from the device %d ", BNXT_RE_ABI_VERSION); - return ERR_PTR(-EPERM); + return -EPERM; } - uctx = kzalloc(sizeof(*uctx), GFP_KERNEL); - if (!uctx) - return ERR_PTR(-ENOMEM); - uctx->rdev = rdev; uctx->shpg = (void *)__get_free_page(GFP_KERNEL); @@ -3727,23 +3724,21 @@ struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, goto cfail; } - return &uctx->ib_uctx; + return 0; cfail: free_page((unsigned long)uctx->shpg); uctx->shpg = NULL; fail: - kfree(uctx); - return ERR_PTR(rc); + return rc; } -int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) +void bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) { struct bnxt_re_ucontext *uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = uctx->rdev; - int rc = 0; if (uctx->shpg) free_page((unsigned long)uctx->shpg); @@ -3752,17 +3747,10 @@ int bnxt_re_dealloc_ucontext(struct ib_ucontext *ib_uctx) /* Free DPI only if this is the first PD allocated by the * application and mark the context dpi as NULL */ - rc = bnxt_qplib_dealloc_dpi(&rdev->qplib_res, - &rdev->qplib_res.dpi_tbl, - &uctx->dpi); - if (rc) - dev_err(rdev_to_dev(rdev), "Deallocate HW DPI failed!"); - /* Don't fail, continue*/ + bnxt_qplib_dealloc_dpi(&rdev->qplib_res, + &rdev->qplib_res.dpi_tbl, &uctx->dpi); uctx->dpi.dbr = NULL; } - - kfree(uctx); - return 0; } /* Helper function to mmap the virtual memory from user app */ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index c7cca803cfa3..e45465ed4eee 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -135,8 +135,8 @@ struct bnxt_re_mw { }; struct bnxt_re_ucontext { + struct ib_ucontext ib_uctx; struct bnxt_re_dev *rdev; - struct ib_ucontext ib_uctx; struct bnxt_qplib_dpi dpi; void *shpg; spinlock_t sh_lock; /* protect shpg */ @@ -215,9 +215,8 @@ int bnxt_re_dealloc_mw(struct ib_mw *mw); struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); -struct ib_ucontext *bnxt_re_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata); -int bnxt_re_dealloc_ucontext(struct ib_ucontext *context); +int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); +void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); unsigned long bnxt_re_lock_cqs(struct bnxt_re_qp *qp); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 0a89ef6e5754..2bd24ac45ee4 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -638,6 +638,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), }; static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index b74fd90a22dc..4accf7b3dcf2 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -62,7 +62,7 @@ #include #include "common.h" -static int iwch_dealloc_ucontext(struct ib_ucontext *context) +static void iwch_dealloc_ucontext(struct ib_ucontext *context) { struct iwch_dev *rhp = to_iwch_dev(context->device); struct iwch_ucontext *ucontext = to_iwch_ucontext(context); @@ -72,24 +72,20 @@ static int iwch_dealloc_ucontext(struct ib_ucontext *context) list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); cxio_release_ucontext(&rhp->rdev, &ucontext->uctx); - kfree(ucontext); - return 0; } -static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int iwch_alloc_ucontext(struct ib_ucontext *ucontext, + struct ib_udata *udata) { - struct iwch_ucontext *context; + struct ib_device *ibdev = ucontext->device; + struct iwch_ucontext *context = to_iwch_ucontext(ucontext); struct iwch_dev *rhp = to_iwch_dev(ibdev); pr_debug("%s ibdev %p\n", __func__, ibdev); - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); cxio_init_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); - return &context->ibucontext; + return 0; } static int iwch_destroy_cq(struct ib_cq *ib_cq) @@ -1342,6 +1338,7 @@ static const struct ib_device_ops iwch_dev_ops = { .req_notify_cq = iwch_arm_cq, .resize_cq = iwch_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext), }; int iwch_register_device(struct iwch_dev *dev) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 81fcffb597ab..507c54572cc9 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -58,7 +58,7 @@ static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)"); -static int c4iw_dealloc_ucontext(struct ib_ucontext *context) +static void c4iw_dealloc_ucontext(struct ib_ucontext *context) { struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context); struct c4iw_dev *rhp; @@ -70,26 +70,19 @@ static int c4iw_dealloc_ucontext(struct ib_ucontext *context) list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry) kfree(mm); c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx); - kfree(ucontext); - return 0; } -static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, + struct ib_udata *udata) { - struct c4iw_ucontext *context; + struct ib_device *ibdev = ucontext->device; + struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext); struct c4iw_dev *rhp = to_c4iw_dev(ibdev); struct c4iw_alloc_ucontext_resp uresp; int ret = 0; struct c4iw_mm_entry *mm = NULL; pr_debug("ibdev %p\n", ibdev); - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) { - ret = -ENOMEM; - goto err; - } - c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx); INIT_LIST_HEAD(&context->mmaps); spin_lock_init(&context->mmap_lock); @@ -101,7 +94,7 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, mm = kmalloc(sizeof(*mm), GFP_KERNEL); if (!mm) { ret = -ENOMEM; - goto err_free; + goto err; } uresp.status_page_size = PAGE_SIZE; @@ -121,13 +114,11 @@ static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev, mm->len = PAGE_SIZE; insert_mmap(context, mm); } - return &context->ibucontext; + return 0; err_mm: kfree(mm); -err_free: - kfree(context); err: - return ERR_PTR(ret); + return ret; } static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) @@ -555,6 +546,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .reg_user_mr = c4iw_reg_user_mr, .req_notify_cq = c4iw_arm_cq, INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext), }; void c4iw_register_device(struct work_struct *work) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 29fb4fbba5ba..c929125da84b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -335,23 +335,19 @@ static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask, return 0; } -static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, - struct ib_udata *udata) +static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) { int ret = 0; - struct hns_roce_ucontext *context; + struct hns_roce_ucontext *context = to_hr_ucontext(uctx); struct hns_roce_ib_alloc_ucontext_resp resp = {}; - struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + struct hns_roce_dev *hr_dev = to_hr_dev(uctx->device); if (!hr_dev->active) - return ERR_PTR(-EAGAIN); + return -EAGAIN; resp.qp_tab_size = hr_dev->caps.num_qps; - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); - ret = hns_roce_uar_alloc(hr_dev, &context->uar); if (ret) goto error_fail_uar_alloc; @@ -365,25 +361,20 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, if (ret) goto error_fail_copy_to_udata; - return &context->ibucontext; + return 0; error_fail_copy_to_udata: hns_roce_uar_free(hr_dev, &context->uar); error_fail_uar_alloc: - kfree(context); - - return ERR_PTR(ret); + return ret; } -static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) +static void hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); hns_roce_uar_free(to_hr_dev(ibcontext->device), &context->uar); - kfree(context); - - return 0; } static int hns_roce_mmap(struct ib_ucontext *context, @@ -478,6 +469,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .query_port = hns_roce_query_port, .reg_user_mr = hns_roce_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext), }; static const struct ib_device_ops hns_roce_dev_mr_ops = { diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 76b4d1218696..a8352e3ca23d 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -121,78 +121,55 @@ static int i40iw_query_port(struct ib_device *ibdev, /** * i40iw_alloc_ucontext - Allocate the user context data structure - * @ibdev: device pointer from stack + * @uctx: Uverbs context pointer from stack * @udata: user data * * This keeps track of all objects associated with a particular * user-mode client. */ -static struct ib_ucontext *i40iw_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int i40iw_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; struct i40iw_device *iwdev = to_iwdev(ibdev); struct i40iw_alloc_ucontext_req req; - struct i40iw_alloc_ucontext_resp uresp; - struct i40iw_ucontext *ucontext; + struct i40iw_alloc_ucontext_resp uresp = {}; + struct i40iw_ucontext *ucontext = to_ucontext(uctx); if (ib_copy_from_udata(&req, udata, sizeof(req))) - return ERR_PTR(-EINVAL); + return -EINVAL; if (req.userspace_ver < 4 || req.userspace_ver > I40IW_ABI_VER) { i40iw_pr_err("Unsupported provider library version %u.\n", req.userspace_ver); - return ERR_PTR(-EINVAL); + return -EINVAL; } - memset(&uresp, 0, sizeof(uresp)); uresp.max_qps = iwdev->max_qp; uresp.max_pds = iwdev->max_pd; uresp.wq_size = iwdev->max_qp_wr * 2; uresp.kernel_ver = req.userspace_ver; - ucontext = kzalloc(sizeof(*ucontext), GFP_KERNEL); - if (!ucontext) - return ERR_PTR(-ENOMEM); - ucontext->iwdev = iwdev; ucontext->abi_ver = req.userspace_ver; - if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { - kfree(ucontext); - return ERR_PTR(-EFAULT); - } + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) + return -EFAULT; INIT_LIST_HEAD(&ucontext->cq_reg_mem_list); spin_lock_init(&ucontext->cq_reg_mem_list_lock); INIT_LIST_HEAD(&ucontext->qp_reg_mem_list); spin_lock_init(&ucontext->qp_reg_mem_list_lock); - return &ucontext->ibucontext; + return 0; } /** * i40iw_dealloc_ucontext - deallocate the user context data structure * @context: user context created during alloc */ -static int i40iw_dealloc_ucontext(struct ib_ucontext *context) +static void i40iw_dealloc_ucontext(struct ib_ucontext *context) { - struct i40iw_ucontext *ucontext = to_ucontext(context); - unsigned long flags; - - spin_lock_irqsave(&ucontext->cq_reg_mem_list_lock, flags); - if (!list_empty(&ucontext->cq_reg_mem_list)) { - spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); - return -EBUSY; - } - spin_unlock_irqrestore(&ucontext->cq_reg_mem_list_lock, flags); - spin_lock_irqsave(&ucontext->qp_reg_mem_list_lock, flags); - if (!list_empty(&ucontext->qp_reg_mem_list)) { - spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); - return -EBUSY; - } - spin_unlock_irqrestore(&ucontext->qp_reg_mem_list_lock, flags); - - kfree(ucontext); - return 0; + return; } /** @@ -2740,6 +2717,7 @@ static const struct ib_device_ops i40iw_dev_ops = { .reg_user_mr = i40iw_reg_user_mr, .req_notify_cq = i40iw_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext), }; /** diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c0f6aea7ed7c..733f7bbd5901 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1076,17 +1076,18 @@ out: return err; } -static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; struct mlx4_ib_dev *dev = to_mdev(ibdev); - struct mlx4_ib_ucontext *context; + struct mlx4_ib_ucontext *context = to_mucontext(uctx); struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) - return ERR_PTR(-EAGAIN); + return -EAGAIN; if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; @@ -1100,15 +1101,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, resp.cqe_size = dev->dev->caps.cqe_size; } - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); - err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); - if (err) { - kfree(context); - return ERR_PTR(err); - } + if (err) + return err; INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); @@ -1123,21 +1118,17 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); - kfree(context); - return ERR_PTR(-EFAULT); + return -EFAULT; } - return &context->ibucontext; + return err; } -static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +static void mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); - kfree(context); - - return 0; } static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) @@ -2570,6 +2561,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .rereg_user_mr = mlx4_ib_rereg_user_mr, .resize_cq = mlx4_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext), }; static const struct ib_device_ops mlx4_ib_dev_wq_ops = { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 614b2acbc621..994c19d01211 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1745,14 +1745,15 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn, mlx5_ib_disable_lb(dev, true, false); } -static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_ucontext *context; + struct mlx5_ib_ucontext *context = to_mucontext(uctx); struct mlx5_bfreg_info *bfregi; int ver; int err; @@ -1762,29 +1763,29 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, bool lib_uar_4k; if (!dev->ib_active) - return ERR_PTR(-EAGAIN); + return -EAGAIN; if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; else if (udata->inlen >= min_req_v2) ver = 2; else - return ERR_PTR(-EINVAL); + return -EINVAL; err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); if (err) - return ERR_PTR(err); + return err; if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; req.total_num_bfregs = ALIGN(req.total_num_bfregs, MLX5_NON_FP_BFREGS_PER_UAR); if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) - return ERR_PTR(-EINVAL); + return -EINVAL; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) @@ -1817,10 +1818,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */ } - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); - lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; bfregi = &context->bfregi; @@ -1955,7 +1952,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, 1, &dev->roce[port].tx_port_affinity)); } - return &context->ibucontext; + return 0; out_mdev: mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); @@ -1973,12 +1970,10 @@ out_count: kfree(bfregi->count); out_ctx: - kfree(context); - - return ERR_PTR(err); + return err; } -static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); @@ -1998,9 +1993,6 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) deallocate_uars(dev, context); kfree(bfregi->sys_pages); kfree(bfregi->count); - kfree(context); - - return 0; } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, @@ -5984,6 +5976,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext), }; static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = { diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 80c3af217d96..d063d7a37762 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -301,17 +301,16 @@ static int mthca_query_gid(struct ib_device *ibdev, u8 port, return err; } -static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int mthca_alloc_ucontext(struct ib_ucontext *uctx, + struct ib_udata *udata) { - struct mthca_alloc_ucontext_resp uresp; - struct mthca_ucontext *context; + struct ib_device *ibdev = uctx->device; + struct mthca_alloc_ucontext_resp uresp = {}; + struct mthca_ucontext *context = to_mucontext(uctx); int err; if (!(to_mdev(ibdev)->active)) - return ERR_PTR(-EAGAIN); - - memset(&uresp, 0, sizeof uresp); + return -EAGAIN; uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps; if (mthca_is_memfree(to_mdev(ibdev))) @@ -319,44 +318,33 @@ static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev, else uresp.uarc_size = 0; - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); - err = mthca_uar_alloc(to_mdev(ibdev), &context->uar); - if (err) { - kfree(context); - return ERR_PTR(err); - } + if (err) + return err; context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev)); if (IS_ERR(context->db_tab)) { err = PTR_ERR(context->db_tab); mthca_uar_free(to_mdev(ibdev), &context->uar); - kfree(context); - return ERR_PTR(err); + return err; } - if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) { mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab); mthca_uar_free(to_mdev(ibdev), &context->uar); - kfree(context); - return ERR_PTR(-EFAULT); + return -EFAULT; } context->reg_mr_warned = 0; - return &context->ibucontext; + return 0; } -static int mthca_dealloc_ucontext(struct ib_ucontext *context) +static void mthca_dealloc_ucontext(struct ib_ucontext *context) { mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar, to_mucontext(context)->db_tab); mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar); - kfree(to_mucontext(context)); - - return 0; } static int mthca_mmap_uar(struct ib_ucontext *context, @@ -1213,6 +1201,7 @@ static const struct ib_device_ops mthca_dev_ops = { .reg_user_mr = mthca_reg_user_mr, .resize_cq = mthca_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext), }; static const struct ib_device_ops mthca_dev_arbel_srq_ops = { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index b23956aa45b8..828e4af3f951 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -529,27 +529,27 @@ static int nes_query_gid(struct ib_device *ibdev, u8 port, * nes_alloc_ucontext - Allocate the user context data structure. This keeps track * of all objects associated with a particular user-mode client. */ -static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int nes_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; struct nes_vnic *nesvnic = to_nesvnic(ibdev); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; struct nes_alloc_ucontext_req req; struct nes_alloc_ucontext_resp uresp = {}; - struct nes_ucontext *nes_ucontext; + struct nes_ucontext *nes_ucontext = to_nesucontext(uctx); struct nes_ib_device *nesibdev = nesvnic->nesibdev; if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) { printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } if (req.userspace_ver != NES_ABI_USERSPACE_VER) { printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n", req.userspace_ver, NES_ABI_USERSPACE_VER); - return ERR_PTR(-EINVAL); + return -EINVAL; } @@ -559,10 +559,6 @@ static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev, uresp.virtwq = nesadapter->virtwq; uresp.kernel_ver = NES_ABI_KERNEL_VER; - nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL); - if (!nes_ucontext) - return ERR_PTR(-ENOMEM); - nes_ucontext->nesdev = nesdev; nes_ucontext->mmap_wq_offset = uresp.max_pds; nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset + @@ -570,29 +566,22 @@ static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev, PAGE_SIZE; - if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) { - kfree(nes_ucontext); - return ERR_PTR(-EFAULT); - } + if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) + return -EFAULT; INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list); INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list); - return &nes_ucontext->ibucontext; + return 0; } - /** * nes_dealloc_ucontext */ -static int nes_dealloc_ucontext(struct ib_ucontext *context) +static void nes_dealloc_ucontext(struct ib_ucontext *context) { - struct nes_ucontext *nes_ucontext = to_nesucontext(context); - - kfree(nes_ucontext); - return 0; + return; } - /** * nes_mmap */ @@ -3599,6 +3588,7 @@ static const struct ib_device_ops nes_dev_ops = { .reg_user_mr = nes_reg_user_mr, .req_notify_cq = nes_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext), }; /** diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 0de83c92691f..b9e10d55a58e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -180,6 +180,7 @@ static const struct ib_device_ops ocrdma_dev_ops = { .req_notify_cq = ocrdma_arm_cq, .resize_cq = ocrdma_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext), }; static const struct ib_device_ops ocrdma_dev_srq_ops = { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index ed5da67b693d..b4e1777c2c97 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -440,7 +440,7 @@ err: return status; } -static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) +static void ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) { struct ocrdma_pd *pd = uctx->cntxt_pd; struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); @@ -451,8 +451,7 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) } kfree(uctx->cntxt_pd); uctx->cntxt_pd = NULL; - (void)_ocrdma_dealloc_pd(dev, pd); - return 0; + _ocrdma_dealloc_pd(dev, pd); } static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx) @@ -476,33 +475,28 @@ static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx) mutex_unlock(&uctx->mm_list_lock); } -struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; int status; - struct ocrdma_ucontext *ctx; - struct ocrdma_alloc_ucontext_resp resp; + struct ocrdma_ucontext *ctx = get_ocrdma_ucontext(uctx); + struct ocrdma_alloc_ucontext_resp resp = {}; struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); struct pci_dev *pdev = dev->nic_info.pdev; u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE); if (!udata) - return ERR_PTR(-EFAULT); - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return ERR_PTR(-ENOMEM); + return -EFAULT; INIT_LIST_HEAD(&ctx->mm_head); mutex_init(&ctx->mm_list_lock); ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len, &ctx->ah_tbl.pa, GFP_KERNEL); - if (!ctx->ah_tbl.va) { - kfree(ctx); - return ERR_PTR(-ENOMEM); - } + if (!ctx->ah_tbl.va) + return -ENOMEM; + ctx->ah_tbl.len = map_len; - memset(&resp, 0, sizeof(resp)); resp.ah_tbl_len = ctx->ah_tbl.len; resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va); @@ -524,7 +518,7 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, status = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (status) goto cpy_err; - return &ctx->ibucontext; + return 0; cpy_err: ocrdma_dealloc_ucontext_pd(ctx); @@ -533,19 +527,17 @@ pd_err: map_err: dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va, ctx->ah_tbl.pa); - kfree(ctx); - return ERR_PTR(status); + return status; } -int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) +void ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) { - int status; struct ocrdma_mm *mm, *tmp; struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx); struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device); struct pci_dev *pdev = dev->nic_info.pdev; - status = ocrdma_dealloc_ucontext_pd(uctx); + ocrdma_dealloc_ucontext_pd(uctx); ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len); dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va, @@ -555,8 +547,6 @@ int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx) list_del(&mm->entry); kfree(mm); } - kfree(uctx); - return status; } int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 1fd66721c930..4c04ab40798e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -64,9 +64,8 @@ void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid); struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num); int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); -struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *, - struct ib_udata *); -int ocrdma_dealloc_ucontext(struct ib_ucontext *); +int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata); +void ocrdma_dealloc_ucontext(struct ib_ucontext *uctx); int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 44ce4989dcef..996d9ecd93e0 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -240,6 +240,7 @@ static const struct ib_device_ops qedr_dev_ops = { .req_notify_cq = qedr_arm_cq, .resize_cq = qedr_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext), }; static int qedr_register_device(struct qedr_dev *dev) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index d51bc3ede9d1..59ad4202422c 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -316,28 +316,24 @@ static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr, return found; } -struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; int rc; - struct qedr_ucontext *ctx; - struct qedr_alloc_ucontext_resp uresp; + struct qedr_ucontext *ctx = get_qedr_ucontext(uctx); + struct qedr_alloc_ucontext_resp uresp = {}; struct qedr_dev *dev = get_qedr_dev(ibdev); struct qed_rdma_add_user_out_params oparams; if (!udata) - return ERR_PTR(-EFAULT); - - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return ERR_PTR(-ENOMEM); + return -EFAULT; rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams); if (rc) { DP_ERR(dev, "failed to allocate a DPI for a new RoCE application, rc=%d. To overcome this consider to increase the number of DPIs, increase the doorbell BAR size or just close unnecessary RoCE applications. In order to increase the number of DPIs consult the qedr readme\n", rc); - goto err; + return rc; } ctx->dpi = oparams.dpi; @@ -347,8 +343,6 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&ctx->mm_head); mutex_init(&ctx->mm_list_lock); - memset(&uresp, 0, sizeof(uresp)); - uresp.dpm_enabled = dev->user_dpm_enabled; uresp.wids_enabled = 1; uresp.wid_count = oparams.wid_count; @@ -364,28 +358,23 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rc) - goto err; + return rc; ctx->dev = dev; rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size); if (rc) - goto err; + return rc; DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n", &ctx->ibucontext); - return &ctx->ibucontext; - -err: - kfree(ctx); - return ERR_PTR(rc); + return 0; } -int qedr_dealloc_ucontext(struct ib_ucontext *ibctx) +void qedr_dealloc_ucontext(struct ib_ucontext *ibctx) { struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx); struct qedr_mm *mm, *tmp; - int status = 0; DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n", uctx); @@ -398,9 +387,6 @@ int qedr_dealloc_ucontext(struct ib_ucontext *ibctx) list_del(&mm->entry); kfree(mm); } - - kfree(uctx); - return status; } int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 97a6ff3f9afb..f0c05f4771ac 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -43,8 +43,8 @@ int qedr_iw_query_gid(struct ib_device *ibdev, u8 port, int qedr_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey); -struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *, struct ib_udata *); -int qedr_dealloc_ucontext(struct ib_ucontext *); +int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata); +void qedr_dealloc_ucontext(struct ib_ucontext *uctx); int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index be6468021a9a..d88d9f8a7f9a 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -350,6 +350,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext), }; /* Start of PF discovery section */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 7549ae23027e..bd4521b2cc5f 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -653,37 +653,31 @@ int usnic_ib_dereg_mr(struct ib_mr *ibmr) return 0; } -struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - struct usnic_ib_ucontext *context; + struct ib_device *ibdev = uctx->device; + struct usnic_ib_ucontext *context = to_ucontext(uctx); struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); usnic_dbg("\n"); - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&context->qp_grp_list); mutex_lock(&us_ibdev->usdev_lock); list_add_tail(&context->link, &us_ibdev->ctx_list); mutex_unlock(&us_ibdev->usdev_lock); - return &context->ibucontext; + return 0; } -int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) +void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct usnic_ib_ucontext *context = to_uucontext(ibcontext); struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device); usnic_dbg("\n"); mutex_lock(&us_ibdev->usdev_lock); - BUG_ON(!list_empty(&context->qp_grp_list)); + WARN_ON_ONCE(!list_empty(&context->qp_grp_list)); list_del(&context->link); mutex_unlock(&us_ibdev->usdev_lock); - kfree(context); - return 0; } int usnic_ib_mmap(struct ib_ucontext *context, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 2a87650949f6..c40e89b6246f 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -68,9 +68,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); int usnic_ib_dereg_mr(struct ib_mr *ibmr); -struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata); -int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); +int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata); +void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext); int usnic_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); #endif /* !USNIC_IB_VERBS_H */ diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 47e653d2495c..6d8b3e0de57a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -196,6 +196,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .reg_user_mr = pvrdma_reg_user_mr, .req_notify_cq = pvrdma_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext), }; static const struct ib_device_ops pvrdma_dev_srq_ops = { diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index f44220f72e05..42fe821f8d58 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -306,41 +306,32 @@ out: /** * pvrdma_alloc_ucontext - allocate ucontext - * @ibdev: the IB device + * @uctx: the uverbs countext * @udata: user data * - * @return: the ib_ucontext pointer on success, otherwise errno. + * @return: zero on success, otherwise errno. */ -struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { + struct ib_device *ibdev = uctx->device; struct pvrdma_dev *vdev = to_vdev(ibdev); - struct pvrdma_ucontext *context; - union pvrdma_cmd_req req; - union pvrdma_cmd_resp rsp; + struct pvrdma_ucontext *context = to_vucontext(uctx); + union pvrdma_cmd_req req = {}; + union pvrdma_cmd_resp rsp = {}; struct pvrdma_cmd_create_uc *cmd = &req.create_uc; struct pvrdma_cmd_create_uc_resp *resp = &rsp.create_uc_resp; - struct pvrdma_alloc_ucontext_resp uresp = {0}; + struct pvrdma_alloc_ucontext_resp uresp = {}; int ret; - void *ptr; if (!vdev->ib_active) - return ERR_PTR(-EAGAIN); - - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); + return -EAGAIN; context->dev = vdev; ret = pvrdma_uar_alloc(vdev, &context->uar); - if (ret) { - kfree(context); - return ERR_PTR(-ENOMEM); - } + if (ret) + return -ENOMEM; /* get ctx_handle from host */ - memset(cmd, 0, sizeof(*cmd)); - if (vdev->dsr_version < PVRDMA_PPN64_VERSION) cmd->pfn = context->uar.pfn; else @@ -351,7 +342,6 @@ struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev, if (ret < 0) { dev_warn(&vdev->pdev->dev, "could not create ucontext, error: %d\n", ret); - ptr = ERR_PTR(ret); goto err; } @@ -362,33 +352,28 @@ struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev, ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (ret) { pvrdma_uar_free(vdev, &context->uar); - context->ibucontext.device = ibdev; pvrdma_dealloc_ucontext(&context->ibucontext); - return ERR_PTR(-EFAULT); + return -EFAULT; } - return &context->ibucontext; + return 0; err: pvrdma_uar_free(vdev, &context->uar); - kfree(context); - return ptr; + return ret; } /** * pvrdma_dealloc_ucontext - deallocate ucontext * @ibcontext: the ucontext - * - * @return: 0 on success, otherwise errno. */ -int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext) +void pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct pvrdma_ucontext *context = to_vucontext(ibcontext); - union pvrdma_cmd_req req; + union pvrdma_cmd_req req = {}; struct pvrdma_cmd_destroy_uc *cmd = &req.destroy_uc; int ret; - memset(cmd, 0, sizeof(*cmd)); cmd->hdr.cmd = PVRDMA_CMD_DESTROY_UC; cmd->ctx_handle = context->ctx_handle; @@ -399,9 +384,6 @@ int pvrdma_dealloc_ucontext(struct ib_ucontext *ibcontext) /* Free the UAR even if the device command failed */ pvrdma_uar_free(to_vdev(ibcontext->device), &context->uar); - kfree(context); - - return ret; } /** diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index ed91baad1ffa..607aa131d67c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -396,9 +396,8 @@ int pvrdma_modify_device(struct ib_device *ibdev, int mask, int pvrdma_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props); int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); -struct ib_ucontext *pvrdma_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata); -int pvrdma_dealloc_ucontext(struct ib_ucontext *context); +int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata); +void pvrdma_dealloc_ucontext(struct ib_ucontext *context); int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, struct ib_udata *udata); void pvrdma_dealloc_pd(struct ib_pd *ibpd); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index ee9c82cb3b6b..42c9d35f832d 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -292,28 +292,21 @@ static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext /** * rvt_alloc_ucontext - Allocate a user context - * @ibdev: Verbs IB dev + * @uctx: Verbs context * @udata: User data allocated */ -static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev, - struct ib_udata *udata) +static int rvt_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - struct rvt_ucontext *context; - - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return ERR_PTR(-ENOMEM); - return &context->ibucontext; + return 0; } /** - *rvt_dealloc_ucontext - Free a user context - *@context - Free this + * rvt_dealloc_ucontext - Free a user context + * @context - Free this */ -static int rvt_dealloc_ucontext(struct ib_ucontext *context) +static void rvt_dealloc_ucontext(struct ib_ucontext *context) { - kfree(to_iucontext(context)); - return 0; + return; } static int rvt_get_port_immutable(struct ib_device *ibdev, u8 port_num, @@ -433,6 +426,7 @@ static const struct ib_device_ops rvt_dev_ops = { .resize_cq = rvt_resize_cq, .unmap_fmr = rvt_unmap_fmr, INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext), }; static noinline int check_support(struct rvt_dev_info *rdi, int verb) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index c9b8b8c6bfb5..120fa9005954 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -42,6 +42,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_UC] = { .name = "rxe-uc", .size = sizeof(struct rxe_ucontext), + .flags = RXE_POOL_NO_ALLOC, }, [RXE_TYPE_PD] = { .name = "rxe-pd", diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index bd6a379b79d3..6ecf28570ff0 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -142,22 +142,19 @@ static enum rdma_link_layer rxe_get_link_layer(struct ib_device *dev, return rxe_link_layer(rxe, port_num); } -static struct ib_ucontext *rxe_alloc_ucontext(struct ib_device *dev, - struct ib_udata *udata) +static int rxe_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata) { - struct rxe_dev *rxe = to_rdev(dev); - struct rxe_ucontext *uc; + struct rxe_dev *rxe = to_rdev(uctx->device); + struct rxe_ucontext *uc = to_ruc(uctx); - uc = rxe_alloc(&rxe->uc_pool); - return uc ? &uc->ibuc : ERR_PTR(-ENOMEM); + return rxe_add_to_pool(&rxe->uc_pool, &uc->pelem); } -static int rxe_dealloc_ucontext(struct ib_ucontext *ibuc) +static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); rxe_drop_ref(uc); - return 0; } static int rxe_port_immutable(struct ib_device *dev, u8 port_num, @@ -1180,6 +1177,7 @@ static const struct ib_device_ops rxe_dev_ops = { .req_notify_cq = rxe_req_notify_cq, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), }; int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index d02eb75ef282..157e51aeb1e1 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -61,8 +61,8 @@ static inline int psn_compare(u32 psn_a, u32 psn_b) } struct rxe_ucontext { + struct ib_ucontext ibuc; struct rxe_pool_entry pelem; - struct ib_ucontext ibuc; }; struct rxe_pd { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 225cb76d469f..9b9e17bcc201 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2389,9 +2389,9 @@ struct ib_device_ops { int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); - struct ib_ucontext *(*alloc_ucontext)(struct ib_device *device, - struct ib_udata *udata); - int (*dealloc_ucontext)(struct ib_ucontext *context); + int (*alloc_ucontext)(struct ib_ucontext *context, + struct ib_udata *udata); + void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context, @@ -2551,6 +2551,7 @@ struct ib_device_ops { void (*dealloc_driver)(struct ib_device *dev); DECLARE_RDMA_OBJ_SIZE(ib_pd); + DECLARE_RDMA_OBJ_SIZE(ib_ucontext); }; struct rdma_restrack_root; -- cgit v1.2.3