diff options
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 5 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 11 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 82 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 18 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_marshal.c | 299 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 20 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 22 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 73 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 6 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 96 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 30 |
11 files changed, 296 insertions, 366 deletions
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 1ebb09e1ac4f..59e64025ed96 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mw *mw; while (!list_empty(&req->rl_registered)) { - mw = list_first_entry(&req->rl_registered, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - + mw = rpcrdma_pop_mw(&req->rl_registered); if (sync) fmr_op_recover_mr(mw); else diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 47bed5333c7f..f81dd93176c0 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -466,8 +466,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; + struct rpcrdma_mw *mw; int count, rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -534,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * them to the free MW list. */ unmap: - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + while (!list_empty(&req->rl_registered)) { + mw = rpcrdma_pop_mw(&req->rl_registered); dprintk("RPC: %s: DMA unmapping frmr %p\n", __func__, &mw->frmr); - list_del_init(&mw->mw_list); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -571,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mw *mw; while (!list_empty(&req->rl_registered)) { - mw = list_first_entry(&req->rl_registered, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - + mw = rpcrdma_pop_mw(&req->rl_registered); if (sync) frwr_op_recover_mr(mw); else diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c52e0f2ffe52..a044be2d6ad7 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) /* The client can send a request inline as long as the RPCRDMA header * plus the RPC call fit under the transport's inline limit. If the * combined call message size exceeds that limit, the client must use - * the read chunk list for this operation. + * a Read chunk for this operation. + * + * A Read chunk is also required if sending the RPC call inline would + * exceed this device's max_sge limit. */ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct xdr_buf *xdr = &rqst->rq_snd_buf; + unsigned int count, remaining, offset; + + if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + return false; + + if (xdr->page_len) { + remaining = xdr->page_len; + offset = xdr->page_base & ~PAGE_MASK; + count = 0; + while (remaining) { + remaining -= min_t(unsigned int, + PAGE_SIZE - offset, remaining); + offset = 0; + if (++count > r_xprt->rx_ia.ri_max_send_sges) + return false; + } + } - return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; + return true; } /* The client can't know how large the actual reply will be. Thus it @@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) */ static int -rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, - bool reminv_expected) +rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type, + struct rpcrdma_mr_seg *seg) { int len, n, p, page_base; struct page **ppages; @@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == RPCRDMA_MAX_SEGS) goto out_overflow; - /* When encoding the read list, the tail is always sent inline */ - if (type == rpcrdma_readch) + /* When encoding a Read chunk, the tail iovec contains an + * XDR pad and may be omitted. + */ + if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) return n; - /* When encoding the Write list, some servers need to see an extra - * segment for odd-length Write chunks. The upper layer provides - * space in the tail iovec for this purpose. + /* When encoding a Write chunk, some servers need to see an + * extra segment for non-XDR-aligned Write chunks. The upper + * layer provides space in the tail iovec that may be used + * for this purpose. */ - if (type == rpcrdma_writech && reminv_expected) + if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) return n; if (xdrbuf->tail[0].iov_len) { - /* the rpcrdma protocol allows us to omit any trailing - * xdr pad bytes, saving the server an RDMA operation. */ - if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) - return n; n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); if (n == RPCRDMA_MAX_SEGS) goto out_overflow; @@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, if (rtype == rpcrdma_areadch) pos = 0; seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, + rtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, false, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); *iptr++ = xdr_one; /* item present */ @@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg, - r_xprt->rx_ia.ri_reminv_expected); + wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, true, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); iptr = xdr_encode_rdma_segment(iptr, mw); @@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, - r_xprt->rx_ia.ri_reminv_expected); + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, true, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); iptr = xdr_encode_rdma_segment(iptr, mw); @@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) iptr = headerp->rm_body.rm_chunks; iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", @@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, &rqst->rq_snd_buf, rtype)) { iptr = ERR_PTR(-EIO); - goto out_unmap; + goto out_err; } return 0; -out_unmap: - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); +out_err: + pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", + PTR_ERR(iptr)); + r_xprt->rx_stats.failed_marshal_count++; return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 288e35c2d8f4..ff1df40f0d26 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -4,6 +4,7 @@ * Support for backward direction RPCs on RPC/RDMA (server-side). */ +#include <linux/module.h> #include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" @@ -200,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + __be32 *p; int rc; /* Space in the send buffer for an RPC/RDMA header is reserved * via xprt->tsh_size. */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + p = rqst->rq_buffer; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 0ba9887f3e22..1c4aabf0f657 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -47,102 +48,43 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* - * Decodes a read chunk list. The expected format is as follows: - * descrim : xdr_one - * position : __be32 offset into XDR stream - * handle : __be32 RKEY - * . . . - * end-of-list: xdr_zero - */ -static __be32 *decode_read_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) { - struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + __be32 *next; - while (ch->rc_discrim != xdr_zero) { - if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > - (unsigned long)vaend) { - dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + while (*p++ != xdr_zero) { + next = p + rpcrdma_readchunk_maxsz - 1; + if (next > end) return NULL; - } - ch++; + p = next; } - return &ch->rc_position; + return p; } -/* - * Decodes a write chunk list. The expected format is as follows: - * descrim : xdr_one - * nchunks : <count> - * handle : __be32 RKEY ---+ - * length : __be32 <len of segment> | - * offset : remove va + <count> - * . . . | - * ---+ - */ -static __be32 *decode_write_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; + __be32 *next; - /* Check for not write-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + while (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - return &ary->wc_array[nchunks].wc_target.rs_length; + return p; } -static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; - - /* Check for no reply-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + __be32 *next; + + if (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - return (__be32 *)&ary->wc_array[nchunks]; + return p; } /** @@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) */ int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - struct rpcrdma_msg *rmsgp; - __be32 *va, *vaend; - unsigned int len; - u32 hdr_len; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { - dprintk("svcrdma: header too short = %d\n", - rq_arg->len); - return -EINVAL; - } + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; - rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; - if (rmsgp->rm_vers != rpcrdma_version) { - dprintk("%s: bad version %u\n", __func__, - be32_to_cpu(rmsgp->rm_vers)); - return -EPROTONOSUPPORT; - } + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; - switch (be32_to_cpu(rmsgp->rm_type)) { - case RDMA_MSG: - case RDMA_NOMSG: + switch (*(rdma_argp + 3)) { + case rdma_msg: + case rdma_nomsg: break; - case RDMA_DONE: - /* Just drop it */ - dprintk("svcrdma: dropping RDMA_DONE message\n"); - return 0; - - case RDMA_ERROR: - /* Possible if this is a backchannel reply. - * XXX: We should cancel this XID, though. - */ - dprintk("svcrdma: dropping RDMA_ERROR message\n"); - return 0; - - case RDMA_MSGP: - /* Pull in the extra for the padded case, bump our pointer */ - rmsgp->rm_body.rm_padded.rm_align = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); - rmsgp->rm_body.rm_padded.rm_thresh = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); - - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rq_arg->head[0].iov_base = va; - len = (u32)((unsigned long)va - (unsigned long)rmsgp); - rq_arg->head[0].iov_len -= len; - if (len > rq_arg->len) - return -EINVAL; - return len; - default: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpu(rmsgp->rm_type)); - return -EINVAL; - } + case rdma_done: + goto out_drop; - /* The chunk list may contain either a read chunk list or a write - * chunk list and a reply chunk list. - */ - va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); - va = decode_read_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode read list\n"); - return -EINVAL; - } - va = decode_write_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode write list\n"); - return -EINVAL; - } - va = decode_reply_array(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode reply chunk\n"); - return -EINVAL; + case rdma_error: + goto out_drop; + + default: + goto out_proc; } - rq_arg->head[0].iov_base = va; - hdr_len = (unsigned long)va - (unsigned long)rmsgp; + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; rq_arg->head[0].iov_len -= hdr_len; return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; + +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; + +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; + +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, @@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, *va++ = rmsgp->rm_xid; *va++ = rmsgp->rm_vers; - *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = xprt->sc_fc_credits; *va++ = rdma_error; *va++ = cpu_to_be32(err); if (err == ERR_VERS) { @@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, return (int)((unsigned long)va - (unsigned long)startp); } -int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +/** + * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header + * @rdma_resp: buffer containing Reply transport header + * + * Returns length of transport header, in bytes. + */ +unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) { - struct rpcrdma_write_array *wr_ary; + unsigned int nsegs; + __be32 *p; - /* There is no read-list in a reply */ + p = rdma_resp; - /* skip write list */ - wr_ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. - wc_target.rs_length; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - /* skip reply array */ - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - return (unsigned long) wr_ary - (unsigned long) rmsgp; + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; + + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + return (unsigned long)p - (unsigned long)rdma_resp; } void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) @@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, seg->rs_offset = rs_offset; seg->rs_length = cpu_to_be32(write_len); } - -void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, - struct rpcrdma_msg *rdma_resp, - enum rpcrdma_proc rdma_type) -{ - rdma_resp->rm_xid = rdma_argp->rm_xid; - rdma_resp->rm_vers = rdma_argp->rm_vers; - rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); - rdma_resp->rm_type = cpu_to_be32(rdma_type); - - /* Encode <nul> chunks lists */ - rdma_resp->rm_body.rm_chunks[0] = xdr_zero; - rdma_resp->rm_body.rm_chunks[1] = xdr_zero; - rdma_resp->rm_body.rm_chunks[2] = xdr_zero; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 172b537f8cfc..f7b2daf72a86 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); } else { atomic_inc(&rdma_stat_rq_starve); clear_bit(XPT_DATA, &xprt->xpt_flags); ctxt = NULL; } - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); if (!ctxt) { /* This is the EAGAIN path. The svc_recv routine will * return -EAGAIN, the nfsd thread will go to call into diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index ad4d286a83c5..515221b16d09 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].length = + svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, ctxt->sge[0].length, DMA_TO_DEVICE); @@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct rpcrdma_msg *rdma_argp; struct rpcrdma_msg *rdma_resp; struct rpcrdma_write_array *wr_ary, *rp_ary; - enum rpcrdma_proc reply_type; int ret; int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; u32 inv_rkey; + __be32 *p; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!res_page) goto err0; rdma_resp = page_address(res_page); - if (rp_ary) - reply_type = RDMA_NOMSG; - else - reply_type = RDMA_MSG; - svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, - rdma_resp, reply_type); + + p = &rdma_resp->rm_xid; + *p++ = rdma_argp->rm_xid; + *p++ = rdma_argp->rm_vers; + *p++ = rdma->sc_fc_credits; + *p++ = rp_ary ? rdma_nomsg : rdma_msg; + + /* Start with empty chunks */ + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; /* Send any write-chunk data and build resp write-list */ if (wr_ary) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ca2799af05a6..c13a5c35ce14 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, ctxt = kmalloc(sizeof(*ctxt), flags); if (ctxt) { ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->free); - INIT_LIST_HEAD(&ctxt->dto_q); + INIT_LIST_HEAD(&ctxt->list); } return ctxt; } @@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) dprintk("svcrdma: No memory for RDMA ctxt\n"); return false; } - list_add(&ctxt->free, &xprt->sc_ctxts); + list_add(&ctxt->list, &xprt->sc_ctxts); } return true; } @@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used++; if (list_empty(&xprt->sc_ctxts)) goto out_empty; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del_init(&ctxt->free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&xprt->sc_ctxt_lock); out: ctxt->count = 0; @@ -209,15 +208,15 @@ out_empty: /* Either pre-allocation missed the mark, or send * queue accounting is broken. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); ctxt = alloc_ctxt(xprt, GFP_NOIO); if (ctxt) goto out; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); return NULL; } @@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - list_add(&ctxt->free, &xprt->sc_ctxts); - spin_unlock_bh(&xprt->sc_ctxt_lock); + list_add(&ctxt->list, &xprt->sc_ctxts); + spin_unlock(&xprt->sc_ctxt_lock); } static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) @@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del(&ctxt->free); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); kfree(ctxt); } } @@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->byte_len = wc->byte_len; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); spin_unlock(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); @@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) read_hdr = ctxt->read_hdr; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->dto_q, + list_add_tail(&read_hdr->list, &xprt->sc_read_complete_q); spin_unlock(&xprt->sc_rq_dto_lock); @@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return NULL; svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); - INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); @@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_map_lock); + /* + * Note that this implies that the underlying transport support + * has some form of congestion control (see RFC 7530 section 3.1 + * paragraph 2). For now, we assume that all supported RDMA + * transports are suitable here. + */ + set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) { struct svc_rdma_fastreg_mr *frmr = NULL; - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); if (!list_empty(&rdma->sc_frmr_q)) { frmr = list_entry(rdma->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); frmr->sg_nents = 0; } - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); if (frmr) return frmr; @@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); } } @@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + @@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -1201,9 +1208,9 @@ static void __svc_rdma_free(struct work_struct *work) ib_drain_qp(rdma->sc_qp); /* We should only be called from kref_put */ - if (atomic_read(&xprt->xpt_ref.refcount) != 0) + if (kref_read(&xprt->xpt_ref) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&xprt->xpt_ref.refcount)); + kref_read(&xprt->xpt_ref)); /* * Destroy queued, but not processed read completions. Note @@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work) */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 534c178d2a7e..c717f5410776 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 1; + int xprt_rdma_pad_optimize = 0; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -709,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task) return 0; failed_marshal: - dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", - __func__, rc); - if (rc == -EIO) - r_xprt->rx_stats.failed_marshal_count++; if (rc != -ENOTCONN) return rc; drop_connection: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 11d07748f699..81cd31acf690 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -54,6 +54,7 @@ #include <linux/sunrpc/svc_rdma.h> #include <asm/bitops.h> #include <linux/module.h> /* try_module_get()/module_put() */ +#include <rdma/ib_cm.h> #include "xprt_rdma.h" @@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, /* Default settings for RPC-over-RDMA Version One */ r_xprt->rx_ia.ri_reminv_expected = false; + r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; @@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { r_xprt->rx_ia.ri_reminv_expected = true; + r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } @@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n", + sap, rpc_get_port(sap), ia->ri_device->name, + rdma_reject_msg(id, event->status)); +#endif connstate = -ECONNREFUSED; + if (event->status == IB_CM_REJ_STALE_CONN) + connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; @@ -486,18 +496,19 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) */ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata) + struct rpcrdma_create_data_internal *cdata) { struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + unsigned int max_qp_wr, max_sge; struct ib_cq *sendcq, *recvcq; - unsigned int max_qp_wr; int rc; - if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { - dprintk("RPC: %s: insufficient sge's available\n", - __func__); + max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES); + if (max_sge < RPCRDMA_MIN_SEND_SGES) { + pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); return -ENOMEM; } + ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES; if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", @@ -522,7 +533,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ - ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; + ep->rep_attr.cap.max_send_sge = max_sge; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -640,20 +651,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); struct rdma_cm_id *id, *old; + struct sockaddr *sap; + unsigned int extras; int rc = 0; - int retry_count = 0; if (ep->rep_connected != 0) { - struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); rpcrdma_ep_disconnect(ep, ia); - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - id = rpcrdma_create_id(xprt, ia, - (struct sockaddr *)&xprt->rx_data.addr); + sap = (struct sockaddr *)&r_xprt->rx_data.addr; + id = rpcrdma_create_id(r_xprt, ia, sap); if (IS_ERR(id)) { rc = -EHOSTUNREACH; goto out; @@ -708,51 +720,18 @@ retry: } wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - - /* - * Check state. A non-peer reject indicates no listener - * (ECONNREFUSED), which may be a transient state. All - * others indicate a transport condition which has already - * undergone a best-effort. - */ - if (ep->rep_connected == -ECONNREFUSED && - ++retry_count <= RDMA_CONNECT_RETRY_MAX) { - dprintk("RPC: %s: non-peer_reject, retry\n", __func__); - goto retry; - } if (ep->rep_connected <= 0) { - /* Sometimes, the only way to reliably connect to remote - * CMs is to use same nonzero values for ORD and IRD. */ - if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && - (ep->rep_remote_cma.responder_resources == 0 || - ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources)) { - if (ep->rep_remote_cma.responder_resources == 0) - ep->rep_remote_cma.responder_resources = 1; - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; + if (ep->rep_connected == -EAGAIN) goto retry; - } rc = ep->rep_connected; - } else { - struct rpcrdma_xprt *r_xprt; - unsigned int extras; - - dprintk("RPC: %s: connected\n", __func__); - - r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - extras = r_xprt->rx_buf.rb_bc_srv_max_requests; - - if (extras) { - rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) { - pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", - __func__, rc); - rc = 0; - } - } + goto out; } + dprintk("RPC: %s: connected\n", __func__); + extras = r_xprt->rx_buf.rb_bc_srv_max_requests; + if (extras) + rpcrdma_ep_post_extra_recv(r_xprt, extras); + out: if (rc) ep->rep_connected = rc; @@ -797,9 +776,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) spin_lock(&buf->rb_recovery_lock); while (!list_empty(&buf->rb_stale_mrs)) { - mw = list_first_entry(&buf->rb_stale_mrs, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); + mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); dprintk("RPC: %s: recovering MR %p\n", __func__, mw); @@ -817,7 +794,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_recovery_lock); - list_add(&mw->mw_list, &buf->rb_stale_mrs); + rpcrdma_push_mw(mw, &buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); schedule_delayed_work(&buf->rb_recovery_worker, 0); @@ -1093,11 +1070,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) struct rpcrdma_mw *mw = NULL; spin_lock(&buf->rb_mwlock); - if (!list_empty(&buf->rb_mws)) { - mw = list_first_entry(&buf->rb_mws, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - } + if (!list_empty(&buf->rb_mws)) + mw = rpcrdma_pop_mw(&buf->rb_mws); spin_unlock(&buf->rb_mwlock); if (!mw) @@ -1120,7 +1094,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_mwlock); - list_add_tail(&mw->mw_list, &buf->rb_mws); + rpcrdma_push_mw(mw, &buf->rb_mws); spin_unlock(&buf->rb_mwlock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e35efd4ac1e4..171a35116de9 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -74,7 +74,9 @@ struct rpcrdma_ia { unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; + unsigned int ri_max_send_sges; bool ri_reminv_expected; + bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; @@ -303,15 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; -/* Reserve enough Send SGEs to send a maximum size inline request: +/* The Send SGE array is provisioned to send a maximum size + * inline request: * - RPC-over-RDMA header * - xdr_buf head iovec - * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages + * - RPCRDMA_MAX_INLINE bytes, in pages * - xdr_buf tail iovec + * + * The actual number of array elements consumed by each RPC + * depends on the device's max_sge limit. */ enum { - RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, - RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, + RPCRDMA_MIN_SEND_SGES = 3, + RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT, RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, }; @@ -348,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) return rqst->rq_xprtdata; } +static inline void +rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) +{ + list_add_tail(&mw->mw_list, list); +} + +static inline struct rpcrdma_mw * +rpcrdma_pop_mw(struct list_head *list) +{ + struct rpcrdma_mw *mw; + + mw = list_first_entry(list, struct rpcrdma_mw, mw_list); + list_del(&mw->mw_list); + return mw; +} + /* * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for * inline requests/replies, and client/server credits. |