diff options
Diffstat (limited to 'net')
71 files changed, 1401 insertions, 1361 deletions
diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 16e10680518c..931ea00c4fed 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -242,8 +242,9 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, "w", nwname); if (!errcode) { *wnames = - kmalloc(sizeof(char *) * *nwname, - GFP_NOFS); + kmalloc_array(*nwname, + sizeof(char *), + GFP_NOFS); if (!*wnames) errcode = -ENOMEM; } @@ -285,9 +286,9 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt, p9pdu_readf(pdu, proto_version, "w", nwqid); if (!errcode) { *wqids = - kmalloc(*nwqid * - sizeof(struct p9_qid), - GFP_NOFS); + kmalloc_array(*nwqid, + sizeof(struct p9_qid), + GFP_NOFS); if (*wqids == NULL) errcode = -ENOMEM; } diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 4d0372263e5d..05006cbb3361 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -360,7 +360,8 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, nr_pages = DIV_ROUND_UP((unsigned long)p + len, PAGE_SIZE) - (unsigned long)p / PAGE_SIZE; - *pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + *pages = kmalloc_array(nr_pages, sizeof(struct page *), + GFP_NOFS); if (!*pages) return -ENOMEM; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 31e0dcb970f8..75620c2f2617 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -472,7 +472,7 @@ static const uint8_t *copy_macs(struct mpoa_client *mpc, if (mpc->number_of_mps_macs != 0) kfree(mpc->mps_macs); mpc->number_of_mps_macs = 0; - mpc->mps_macs = kmalloc(num_macs * ETH_ALEN, GFP_KERNEL); + mpc->mps_macs = kmalloc_array(ETH_ALEN, num_macs, GFP_KERNEL); if (mpc->mps_macs == NULL) { pr_info("(%s) out of mem\n", mpc->dev->name); return NULL; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 1dec33790198..ee8ef1228263 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1281,7 +1281,7 @@ int hci_inquiry(void __user *arg) /* cache_dump can't sleep. Therefore we allocate temp buffer and then * copy it to the user space. */ - buf = kmalloc(sizeof(struct inquiry_info) * max_rsp, GFP_KERNEL); + buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL); if (!buf) { err = -ENOMEM; goto done; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9b7907ebfa01..d17a4736e47c 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -331,7 +331,7 @@ static int l2cap_seq_list_init(struct l2cap_seq_list *seq_list, u16 size) */ alloc_size = roundup_pow_of_two(size); - seq_list->list = kmalloc(sizeof(u16) * alloc_size, GFP_KERNEL); + seq_list->list = kmalloc_array(alloc_size, sizeof(u16), GFP_KERNEL); if (!seq_list->list) return -ENOMEM; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index cb4729539b82..920665dd92db 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -333,7 +333,7 @@ static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max, mdb->max = max; mdb->old = old; - mdb->mhash = kzalloc(max * sizeof(*mdb->mhash), GFP_ATOMIC); + mdb->mhash = kcalloc(max, sizeof(*mdb->mhash), GFP_ATOMIC); if (!mdb->mhash) { kfree(mdb); return -ENOMEM; diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index bcec377b07e7..491828713e0b 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -918,12 +918,13 @@ static int translate_table(struct net *net, const char *name, * if an error occurs */ newinfo->chainstack = - vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack))); + vmalloc(array_size(nr_cpu_ids, + sizeof(*(newinfo->chainstack)))); if (!newinfo->chainstack) return -ENOMEM; for_each_possible_cpu(i) { newinfo->chainstack[i] = - vmalloc(udc_cnt * sizeof(*(newinfo->chainstack[0]))); + vmalloc(array_size(udc_cnt, sizeof(*(newinfo->chainstack[0])))); if (!newinfo->chainstack[i]) { while (i) vfree(newinfo->chainstack[--i]); @@ -933,7 +934,7 @@ static int translate_table(struct net *net, const char *name, } } - cl_s = vmalloc(udc_cnt * sizeof(*cl_s)); + cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s))); if (!cl_s) return -ENOMEM; i = 0; /* the i'th udc */ @@ -1308,7 +1309,7 @@ static int do_update_counters(struct net *net, const char *name, if (num_counters == 0) return -EINVAL; - tmp = vmalloc(num_counters * sizeof(*tmp)); + tmp = vmalloc(array_size(num_counters, sizeof(*tmp))); if (!tmp) return -ENOMEM; @@ -1449,7 +1450,7 @@ static int copy_counters_to_user(struct ebt_table *t, return -EINVAL; } - counterstmp = vmalloc(nentries * sizeof(*counterstmp)); + counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp))); if (!counterstmp) return -ENOMEM; diff --git a/net/can/bcm.c b/net/can/bcm.c index 97fedff3f0c4..9393f25df08d 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -923,8 +923,9 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { - op->frames = kmalloc(msg_head->nframes * op->cfsiz, - GFP_KERNEL); + op->frames = kmalloc_array(msg_head->nframes, + op->cfsiz, + GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; @@ -1095,15 +1096,17 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ - op->frames = kmalloc(msg_head->nframes * op->cfsiz, - GFP_KERNEL); + op->frames = kmalloc_array(msg_head->nframes, + op->cfsiz, + GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } /* create and init array for received CAN frames */ - op->last_frames = kzalloc(msg_head->nframes * op->cfsiz, + op->last_frames = kcalloc(msg_head->nframes, + op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3b3d33ea9ed8..c6413c360771 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -168,12 +168,6 @@ static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; static struct lock_class_key socket_class; #endif -/* - * When skipping (ignoring) a block of input we read it into a "skip - * buffer," which is this many bytes in size. - */ -#define SKIP_BUF_SIZE 1024 - static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); static void ceph_con_workfn(struct work_struct *); @@ -520,12 +514,18 @@ static int ceph_tcp_connect(struct ceph_connection *con) return 0; } +/* + * If @buf is NULL, discard up to @len bytes. + */ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) { struct kvec iov = {buf, len}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; + if (!buf) + msg.msg_flags |= MSG_TRUNC; + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) @@ -2575,9 +2575,6 @@ static int try_write(struct ceph_connection *con) con->state != CON_STATE_OPEN) return 0; -more: - dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); - /* open the socket first? */ if (con->state == CON_STATE_PREOPEN) { BUG_ON(con->sock); @@ -2598,7 +2595,8 @@ more: } } -more_kvec: +more: + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); BUG_ON(!con->sock); /* kvec data queued? */ @@ -2623,7 +2621,7 @@ more_kvec: ret = write_partial_message_data(con); if (ret == 1) - goto more_kvec; /* we need to send the footer, too! */ + goto more; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { @@ -2659,8 +2657,6 @@ out: return ret; } - - /* * Read what we can from the socket. */ @@ -2721,16 +2717,11 @@ more: if (con->in_base_pos < 0) { /* * skipping + discarding content. - * - * FIXME: there must be a better way to do this! */ - static char buf[SKIP_BUF_SIZE]; - int skip = min((int) sizeof (buf), -con->in_base_pos); - - dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); - ret = ceph_tcp_recvmsg(con->sock, buf, skip); + ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); if (ret <= 0) goto out; + dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); con->in_base_pos += ret; if (con->in_base_pos) goto more; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 69a2581ddbba..a00c74f1154e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -766,7 +766,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_dup_last); -void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, +int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method) { struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, @@ -778,7 +778,9 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, BUG_ON(opcode != CEPH_OSD_OP_CALL); pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); - BUG_ON(!pagelist); + if (!pagelist) + return -ENOMEM; + ceph_pagelist_init(pagelist); op->cls.class_name = class; @@ -798,6 +800,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); op->indata_len = payload_len; + return 0; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -1026,7 +1029,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } - req->r_abort_on_full = true; req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); @@ -1054,6 +1056,38 @@ EXPORT_SYMBOL(ceph_osdc_new_request); DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) +/* + * Call @fn on each OSD request as long as @fn returns 0. + */ +static void for_each_request(struct ceph_osd_client *osdc, + int (*fn)(struct ceph_osd_request *req, void *arg), + void *arg) +{ + struct rb_node *n, *p; + + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + for (p = rb_first(&osd->o_requests); p; ) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + p = rb_next(p); + if (fn(req, arg)) + return; + } + } + + for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + p = rb_next(p); + if (fn(req, arg)) + return; + } +} + static bool osd_homeless(struct ceph_osd *osd) { return osd->o_osd == CEPH_HOMELESS_OSD; @@ -1395,7 +1429,6 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool recovery_deletes = ceph_osdmap_flag(osdc, CEPH_OSDMAP_RECOVERY_DELETES); enum calc_target_result ct_res; - int ret; t->epoch = osdc->osdmap->epoch; pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); @@ -1431,14 +1464,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, } } - ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, - &pgid); - if (ret) { - WARN_ON(ret != -ENOENT); - t->osd = CEPH_HOMELESS_OSD; - ct_res = CALC_TARGET_POOL_DNE; - goto out; - } + __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, &pgid); last_pgid.pool = pgid.pool; last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); @@ -2161,9 +2187,9 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd *osd; enum calc_target_result ct_res; + int err = 0; bool need_send = false; bool promoted = false; - bool need_abort = false; WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); @@ -2179,7 +2205,10 @@ again: goto promote; } - if (osdc->osdmap->epoch < osdc->epoch_barrier) { + if (osdc->abort_err) { + dout("req %p abort_err %d\n", req, osdc->abort_err); + err = osdc->abort_err; + } else if (osdc->osdmap->epoch < osdc->epoch_barrier) { dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, osdc->epoch_barrier); req->r_t.paused = true; @@ -2200,11 +2229,13 @@ again: (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); - pr_warn_ratelimited("FULL or reached pool quota\n"); - req->r_t.paused = true; - maybe_request_map(osdc); - if (req->r_abort_on_full) - need_abort = true; + if (osdc->abort_on_full) { + err = -ENOSPC; + } else { + pr_warn_ratelimited("FULL or reached pool quota\n"); + req->r_t.paused = true; + maybe_request_map(osdc); + } } else if (!osd_homeless(osd)) { need_send = true; } else { @@ -2221,11 +2252,11 @@ again: link_request(osd, req); if (need_send) send_request(req); - else if (need_abort) - complete_request(req, -ENOSPC); + else if (err) + complete_request(req, err); mutex_unlock(&osd->lock); - if (ct_res == CALC_TARGET_POOL_DNE) + if (!err && ct_res == CALC_TARGET_POOL_DNE) send_map_check(req); if (promoted) @@ -2281,11 +2312,21 @@ static void finish_request(struct ceph_osd_request *req) static void __complete_request(struct ceph_osd_request *req) { - if (req->r_callback) { - dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, - req->r_tid, req->r_callback, req->r_result); + dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, + req->r_tid, req->r_callback, req->r_result); + + if (req->r_callback) req->r_callback(req); - } + complete_all(&req->r_completion); + ceph_osdc_put_request(req); +} + +static void complete_request_workfn(struct work_struct *work) +{ + struct ceph_osd_request *req = + container_of(work, struct ceph_osd_request, r_complete_work); + + __complete_request(req); } /* @@ -2297,9 +2338,9 @@ static void complete_request(struct ceph_osd_request *req, int err) req->r_result = err; finish_request(req); - __complete_request(req); - complete_all(&req->r_completion); - ceph_osdc_put_request(req); + + INIT_WORK(&req->r_complete_work, complete_request_workfn); + queue_work(req->r_osdc->completion_wq, &req->r_complete_work); } static void cancel_map_check(struct ceph_osd_request *req) @@ -2336,6 +2377,28 @@ static void abort_request(struct ceph_osd_request *req, int err) complete_request(req, err); } +static int abort_fn(struct ceph_osd_request *req, void *arg) +{ + int err = *(int *)arg; + + abort_request(req, err); + return 0; /* continue iteration */ +} + +/* + * Abort all in-flight requests with @err and arrange for all future + * requests to be failed immediately. + */ +void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) +{ + dout("%s osdc %p err %d\n", __func__, osdc, err); + down_write(&osdc->lock); + for_each_request(osdc, abort_fn, &err); + osdc->abort_err = err; + up_write(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_abort_requests); + static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) { if (likely(eb > osdc->epoch_barrier)) { @@ -2363,6 +2426,30 @@ void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); /* + * We can end up releasing caps as a result of abort_request(). + * In that case, we probably want to ensure that the cap release message + * has an updated epoch barrier in it, so set the epoch barrier prior to + * aborting the first request. + */ +static int abort_on_full_fn(struct ceph_osd_request *req, void *arg) +{ + struct ceph_osd_client *osdc = req->r_osdc; + bool *victims = arg; + + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + pool_full(osdc, req->r_t.base_oloc.pool))) { + if (!*victims) { + update_epoch_barrier(osdc, osdc->osdmap->epoch); + *victims = true; + } + abort_request(req, -ENOSPC); + } + + return 0; /* continue iteration */ +} + +/* * Drop all pending requests that are stalled waiting on a full condition to * clear, and complete them with ENOSPC as the return code. Set the * osdc->epoch_barrier to the latest map epoch that we've seen if any were @@ -2370,61 +2457,11 @@ EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); */ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) { - struct rb_node *n; bool victims = false; - dout("enter abort_on_full\n"); - - if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc)) - goto out; - - /* Scan list and see if there is anything to abort */ - for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { - struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); - struct rb_node *m; - - m = rb_first(&osd->o_requests); - while (m) { - struct ceph_osd_request *req = rb_entry(m, - struct ceph_osd_request, r_node); - m = rb_next(m); - - if (req->r_abort_on_full) { - victims = true; - break; - } - } - if (victims) - break; - } - - if (!victims) - goto out; - - /* - * Update the barrier to current epoch if it's behind that point, - * since we know we have some calls to be aborted in the tree. - */ - update_epoch_barrier(osdc, osdc->osdmap->epoch); - - for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { - struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); - struct rb_node *m; - - m = rb_first(&osd->o_requests); - while (m) { - struct ceph_osd_request *req = rb_entry(m, - struct ceph_osd_request, r_node); - m = rb_next(m); - - if (req->r_abort_on_full && - (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || - pool_full(osdc, req->r_t.target_oloc.pool))) - abort_request(req, -ENOSPC); - } - } -out: - dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier); + if (osdc->abort_on_full && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc))) + for_each_request(osdc, abort_on_full_fn, &victims); } static void check_pool_dne(struct ceph_osd_request *req) @@ -3541,8 +3578,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) up_read(&osdc->lock); __complete_request(req); - complete_all(&req->r_completion); - ceph_osdc_put_request(req); return; fail_request: @@ -4927,7 +4962,10 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, if (ret) goto out_put_req; - osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); + if (ret) + goto out_put_req; + if (req_page) osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len, 0, false, false); @@ -4996,6 +5034,10 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->notify_wq) goto out_msgpool_reply; + osdc->completion_wq = create_singlethread_workqueue("ceph-completion"); + if (!osdc->completion_wq) + goto out_notify_wq; + schedule_delayed_work(&osdc->timeout_work, osdc->client->options->osd_keepalive_timeout); schedule_delayed_work(&osdc->osds_timeout_work, @@ -5003,6 +5045,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) return 0; +out_notify_wq: + destroy_workqueue(osdc->notify_wq); out_msgpool_reply: ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: @@ -5017,7 +5061,7 @@ out: void ceph_osdc_stop(struct ceph_osd_client *osdc) { - flush_workqueue(osdc->notify_wq); + destroy_workqueue(osdc->completion_wq); destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 9645ffd6acfb..98c0ff3d6441 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1299,8 +1299,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), - GFP_NOFS); + map->osd_primary_affinity = kmalloc_array(map->max_osd, + sizeof(u32), + GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; @@ -2145,10 +2146,10 @@ bool ceph_osds_changed(const struct ceph_osds *old_acting, * Should only be called with target_oid and target_oloc (as opposed to * base_oid and base_oloc), since tiering isn't taken into account. */ -int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, - const struct ceph_object_id *oid, - const struct ceph_object_locator *oloc, - struct ceph_pg *raw_pgid) +void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { WARN_ON(pi->id != oloc->pool); @@ -2164,11 +2165,8 @@ int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, int nsl = oloc->pool_ns->len; size_t total = nsl + 1 + oid->name_len; - if (total > sizeof(stack_buf)) { - buf = kmalloc(total, GFP_NOIO); - if (!buf) - return -ENOMEM; - } + if (total > sizeof(stack_buf)) + buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL); memcpy(buf, oloc->pool_ns->str, nsl); buf[nsl] = '\037'; memcpy(buf + nsl + 1, oid->name, oid->name_len); @@ -2180,7 +2178,6 @@ int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, oid->name, nsl, oloc->pool_ns->str, raw_pgid->pool, raw_pgid->seed); } - return 0; } int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, @@ -2194,7 +2191,8 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, if (!pi) return -ENOENT; - return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); + __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); + return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index a3d0adc828e6..e560d3975f41 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -20,7 +20,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, int got = 0; int rc = 0; - pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS); if (!pages) return ERR_PTR(-ENOMEM); @@ -74,7 +74,7 @@ struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) struct page **pages; int i; - pages = kmalloc(sizeof(*pages) * num_pages, flags); + pages = kmalloc_array(num_pages, sizeof(*pages), flags); if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { diff --git a/net/core/dev.c b/net/core/dev.c index 6e18242a1cae..57b7bab5f70b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8823,7 +8823,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) int i; struct hlist_head *hash; - hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); + hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL); if (hash != NULL) for (i = 0; i < NETDEV_HASHENTRIES; i++) INIT_HLIST_HEAD(&hash[i]); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c15075dc7572..e677a20180cf 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -911,7 +911,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GSSET_INFO; - info_buf = kzalloc(n_bits * sizeof(u32), GFP_USER); + info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); if (!info_buf) return -ENOMEM; @@ -1017,7 +1017,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) - rule_buf = kzalloc(info.rule_cnt * sizeof(u32), + rule_buf = kcalloc(info.rule_cnt, sizeof(u32), GFP_USER); if (!rule_buf) return -ENOMEM; @@ -1816,7 +1816,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) return -EFAULT; test.len = test_len; - data = kmalloc(test_len * sizeof(u64), GFP_USER); + data = kmalloc_array(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM; @@ -1852,7 +1852,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) WARN_ON_ONCE(!ret); gstrings.len = ret; - data = vzalloc(gstrings.len * ETH_GSTRING_LEN); + data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); if (gstrings.len && !data) return -ENOMEM; @@ -1952,7 +1952,7 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) return -EFAULT; stats.n_stats = n_stats; - data = vzalloc(n_stats * sizeof(u64)); + data = vzalloc(array_size(n_stats, sizeof(u64))); if (n_stats && !data) return -ENOMEM; @@ -1996,7 +1996,7 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return -EFAULT; stats.n_stats = n_stats; - data = vzalloc(n_stats * sizeof(u64)); + data = vzalloc(array_size(n_stats, sizeof(u64))); if (n_stats && !data) return -ENOMEM; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e4ede34cc52..49368e21d228 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3603,7 +3603,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) return -ENOMEM; strcpy(pkt_dev->odevname, ifname); - pkt_dev->flows = vzalloc_node(MAX_CFLOWS * sizeof(struct flow_state), + pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, + sizeof(struct flow_state)), node); if (pkt_dev->flows == NULL) { kfree(pkt_dev); diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index d2f4e0c1faaf..2589a6b78aa1 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -984,7 +984,8 @@ static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { - table = kmalloc(sizeof(struct dcb_app) * app_count, GFP_KERNEL); + table = kmalloc_array(app_count, sizeof(struct dcb_app), + GFP_KERNEL); if (!table) return -ENOMEM; diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 385f153fe031..2b75df469220 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -46,7 +46,8 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) return -ENOMEM; /* allocate buffer and initialize linked list */ - seqp = kmalloc(CCID2_SEQBUF_LEN * sizeof(struct ccid2_seq), gfp_any()); + seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), + gfp_any()); if (seqp == NULL) return -ENOMEM; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index dc2960be51e0..b231e40f006a 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -38,7 +38,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, { void *hdr; int i, pages = 0; - uint32_t *buf = kzalloc(32 * sizeof(uint32_t), GFP_KERNEL); + uint32_t *buf = kcalloc(32, sizeof(uint32_t), GFP_KERNEL); pr_debug("%s\n", __func__); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 63aa39b3af03..b21833651394 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -567,7 +567,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, struct nlattr *mx; int len = 0; - mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL); + mx = kcalloc(3, nla_total_size(4), GFP_KERNEL); if (!mx) return -ENOMEM; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bf4e4adc2d00..1df6e97106d7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -649,7 +649,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, hash = rcu_dereference(nh->nh_exceptions); if (!hash) { - hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); + hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nh->nh_exceptions, hash); @@ -3146,7 +3146,8 @@ int __init ip_rt_init(void) { int cpu; - ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); + ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), + GFP_KERNEL); if (!ip_idents) panic("IP: failed to allocate ip_idents\n"); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index d8c4b6374377..be491bf6ab6e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -956,7 +956,7 @@ static int __net_init icmpv6_sk_init(struct net *net) int err, i, j; net->ipv6.icmp_sk = - kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); + kcalloc(nr_cpu_ids, sizeof(struct sock *), GFP_KERNEL); if (!net->ipv6.icmp_sk) return -ENOMEM; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 44c39c5f0638..10ae13560b40 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -42,7 +42,8 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { - ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); + ilan->locks = kvmalloc_array(size, sizeof(spinlock_t), + GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 89178b46b32f..d9558ffb8acf 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1186,7 +1186,7 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, lockdep_assert_held(&local->mtx); lockdep_assert_held(&local->chanctx_mtx); - vif_chsw = kzalloc(sizeof(vif_chsw[0]) * n_vifs, GFP_KERNEL); + vif_chsw = kcalloc(n_vifs, sizeof(vif_chsw[0]), GFP_KERNEL); if (!vif_chsw) return -ENOMEM; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 722f3d9fb416..fb73451ed85e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -772,7 +772,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local) if (have_mfp) n_suites += 4; - suites = kmalloc(sizeof(u32) * n_suites, GFP_KERNEL); + suites = kmalloc_array(n_suites, sizeof(u32), GFP_KERNEL); if (!suites) return -ENOMEM; diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 8221bc5582ab..76048b53c5b2 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -592,11 +592,11 @@ minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) max_rates = sband->n_bitrates; } - mi->r = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp); + mi->r = kcalloc(max_rates, sizeof(struct minstrel_rate), gfp); if (!mi->r) goto error; - mi->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp); + mi->sample_table = kmalloc_array(max_rates, SAMPLE_COLUMNS, gfp); if (!mi->sample_table) goto error1; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index fb586b6e5d49..67ebdeaffbbc 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1313,11 +1313,11 @@ minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) if (!msp) return NULL; - msp->ratelist = kzalloc(sizeof(struct minstrel_rate) * max_rates, gfp); + msp->ratelist = kcalloc(max_rates, sizeof(struct minstrel_rate), gfp); if (!msp->ratelist) goto error; - msp->sample_table = kmalloc(SAMPLE_COLUMNS * max_rates, gfp); + msp->sample_table = kmalloc_array(max_rates, SAMPLE_COLUMNS, gfp); if (!msp->sample_table) goto error1; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index a3b1bcc2b461..2e917a6d239d 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -1157,7 +1157,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, } } - ie = kzalloc(num_bands * iebufsz, GFP_KERNEL); + ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 2d82c88efd0b..5e2e511c4a6f 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1803,8 +1803,9 @@ static int ieee80211_reconfig_nan(struct ieee80211_sub_if_data *sdata) if (WARN_ON(res)) return res; - funcs = kzalloc((sdata->local->hw.max_nan_de_entries + 1) * - sizeof(*funcs), GFP_KERNEL); + funcs = kcalloc(sdata->local->hw.max_nan_de_entries + 1, + sizeof(*funcs), + GFP_KERNEL); if (!funcs) return -ENOMEM; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 61c3a389da89..99e0aa350dc5 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1380,7 +1380,8 @@ int __init ip_vs_conn_init(void) /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); + ip_vs_conn_tab = vmalloc(array_size(ip_vs_conn_tab_size, + sizeof(*ip_vs_conn_tab))); if (!ip_vs_conn_tab) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index afdeca53e88b..d88841fbc560 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -402,7 +402,8 @@ int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *l4proto) struct nf_conntrack_l4proto __rcu **proto_array; int i; - proto_array = kmalloc(MAX_NF_CT_PROTO * + proto_array = + kmalloc_array(MAX_NF_CT_PROTO, sizeof(struct nf_conntrack_l4proto *), GFP_KERNEL); if (proto_array == NULL) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index b7df32a56e7e..46f9df99d276 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -691,8 +691,9 @@ int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto) mutex_lock(&nf_nat_proto_mutex); if (nf_nat_l4protos[l3proto] == NULL) { - l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *), - GFP_KERNEL); + l4protos = kmalloc_array(IPPROTO_MAX, + sizeof(struct nf_nat_l4proto *), + GFP_KERNEL); if (l4protos == NULL) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d23a5c269c44..896d4a36081d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5304,7 +5304,7 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, if (err < 0) return err; - ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL); + ops = kcalloc(n, sizeof(struct nf_hook_ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -7178,8 +7178,8 @@ static int __init nf_tables_module_init(void) nft_chain_filter_init(); - info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, - GFP_KERNEL); + info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info), + GFP_KERNEL); if (info == NULL) { err = -ENOMEM; goto err1; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index cb5b5f207777..e5d27b2e4eba 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -190,8 +190,9 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; - expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * - class_max, GFP_KERNEL); + expect_policy = kcalloc(class_max, + sizeof(struct nf_conntrack_expect_policy), + GFP_KERNEL); if (expect_policy == NULL) return -ENOMEM; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index df9ab71b0ed9..d0d8397c9588 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1904,7 +1904,7 @@ static int __init xt_init(void) seqcount_init(&per_cpu(xt_recseq, i)); } - xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); + xt = kmalloc_array(NFPROTO_NUMPROTO, sizeof(struct xt_af), GFP_KERNEL); if (!xt) return -ENOMEM; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index b9ce82c9440f..25eeb6d2a75a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -352,8 +352,9 @@ int genl_register_family(struct genl_family *family) } if (family->maxattr && !family->parallel_ops) { - family->attrbuf = kmalloc((family->maxattr+1) * - sizeof(struct nlattr *), GFP_KERNEL); + family->attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), + GFP_KERNEL); if (family->attrbuf == NULL) { err = -ENOMEM; goto errout_locked; @@ -566,8 +567,9 @@ static int genl_family_rcv_msg(const struct genl_family *family, return -EOPNOTSUPP; if (family->maxattr && family->parallel_ops) { - attrbuf = kmalloc((family->maxattr+1) * - sizeof(struct nlattr *), GFP_KERNEL); + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), + GFP_KERNEL); if (attrbuf == NULL) return -ENOMEM; } else diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index b97eb766a1d5..93fbcafbf388 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1395,7 +1395,7 @@ static int __init nr_proto_init(void) return -1; } - dev_nr = kzalloc(nr_ndevs * sizeof(struct net_device *), GFP_KERNEL); + dev_nr = kcalloc(nr_ndevs, sizeof(struct net_device *), GFP_KERNEL); if (dev_nr == NULL) { printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n"); return -1; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index a61818e94396..0f5ce77460d4 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1578,8 +1578,9 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_destroy_table; } - dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), - GFP_KERNEL); + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); if (!dp->ports) { err = -ENOMEM; goto err_destroy_percpu; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index f81c1d0ddff4..19f6765566e7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -47,7 +47,7 @@ static struct hlist_head *dev_table; */ int ovs_vport_init(void) { - dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ee018564b2b4..50809748c127 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4161,7 +4161,7 @@ static char *alloc_one_pg_vec_page(unsigned long order) return buffer; /* __get_free_pages failed, fall back to vmalloc */ - buffer = vzalloc((1 << order) * PAGE_SIZE); + buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; diff --git a/net/rds/ib.c b/net/rds/ib.c index 02deee29e7f1..b6ad38e48f62 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -163,7 +163,8 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; - rds_ibdev->vector_load = kzalloc(sizeof(int) * device->num_comp_vectors, + rds_ibdev->vector_load = kcalloc(device->num_comp_vectors, + sizeof(int), GFP_KERNEL); if (!rds_ibdev->vector_load) { pr_err("RDS/IB: %s failed to allocate vector memory\n", diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 13b38ad0fa4a..f1684ae6abfd 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -526,7 +526,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto recv_hdrs_dma_out; } - ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ic->i_sends = vzalloc_node(array_size(sizeof(struct rds_ib_send_work), + ic->i_send_ring.w_nr), ibdev_to_node(dev)); if (!ic->i_sends) { ret = -ENOMEM; @@ -534,7 +535,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto ack_dma_out; } - ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ic->i_recvs = vzalloc_node(array_size(sizeof(struct rds_ib_recv_work), + ic->i_recv_ring.w_nr), ibdev_to_node(dev)); if (!ic->i_recvs) { ret = -ENOMEM; diff --git a/net/rds/info.c b/net/rds/info.c index 140a44a5f7b7..e367a97a18c8 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) >> PAGE_SHIFT; - pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5b73fea849df..ebe42e7eb456 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1514,7 +1514,8 @@ static int __init rose_proto_init(void) rose_callsign = null_ax25_address; - dev_rose = kzalloc(rose_ndevs * sizeof(struct net_device *), GFP_KERNEL); + dev_rose = kcalloc(rose_ndevs, sizeof(struct net_device *), + GFP_KERNEL); if (dev_rose == NULL) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n"); rc = -ENOMEM; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6c0ae27fff84..278ac0807a60 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -432,7 +432,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, sg = _sg; if (unlikely(nsg > 4)) { - sg = kmalloc(sizeof(*sg) * nsg, GFP_NOIO); + sg = kmalloc_array(nsg, sizeof(*sg), GFP_NOIO); if (!sg) goto nomem; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 22fa13cf5d8b..cd2e0e342fb6 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -489,11 +489,12 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, return err; if (!q->flows) { - q->flows = kvzalloc(q->flows_cnt * - sizeof(struct fq_codel_flow), GFP_KERNEL); + q->flows = kvcalloc(q->flows_cnt, + sizeof(struct fq_codel_flow), + GFP_KERNEL); if (!q->flows) return -ENOMEM; - q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL); + q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); if (!q->backlogs) return -ENOMEM; for (i = 0; i < q->flows_cnt; i++) { diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index bce2632212d3..c3a8388dcdf6 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -599,8 +599,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt, if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ - q->hh_flows = kvzalloc(HH_FLOWS_CNT * - sizeof(struct list_head), GFP_KERNEL); + q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), + GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) @@ -614,8 +614,9 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt, /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_arrays[i] = kvzalloc(HHF_ARRAYS_LEN * - sizeof(u32), GFP_KERNEL); + q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, + sizeof(u32), + GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. diff --git a/net/sctp/auth.c b/net/sctp/auth.c index e64630cd3331..5b537613946f 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -482,8 +482,9 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) return 0; /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kzalloc(sizeof(struct crypto_shash *) * - SCTP_AUTH_NUM_HMACS, gfp); + ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS, + sizeof(struct crypto_shash *), + gfp); if (!ep->auth_hmacs) return -ENOMEM; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 11d93377ba5e..5dffbc493008 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1438,7 +1438,7 @@ static __init int sctp_init(void) /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; sctp_ep_hashtable = - kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); + kmalloc_array(64, sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index cc7c1bb60fe8..dbd2605d1962 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -584,9 +584,9 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = kzalloc( - BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask), - GFP_KERNEL); + link->wr_tx_mask = kcalloc(BITS_TO_LONGS(SMC_WR_BUF_CNT), + sizeof(*link->wr_tx_mask), + GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 9463af4b32e8..be8f103d22fd 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1753,7 +1753,8 @@ alloc_enc_pages(struct rpc_rqst *rqstp) last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT; rqstp->rq_enc_pages_num = last - first + 1 + 1; rqstp->rq_enc_pages - = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), + = kmalloc_array(rqstp->rq_enc_pages_num, + sizeof(struct page *), GFP_NOFS); if (!rqstp->rq_enc_pages) goto out; diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 46b295e4f2b8..1c7c49dbf8ba 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -224,7 +224,7 @@ static void gssp_free_receive_pages(struct gssx_arg_accept_sec_context *arg) static int gssp_alloc_receive_pages(struct gssx_arg_accept_sec_context *arg) { arg->npages = DIV_ROUND_UP(NGROUPS_MAX * 4, PAGE_SIZE); - arg->pages = kzalloc(arg->npages * sizeof(struct page *), GFP_KERNEL); + arg->pages = kcalloc(arg->npages, sizeof(struct page *), GFP_KERNEL); /* * XXX: actual pages are allocated by xdr layer in * xdr_partial_copy_from_skb. @@ -298,9 +298,11 @@ int gssp_accept_sec_context_upcall(struct net *net, if (res.context_handle) { data->out_handle = rctxh.exported_context_token; data->mech_oid.len = rctxh.mech.len; - if (rctxh.mech.data) + if (rctxh.mech.data) { memcpy(data->mech_oid.data, rctxh.mech.data, data->mech_oid.len); + kfree(rctxh.mech.data); + } client_name = rctxh.src_name.display_name; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index cdda4744c9b1..109fbe591e7b 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1683,7 +1683,7 @@ struct cache_detail *cache_create_net(const struct cache_detail *tmpl, struct ne if (cd == NULL) return ERR_PTR(-ENOMEM); - cd->hash_table = kzalloc(cd->hash_size * sizeof(struct hlist_head), + cd->hash_table = kcalloc(cd->hash_size, sizeof(struct hlist_head), GFP_KERNEL); if (cd->hash_table == NULL) { kfree(cd); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index c2266f387213..d839c33ae7d9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1546,6 +1546,7 @@ call_reserveresult(struct rpc_task *task) task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { + xprt_request_init(task); task->tk_action = call_refresh; return; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 70f005044f06..3c85af058227 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -66,7 +66,7 @@ * Local functions */ static void xprt_init(struct rpc_xprt *xprt, struct net *net); -static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); @@ -987,6 +987,8 @@ bool xprt_prepare_transmit(struct rpc_task *task) task->tk_status = -EAGAIN; goto out_unlock; } + if (!bc_prealloc(req) && !req->rq_xmit_bytes_sent) + req->rq_xid = xprt_alloc_xid(xprt); ret = true; out_unlock: spin_unlock_bh(&xprt->transport_lock); @@ -1163,10 +1165,10 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) out_init_req: xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots, xprt->num_reqs); + spin_unlock(&xprt->reserve_lock); + task->tk_status = 0; task->tk_rqstp = req; - xprt_request_init(task, xprt); - spin_unlock(&xprt->reserve_lock); } EXPORT_SYMBOL_GPL(xprt_alloc_slot); @@ -1184,7 +1186,7 @@ void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); -static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) +void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); if (!xprt_dynamic_free_slot(xprt, req)) { @@ -1194,6 +1196,7 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) xprt_wake_up_backlog(xprt); spin_unlock(&xprt->reserve_lock); } +EXPORT_SYMBOL_GPL(xprt_free_slot); static void xprt_free_all_slots(struct rpc_xprt *xprt) { @@ -1303,8 +1306,9 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt) xprt->xid = prandom_u32(); } -static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +void xprt_request_init(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; INIT_LIST_HEAD(&req->rq_list); @@ -1312,7 +1316,6 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; - req->rq_xid = xprt_alloc_xid(xprt); req->rq_connect_cookie = xprt->connect_cookie - 1; req->rq_bytes_sent = 0; req->rq_snd_buf.len = 0; @@ -1373,7 +1376,7 @@ void xprt_release(struct rpc_task *task) dprintk("RPC: %5u release request %p\n", task->tk_pid, req); if (likely(!bc_prealloc(req))) - xprt_free_slot(xprt, req); + xprt->ops->free_slot(xprt, req); else xprt_free_bc_request(req); } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 47ebac949769..90adeff4c06b 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -9,8 +9,10 @@ #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svc_xprt.h> +#include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS @@ -29,29 +31,41 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, spin_unlock(&buf->rb_reqslock); rpcrdma_destroy_req(req); - - kfree(rqst); } -static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, - struct rpc_rqst *rqst) +static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, + unsigned int count) { - struct rpcrdma_regbuf *rb; - struct rpcrdma_req *req; - size_t size; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpc_rqst *rqst; + unsigned int i; + + for (i = 0; i < (count << 1); i++) { + struct rpcrdma_regbuf *rb; + struct rpcrdma_req *req; + size_t size; + + req = rpcrdma_create_req(r_xprt); + if (IS_ERR(req)) + return PTR_ERR(req); + rqst = &req->rl_slot; + + rqst->rq_xprt = xprt; + INIT_LIST_HEAD(&rqst->rq_list); + INIT_LIST_HEAD(&rqst->rq_bc_list); + __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + spin_lock_bh(&xprt->bc_pa_lock); + list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); - req = rpcrdma_create_req(r_xprt); - if (IS_ERR(req)) - return PTR_ERR(req); - - size = r_xprt->rx_data.inline_rsize; - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) - goto out_fail; - req->rl_sendbuf = rb; - xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, - min_t(size_t, size, PAGE_SIZE)); - rpcrdma_set_xprtdata(rqst, req); + size = r_xprt->rx_data.inline_rsize; + rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); + if (IS_ERR(rb)) + goto out_fail; + req->rl_sendbuf = rb; + xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, + min_t(size_t, size, PAGE_SIZE)); + } return 0; out_fail: @@ -59,23 +73,6 @@ out_fail: return -ENOMEM; } -/* Allocate and add receive buffers to the rpcrdma_buffer's - * existing list of rep's. These are released when the - * transport is destroyed. - */ -static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, - unsigned int count) -{ - int rc = 0; - - while (count--) { - rc = rpcrdma_create_rep(r_xprt); - if (rc) - break; - } - return rc; -} - /** * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests * @xprt: transport associated with these backchannel resources @@ -86,9 +83,6 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; - struct rpc_rqst *rqst; - unsigned int i; int rc; /* The backchannel reply path returns each rpc_rqst to the @@ -103,35 +97,11 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) if (reqs > RPCRDMA_BACKWARD_WRS >> 1) goto out_err; - for (i = 0; i < (reqs << 1); i++) { - rqst = kzalloc(sizeof(*rqst), GFP_KERNEL); - if (!rqst) - goto out_free; - - dprintk("RPC: %s: new rqst %p\n", __func__, rqst); - - rqst->rq_xprt = &r_xprt->rx_xprt; - INIT_LIST_HEAD(&rqst->rq_list); - INIT_LIST_HEAD(&rqst->rq_bc_list); - __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - - if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) - goto out_free; - - spin_lock_bh(&xprt->bc_pa_lock); - list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); - } - - rc = rpcrdma_bc_setup_reps(r_xprt, reqs); + rc = rpcrdma_bc_setup_reqs(r_xprt, reqs); if (rc) goto out_free; - rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs); - if (rc) - goto out_free; - - buffer->rb_bc_srv_max_requests = reqs; + r_xprt->rx_buf.rb_bc_srv_max_requests = reqs; request_module("svcrdma"); trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; @@ -235,6 +205,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; + rpcrdma_post_recvs(r_xprt, true); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; return 0; @@ -275,10 +246,14 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) */ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpc_xprt *xprt = rqst->rq_xprt; dprintk("RPC: %s: freeing rqst %p (req %p)\n", - __func__, rqst, rpcr_to_rdmar(rqst)); + __func__, rqst, req); + + rpcrdma_recv_buffer_put(req->rl_reply); + req->rl_reply = NULL; spin_lock_bh(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f2f63959fddd..17fb1e025654 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -20,7 +20,10 @@ * verb (fmr_op_unmap). */ +#include <linux/sunrpc/svc_rdma.h> + #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS @@ -156,10 +159,32 @@ out_release: fmr_op_release_mr(mr); } +/* On success, sets: + * ep->rep_attr.cap.max_send_wr + * ep->rep_attr.cap.max_recv_wr + * cdata->max_requests + * ia->ri_max_segs + */ static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + int max_qp_wr; + + max_qp_wr = ia->ri_device->attrs.max_qp_wr; + max_qp_wr -= RPCRDMA_BACKWARD_WRS; + max_qp_wr -= 1; + if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) + return -ENOMEM; + if (cdata->max_requests > max_qp_wr) + cdata->max_requests = max_qp_wr; + ep->rep_attr.cap.max_send_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES); return 0; @@ -219,6 +244,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; + trace_xprtrdma_dma_map(mr); for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c59c5c788db0..c040de196e13 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -71,8 +71,10 @@ */ #include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS @@ -202,12 +204,22 @@ out_release: frwr_op_release_mr(mr); } +/* On success, sets: + * ep->rep_attr.cap.max_send_wr + * ep->rep_attr.cap.max_recv_wr + * cdata->max_requests + * ia->ri_max_segs + * + * And these FRWR-related fields: + * ia->ri_max_frwr_depth + * ia->ri_mrtype + */ static int frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { struct ib_device_attr *attrs = &ia->ri_device->attrs; - int depth, delta; + int max_qp_wr, depth, delta; ia->ri_mrtype = IB_MR_TYPE_MEM_REG; if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) @@ -241,14 +253,26 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } while (delta > 0); } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > attrs->max_qp_wr) { - cdata->max_requests = attrs->max_qp_wr / depth; + max_qp_wr = ia->ri_device->attrs.max_qp_wr; + max_qp_wr -= RPCRDMA_BACKWARD_WRS; + max_qp_wr -= 1; + if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) + return -ENOMEM; + if (cdata->max_requests > max_qp_wr) + cdata->max_requests = max_qp_wr; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; + if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { + cdata->max_requests = max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * depth; } + ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); @@ -393,6 +417,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; + trace_xprtrdma_dma_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c index a762d192372b..620327c01302 100644 --- a/net/sunrpc/xprtrdma/module.c +++ b/net/sunrpc/xprtrdma/module.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2015, 2017 Oracle. All rights reserved. */ @@ -13,9 +14,11 @@ #include <asm/swab.h> -#define CREATE_TRACE_POINTS #include "xprt_rdma.h" +#define CREATE_TRACE_POINTS +#include <trace/events/rpcrdma.h> + MODULE_AUTHOR("Open Grid Computing and Network Appliance, Inc."); MODULE_DESCRIPTION("RPC/RDMA Transport"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index e8adad33d0bb..c8ae983c6cc0 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. @@ -46,22 +47,17 @@ * to the Linux RPC framework lives. */ -#include "xprt_rdma.h" - #include <linux/highmem.h> +#include <linux/sunrpc/svc_rdma.h> + +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static const char transfertypes[][12] = { - "inline", /* no chunks */ - "read list", /* some argument via rdma read */ - "*read list", /* entire request via rdma read */ - "write list", /* some result via rdma write */ - "reply chunk" /* entire reply via rdma write */ -}; - /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a @@ -230,7 +226,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, */ *ppages = alloc_page(GFP_ATOMIC); if (!*ppages) - return -EAGAIN; + return -ENOBUFS; } seg->mr_page = *ppages; seg->mr_offset = (char *)page_base; @@ -365,7 +361,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false, &mr); if (IS_ERR(seg)) - goto out_maperr; + return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) @@ -377,11 +373,6 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } while (nsegs); return 0; - -out_maperr: - if (PTR_ERR(seg) == -EAGAIN) - xprt_wait_for_buffer_space(rqst->rq_task, NULL); - return PTR_ERR(seg); } /* Register and XDR encode the Write list. Supports encoding a list @@ -428,7 +419,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - goto out_maperr; + return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -445,11 +436,6 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; - -out_maperr: - if (PTR_ERR(seg) == -EAGAIN) - xprt_wait_for_buffer_space(rqst->rq_task, NULL); - return PTR_ERR(seg); } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -491,7 +477,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - goto out_maperr; + return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -508,11 +494,6 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; - -out_maperr: - if (PTR_ERR(seg) == -EAGAIN) - xprt_wait_for_buffer_space(rqst->rq_task, NULL); - return PTR_ERR(seg); } /** @@ -709,7 +690,7 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, { req->rl_sendctx = rpcrdma_sendctx_get_locked(&r_xprt->rx_buf); if (!req->rl_sendctx) - return -ENOBUFS; + return -EAGAIN; req->rl_sendctx->sc_wr.num_sge = 0; req->rl_sendctx->sc_unmap_count = 0; req->rl_sendctx->sc_req = req; @@ -883,7 +864,15 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) return 0; out_err: - r_xprt->rx_stats.failed_marshal_count++; + switch (ret) { + case -EAGAIN: + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + break; + case -ENOBUFS: + break; + default: + r_xprt->rx_stats.failed_marshal_count++; + } return ret; } @@ -1026,8 +1015,6 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) out_short: pr_warn("RPC/RDMA short backward direction call\n"); - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) - xprt_disconnect_done(&r_xprt->rx_xprt); return true; } #else /* CONFIG_SUNRPC_BACKCHANNEL */ @@ -1333,13 +1320,14 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; + --buf->rb_posted_receives; + if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; + /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base); - - /* Fixed transport header fields */ p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); if (unlikely(!p)) goto out_shortreply; @@ -1378,17 +1366,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); + rpcrdma_post_recvs(r_xprt, false); queue_work(rpcrdma_receive_wq, &rep->rr_work); return; -out_badstatus: - rpcrdma_recv_buffer_put(rep); - if (r_xprt->rx_ep.rep_connected == 1) { - r_xprt->rx_ep.rep_connected = -EIO; - rpcrdma_conn_func(&r_xprt->rx_ep); - } - return; - out_badversion: trace_xprtrdma_reply_vers(rep); goto repost; @@ -1408,7 +1389,7 @@ out_shortreply: * receive buffer before returning. */ repost: - r_xprt->rx_stats.bad_reply_count++; - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) - rpcrdma_recv_buffer_put(rep); + rpcrdma_post_recvs(r_xprt, false); +out_badstatus: + rpcrdma_recv_buffer_put(rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index dd8a431dc2ae..357ba90c382d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -1,4 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* + * Copyright (c) 2015-2018 Oracle. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -46,7 +48,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/svc_rdma.h> -#include "xprt_rdma.h" #define RPCDBG_FACILITY RPCDBG_SVCXPRT diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a73632ca9048..a68180090554 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -1,13 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2015-2018 Oracle. All rights reserved. * * Support for backward direction RPCs on RPC/RDMA (server-side). */ #include <linux/module.h> + #include <linux/sunrpc/svc_rdma.h> + #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -112,39 +115,21 @@ out_notfound: * the adapter has a small maximum SQ depth. */ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, - struct rpc_rqst *rqst) + struct rpc_rqst *rqst, + struct svc_rdma_send_ctxt *ctxt) { - struct svc_rdma_op_ctxt *ctxt; int ret; - ctxt = svc_rdma_get_context(rdma); - - /* rpcrdma_bc_send_request builds the transport header and - * the backchannel RPC message in the same buffer. Thus only - * one SGE is needed to send both. - */ - ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer, - rqst->rq_snd_buf.len); + ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL); if (ret < 0) - goto out_err; + return -EIO; /* Bump page refcnt so Send completion doesn't release * the rq_buffer before all retransmits are complete. */ get_page(virt_to_page(rqst->rq_buffer)); - ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0); - if (ret) - goto out_unmap; - -out_err: - dprintk("svcrdma: %s returns %d\n", __func__, ret); - return ret; - -out_unmap: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - ret = -EIO; - goto out_err; + ctxt->sc_send_wr.opcode = IB_WR_SEND; + return svc_rdma_send(rdma, &ctxt->sc_send_wr); } /* Server-side transport endpoint wants a whole page for its send @@ -191,13 +176,15 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct svc_rdma_send_ctxt *ctxt; __be32 *p; int rc; - /* Space in the send buffer for an RPC/RDMA header is reserved - * via xprt->tsh_size. - */ - p = rqst->rq_buffer; + ctxt = svc_rdma_send_ctxt_get(rdma); + if (!ctxt) + goto drop_connection; + + p = ctxt->sc_xprt_buf; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); @@ -205,14 +192,17 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; + svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN); #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); #endif - rc = svc_rdma_bc_sendto(rdma, rqst); - if (rc) + rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); + if (rc) { + svc_rdma_send_ctxt_put(rdma, ctxt); goto drop_connection; + } return rc; drop_connection: @@ -273,6 +263,7 @@ static const struct rpc_xprt_ops xprt_rdma_bc_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, + .free_slot = xprt_free_slot, .release_request = xprt_release_rqst_cong, .buf_alloc = xprt_rdma_bc_allocate, .buf_free = xprt_rdma_bc_free, @@ -320,7 +311,7 @@ xprt_setup_rdma_bc(struct xprt_create *args) xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; xprt->prot = XPRT_TRANSPORT_BC_RDMA; - xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->tsh_size = 0; xprt->ops = &xprt_rdma_bc_procs; memcpy(&xprt->addr, args->dstaddr, args->addrlen); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3d45015dca97..841fca143804 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2016, 2017 Oracle. All rights reserved. + * Copyright (c) 2016-2018 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -60,7 +61,7 @@ * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's * data payload from the client. svc_rdma_recvfrom sets up the * RDMA Reads using pages in svc_rqst::rq_pages, which are - * transferred to an svc_rdma_op_ctxt for the duration of the + * transferred to an svc_rdma_recv_ctxt for the duration of the * I/O. svc_rdma_recvfrom then returns zero, since the RPC message * is still not yet ready. * @@ -69,18 +70,18 @@ * svc_rdma_recvfrom again. This second call may use a different * svc_rqst than the first one, thus any information that needs * to be preserved across these two calls is kept in an - * svc_rdma_op_ctxt. + * svc_rdma_recv_ctxt. * * The second call to svc_rdma_recvfrom performs final assembly * of the RPC Call message, using the RDMA Read sink pages kept in - * the svc_rdma_op_ctxt. The xdr_buf is copied from the - * svc_rdma_op_ctxt to the second svc_rqst. The second call returns + * the svc_rdma_recv_ctxt. The xdr_buf is copied from the + * svc_rdma_recv_ctxt to the second svc_rqst. The second call returns * the length of the completed RPC Call message. * * Page Management * * Pages under I/O must be transferred from the first svc_rqst to an - * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns. + * svc_rdma_recv_ctxt before the first svc_rdma_recvfrom call returns. * * The first svc_rqst supplies pages for RDMA Reads. These are moved * from rqstp::rq_pages into ctxt::pages. The consumed elements of @@ -88,78 +89,286 @@ * svc_rdma_recvfrom call returns. * * During the second svc_rdma_recvfrom call, RDMA Read sink pages - * are transferred from the svc_rdma_op_ctxt to the second svc_rqst + * are transferred from the svc_rdma_recv_ctxt to the second svc_rqst * (see rdma_read_complete() below). */ +#include <linux/spinlock.h> #include <asm/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> -#include <linux/spinlock.h> - #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/debug.h> #include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* - * Replace the pages in the rq_argpages array with the pages from the SGE in - * the RDMA_RECV completion. The SGL should contain full pages up until the - * last one. +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc); + +static inline struct svc_rdma_recv_ctxt * +svc_rdma_next_recv_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_recv_ctxt, + rc_list); +} + +static struct svc_rdma_recv_ctxt * +svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + dma_addr_t addr; + void *buffer; + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) + goto fail0; + buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + if (!buffer) + goto fail1; + addr = ib_dma_map_single(rdma->sc_pd->device, buffer, + rdma->sc_max_req_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + goto fail2; + + ctxt->rc_recv_wr.next = NULL; + ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; + ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; + ctxt->rc_recv_wr.num_sge = 1; + ctxt->rc_cqe.done = svc_rdma_wc_receive; + ctxt->rc_recv_sge.addr = addr; + ctxt->rc_recv_sge.length = rdma->sc_max_req_size; + ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; + ctxt->rc_recv_buf = buffer; + ctxt->rc_temp = false; + return ctxt; + +fail2: + kfree(buffer); +fail1: + kfree(ctxt); +fail0: + return NULL; +} + +static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + ib_dma_unmap_single(rdma->sc_pd->device, ctxt->rc_recv_sge.addr, + ctxt->rc_recv_sge.length, DMA_FROM_DEVICE); + kfree(ctxt->rc_recv_buf); + kfree(ctxt); +} + +/** + * svc_rdma_recv_ctxts_destroy - Release all recv_ctxt's for an xprt + * @rdma: svcxprt_rdma being torn down + * */ -static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *ctxt) +void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) { - struct page *page; - int sge_no; - u32 len; + struct svc_rdma_recv_ctxt *ctxt; - /* The reply path assumes the Call's transport header resides - * in rqstp->rq_pages[0]. - */ - page = ctxt->pages[0]; - put_page(rqstp->rq_pages[0]); - rqstp->rq_pages[0] = page; - - /* Set up the XDR head */ - rqstp->rq_arg.head[0].iov_base = page_address(page); - rqstp->rq_arg.head[0].iov_len = - min_t(size_t, ctxt->byte_len, ctxt->sge[0].length); - rqstp->rq_arg.len = ctxt->byte_len; - rqstp->rq_arg.buflen = ctxt->byte_len; - - /* Compute bytes past head in the SGL */ - len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len; - - /* If data remains, store it in the pagelist */ - rqstp->rq_arg.page_len = len; - rqstp->rq_arg.page_base = 0; - - sge_no = 1; - while (len && sge_no < ctxt->count) { - page = ctxt->pages[sge_no]; - put_page(rqstp->rq_pages[sge_no]); - rqstp->rq_pages[sge_no] = page; - len -= min_t(u32, len, ctxt->sge[sge_no].length); - sge_no++; + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_destroy(rdma, ctxt); } - rqstp->rq_respages = &rqstp->rq_pages[sge_no]; - rqstp->rq_next_page = rqstp->rq_respages + 1; +} + +static struct svc_rdma_recv_ctxt * +svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + spin_lock(&rdma->sc_recv_lock); + ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts); + if (!ctxt) + goto out_empty; + list_del(&ctxt->rc_list); + spin_unlock(&rdma->sc_recv_lock); + +out: + ctxt->rc_page_count = 0; + return ctxt; + +out_empty: + spin_unlock(&rdma->sc_recv_lock); + + ctxt = svc_rdma_recv_ctxt_alloc(rdma); + if (!ctxt) + return NULL; + goto out; +} + +/** + * svc_rdma_recv_ctxt_put - Return recv_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + */ +void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + unsigned int i; + + for (i = 0; i < ctxt->rc_page_count; i++) + put_page(ctxt->rc_pages[i]); + + if (!ctxt->rc_temp) { + spin_lock(&rdma->sc_recv_lock); + list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts); + spin_unlock(&rdma->sc_recv_lock); + } else + svc_rdma_recv_ctxt_destroy(rdma, ctxt); +} + +static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct ib_recv_wr *bad_recv_wr; + int ret; + + svc_xprt_get(&rdma->sc_xprt); + ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, &bad_recv_wr); + trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); + if (ret) + goto err_post; + return 0; + +err_post: + svc_rdma_recv_ctxt_put(rdma, ctxt); + svc_xprt_put(&rdma->sc_xprt); + return ret; +} - /* If not all pages were used from the SGL, free the remaining ones */ - len = sge_no; - while (sge_no < ctxt->count) { - page = ctxt->pages[sge_no++]; - put_page(page); +static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return -ENOMEM; + return __svc_rdma_post_recv(rdma, ctxt); +} + +/** + * svc_rdma_post_recvs - Post initial set of Recv WRs + * @rdma: fresh svcxprt_rdma + * + * Returns true if successful, otherwise false. + */ +bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + unsigned int i; + int ret; + + for (i = 0; i < rdma->sc_max_requests; i++) { + ctxt = svc_rdma_recv_ctxt_get(rdma); + if (!ctxt) + return false; + ctxt->rc_temp = true; + ret = __svc_rdma_post_recv(rdma, ctxt); + if (ret) { + pr_err("svcrdma: failure posting recv buffers: %d\n", + ret); + return false; + } } - ctxt->count = len; + return true; +} - /* Set up tail */ - rqstp->rq_arg.tail[0].iov_base = NULL; - rqstp->rq_arg.tail[0].iov_len = 0; +/** + * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC + * @cq: Completion Queue context + * @wc: Work Completion object + * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Receive completion handler could be running. + */ +static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *rdma = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_recv_ctxt *ctxt; + + trace_svcrdma_wc_receive(wc); + + /* WARNING: Only wc->wr_cqe and wc->status are reliable */ + ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); + + if (wc->status != IB_WC_SUCCESS) + goto flushed; + + if (svc_rdma_post_recv(rdma)) + goto post_err; + + /* All wc fields are now known to be valid */ + ctxt->rc_byte_len = wc->byte_len; + ib_dma_sync_single_for_cpu(rdma->sc_pd->device, + ctxt->rc_recv_sge.addr, + wc->byte_len, DMA_FROM_DEVICE); + + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&ctxt->rc_list, &rdma->sc_rq_dto_q); + spin_unlock(&rdma->sc_rq_dto_lock); + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) + svc_xprt_enqueue(&rdma->sc_xprt); + goto out; + +flushed: + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Recv: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); +post_err: + svc_rdma_recv_ctxt_put(rdma, ctxt); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); +out: + svc_xprt_put(&rdma->sc_xprt); +} + +/** + * svc_rdma_flush_recv_queues - Drain pending Receive work + * @rdma: svcxprt_rdma being shut down + * + */ +void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_recv_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_read_complete_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } + while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_rq_dto_q))) { + list_del(&ctxt->rc_list); + svc_rdma_recv_ctxt_put(rdma, ctxt); + } +} + +static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_recv_ctxt *ctxt) +{ + struct xdr_buf *arg = &rqstp->rq_arg; + + arg->head[0].iov_base = ctxt->rc_recv_buf; + arg->head[0].iov_len = ctxt->rc_byte_len; + arg->tail[0].iov_base = NULL; + arg->tail[0].iov_len = 0; + arg->page_len = 0; + arg->page_base = 0; + arg->buflen = ctxt->rc_byte_len; + arg->len = ctxt->rc_byte_len; + + rqstp->rq_respages = &rqstp->rq_pages[0]; + rqstp->rq_next_page = rqstp->rq_respages + 1; } /* This accommodates the largest possible Write chunk, @@ -294,7 +503,6 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { __be32 *p, *end, *rdma_argp; unsigned int hdr_len; - char *proc; /* Verify that there's enough bytes for header + something */ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) @@ -306,10 +514,8 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) switch (*(rdma_argp + 3)) { case rdma_msg: - proc = "RDMA_MSG"; break; case rdma_nomsg: - proc = "RDMA_NOMSG"; break; case rdma_done: @@ -339,103 +545,94 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) hdr_len = (unsigned long)p - (unsigned long)rdma_argp; rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; - dprintk("svcrdma: received %s request for XID 0x%08x, hdr_len=%u\n", - proc, be32_to_cpup(rdma_argp), hdr_len); + trace_svcrdma_decode_rqst(rdma_argp, hdr_len); return hdr_len; out_short: - dprintk("svcrdma: header too short = %d\n", rq_arg->len); + trace_svcrdma_decode_short(rq_arg->len); return -EINVAL; out_version: - dprintk("svcrdma: bad xprt version: %u\n", - be32_to_cpup(rdma_argp + 1)); + trace_svcrdma_decode_badvers(rdma_argp); return -EPROTONOSUPPORT; out_drop: - dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + trace_svcrdma_decode_drop(rdma_argp); return 0; out_proc: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpup(rdma_argp + 3)); + trace_svcrdma_decode_badproc(rdma_argp); return -EINVAL; out_inval: - dprintk("svcrdma: failed to parse transport header\n"); + trace_svcrdma_decode_parse(rdma_argp); return -EINVAL; } static void rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head) + struct svc_rdma_recv_ctxt *head) { int page_no; - /* Copy RPC pages */ - for (page_no = 0; page_no < head->count; page_no++) { + /* Move Read chunk pages to rqstp so that they will be released + * when svc_process is done with them. + */ + for (page_no = 0; page_no < head->rc_page_count; page_no++) { put_page(rqstp->rq_pages[page_no]); - rqstp->rq_pages[page_no] = head->pages[page_no]; + rqstp->rq_pages[page_no] = head->rc_pages[page_no]; } + head->rc_page_count = 0; /* Point rq_arg.pages past header */ - rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; - rqstp->rq_arg.page_len = head->arg.page_len; + rqstp->rq_arg.pages = &rqstp->rq_pages[head->rc_hdr_count]; + rqstp->rq_arg.page_len = head->rc_arg.page_len; /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_pages[page_no]; rqstp->rq_next_page = rqstp->rq_respages + 1; /* Rebuild rq_arg head and tail. */ - rqstp->rq_arg.head[0] = head->arg.head[0]; - rqstp->rq_arg.tail[0] = head->arg.tail[0]; - rqstp->rq_arg.len = head->arg.len; - rqstp->rq_arg.buflen = head->arg.buflen; + rqstp->rq_arg.head[0] = head->rc_arg.head[0]; + rqstp->rq_arg.tail[0] = head->rc_arg.tail[0]; + rqstp->rq_arg.len = head->rc_arg.len; + rqstp->rq_arg.buflen = head->rc_arg.buflen; } static void svc_rdma_send_error(struct svcxprt_rdma *xprt, __be32 *rdma_argp, int status) { - struct svc_rdma_op_ctxt *ctxt; - __be32 *p, *err_msgp; + struct svc_rdma_send_ctxt *ctxt; unsigned int length; - struct page *page; + __be32 *p; int ret; - page = alloc_page(GFP_KERNEL); - if (!page) + ctxt = svc_rdma_send_ctxt_get(xprt); + if (!ctxt) return; - err_msgp = page_address(page); - p = err_msgp; + p = ctxt->sc_xprt_buf; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = xprt->sc_fc_credits; *p++ = rdma_error; - if (status == -EPROTONOSUPPORT) { + switch (status) { + case -EPROTONOSUPPORT: *p++ = err_vers; *p++ = rpcrdma_version; *p++ = rpcrdma_version; - } else { + trace_svcrdma_err_vers(*rdma_argp); + break; + default: *p++ = err_chunk; + trace_svcrdma_err_chunk(*rdma_argp); } - length = (unsigned long)p - (unsigned long)err_msgp; - - /* Map transport header; no RPC message payload */ - ctxt = svc_rdma_get_context(xprt); - ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length); - if (ret) { - dprintk("svcrdma: Error %d mapping send for protocol error\n", - ret); - return; - } + length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf; + svc_rdma_sync_reply_hdr(xprt, ctxt, length); - ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0); - if (ret) { - dprintk("svcrdma: Error %d posting send for protocol error\n", - ret); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - } + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); + if (ret) + svc_rdma_send_ctxt_put(xprt, ctxt); } /* By convention, backchannel calls arrive via rdma_msg type @@ -507,32 +704,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); - struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_recv_ctxt *ctxt; __be32 *p; int ret; spin_lock(&rdma_xprt->sc_rq_dto_lock); - if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_read_complete_q); + if (ctxt) { + list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; - } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - } else { + } + ctxt = svc_rdma_next_recv_ctxt(&rdma_xprt->sc_rq_dto_q); + if (!ctxt) { /* No new incoming requests, terminate the loop */ clear_bit(XPT_DATA, &xprt->xpt_flags); spin_unlock(&rdma_xprt->sc_rq_dto_lock); return 0; } + list_del(&ctxt->rc_list); spin_unlock(&rdma_xprt->sc_rq_dto_lock); - dprintk("svcrdma: recvfrom: ctxt=%p on xprt=%p, rqstp=%p\n", - ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); svc_rdma_build_arg_xdr(rqstp, ctxt); @@ -548,7 +741,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) if (svc_rdma_is_backchannel_reply(xprt, p)) { ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, &rqstp->rq_arg); - svc_rdma_put_context(ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; } @@ -557,9 +750,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_readchunk; complete: - svc_rdma_put_context(ctxt, 0); - dprintk("svcrdma: recvfrom: xprt=%p, rqstp=%p, rq_arg.len=%u\n", - rdma_xprt, rqstp, rqstp->rq_arg.len); + rqstp->rq_xprt_ctxt = ctxt; rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); return rqstp->rq_arg.len; @@ -572,16 +763,16 @@ out_readchunk: out_err: svc_rdma_send_error(rdma_xprt, p, ret); - svc_rdma_put_context(ctxt, 0); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; out_postfail: if (ret == -EINVAL) svc_rdma_send_error(rdma_xprt, p, ret); - svc_rdma_put_context(ctxt, 1); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; out_drop: - svc_rdma_put_context(ctxt, 1); + svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 12b9a7e0b6d2..ce3ea8419704 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -1,15 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2016 Oracle. All rights reserved. + * Copyright (c) 2016-2018 Oracle. All rights reserved. * * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. */ +#include <rdma/rw.h> + #include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #include <linux/sunrpc/debug.h> -#include <rdma/rw.h> +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -205,6 +208,8 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); + trace_svcrdma_wc_write(wc); + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); @@ -222,7 +227,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) /* State for pulling a Read chunk. */ struct svc_rdma_read_info { - struct svc_rdma_op_ctxt *ri_readctxt; + struct svc_rdma_recv_ctxt *ri_readctxt; unsigned int ri_position; unsigned int ri_pageno; unsigned int ri_pageoff; @@ -266,6 +271,8 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_read_info *info = container_of(cc, struct svc_rdma_read_info, ri_cc); + trace_svcrdma_wc_read(wc); + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); @@ -275,10 +282,10 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - svc_rdma_put_context(info->ri_readctxt, 1); + svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt); } else { spin_lock(&rdma->sc_rq_dto_lock); - list_add_tail(&info->ri_readctxt->list, + list_add_tail(&info->ri_readctxt->rc_list, &rdma->sc_read_complete_q); spin_unlock(&rdma->sc_rq_dto_lock); @@ -323,18 +330,20 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + trace_svcrdma_post_rw(&cc->cc_cqe, + cc->cc_sqecount, ret); if (ret) break; return 0; } - atomic_inc(&rdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wait_event(rdma->sc_send_wait, atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); + trace_svcrdma_sq_retry(rdma); } while (1); - pr_err("svcrdma: ib_post_send failed (%d)\n", ret); set_bit(XPT_CLOSE, &xprt->xpt_flags); /* If even one was posted, there will be a completion. */ @@ -437,6 +446,7 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, if (ret < 0) goto out_initerr; + trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; if (write_len == seg_length - info->wi_seg_off) { @@ -462,7 +472,7 @@ out_noctx: out_initerr: svc_rdma_put_rw_ctxt(rdma, ctxt); - pr_err("svcrdma: failed to map pagelist (%d)\n", ret); + trace_svcrdma_dma_map_rwctx(rdma, ret); return -EIO; } @@ -526,6 +536,8 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); if (ret < 0) goto out_err; + + trace_svcrdma_encode_write(xdr->page_len); return xdr->page_len; out_err: @@ -582,6 +594,8 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); if (ret < 0) goto out_err; + + trace_svcrdma_encode_reply(consumed); return consumed; out_err: @@ -593,7 +607,7 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, struct svc_rqst *rqstp, u32 rkey, u32 len, u64 offset) { - struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; struct svc_rdma_rw_ctxt *ctxt; unsigned int sge_no, seg_len; @@ -606,18 +620,15 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, goto out_noctx; ctxt->rw_nents = sge_no; - dprintk("svcrdma: reading segment %u@0x%016llx:0x%08x (%u sges)\n", - len, offset, rkey, sge_no); - sg = ctxt->rw_sg_table.sgl; for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { seg_len = min_t(unsigned int, len, PAGE_SIZE - info->ri_pageoff); - head->arg.pages[info->ri_pageno] = + head->rc_arg.pages[info->ri_pageno] = rqstp->rq_pages[info->ri_pageno]; if (!info->ri_pageoff) - head->count++; + head->rc_page_count++; sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], seg_len, info->ri_pageoff); @@ -656,8 +667,8 @@ out_overrun: return -EINVAL; out_initerr: + trace_svcrdma_dma_map_rwctx(cc->cc_rdma, ret); svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); - pr_err("svcrdma: failed to map pagelist (%d)\n", ret); return -EIO; } @@ -686,6 +697,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, if (ret < 0) break; + trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); info->ri_chunklen += rs_length; } @@ -693,9 +705,9 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, } /* Construct RDMA Reads to pull over a normal Read chunk. The chunk - * data lands in the page list of head->arg.pages. + * data lands in the page list of head->rc_arg.pages. * - * Currently NFSD does not look at the head->arg.tail[0] iovec. + * Currently NFSD does not look at the head->rc_arg.tail[0] iovec. * Therefore, XDR round-up of the Read chunk and trailing * inline content must both be added at the end of the pagelist. */ @@ -703,29 +715,27 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { - struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; int ret; - dprintk("svcrdma: Reading Read chunk at position %u\n", - info->ri_position); - - info->ri_pageno = head->hdr_count; - info->ri_pageoff = 0; - ret = svc_rdma_build_read_chunk(rqstp, info, p); if (ret < 0) goto out; + trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); + + head->rc_hdr_count = 0; + /* Split the Receive buffer between the head and tail * buffers at Read chunk's position. XDR roundup of the * chunk is not included in either the pagelist or in * the tail. */ - head->arg.tail[0].iov_base = - head->arg.head[0].iov_base + info->ri_position; - head->arg.tail[0].iov_len = - head->arg.head[0].iov_len - info->ri_position; - head->arg.head[0].iov_len = info->ri_position; + head->rc_arg.tail[0].iov_base = + head->rc_arg.head[0].iov_base + info->ri_position; + head->rc_arg.tail[0].iov_len = + head->rc_arg.head[0].iov_len - info->ri_position; + head->rc_arg.head[0].iov_len = info->ri_position; /* Read chunk may need XDR roundup (see RFC 8166, s. 3.4.5.2). * @@ -738,9 +748,9 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, */ info->ri_chunklen = XDR_QUADLEN(info->ri_chunklen) << 2; - head->arg.page_len = info->ri_chunklen; - head->arg.len += info->ri_chunklen; - head->arg.buflen += info->ri_chunklen; + head->rc_arg.page_len = info->ri_chunklen; + head->rc_arg.len += info->ri_chunklen; + head->rc_arg.buflen += info->ri_chunklen; out: return ret; @@ -749,7 +759,7 @@ out: /* Construct RDMA Reads to pull over a Position Zero Read chunk. * The start of the data lands in the first page just after * the Transport header, and the rest lands in the page list of - * head->arg.pages. + * head->rc_arg.pages. * * Assumptions: * - A PZRC has an XDR-aligned length (no implicit round-up). @@ -761,35 +771,25 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { - struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_recv_ctxt *head = info->ri_readctxt; int ret; - dprintk("svcrdma: Reading Position Zero Read chunk\n"); - - info->ri_pageno = head->hdr_count - 1; - info->ri_pageoff = offset_in_page(head->byte_len); - ret = svc_rdma_build_read_chunk(rqstp, info, p); if (ret < 0) goto out; - head->arg.len += info->ri_chunklen; - head->arg.buflen += info->ri_chunklen; + trace_svcrdma_encode_pzr(info->ri_chunklen); - if (head->arg.buflen <= head->sge[0].length) { - /* Transport header and RPC message fit entirely - * in page where head iovec resides. - */ - head->arg.head[0].iov_len = info->ri_chunklen; - } else { - /* Transport header and part of RPC message reside - * in the head iovec's page. - */ - head->arg.head[0].iov_len = - head->sge[0].length - head->byte_len; - head->arg.page_len = - info->ri_chunklen - head->arg.head[0].iov_len; - } + head->rc_arg.len += info->ri_chunklen; + head->rc_arg.buflen += info->ri_chunklen; + + head->rc_hdr_count = 1; + head->rc_arg.head[0].iov_base = page_address(head->rc_pages[0]); + head->rc_arg.head[0].iov_len = min_t(size_t, PAGE_SIZE, + info->ri_chunklen); + + head->rc_arg.page_len = info->ri_chunklen - + head->rc_arg.head[0].iov_len; out: return ret; @@ -813,29 +813,30 @@ out: * - All Read segments in @p have the same Position value. */ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, __be32 *p) + struct svc_rdma_recv_ctxt *head, __be32 *p) { struct svc_rdma_read_info *info; struct page **page; int ret; /* The request (with page list) is constructed in - * head->arg. Pages involved with RDMA Read I/O are + * head->rc_arg. Pages involved with RDMA Read I/O are * transferred there. */ - head->hdr_count = head->count; - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->arg.pages = head->pages; - head->arg.page_base = 0; - head->arg.page_len = 0; - head->arg.len = rqstp->rq_arg.len; - head->arg.buflen = rqstp->rq_arg.buflen; + head->rc_arg.head[0] = rqstp->rq_arg.head[0]; + head->rc_arg.tail[0] = rqstp->rq_arg.tail[0]; + head->rc_arg.pages = head->rc_pages; + head->rc_arg.page_base = 0; + head->rc_arg.page_len = 0; + head->rc_arg.len = rqstp->rq_arg.len; + head->rc_arg.buflen = rqstp->rq_arg.buflen; info = svc_rdma_read_info_alloc(rdma); if (!info) return -ENOMEM; info->ri_readctxt = head; + info->ri_pageno = 0; + info->ri_pageoff = 0; info->ri_position = be32_to_cpup(p + 1); if (info->ri_position) @@ -856,7 +857,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, out: /* Read sink pages have been moved from rqstp->rq_pages to - * head->arg.pages. Force svc_recv to refill those slots + * head->rc_arg.pages. Force svc_recv to refill those slots * in rq_pages. */ for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 649441d5087d..4a3efaea277c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (c) 2016 Oracle. All rights reserved. + * Copyright (c) 2016-2018 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -74,11 +75,11 @@ * DMA-unmap the pages under I/O for that Write segment. The Write * completion handler does not release any pages. * - * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt. + * When the Send WR is constructed, it also gets its own svc_rdma_send_ctxt. * The ownership of all of the Reply's pages are transferred into that * ctxt, the Send WR is posted, and sendto returns. * - * The svc_rdma_op_ctxt is presented when the Send WR completes. The + * The svc_rdma_send_ctxt is presented when the Send WR completes. The * Send completion handler finally releases the Reply's pages. * * This mechanism also assumes that completions on the transport's Send @@ -98,16 +99,230 @@ * where two different Write segments send portions of the same page. */ -#include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/spinlock.h> #include <asm/unaligned.h> + #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> + +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> + #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); + +static inline struct svc_rdma_send_ctxt * +svc_rdma_next_send_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, + sc_list); +} + +static struct svc_rdma_send_ctxt * +svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + dma_addr_t addr; + void *buffer; + size_t size; + int i; + + size = sizeof(*ctxt); + size += rdma->sc_max_send_sges * sizeof(struct ib_sge); + ctxt = kmalloc(size, GFP_KERNEL); + if (!ctxt) + goto fail0; + buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + if (!buffer) + goto fail1; + addr = ib_dma_map_single(rdma->sc_pd->device, buffer, + rdma->sc_max_req_size, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) + goto fail2; + + ctxt->sc_send_wr.next = NULL; + ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; + ctxt->sc_send_wr.sg_list = ctxt->sc_sges; + ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; + ctxt->sc_cqe.done = svc_rdma_wc_send; + ctxt->sc_xprt_buf = buffer; + ctxt->sc_sges[0].addr = addr; + + for (i = 0; i < rdma->sc_max_send_sges; i++) + ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; + return ctxt; + +fail2: + kfree(buffer); +fail1: + kfree(ctxt); +fail0: + return NULL; +} + +/** + * svc_rdma_send_ctxts_destroy - Release all send_ctxt's for an xprt + * @rdma: svcxprt_rdma being torn down + * + */ +void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { + list_del(&ctxt->sc_list); + ib_dma_unmap_single(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, + rdma->sc_max_req_size, + DMA_TO_DEVICE); + kfree(ctxt->sc_xprt_buf); + kfree(ctxt); + } +} + +/** + * svc_rdma_send_ctxt_get - Get a free send_ctxt + * @rdma: controlling svcxprt_rdma + * + * Returns a ready-to-use send_ctxt, or NULL if none are + * available and a fresh one cannot be allocated. + */ +struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_send_ctxt *ctxt; + + spin_lock(&rdma->sc_send_lock); + ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); + if (!ctxt) + goto out_empty; + list_del(&ctxt->sc_list); + spin_unlock(&rdma->sc_send_lock); + +out: + ctxt->sc_send_wr.num_sge = 0; + ctxt->sc_cur_sge_no = 0; + ctxt->sc_page_count = 0; + return ctxt; + +out_empty: + spin_unlock(&rdma->sc_send_lock); + ctxt = svc_rdma_send_ctxt_alloc(rdma); + if (!ctxt) + return NULL; + goto out; +} + +/** + * svc_rdma_send_ctxt_put - Return send_ctxt to free list + * @rdma: controlling svcxprt_rdma + * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. + */ +void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct ib_device *device = rdma->sc_cm_id->device; + unsigned int i; + + /* The first SGE contains the transport header, which + * remains mapped until @ctxt is destroyed. + */ + for (i = 1; i < ctxt->sc_send_wr.num_sge; i++) + ib_dma_unmap_page(device, + ctxt->sc_sges[i].addr, + ctxt->sc_sges[i].length, + DMA_TO_DEVICE); + + for (i = 0; i < ctxt->sc_page_count; ++i) + put_page(ctxt->sc_pages[i]); + + spin_lock(&rdma->sc_send_lock); + list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); + spin_unlock(&rdma->sc_send_lock); +} + +/** + * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC + * @cq: Completion Queue context + * @wc: Work Completion object + * + * NB: The svc_xprt/svcxprt_rdma is pinned whenever it's possible that + * the Send completion handler could be running. + */ +static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +{ + struct svcxprt_rdma *rdma = cq->cq_context; + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_send_ctxt *ctxt; + + trace_svcrdma_wc_send(wc); + + atomic_inc(&rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); + svc_rdma_send_ctxt_put(rdma, ctxt); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + } + + svc_xprt_put(&rdma->sc_xprt); +} + +/** + * svc_rdma_send - Post a single Send WR + * @rdma: transport on which to post the WR + * @wr: prepared Send WR to post + * + * Returns zero the Send WR was posted successfully. Otherwise, a + * negative errno is returned. + */ +int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +{ + struct ib_send_wr *bad_wr; + int ret; + + might_sleep(); + + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + atomic_inc(&rdma_stat_sq_starve); + trace_svcrdma_sq_full(rdma); + atomic_inc(&rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > 1); + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return -ENOTCONN; + trace_svcrdma_sq_retry(rdma); + continue; + } + + svc_xprt_get(&rdma->sc_xprt); + ret = ib_post_send(rdma->sc_qp, wr, &bad_wr); + trace_svcrdma_post_send(wr, ret); + if (ret) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_xprt_put(&rdma->sc_xprt); + wake_up(&rdma->sc_send_wait); + } + break; + } + return ret; +} + static u32 xdr_padsize(u32 len) { return (len & 3) ? (4 - (len & 3)) : 0; @@ -296,41 +511,10 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, return be32_to_cpup(p); } -/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() - * is used during completion to DMA-unmap this memory, and - * it uses ib_dma_unmap_page() exclusively. - */ -static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, - unsigned int sge_no, - unsigned char *base, - unsigned int len) -{ - unsigned long offset = (unsigned long)base & ~PAGE_MASK; - struct ib_device *dev = rdma->sc_cm_id->device; - dma_addr_t dma_addr; - - dma_addr = ib_dma_map_page(dev, virt_to_page(base), - offset, len, DMA_TO_DEVICE); - if (ib_dma_mapping_error(dev, dma_addr)) - goto out_maperr; - - ctxt->sge[sge_no].addr = dma_addr; - ctxt->sge[sge_no].length = len; - ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - svc_rdma_count_mappings(rdma, ctxt); - return 0; - -out_maperr: - pr_err("svcrdma: failed to map buffer\n"); - return -EIO; -} - static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, - unsigned int sge_no, + struct svc_rdma_send_ctxt *ctxt, struct page *page, - unsigned int offset, + unsigned long offset, unsigned int len) { struct ib_device *dev = rdma->sc_cm_id->device; @@ -340,58 +524,71 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, if (ib_dma_mapping_error(dev, dma_addr)) goto out_maperr; - ctxt->sge[sge_no].addr = dma_addr; - ctxt->sge[sge_no].length = len; - ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - svc_rdma_count_mappings(rdma, ctxt); + ctxt->sc_sges[ctxt->sc_cur_sge_no].addr = dma_addr; + ctxt->sc_sges[ctxt->sc_cur_sge_no].length = len; + ctxt->sc_send_wr.num_sge++; return 0; out_maperr: - pr_err("svcrdma: failed to map page\n"); + trace_svcrdma_dma_map_page(rdma, page); return -EIO; } +/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() + * handles DMA-unmap and it uses ib_dma_unmap_page() exclusively. + */ +static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + unsigned char *base, + unsigned int len) +{ + return svc_rdma_dma_map_page(rdma, ctxt, virt_to_page(base), + offset_in_page(base), len); +} + /** - * svc_rdma_map_reply_hdr - DMA map the transport header buffer + * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer * @rdma: controlling transport - * @ctxt: op_ctxt for the Send WR - * @rdma_resp: buffer containing transport header + * @ctxt: send_ctxt for the Send WR * @len: length of transport header * - * Returns: - * %0 if the header is DMA mapped, - * %-EIO if DMA mapping failed. */ -int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, - __be32 *rdma_resp, - unsigned int len) +void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + unsigned int len) { - ctxt->direction = DMA_TO_DEVICE; - ctxt->pages[0] = virt_to_page(rdma_resp); - ctxt->count = 1; - return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len); + ctxt->sc_sges[0].length = len; + ctxt->sc_send_wr.num_sge++; + ib_dma_sync_single_for_device(rdma->sc_pd->device, + ctxt->sc_sges[0].addr, len, + DMA_TO_DEVICE); } -/* Load the xdr_buf into the ctxt's sge array, and DMA map each +/* svc_rdma_map_reply_msg - Map the buffer holding RPC message + * @rdma: controlling transport + * @ctxt: send_ctxt for the Send WR + * @xdr: prepared xdr_buf containing RPC message + * @wr_lst: pointer to Call header's Write list, or NULL + * + * Load the xdr_buf into the ctxt's sge array, and DMA map each * element as it is added. * - * Returns the number of sge elements loaded on success, or - * a negative errno on failure. + * Returns zero on success, or a negative errno on failure. */ -static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) +int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) { - unsigned int len, sge_no, remaining, page_off; + unsigned int len, remaining; + unsigned long page_off; struct page **ppages; unsigned char *base; u32 xdr_pad; int ret; - sge_no = 1; - - ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, + if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) + return -EIO; + ret = svc_rdma_dma_map_buf(rdma, ctxt, xdr->head[0].iov_base, xdr->head[0].iov_len); if (ret < 0) @@ -421,8 +618,10 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, while (remaining) { len = min_t(u32, PAGE_SIZE - page_off, remaining); - ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++, - *ppages++, page_off, len); + if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) + return -EIO; + ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, + page_off, len); if (ret < 0) return ret; @@ -434,12 +633,14 @@ static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, len = xdr->tail[0].iov_len; tail: if (len) { - ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len); + if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) + return -EIO; + ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); if (ret < 0) return ret; } - return sge_no - 1; + return 0; } /* The svc_rqst and all resources it owns are released as soon as @@ -447,62 +648,25 @@ tail: * so they are released by the Send completion handler. */ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *ctxt) + struct svc_rdma_send_ctxt *ctxt) { int i, pages = rqstp->rq_next_page - rqstp->rq_respages; - ctxt->count += pages; + ctxt->sc_page_count += pages; for (i = 0; i < pages; i++) { - ctxt->pages[i + 1] = rqstp->rq_respages[i]; + ctxt->sc_pages[i] = rqstp->rq_respages[i]; rqstp->rq_respages[i] = NULL; } rqstp->rq_next_page = rqstp->rq_respages + 1; } -/** - * svc_rdma_post_send_wr - Set up and post one Send Work Request - * @rdma: controlling transport - * @ctxt: op_ctxt for transmitting the Send WR - * @num_sge: number of SGEs to send - * @inv_rkey: R_key argument to Send With Invalidate, or zero - * - * Returns: - * %0 if the Send* was posted successfully, - * %-ENOTCONN if the connection was lost or dropped, - * %-EINVAL if there was a problem with the Send we built, - * %-ENOMEM if ib_post_send failed. - */ -int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, - struct svc_rdma_op_ctxt *ctxt, int num_sge, - u32 inv_rkey) -{ - struct ib_send_wr *send_wr = &ctxt->send_wr; - - dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge); - - send_wr->next = NULL; - ctxt->cqe.done = svc_rdma_wc_send; - send_wr->wr_cqe = &ctxt->cqe; - send_wr->sg_list = ctxt->sge; - send_wr->num_sge = num_sge; - send_wr->send_flags = IB_SEND_SIGNALED; - if (inv_rkey) { - send_wr->opcode = IB_WR_SEND_WITH_INV; - send_wr->ex.invalidate_rkey = inv_rkey; - } else { - send_wr->opcode = IB_WR_SEND; - } - - return svc_rdma_send(rdma, send_wr); -} - /* Prepare the portion of the RPC Reply that will be transmitted * via RDMA Send. The RPC-over-RDMA transport header is prepared - * in sge[0], and the RPC xdr_buf is prepared in following sges. + * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. * * Depending on whether a Write list or Reply chunk is present, * the server may send all, a portion of, or none of the xdr_buf. - * In the latter case, only the transport header (sge[0]) is + * In the latter case, only the transport header (sc_sges[0]) is * transmitted. * * RDMA Send is the last step of transmitting an RPC reply. Pages @@ -515,49 +679,32 @@ int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, * - The Reply's transport header will never be larger than a page. */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, - __be32 *rdma_argp, __be32 *rdma_resp, + struct svc_rdma_send_ctxt *ctxt, + __be32 *rdma_argp, struct svc_rqst *rqstp, __be32 *wr_lst, __be32 *rp_ch) { - struct svc_rdma_op_ctxt *ctxt; - u32 inv_rkey; int ret; - dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n", - (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"), - rqstp->rq_res.head[0].iov_len, - rqstp->rq_res.page_len, - rqstp->rq_res.tail[0].iov_len); - - ctxt = svc_rdma_get_context(rdma); - - ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, - svc_rdma_reply_hdr_len(rdma_resp)); - if (ret < 0) - goto err; - if (!rp_ch) { ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqstp->rq_res, wr_lst); if (ret < 0) - goto err; + return ret; } svc_rdma_save_io_pages(rqstp, ctxt); - inv_rkey = 0; - if (rdma->sc_snd_w_inv) - inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); - ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey); - if (ret) - goto err; - - return 0; - -err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - return ret; + ctxt->sc_send_wr.opcode = IB_WR_SEND; + if (rdma->sc_snd_w_inv) { + ctxt->sc_send_wr.ex.invalidate_rkey = + svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); + if (ctxt->sc_send_wr.ex.invalidate_rkey) + ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + } + dprintk("svcrdma: posting Send WR with %u sge(s)\n", + ctxt->sc_send_wr.num_sge); + return svc_rdma_send(rdma, &ctxt->sc_send_wr); } /* Given the client-provided Write and Reply chunks, the server was not @@ -568,38 +715,29 @@ err: * Remote Invalidation is skipped for simplicity. */ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - __be32 *rdma_resp, struct svc_rqst *rqstp) + struct svc_rdma_send_ctxt *ctxt, + struct svc_rqst *rqstp) { - struct svc_rdma_op_ctxt *ctxt; __be32 *p; int ret; - ctxt = svc_rdma_get_context(rdma); - - /* Replace the original transport header with an - * RDMA_ERROR response. XID etc are preserved. - */ - p = rdma_resp + 3; + p = ctxt->sc_xprt_buf; + trace_svcrdma_err_chunk(*p); + p += 3; *p++ = rdma_error; *p = err_chunk; - - ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20); - if (ret < 0) - goto err; + svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); svc_rdma_save_io_pages(rqstp, ctxt); - ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0); - if (ret) - goto err; + ctxt->sc_send_wr.opcode = IB_WR_SEND; + ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); + if (ret) { + svc_rdma_send_ctxt_put(rdma, ctxt); + return ret; + } return 0; - -err: - pr_err("svcrdma: failed to post Send WR (%d)\n", ret); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - return ret; } void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) @@ -623,20 +761,15 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; struct xdr_buf *xdr = &rqstp->rq_res; - struct page *res_page; + struct svc_rdma_send_ctxt *sctxt; int ret; - /* Find the call's chunk lists to decide how to send the reply. - * Receive places the Call's xprt header at the start of page 0. - */ - rdma_argp = page_address(rqstp->rq_pages[0]); + rdma_argp = rctxt->rc_recv_buf; svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); - dprintk("svcrdma: preparing response for XID 0x%08x\n", - be32_to_cpup(rdma_argp)); - /* Create the RDMA response header. xprt->xpt_mutex, * acquired in svc_send(), serializes RPC replies. The * code path below that inserts the credit grant value @@ -644,10 +777,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * critical section. */ ret = -ENOMEM; - res_page = alloc_page(GFP_KERNEL); - if (!res_page) + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) goto err0; - rdma_resp = page_address(res_page); + rdma_resp = sctxt->sc_xprt_buf; p = rdma_resp; *p++ = *rdma_argp; @@ -674,26 +807,33 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); } - ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp, + svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); + ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp, wr_lst, rp_ch); if (ret < 0) - goto err0; - return 0; + goto err1; + ret = 0; + +out: + rqstp->rq_xprt_ctxt = NULL; + svc_rdma_recv_ctxt_put(rdma, rctxt); + return ret; err2: if (ret != -E2BIG && ret != -EINVAL) goto err1; - ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp); + ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); if (ret < 0) - goto err0; - return 0; + goto err1; + ret = 0; + goto out; err1: - put_page(res_page); + svc_rdma_send_ctxt_put(rdma, sctxt); err0: - pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n", - ret); + trace_svcrdma_send_failed(rqstp, ret); set_bit(XPT_CLOSE, &xprt->xpt_flags); - return -ENOTCONN; + ret = -ENOTCONN; + goto out; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 96cc8f6597d3..e9535a66bab0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1,4 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* + * Copyright (c) 2015-2018 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. * @@ -40,26 +42,30 @@ * Author: Tom Tucker <tom@opengridcomputing.com> */ -#include <linux/sunrpc/svc_xprt.h> -#include <linux/sunrpc/addr.h> -#include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> +#include <linux/export.h> + #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <rdma/rw.h> + +#include <linux/sunrpc/addr.h> +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svc_rdma.h> -#include <linux/export.h> + #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int svc_rdma_post_recv(struct svcxprt_rdma *xprt); -static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int); +static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, + struct net *net); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -123,7 +129,7 @@ static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv, struct svcxprt_rdma *cma_xprt; struct svc_xprt *xprt; - cma_xprt = rdma_create_xprt(serv, 0); + cma_xprt = svc_rdma_create_xprt(serv, net); if (!cma_xprt) return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; @@ -152,133 +158,20 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, - gfp_t flags) -{ - struct svc_rdma_op_ctxt *ctxt; - - ctxt = kmalloc(sizeof(*ctxt), flags); - if (ctxt) { - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->list); - } - return ctxt; -} - -static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) -{ - unsigned int i; - - /* Each RPC/RDMA credit can consume one Receive and - * one Send WQE at the same time. - */ - i = xprt->sc_sq_depth + xprt->sc_rq_depth; - - while (i--) { - struct svc_rdma_op_ctxt *ctxt; - - ctxt = alloc_ctxt(xprt, GFP_KERNEL); - if (!ctxt) { - dprintk("svcrdma: No memory for RDMA ctxt\n"); - return false; - } - list_add(&ctxt->list, &xprt->sc_ctxts); - } - return true; -} - -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_op_ctxt *ctxt = NULL; - - spin_lock(&xprt->sc_ctxt_lock); - xprt->sc_ctxt_used++; - if (list_empty(&xprt->sc_ctxts)) - goto out_empty; - - ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - spin_unlock(&xprt->sc_ctxt_lock); - -out: - ctxt->count = 0; - ctxt->mapped_sges = 0; - return ctxt; - -out_empty: - /* Either pre-allocation missed the mark, or send - * queue accounting is broken. - */ - spin_unlock(&xprt->sc_ctxt_lock); - - ctxt = alloc_ctxt(xprt, GFP_NOIO); - if (ctxt) - goto out; - - spin_lock(&xprt->sc_ctxt_lock); - xprt->sc_ctxt_used--; - spin_unlock(&xprt->sc_ctxt_lock); - WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); - return NULL; -} - -void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) -{ - struct svcxprt_rdma *xprt = ctxt->xprt; - struct ib_device *device = xprt->sc_cm_id->device; - unsigned int i; - - for (i = 0; i < ctxt->mapped_sges; i++) - ib_dma_unmap_page(device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - ctxt->mapped_sges = 0; -} - -void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) -{ - struct svcxprt_rdma *xprt = ctxt->xprt; - int i; - - if (free_pages) - for (i = 0; i < ctxt->count; i++) - put_page(ctxt->pages[i]); - - spin_lock(&xprt->sc_ctxt_lock); - xprt->sc_ctxt_used--; - list_add(&ctxt->list, &xprt->sc_ctxts); - spin_unlock(&xprt->sc_ctxt_lock); -} - -static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_ctxts)) { - struct svc_rdma_op_ctxt *ctxt; - - ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - kfree(ctxt); - } -} - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { struct svc_xprt *xprt = context; + trace_svcrdma_qp_error(event, (struct sockaddr *)&xprt->xpt_remote); switch (event->event) { /* These are considered benign events */ case IB_EVENT_PATH_MIG: case IB_EVENT_COMM_EST: case IB_EVENT_SQ_DRAINED: case IB_EVENT_QP_LAST_WQE_REACHED: - dprintk("svcrdma: QP event %s (%d) received for QP=%p\n", - ib_event_msg(event->event), event->event, - event->element.qp); break; + /* These are considered fatal events */ case IB_EVENT_PATH_MIG_ERR: case IB_EVENT_QP_FATAL: @@ -286,111 +179,34 @@ static void qp_event_handler(struct ib_event *event, void *context) case IB_EVENT_QP_ACCESS_ERR: case IB_EVENT_DEVICE_FATAL: default: - dprintk("svcrdma: QP ERROR event %s (%d) received for QP=%p, " - "closing transport\n", - ib_event_msg(event->event), event->event, - event->element.qp); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); break; } } -/** - * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue - * @wc: completed WR - * - */ -static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) -{ - struct svcxprt_rdma *xprt = cq->cq_context; - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - /* WARNING: Only wc->wr_cqe and wc->status are reliable */ - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - - if (wc->status != IB_WC_SUCCESS) - goto flushed; - - /* All wc fields are now known to be valid */ - ctxt->byte_len = wc->byte_len; - spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); - spin_unlock(&xprt->sc_rq_dto_lock); - - svc_rdma_post_recv(xprt); - - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) - goto out; - goto out_enqueue; - -flushed: - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: Recv: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_rdma_put_context(ctxt, 1); - -out_enqueue: - svc_xprt_enqueue(&xprt->sc_xprt); -out: - svc_xprt_put(&xprt->sc_xprt); -} - -/** - * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) -{ - struct svcxprt_rdma *xprt = cq->cq_context; - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - atomic_inc(&xprt->sc_sq_avail); - wake_up(&xprt->sc_send_wait); - - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - - if (unlikely(wc->status != IB_WC_SUCCESS)) { - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_xprt_enqueue(&xprt->sc_xprt); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: Send: %s (%u/0x%x)\n", - ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - } - - svc_xprt_put(&xprt->sc_xprt); -} - -static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, - int listener) +static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, + struct net *net) { struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); - if (!cma_xprt) + if (!cma_xprt) { + dprintk("svcrdma: failed to create new transport\n"); return NULL; - svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); + } + svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); - INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); - spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_send_lock); + spin_lock_init(&cma_xprt->sc_recv_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* @@ -401,70 +217,9 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, */ set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); - if (listener) { - strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); - } - return cma_xprt; } -static int -svc_rdma_post_recv(struct svcxprt_rdma *xprt) -{ - struct ib_recv_wr recv_wr, *bad_recv_wr; - struct svc_rdma_op_ctxt *ctxt; - struct page *page; - dma_addr_t pa; - int sge_no; - int buflen; - int ret; - - ctxt = svc_rdma_get_context(xprt); - buflen = 0; - ctxt->direction = DMA_FROM_DEVICE; - ctxt->cqe.done = svc_rdma_wc_receive; - for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { - if (sge_no >= xprt->sc_max_sge) { - pr_err("svcrdma: Too many sges (%d)\n", sge_no); - goto err_put_ctxt; - } - page = alloc_page(GFP_KERNEL); - if (!page) - goto err_put_ctxt; - ctxt->pages[sge_no] = page; - pa = ib_dma_map_page(xprt->sc_cm_id->device, - page, 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) - goto err_put_ctxt; - svc_rdma_count_mappings(xprt, ctxt); - ctxt->sge[sge_no].addr = pa; - ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->count = sge_no + 1; - buflen += PAGE_SIZE; - } - recv_wr.next = NULL; - recv_wr.sg_list = &ctxt->sge[0]; - recv_wr.num_sge = ctxt->count; - recv_wr.wr_cqe = &ctxt->cqe; - - svc_xprt_get(&xprt->sc_xprt); - ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); - if (ret) { - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - svc_xprt_put(&xprt->sc_xprt); - } - return ret; - - err_put_ctxt: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - return -ENOMEM; -} - static void svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt, struct rdma_conn_param *param) @@ -504,15 +259,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, struct sockaddr *sa; /* Create a new transport */ - newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); - if (!newxprt) { - dprintk("svcrdma: failed to create new transport\n"); + newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, + listen_xprt->sc_xprt.xpt_net); + if (!newxprt) return; - } newxprt->sc_cm_id = new_cma_id; new_cma_id->context = newxprt; - dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", - newxprt, newxprt->sc_cm_id, listen_xprt); svc_rdma_parse_connect_private(newxprt, param); /* Save client advertised inbound read limit for use later in accept. */ @@ -543,9 +295,11 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, static int rdma_listen_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct svcxprt_rdma *xprt = cma_id->context; + struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; int ret = 0; + trace_svcrdma_cm_event(event, sap); + switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " @@ -553,23 +307,8 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, &event->param.conn); break; - - case RDMA_CM_EVENT_ESTABLISHED: - /* Accept complete */ - dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " - "cm_id=%p\n", xprt, cma_id); - break; - - case RDMA_CM_EVENT_DEVICE_REMOVAL: - dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", - xprt, cma_id); - if (xprt) { - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - svc_xprt_enqueue(&xprt->sc_xprt); - } - break; - default: + /* NB: No device removal upcall for INADDR_ANY listeners */ dprintk("svcrdma: Unexpected event on listening endpoint %p, " "event = %s (%d)\n", cma_id, rdma_event_msg(event->event), event->event); @@ -582,9 +321,12 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, static int rdma_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct svc_xprt *xprt = cma_id->context; - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr; + struct svcxprt_rdma *rdma = cma_id->context; + struct svc_xprt *xprt = &rdma->sc_xprt; + + trace_svcrdma_cm_event(event, sap); + switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Accept complete */ @@ -597,21 +339,17 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, case RDMA_CM_EVENT_DISCONNECTED: dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", xprt, cma_id); - if (xprt) { - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - } + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " "event = %s (%d)\n", cma_id, xprt, rdma_event_msg(event->event), event->event); - if (xprt) { - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - } + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); break; default: dprintk("svcrdma: Unexpected event on DTO endpoint %p, " @@ -634,16 +372,18 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct svcxprt_rdma *cma_xprt; int ret; - dprintk("svcrdma: Creating RDMA socket\n"); + dprintk("svcrdma: Creating RDMA listener\n"); if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) { dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); return ERR_PTR(-EAFNOSUPPORT); } - cma_xprt = rdma_create_xprt(serv, 1); + cma_xprt = svc_rdma_create_xprt(serv, net); if (!cma_xprt) return ERR_PTR(-ENOMEM); + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt, + listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); @@ -708,9 +448,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; + unsigned int ctxts, rq_depth; struct ib_device *dev; struct sockaddr *sap; - unsigned int i, ctxts; int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); @@ -736,24 +476,28 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, - (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_send_sges = dev->attrs.max_sge; + /* transport hdr, head iovec, one page list entry, tail iovec */ + if (newxprt->sc_max_send_sges < 4) { + pr_err("svcrdma: too few Send SGEs available (%d)\n", + newxprt->sc_max_send_sges); + goto errout; + } newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = svcrdma_max_requests; newxprt->sc_max_bc_requests = svcrdma_max_bc_requests; - newxprt->sc_rq_depth = newxprt->sc_max_requests + - newxprt->sc_max_bc_requests; - if (newxprt->sc_rq_depth > dev->attrs.max_qp_wr) { + rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; + if (rq_depth > dev->attrs.max_qp_wr) { pr_warn("svcrdma: reducing receive depth to %d\n", dev->attrs.max_qp_wr); - newxprt->sc_rq_depth = dev->attrs.max_qp_wr; - newxprt->sc_max_requests = newxprt->sc_rq_depth - 2; + rq_depth = dev->attrs.max_qp_wr; + newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); ctxts *= newxprt->sc_max_requests; - newxprt->sc_sq_depth = newxprt->sc_rq_depth + ctxts; + newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) { pr_warn("svcrdma: reducing send depth to %d\n", dev->attrs.max_qp_wr); @@ -761,9 +505,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); - if (!svc_rdma_prealloc_ctxts(newxprt)) - goto errout; - newxprt->sc_pd = ib_alloc_pd(dev, 0); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); @@ -775,7 +516,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, + newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth, 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); @@ -788,9 +529,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.port_num = newxprt->sc_port_num; qp_attr.cap.max_rdma_ctxs = ctxts; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth - ctxts; - qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; - qp_attr.cap.max_send_sge = newxprt->sc_max_sge; - qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; + qp_attr.cap.max_recv_wr = rq_depth; + qp_attr.cap.max_send_sge = newxprt->sc_max_send_sges; + qp_attr.cap.max_recv_sge = 1; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; @@ -815,14 +556,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) !rdma_ib_or_roce(dev, newxprt->sc_port_num)) goto errout; - /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); - if (ret) { - dprintk("svcrdma: failure posting receive buffers\n"); - goto errout; - } - } + if (!svc_rdma_post_recvs(newxprt)) + goto errout; /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; @@ -856,16 +591,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); - dprintk(" max_sge : %d\n", newxprt->sc_max_sge); + dprintk(" max_sge : %d\n", newxprt->sc_max_send_sges); dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); dprintk(" rdma_rw_ctxs : %d\n", ctxts); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); dprintk(" ord : %d\n", conn_param.initiator_depth); + trace_svcrdma_xprt_accept(&newxprt->sc_xprt); return &newxprt->sc_xprt; errout: dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + trace_svcrdma_xprt_fail(&newxprt->sc_xprt); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) @@ -896,7 +633,6 @@ static void svc_rdma_detach(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - dprintk("svc: svc_rdma_detach(%p)\n", xprt); /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); @@ -908,7 +644,7 @@ static void __svc_rdma_free(struct work_struct *work) container_of(work, struct svcxprt_rdma, sc_work); struct svc_xprt *xprt = &rdma->sc_xprt; - dprintk("svcrdma: %s(%p)\n", __func__, rdma); + trace_svcrdma_xprt_free(xprt); if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); @@ -918,25 +654,7 @@ static void __svc_rdma_free(struct work_struct *work) pr_err("svcrdma: sc_xprt still in use? (%d)\n", kref_read(&xprt->xpt_ref)); - while (!list_empty(&rdma->sc_read_complete_q)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_first_entry(&rdma->sc_read_complete_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - svc_rdma_put_context(ctxt, 1); - } - while (!list_empty(&rdma->sc_rq_dto_q)) { - struct svc_rdma_op_ctxt *ctxt; - ctxt = list_first_entry(&rdma->sc_rq_dto_q, - struct svc_rdma_op_ctxt, list); - list_del(&ctxt->list); - svc_rdma_put_context(ctxt, 1); - } - - /* Warn if we leaked a resource or under-referenced */ - if (rdma->sc_ctxt_used != 0) - pr_err("svcrdma: ctxt still in use? (%d)\n", - rdma->sc_ctxt_used); + svc_rdma_flush_recv_queues(rdma); /* Final put of backchannel client transport */ if (xprt->xpt_bc_xprt) { @@ -945,7 +663,8 @@ static void __svc_rdma_free(struct work_struct *work) } svc_rdma_destroy_rw_ctxts(rdma); - svc_rdma_destroy_ctxts(rdma); + svc_rdma_send_ctxts_destroy(rdma); + svc_rdma_recv_ctxts_destroy(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -998,51 +717,3 @@ static void svc_rdma_secure_port(struct svc_rqst *rqstp) static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) { } - -int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) -{ - struct ib_send_wr *bad_wr, *n_wr; - int wr_count; - int i; - int ret; - - if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return -ENOTCONN; - - wr_count = 1; - for (n_wr = wr->next; n_wr; n_wr = n_wr->next) - wr_count++; - - /* If the SQ is full, wait until an SQ entry is available */ - while (1) { - if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) { - atomic_inc(&rdma_stat_sq_starve); - - /* Wait until SQ WR available if SQ still full */ - atomic_add(wr_count, &xprt->sc_sq_avail); - wait_event(xprt->sc_send_wait, - atomic_read(&xprt->sc_sq_avail) > wr_count); - if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return -ENOTCONN; - continue; - } - /* Take a transport ref for each WR posted */ - for (i = 0; i < wr_count; i++) - svc_xprt_get(&xprt->sc_xprt); - - /* Bump used SQ WR count and post */ - ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); - if (ret) { - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - for (i = 0; i < wr_count; i ++) - svc_xprt_put(&xprt->sc_xprt); - dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret); - dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n", - atomic_read(&xprt->sc_sq_avail), - xprt->sc_sq_depth); - wake_up(&xprt->sc_send_wait); - } - break; - } - return ret; -} diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index cc1aad325496..143ce2579ba9 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. @@ -51,9 +52,13 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/seq_file.h> +#include <linux/smp.h> + #include <linux/sunrpc/addr.h> +#include <linux/sunrpc/svc_rdma.h> #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS @@ -330,9 +335,7 @@ xprt_setup_rdma(struct xprt_create *args) return ERR_PTR(-EBADF); } - xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), - xprt_rdma_slot_table_entries, - xprt_rdma_slot_table_entries); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0); if (xprt == NULL) { dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", __func__); @@ -364,7 +367,7 @@ xprt_setup_rdma(struct xprt_create *args) xprt_set_bound(xprt); xprt_rdma_format_addresses(xprt, sap); - cdata.max_requests = xprt->max_reqs; + cdata.max_requests = xprt_rdma_slot_table_entries; cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ @@ -537,6 +540,47 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } +/** + * xprt_rdma_alloc_slot - allocate an rpc_rqst + * @xprt: controlling RPC transport + * @task: RPC task requesting a fresh rpc_rqst + * + * tk_status values: + * %0 if task->tk_rqstp points to a fresh rpc_rqst + * %-EAGAIN if no rpc_rqst is available; queued on backlog + */ +static void +xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req; + + req = rpcrdma_buffer_get(&r_xprt->rx_buf); + if (!req) + goto out_sleep; + task->tk_rqstp = &req->rl_slot; + task->tk_status = 0; + return; + +out_sleep: + rpc_sleep_on(&xprt->backlog, task, NULL); + task->tk_status = -EAGAIN; +} + +/** + * xprt_rdma_free_slot - release an rpc_rqst + * @xprt: controlling RPC transport + * @rqst: rpc_rqst to release + * + */ +static void +xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst) +{ + memset(rqst, 0, sizeof(*rqst)); + rpcrdma_buffer_put(rpcr_to_rdmar(rqst)); + rpc_wake_up_next(&xprt->backlog); +} + static bool rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, size_t size, gfp_t flags) @@ -607,13 +651,9 @@ xprt_rdma_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - struct rpcrdma_req *req; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); gfp_t flags; - req = rpcrdma_buffer_get(&r_xprt->rx_buf); - if (req == NULL) - goto out_get; - flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; @@ -623,15 +663,12 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; trace_xprtrdma_allocate(task, req); return 0; out_fail: - rpcrdma_buffer_put(req); -out_get: trace_xprtrdma_allocate(task, NULL); return -ENOMEM; } @@ -652,7 +689,6 @@ xprt_rdma_free(struct rpc_task *task) if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) rpcrdma_release_rqst(r_xprt, req); trace_xprtrdma_rpc_done(task, req); - rpcrdma_buffer_put(req); } /** @@ -690,9 +726,6 @@ xprt_rdma_send_request(struct rpc_task *task) if (rc < 0) goto failed_marshal; - if (req->rl_reply == NULL) /* e.g. reconnection */ - rpcrdma_recv_buffer_get(req); - /* Must suppress retransmit to maintain credits */ if (rqst->rq_connect_cookie == xprt->connect_cookie) goto drop_connection; @@ -779,7 +812,8 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt) static const struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ - .alloc_slot = xprt_alloc_slot, + .alloc_slot = xprt_rdma_alloc_slot, + .free_slot = xprt_rdma_free_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ .timer = xprt_rdma_timer, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c345d365af88..16161a36dc73 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. @@ -59,6 +60,7 @@ #include <rdma/ib_cm.h> #include "xprt_rdma.h" +#include <trace/events/rpcrdma.h> /* * Globals/Macros @@ -71,8 +73,10 @@ /* * internal functions */ +static void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); +static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); struct workqueue_struct *rpcrdma_receive_wq __read_mostly; @@ -159,7 +163,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rr_cqe); /* WARNING: Only wr_id and status are reliable at this point */ - trace_xprtrdma_wc_receive(rep, wc); + trace_xprtrdma_wc_receive(wc); if (wc->status != IB_WC_SUCCESS) goto out_fail; @@ -231,7 +235,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) complete(&ia->ri_done); break; case RDMA_CM_EVENT_ADDR_ERROR: - ia->ri_async_rc = -EHOSTUNREACH; + ia->ri_async_rc = -EPROTO; complete(&ia->ri_done); break; case RDMA_CM_EVENT_ROUTE_ERROR: @@ -262,7 +266,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENOTCONN; goto connected; case RDMA_CM_EVENT_UNREACHABLE: - connstate = -ENETDOWN; + connstate = -ENETUNREACH; goto connected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", @@ -305,8 +309,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); - id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, - IB_QPT_RC); + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); dprintk("RPC: %s: rdma_create_id() failed %i\n", @@ -500,8 +504,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; - unsigned int max_qp_wr, max_sge; struct ib_cq *sendcq, *recvcq; + unsigned int max_sge; int rc; max_sge = min_t(unsigned int, ia->ri_device->attrs.max_sge, @@ -512,29 +516,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } ia->ri_max_send_sges = max_sge; - if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { - dprintk("RPC: %s: insufficient wqe's available\n", - __func__); - return -ENOMEM; - } - max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1; - - /* check provider's send/recv wr limits */ - if (cdata->max_requests > max_qp_wr) - cdata->max_requests = max_qp_wr; + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */ - rc = ia->ri_ops->ro_open(ia, ep, cdata); - if (rc) - return rc; - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ ep->rep_attr.cap.max_send_sge = max_sge; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; @@ -741,7 +729,6 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - unsigned int extras; int rc; retry: @@ -785,9 +772,8 @@ retry: } dprintk("RPC: %s: connected\n", __func__); - extras = r_xprt->rx_buf.rb_bc_srv_max_requests; - if (extras) - rpcrdma_ep_post_extra_recv(r_xprt, extras); + + rpcrdma_post_recvs(r_xprt, true); out: if (rc) @@ -893,6 +879,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } + buf->rb_flags = 0; return 0; @@ -950,7 +937,7 @@ out_emptyq: * completions recently. This is a sign the Send Queue is * backing up. Cause the caller to pause and try again. */ - dprintk("RPC: %s: empty sendctx queue\n", __func__); + set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags); r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); r_xprt->rx_stats.empty_sendctx_q++; return NULL; @@ -965,7 +952,8 @@ out_emptyq: * * The caller serializes calls to this function (per rpcrdma_buffer). */ -void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) +static void +rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) { struct rpcrdma_buffer *buf = &sc->sc_xprt->rx_buf; unsigned long next_tail; @@ -984,6 +972,11 @@ void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) /* Paired with READ_ONCE */ smp_store_release(&buf->rb_sc_tail, next_tail); + + if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) { + smp_mb__after_atomic(); + xprt_write_space(&sc->sc_xprt->rx_xprt); + } } static void @@ -1097,14 +1090,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) return req; } -/** - * rpcrdma_create_rep - Allocate an rpcrdma_rep object - * @r_xprt: controlling transport - * - * Returns 0 on success or a negative errno on failure. - */ -int -rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) +static int +rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1132,6 +1119,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; + rep->rr_temp = temp; spin_lock(&buf->rb_lock); list_add(&rep->rr_list, &buf->rb_recv_bufs); @@ -1183,12 +1171,8 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) list_add(&req->rl_list, &buf->rb_send_bufs); } + buf->rb_posted_receives = 0; INIT_LIST_HEAD(&buf->rb_recv_bufs); - for (i = 0; i <= buf->rb_max_requests; i++) { - rc = rpcrdma_create_rep(r_xprt); - if (rc) - goto out; - } rc = rpcrdma_sendctxs_create(r_xprt); if (rc) @@ -1200,28 +1184,6 @@ out: return rc; } -static struct rpcrdma_req * -rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_req *req; - - req = list_first_entry(&buf->rb_send_bufs, - struct rpcrdma_req, rl_list); - list_del_init(&req->rl_list); - return req; -} - -static struct rpcrdma_rep * -rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_rep *rep; - - rep = list_first_entry(&buf->rb_recv_bufs, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - return rep; -} - static void rpcrdma_destroy_rep(struct rpcrdma_rep *rep) { @@ -1280,10 +1242,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; - rep = rpcrdma_buffer_get_rep_locked(buf); + rep = list_first_entry(&buf->rb_recv_bufs, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); rpcrdma_destroy_rep(rep); } - buf->rb_send_count = 0; spin_lock(&buf->rb_reqslock); while (!list_empty(&buf->rb_allreqs)) { @@ -1298,7 +1261,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) spin_lock(&buf->rb_reqslock); } spin_unlock(&buf->rb_reqslock); - buf->rb_recv_count = 0; rpcrdma_mrs_destroy(buf); } @@ -1371,27 +1333,11 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) __rpcrdma_mr_put(&r_xprt->rx_buf, mr); } -static struct rpcrdma_rep * -rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers) -{ - /* If an RPC previously completed without a reply (say, a - * credential problem or a soft timeout occurs) then hold off - * on supplying more Receive buffers until the number of new - * pending RPCs catches up to the number of posted Receives. - */ - if (unlikely(buffers->rb_send_count < buffers->rb_recv_count)) - return NULL; - - if (unlikely(list_empty(&buffers->rb_recv_bufs))) - return NULL; - buffers->rb_recv_count++; - return rpcrdma_buffer_get_rep_locked(buffers); -} - -/* - * Get a set of request/reply buffers. +/** + * rpcrdma_buffer_get - Get a request buffer + * @buffers: Buffer pool from which to obtain a buffer * - * Reply buffer (if available) is attached to send buffer upon return. + * Returns a fresh rpcrdma_req, or NULL if none are available. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) @@ -1399,23 +1345,18 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) struct rpcrdma_req *req; spin_lock(&buffers->rb_lock); - if (list_empty(&buffers->rb_send_bufs)) - goto out_reqbuf; - buffers->rb_send_count++; - req = rpcrdma_buffer_get_req_locked(buffers); - req->rl_reply = rpcrdma_buffer_get_rep(buffers); + req = list_first_entry_or_null(&buffers->rb_send_bufs, + struct rpcrdma_req, rl_list); + if (req) + list_del_init(&req->rl_list); spin_unlock(&buffers->rb_lock); - return req; - -out_reqbuf: - spin_unlock(&buffers->rb_lock); - return NULL; } -/* - * Put request/reply buffers back into pool. - * Pre-decrement counter/array index. +/** + * rpcrdma_buffer_put - Put request/reply buffers back into pool + * @req: object to return + * */ void rpcrdma_buffer_put(struct rpcrdma_req *req) @@ -1426,27 +1367,16 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) req->rl_reply = NULL; spin_lock(&buffers->rb_lock); - buffers->rb_send_count--; - list_add_tail(&req->rl_list, &buffers->rb_send_bufs); + list_add(&req->rl_list, &buffers->rb_send_bufs); if (rep) { - buffers->rb_recv_count--; - list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + if (!rep->rr_temp) { + list_add(&rep->rr_list, &buffers->rb_recv_bufs); + rep = NULL; + } } spin_unlock(&buffers->rb_lock); -} - -/* - * Recover reply buffers from pool. - * This happens when recovering from disconnect. - */ -void -rpcrdma_recv_buffer_get(struct rpcrdma_req *req) -{ - struct rpcrdma_buffer *buffers = req->rl_buffer; - - spin_lock(&buffers->rb_lock); - req->rl_reply = rpcrdma_buffer_get_rep(buffers); - spin_unlock(&buffers->rb_lock); + if (rep) + rpcrdma_destroy_rep(rep); } /* @@ -1458,10 +1388,13 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - spin_lock(&buffers->rb_lock); - buffers->rb_recv_count--; - list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock(&buffers->rb_lock); + if (!rep->rr_temp) { + spin_lock(&buffers->rb_lock); + list_add(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock(&buffers->rb_lock); + } else { + rpcrdma_destroy_rep(rep); + } } /** @@ -1557,13 +1490,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; int rc; - if (req->rl_reply) { - rc = rpcrdma_ep_post_recv(ia, req->rl_reply); - if (rc) - return rc; - req->rl_reply = NULL; - } - if (!ep->rep_send_count || test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) { send_wr->send_flags |= IB_SEND_SIGNALED; @@ -1580,61 +1506,69 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, return 0; } -int -rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, - struct rpcrdma_rep *rep) -{ - struct ib_recv_wr *recv_wr_fail; - int rc; - - if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf)) - goto out_map; - rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail); - trace_xprtrdma_post_recv(rep, rc); - if (rc) - return -ENOTCONN; - return 0; - -out_map: - pr_err("rpcrdma: failed to DMA map the Receive buffer\n"); - return -EIO; -} - /** - * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests - * @r_xprt: transport associated with these backchannel resources - * @count: minimum number of incoming requests expected + * rpcrdma_post_recvs - Maybe post some Receive buffers + * @r_xprt: controlling transport + * @temp: when true, allocate temp rpcrdma_rep objects * - * Returns zero if all requested buffers were posted, or a negative errno. */ -int -rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) +void +rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_rep *rep; - int rc; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_recv_wr *wr, *bad_wr; + int needed, count, rc; - while (count--) { - spin_lock(&buffers->rb_lock); - if (list_empty(&buffers->rb_recv_bufs)) - goto out_reqbuf; - rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock(&buffers->rb_lock); + needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); + if (buf->rb_posted_receives > needed) + return; + needed -= buf->rb_posted_receives; - rc = rpcrdma_ep_post_recv(ia, rep); - if (rc) - goto out_rc; - } + count = 0; + wr = NULL; + while (needed) { + struct rpcrdma_regbuf *rb; + struct rpcrdma_rep *rep; - return 0; + spin_lock(&buf->rb_lock); + rep = list_first_entry_or_null(&buf->rb_recv_bufs, + struct rpcrdma_rep, rr_list); + if (likely(rep)) + list_del(&rep->rr_list); + spin_unlock(&buf->rb_lock); + if (!rep) { + if (rpcrdma_create_rep(r_xprt, temp)) + break; + continue; + } -out_reqbuf: - spin_unlock(&buffers->rb_lock); - trace_xprtrdma_noreps(r_xprt); - return -ENOMEM; + rb = rep->rr_rdmabuf; + if (!rpcrdma_regbuf_is_mapped(rb)) { + if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) { + rpcrdma_recv_buffer_put(rep); + break; + } + } -out_rc: - rpcrdma_recv_buffer_put(rep); - return rc; + trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe); + rep->rr_recv_wr.next = wr; + wr = &rep->rr_recv_wr; + ++count; + --needed; + } + if (!count) + return; + + rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, &bad_wr); + if (rc) { + for (wr = bad_wr; wr; wr = wr->next) { + struct rpcrdma_rep *rep; + + rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); + rpcrdma_recv_buffer_put(rep); + --count; + } + } + buf->rb_posted_receives += count; + trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cb41b12a3bf8..2ca14f7c2d51 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* * Copyright (c) 2014-2017 Oracle. All rights reserved. * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. @@ -196,6 +197,7 @@ struct rpcrdma_rep { __be32 rr_proc; int rr_wc_flags; u32 rr_inv_rkey; + bool rr_temp; struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; @@ -334,6 +336,7 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; + struct rpc_rqst rl_slot; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; @@ -356,16 +359,10 @@ enum { RPCRDMA_REQ_F_TX_RESOURCES, }; -static inline void -rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req) -{ - rqst->rq_xprtdata = req; -} - static inline struct rpcrdma_req * rpcr_to_rdmar(const struct rpc_rqst *rqst) { - return rqst->rq_xprtdata; + return container_of(rqst, struct rpcrdma_req, rl_slot); } static inline void @@ -401,11 +398,12 @@ struct rpcrdma_buffer { struct rpcrdma_sendctx **rb_sc_ctxs; spinlock_t rb_lock; /* protect buf lists */ - int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; + unsigned long rb_flags; u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ + int rb_posted_receives; u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ @@ -420,6 +418,11 @@ struct rpcrdma_buffer { }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) +/* rb_flags */ +enum { + RPCRDMA_BUF_F_EMPTY_SCQ = 0, +}; + /* * Internal structure for transport instance creation. This * exists primarily for modularity. @@ -561,18 +564,16 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); -int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); void rpcrdma_destroy_req(struct rpcrdma_req *); -int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); -void rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); @@ -581,7 +582,6 @@ void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); -void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction, @@ -603,8 +603,6 @@ rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) return __rpcrdma_dma_map_regbuf(ia, rb); } -int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); - int rpcrdma_alloc_wq(void); void rpcrdma_destroy_wq(void); @@ -675,5 +673,3 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ - -#include <trace/events/rpcrdma.h> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c8902f11efdd..9e1c5024aba9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2763,6 +2763,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_alloc_slot, + .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_local_connect, @@ -2782,6 +2783,7 @@ static const struct rpc_xprt_ops xs_udp_ops = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, + .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2803,6 +2805,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_lock_and_alloc_slot, + .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2834,6 +2837,7 @@ static const struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, + .free_slot = xprt_free_slot, .buf_alloc = bc_malloc, .buf_free = bc_free, .send_request = bc_send_request, diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 4492cda45566..a2f76743c73a 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -285,8 +285,9 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, if (!trans_buf) return -ENOMEM; - attrbuf = kmalloc((tipc_genl_family.maxattr + 1) * - sizeof(struct nlattr *), GFP_KERNEL); + attrbuf = kmalloc_array(tipc_genl_family.maxattr + 1, + sizeof(struct nlattr *), + GFP_KERNEL); if (!attrbuf) { err = -ENOMEM; goto trans_out; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 07514ca011b2..c7bbe5f0aae8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10833,7 +10833,7 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, struct nlattr **tb; int err; - tb = kzalloc(NUM_NL80211_ATTR * sizeof(*tb), GFP_KERNEL); + tb = kcalloc(NUM_NL80211_ATTR, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; @@ -11793,7 +11793,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, func->srf_num_macs = n_entries; func->srf_macs = - kzalloc(sizeof(*func->srf_macs) * n_entries, + kcalloc(n_entries, sizeof(*func->srf_macs), GFP_KERNEL); if (!func->srf_macs) { err = -ENOMEM; |