summaryrefslogtreecommitdiffstats
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/Kconfig2
-rw-r--r--net/sunrpc/backchannel_rqst.c40
-rw-r--r--net/sunrpc/clnt.c95
-rw-r--r--net/sunrpc/debugfs.c52
-rw-r--r--net/sunrpc/sched.c81
-rw-r--r--net/sunrpc/stats.c23
-rw-r--r--net/sunrpc/svc.c2
-rw-r--r--net/sunrpc/xprt.c101
-rw-r--r--net/sunrpc/xprtmultipath.c89
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c7
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c327
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c152
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c8
-rw-r--r--net/sunrpc/xprtrdma/transport.c84
-rw-r--r--net/sunrpc/xprtrdma/verbs.c115
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h45
-rw-r--r--net/sunrpc/xprtsock.c126
18 files changed, 845 insertions, 508 deletions
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index aa307505ca54..3bcf985507be 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -35,7 +35,7 @@ config RPCSEC_GSS_KRB5
If unsure, say Y.
-config CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES
+config SUNRPC_DISABLE_INSECURE_ENCTYPES
bool "Secure RPC: Disable insecure Kerberos encryption types"
depends on RPCSEC_GSS_KRB5
default n
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index c47d82622fd1..339e8c077c2d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -31,25 +31,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define RPCDBG_FACILITY RPCDBG_TRANS
#endif
+#define BC_MAX_SLOTS 64U
+
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
+{
+ return BC_MAX_SLOTS;
+}
+
/*
* Helper routines that track the number of preallocation elements
* on the transport.
*/
static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
{
- return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
-}
-
-static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
- atomic_add(n, &xprt->bc_free_slots);
- xprt->bc_alloc_count += n;
-}
-
-static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
- atomic_sub(n, &xprt->bc_free_slots);
- return xprt->bc_alloc_count -= n;
+ return xprt->bc_alloc_count < xprt->bc_alloc_max;
}
/*
@@ -145,6 +140,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
dprintk("RPC: setup backchannel transport\n");
+ if (min_reqs > BC_MAX_SLOTS)
+ min_reqs = BC_MAX_SLOTS;
+
/*
* We use a temporary list to keep track of the preallocated
* buffers. Once we're done building the list we splice it
@@ -172,7 +170,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
*/
spin_lock(&xprt->bc_pa_lock);
list_splice(&tmp_list, &xprt->bc_pa_list);
- xprt_inc_alloc_count(xprt, min_reqs);
+ xprt->bc_alloc_count += min_reqs;
+ xprt->bc_alloc_max += min_reqs;
+ atomic_add(min_reqs, &xprt->bc_slot_count);
spin_unlock(&xprt->bc_pa_lock);
dprintk("RPC: setup backchannel transport done\n");
@@ -220,11 +220,13 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
goto out;
spin_lock_bh(&xprt->bc_pa_lock);
- xprt_dec_alloc_count(xprt, max_reqs);
+ xprt->bc_alloc_max -= max_reqs;
list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
dprintk("RPC: req=%p\n", req);
list_del(&req->rq_bc_pa_list);
xprt_free_allocation(req);
+ xprt->bc_alloc_count--;
+ atomic_dec(&xprt->bc_slot_count);
if (--max_reqs == 0)
break;
}
@@ -241,13 +243,14 @@ static struct rpc_rqst *xprt_get_bc_request(struct rpc_xprt *xprt, __be32 xid,
struct rpc_rqst *req = NULL;
dprintk("RPC: allocate a backchannel request\n");
- if (atomic_read(&xprt->bc_free_slots) <= 0)
- goto not_found;
if (list_empty(&xprt->bc_pa_list)) {
if (!new)
goto not_found;
+ if (atomic_read(&xprt->bc_slot_count) >= BC_MAX_SLOTS)
+ goto not_found;
list_add_tail(&new->rq_bc_pa_list, &xprt->bc_pa_list);
xprt->bc_alloc_count++;
+ atomic_inc(&xprt->bc_slot_count);
}
req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
rq_bc_pa_list);
@@ -291,6 +294,7 @@ void xprt_free_bc_rqst(struct rpc_rqst *req)
if (xprt_need_to_requeue(xprt)) {
list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
xprt->bc_alloc_count++;
+ atomic_inc(&xprt->bc_slot_count);
req = NULL;
}
spin_unlock_bh(&xprt->bc_pa_lock);
@@ -357,7 +361,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
spin_lock(&xprt->bc_pa_lock);
list_del(&req->rq_bc_pa_list);
- xprt_dec_alloc_count(xprt, 1);
+ xprt->bc_alloc_count--;
spin_unlock(&xprt->bc_pa_lock);
req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b03bfa055c08..d8679b6027e9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -528,6 +528,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
.bc_xprt = args->bc_xprt,
};
char servername[48];
+ struct rpc_clnt *clnt;
+ int i;
if (args->bc_xprt) {
WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -590,7 +592,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
xprt->resvport = 0;
- return rpc_create_xprt(args, xprt);
+ clnt = rpc_create_xprt(args, xprt);
+ if (IS_ERR(clnt) || args->nconnect <= 1)
+ return clnt;
+
+ for (i = 0; i < args->nconnect - 1; i++) {
+ if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+ break;
+ }
+ return clnt;
}
EXPORT_SYMBOL_GPL(rpc_create);
@@ -968,13 +978,46 @@ out:
}
EXPORT_SYMBOL_GPL(rpc_bind_new_program);
+struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ struct rpc_xprt_switch *xps;
+
+ if (!xprt)
+ return NULL;
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ atomic_long_inc(&xps->xps_queuelen);
+ rcu_read_unlock();
+ atomic_long_inc(&xprt->queuelen);
+
+ return xprt;
+}
+
+static void
+rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+ struct rpc_xprt_switch *xps;
+
+ atomic_long_dec(&xprt->queuelen);
+ rcu_read_lock();
+ xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+ atomic_long_dec(&xps->xps_queuelen);
+ rcu_read_unlock();
+
+ xprt_put(xprt);
+}
+
void rpc_task_release_transport(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
if (xprt) {
task->tk_xprt = NULL;
- xprt_put(xprt);
+ if (task->tk_client)
+ rpc_task_release_xprt(task->tk_client, xprt);
+ else
+ xprt_put(xprt);
}
}
EXPORT_SYMBOL_GPL(rpc_task_release_transport);
@@ -983,6 +1026,7 @@ void rpc_task_release_client(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
+ rpc_task_release_transport(task);
if (clnt != NULL) {
/* Remove from client task list */
spin_lock(&clnt->cl_lock);
@@ -992,14 +1036,34 @@ void rpc_task_release_client(struct rpc_task *task)
rpc_release_client(clnt);
}
- rpc_task_release_transport(task);
+}
+
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ rcu_read_unlock();
+ return rpc_task_get_xprt(clnt, xprt);
+}
+
+static struct rpc_xprt *
+rpc_task_get_next_xprt(struct rpc_clnt *clnt)
+{
+ return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
}
static
void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
{
- if (!task->tk_xprt)
- task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
+ if (task->tk_xprt)
+ return;
+ if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
+ task->tk_xprt = rpc_task_get_first_xprt(clnt);
+ else
+ task->tk_xprt = rpc_task_get_next_xprt(clnt);
}
static
@@ -1462,6 +1526,19 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
}
EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
+unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt)
+{
+ struct rpc_xprt *xprt;
+ unsigned int ret;
+
+ rcu_read_lock();
+ xprt = rcu_dereference(clnt->cl_xprt);
+ ret = xprt->ops->bc_num_slots(xprt);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_num_bc_slots);
+
/**
* rpc_force_rebind - force transport to check that remote port is unchanged
* @clnt: client to rebind
@@ -1788,6 +1865,7 @@ rpc_xdr_encode(struct rpc_task *task)
req->rq_snd_buf.head[0].iov_len = 0;
xdr_init_encode(&xdr, &req->rq_snd_buf,
req->rq_snd_buf.head[0].iov_base, req);
+ xdr_free_bvec(&req->rq_snd_buf);
if (rpc_encode_header(task, &xdr))
return;
@@ -1827,8 +1905,6 @@ call_encode(struct rpc_task *task)
rpc_call_rpcerror(task, task->tk_status);
}
return;
- } else {
- xprt_request_prepare(task->tk_rqstp);
}
/* Add task to reply queue before transmission to avoid races */
@@ -2696,6 +2772,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
return -ENOMEM;
data->xps = xprt_switch_get(xps);
data->xprt = xprt_get(xprt);
+ if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+ rpc_cb_add_xprt_release(data);
+ goto success;
+ }
task = rpc_call_null_helper(clnt, xprt, NULL,
RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
@@ -2703,6 +2783,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
+success:
return 1;
}
EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 707d7aab1546..fd9bca242724 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/**
+/*
* debugfs interface for sunrpc
*
* (c) 2014 Jeff Layton <jlayton@primarydata.com>
@@ -117,12 +117,37 @@ static const struct file_operations tasks_fops = {
.release = tasks_release,
};
+static int do_xprt_debugfs(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *numv)
+{
+ int len;
+ char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
+ char link[9]; /* enough for 8 hex digits + NULL */
+ int *nump = numv;
+
+ if (IS_ERR_OR_NULL(xprt->debugfs))
+ return 0;
+ len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
+ xprt->debugfs->d_name.name);
+ if (len > sizeof(name))
+ return -1;
+ if (*nump == 0)
+ strcpy(link, "xprt");
+ else {
+ len = snprintf(link, sizeof(link), "xprt%d", *nump);
+ if (len > sizeof(link))
+ return -1;
+ }
+ debugfs_create_symlink(link, clnt->cl_debugfs, name);
+ (*nump)++;
+ return 0;
+}
+
void
rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
{
int len;
- char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */
- struct rpc_xprt *xprt;
+ char name[9]; /* enough for 8 hex digits + NULL */
+ int xprtnum = 0;
len = snprintf(name, sizeof(name), "%x", clnt->cl_clid);
if (len >= sizeof(name))
@@ -135,26 +160,7 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt,
&tasks_fops);
- rcu_read_lock();
- xprt = rcu_dereference(clnt->cl_xprt);
- /* no "debugfs" dentry? Don't bother with the symlink. */
- if (IS_ERR_OR_NULL(xprt->debugfs)) {
- rcu_read_unlock();
- return;
- }
- len = snprintf(name, sizeof(name), "../../rpc_xprt/%s",
- xprt->debugfs->d_name.name);
- rcu_read_unlock();
-
- if (len >= sizeof(name))
- goto out_err;
-
- debugfs_create_symlink("xprt", clnt->cl_debugfs, name);
-
- return;
-out_err:
- debugfs_remove_recursive(clnt->cl_debugfs);
- clnt->cl_debugfs = NULL;
+ rpc_clnt_iterate_for_each_xprt(clnt, do_xprt_debugfs, &xprtnum);
}
void
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a2c114812717..1f275aba786f 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -23,6 +23,7 @@
#include <linux/sched/mm.h>
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/metrics.h>
#include "sunrpc.h"
@@ -46,7 +47,7 @@ static mempool_t *rpc_buffer_mempool __read_mostly;
static void rpc_async_schedule(struct work_struct *);
static void rpc_release_task(struct rpc_task *task);
-static void __rpc_queue_timer_fn(struct timer_list *t);
+static void __rpc_queue_timer_fn(struct work_struct *);
/*
* RPC tasks sit here while waiting for conditions to improve.
@@ -58,6 +59,7 @@ static struct rpc_wait_queue delay_queue;
*/
struct workqueue_struct *rpciod_workqueue __read_mostly;
struct workqueue_struct *xprtiod_workqueue __read_mostly;
+EXPORT_SYMBOL_GPL(xprtiod_workqueue);
unsigned long
rpc_task_timeout(const struct rpc_task *task)
@@ -87,13 +89,19 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
task->tk_timeout = 0;
list_del(&task->u.tk_wait.timer_list);
if (list_empty(&queue->timer_list.list))
- del_timer(&queue->timer_list.timer);
+ cancel_delayed_work(&queue->timer_list.dwork);
}
static void
rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
{
- timer_reduce(&queue->timer_list.timer, expires);
+ unsigned long now = jiffies;
+ queue->timer_list.expires = expires;
+ if (time_before_eq(expires, now))
+ expires = 0;
+ else
+ expires -= now;
+ mod_delayed_work(rpciod_workqueue, &queue->timer_list.dwork, expires);
}
/*
@@ -107,7 +115,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
task->tk_pid, jiffies_to_msecs(timeout - jiffies));
task->tk_timeout = timeout;
- rpc_set_queue_timer(queue, timeout);
+ if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
+ rpc_set_queue_timer(queue, timeout);
list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
}
@@ -250,7 +259,8 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
queue->maxpriority = nr_queues - 1;
rpc_reset_waitqueue_priority(queue);
queue->qlen = 0;
- timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0);
+ queue->timer_list.expires = 0;
+ INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
INIT_LIST_HEAD(&queue->timer_list.list);
rpc_assign_waitqueue_name(queue, qname);
}
@@ -269,7 +279,7 @@ EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
{
- del_timer_sync(&queue->timer_list.timer);
+ cancel_delayed_work_sync(&queue->timer_list.dwork);
}
EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
@@ -424,9 +434,9 @@ void rpc_sleep_on_timeout(struct rpc_wait_queue *q, struct rpc_task *task,
/*
* Protect the queue operations.
*/
- spin_lock_bh(&q->lock);
+ spin_lock(&q->lock);
__rpc_sleep_on_priority_timeout(q, task, timeout, task->tk_priority);
- spin_unlock_bh(&q->lock);
+ spin_unlock(&q->lock);
}
EXPORT_SYMBOL_GPL(rpc_sleep_on_timeout);
@@ -442,9 +452,9 @@ void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task,
/*
* Protect the queue operations.
*/
- spin_lock_bh(&q->lock);
+ spin_lock(&q->lock);
__rpc_sleep_on_priority(q, task, task->tk_priority);
- spin_unlock_bh(&q->lock);
+ spin_unlock(&q->lock);
}
EXPORT_SYMBOL_GPL(rpc_sleep_on);
@@ -458,9 +468,9 @@ void rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
/*
* Protect the queue operations.
*/
- spin_lock_bh(&q->lock);
+ spin_lock(&q->lock);
__rpc_sleep_on_priority_timeout(q, task, timeout, priority);
- spin_unlock_bh(&q->lock);
+ spin_unlock(&q->lock);
}
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority_timeout);
@@ -475,9 +485,9 @@ void rpc_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task,
/*
* Protect the queue operations.
*/
- spin_lock_bh(&q->lock);
+ spin_lock(&q->lock);
__rpc_sleep_on_priority(q, task, priority);
- spin_unlock_bh(&q->lock);
+ spin_unlock(&q->lock);
}
EXPORT_SYMBOL_GPL(rpc_sleep_on_priority);
@@ -555,9 +565,9 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq,
{
if (!RPC_IS_QUEUED(task))
return;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
rpc_wake_up_task_on_wq_queue_locked(wq, queue, task);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
/*
@@ -567,9 +577,9 @@ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task
{
if (!RPC_IS_QUEUED(task))
return;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
rpc_wake_up_task_queue_locked(queue, task);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task);
@@ -602,9 +612,9 @@ rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue,
{
if (!RPC_IS_QUEUED(task))
return;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
rpc_wake_up_task_queue_set_status_locked(queue, task, status);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
/*
@@ -667,12 +677,12 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq,
dprintk("RPC: wake_up_first(%p \"%s\")\n",
queue, rpc_qname(queue));
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
task = __rpc_find_next_queued(queue);
if (task != NULL)
task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue,
task, func, data);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
return task;
}
@@ -711,7 +721,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue)
{
struct list_head *head;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
head = &queue->tasks[queue->maxpriority];
for (;;) {
while (!list_empty(head)) {
@@ -725,7 +735,7 @@ void rpc_wake_up(struct rpc_wait_queue *queue)
break;
head--;
}
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up);
@@ -740,7 +750,7 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
{
struct list_head *head;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
head = &queue->tasks[queue->maxpriority];
for (;;) {
while (!list_empty(head)) {
@@ -755,13 +765,15 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
break;
head--;
}
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
}
EXPORT_SYMBOL_GPL(rpc_wake_up_status);
-static void __rpc_queue_timer_fn(struct timer_list *t)
+static void __rpc_queue_timer_fn(struct work_struct *work)
{
- struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer);
+ struct rpc_wait_queue *queue = container_of(work,
+ struct rpc_wait_queue,
+ timer_list.dwork.work);
struct rpc_task *task, *n;
unsigned long expires, now, timeo;
@@ -832,6 +844,10 @@ rpc_reset_task_statistics(struct rpc_task *task)
void rpc_exit_task(struct rpc_task *task)
{
task->tk_action = NULL;
+ if (task->tk_ops->rpc_count_stats)
+ task->tk_ops->rpc_count_stats(task, task->tk_calldata);
+ else if (task->tk_client)
+ rpc_count_iostats(task, task->tk_client->cl_metrics);
if (task->tk_ops->rpc_call_done != NULL) {
task->tk_ops->rpc_call_done(task, task->tk_calldata);
if (task->tk_action != NULL) {
@@ -927,13 +943,13 @@ static void __rpc_execute(struct rpc_task *task)
* rpc_task pointer may still be dereferenced.
*/
queue = task->tk_waitqueue;
- spin_lock_bh(&queue->lock);
+ spin_lock(&queue->lock);
if (!RPC_IS_QUEUED(task)) {
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
continue;
}
rpc_clear_running(task);
- spin_unlock_bh(&queue->lock);
+ spin_unlock(&queue->lock);
if (task_is_async)
return;
@@ -1076,7 +1092,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
/* Initialize workqueue for async tasks */
task->tk_workqueue = task_setup_data->workqueue;
- task->tk_xprt = xprt_get(task_setup_data->rpc_xprt);
+ task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
+ xprt_get(task_setup_data->rpc_xprt));
task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2b6dc7e5f74f..7c74197c2ecf 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -177,6 +177,8 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
execute = ktime_sub(now, task->tk_start);
op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
+ if (task->tk_status < 0)
+ op_metrics->om_error_status++;
spin_unlock(&op_metrics->om_lock);
@@ -219,13 +221,14 @@ static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b)
a->om_queue = ktime_add(a->om_queue, b->om_queue);
a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
a->om_execute = ktime_add(a->om_execute, b->om_execute);
+ a->om_error_status += b->om_error_status;
}
static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
int op, const struct rpc_procinfo *procs)
{
_print_name(seq, op, procs);
- seq_printf(seq, "%lu %lu %lu %Lu %Lu %Lu %Lu %Lu\n",
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n",
stats->om_ops,
stats->om_ntrans,
stats->om_timeouts,
@@ -233,12 +236,20 @@ static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
stats->om_bytes_recv,
ktime_to_ms(stats->om_queue),
ktime_to_ms(stats->om_rtt),
- ktime_to_ms(stats->om_execute));
+ ktime_to_ms(stats->om_execute),
+ stats->om_error_status);
+}
+
+static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv)
+{
+ struct seq_file *seq = seqv;
+
+ xprt->ops->print_stats(xprt, seq);
+ return 0;
}
void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
{
- struct rpc_xprt *xprt;
unsigned int op, maxproc = clnt->cl_maxproc;
if (!clnt->cl_metrics)
@@ -248,11 +259,7 @@ void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
seq_printf(seq, "p/v: %u/%u (%s)\n",
clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name);
- rcu_read_lock();
- xprt = rcu_dereference(clnt->cl_xprt);
- if (xprt)
- xprt->ops->print_stats(xprt, seq);
- rcu_read_unlock();
+ rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq);
seq_printf(seq, "\tper-op statistics\n");
for (op = 0; op < maxproc; op++) {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e15cb704453e..220b79988000 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1595,7 +1595,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
/* Parse and execute the bc call */
proc_error = svc_process_common(rqstp, argv, resv);
- atomic_inc(&req->rq_xprt->bc_free_slots);
+ atomic_dec(&req->rq_xprt->bc_slot_count);
if (!proc_error) {
/* Processing error: drop the request */
xprt_free_bc_request(req);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f6c82b1651e7..783748dc5e6f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -302,9 +302,9 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task)
return 1;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
retval = xprt->ops->reserve_xprt(xprt, task);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
return retval;
}
@@ -381,9 +381,9 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta
{
if (xprt->snd_task != task)
return;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
/*
@@ -435,9 +435,9 @@ xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req)
if (req->rq_cong)
return true;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
ret = __xprt_get_cong(xprt, req) != 0;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
return ret;
}
EXPORT_SYMBOL_GPL(xprt_request_get_cong);
@@ -464,9 +464,9 @@ static void
xprt_clear_congestion_window_wait(struct rpc_xprt *xprt)
{
if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) {
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
__xprt_lock_write_next_cong(xprt);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
}
@@ -563,9 +563,9 @@ bool xprt_write_space(struct rpc_xprt *xprt)
if (!test_bit(XPRT_WRITE_SPACE, &xprt->state))
return false;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
ret = xprt_clear_write_space_locked(xprt);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
return ret;
}
EXPORT_SYMBOL_GPL(xprt_write_space);
@@ -634,9 +634,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req)
req->rq_retries = 0;
xprt_reset_majortimeo(req);
/* Reset the RTT counters == "slow start" */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
status = -ETIMEDOUT;
}
@@ -668,11 +668,11 @@ static void xprt_autoclose(struct work_struct *work)
void xprt_disconnect_done(struct rpc_xprt *xprt)
{
dprintk("RPC: disconnected transport %p\n", xprt);
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt_clear_connected(xprt);
xprt_clear_write_space_locked(xprt);
xprt_wake_pending_tasks(xprt, -ENOTCONN);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_disconnect_done);
@@ -684,7 +684,7 @@ EXPORT_SYMBOL_GPL(xprt_disconnect_done);
void xprt_force_disconnect(struct rpc_xprt *xprt)
{
/* Don't race with the test_bit() in xprt_clear_locked() */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
set_bit(XPRT_CLOSE_WAIT, &xprt->state);
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
@@ -692,7 +692,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
else if (xprt->snd_task)
rpc_wake_up_queued_task_set_status(&xprt->pending,
xprt->snd_task, -ENOTCONN);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_force_disconnect);
@@ -726,7 +726,7 @@ xprt_request_retransmit_after_disconnect(struct rpc_task *task)
void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
{
/* Don't race with the test_bit() in xprt_clear_locked() */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (cookie != xprt->connect_cookie)
goto out;
if (test_bit(XPRT_CLOSING, &xprt->state))
@@ -737,7 +737,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
static bool
@@ -750,6 +750,7 @@ static void
xprt_schedule_autodisconnect(struct rpc_xprt *xprt)
__must_hold(&xprt->transport_lock)
{
+ xprt->last_used = jiffies;
if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt))
mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout);
}
@@ -759,18 +760,13 @@ xprt_init_autodisconnect(struct timer_list *t)
{
struct rpc_xprt *xprt = from_timer(xprt, t, timer);
- spin_lock(&xprt->transport_lock);
if (!RB_EMPTY_ROOT(&xprt->recv_queue))
- goto out_abort;
+ return;
/* Reset xprt->last_used to avoid connect/autodisconnect cycling */
xprt->last_used = jiffies;
if (test_and_set_bit(XPRT_LOCKED, &xprt->state))
- goto out_abort;
- spin_unlock(&xprt->transport_lock);
+ return;
queue_work(xprtiod_workqueue, &xprt->task_cleanup);
- return;
-out_abort:
- spin_unlock(&xprt->transport_lock);
}
bool xprt_lock_connect(struct rpc_xprt *xprt,
@@ -779,7 +775,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
{
bool ret = false;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (!test_bit(XPRT_LOCKED, &xprt->state))
goto out;
if (xprt->snd_task != task)
@@ -787,13 +783,13 @@ bool xprt_lock_connect(struct rpc_xprt *xprt,
xprt->snd_task = cookie;
ret = true;
out:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
return ret;
}
void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
{
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (xprt->snd_task != cookie)
goto out;
if (!test_bit(XPRT_LOCKED, &xprt->state))
@@ -802,7 +798,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)
xprt->ops->release_xprt(xprt, NULL);
xprt_schedule_autodisconnect(xprt);
out:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
wake_up_bit(&xprt->state, XPRT_LOCKED);
}
@@ -850,6 +846,38 @@ void xprt_connect(struct rpc_task *task)
xprt_release_write(xprt, task);
}
+/**
+ * xprt_reconnect_delay - compute the wait before scheduling a connect
+ * @xprt: transport instance
+ *
+ */
+unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt)
+{
+ unsigned long start, now = jiffies;
+
+ start = xprt->stat.connect_start + xprt->reestablish_timeout;
+ if (time_after(start, now))
+ return start - now;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_delay);
+
+/**
+ * xprt_reconnect_backoff - compute the new re-establish timeout
+ * @xprt: transport instance
+ * @init_to: initial reestablish timeout
+ *
+ */
+void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to)
+{
+ xprt->reestablish_timeout <<= 1;
+ if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+ xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+ if (xprt->reestablish_timeout < init_to)
+ xprt->reestablish_timeout = init_to;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_backoff);
+
enum xprt_xid_rb_cmp {
XID_RB_EQUAL,
XID_RB_LEFT,
@@ -1013,6 +1041,8 @@ xprt_request_enqueue_receive(struct rpc_task *task)
if (!xprt_request_need_enqueue_receive(task, req))
return;
+
+ xprt_request_prepare(task->tk_rqstp);
spin_lock(&xprt->queue_lock);
/* Update the softirq receive buffer */
@@ -1412,14 +1442,14 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
xprt_inject_disconnect(xprt);
task->tk_flags |= RPC_TASK_SENT;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt->stat.sends++;
xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
xprt->stat.bklog_u += xprt->backlog.qlen;
xprt->stat.sending_u += xprt->sending.qlen;
xprt->stat.pending_u += xprt->pending.qlen;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
req->rq_connect_cookie = connect_cookie;
out_dequeue:
@@ -1765,18 +1795,13 @@ void xprt_release(struct rpc_task *task)
}
xprt = req->rq_xprt;
- if (task->tk_ops->rpc_count_stats != NULL)
- task->tk_ops->rpc_count_stats(task, task->tk_calldata);
- else if (task->tk_client)
- rpc_count_iostats(task, task->tk_client->cl_metrics);
xprt_request_dequeue_all(task, req);
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
xprt->ops->release_request(task);
- xprt->last_used = jiffies;
xprt_schedule_autodisconnect(xprt);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(task);
xprt_inject_disconnect(xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 8394124126f8..78c075a68c04 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -19,7 +19,7 @@
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/xprtmultipath.h>
-typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
+typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps,
const struct rpc_xprt *cur);
static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular;
@@ -36,6 +36,7 @@ static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
if (xps->xps_nxprts == 0)
xps->xps_net = xprt->xprt_net;
xps->xps_nxprts++;
+ xps->xps_nactive++;
}
/**
@@ -51,8 +52,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
if (xprt == NULL)
return;
spin_lock(&xps->xps_lock);
- if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
- !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+ if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
xprt_switch_add_xprt_locked(xps, xprt);
spin_unlock(&xps->xps_lock);
}
@@ -62,6 +62,7 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
{
if (unlikely(xprt == NULL))
return;
+ xps->xps_nactive--;
xps->xps_nxprts--;
if (xps->xps_nxprts == 0)
xps->xps_net = NULL;
@@ -102,7 +103,9 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
if (xps != NULL) {
spin_lock_init(&xps->xps_lock);
kref_init(&xps->xps_kref);
- xps->xps_nxprts = 0;
+ xps->xps_nxprts = xps->xps_nactive = 0;
+ atomic_long_set(&xps->xps_queuelen, 0);
+ xps->xps_net = NULL;
INIT_LIST_HEAD(&xps->xps_xprt_list);
xps->xps_iter_ops = &rpc_xprt_iter_singular;
xprt_switch_add_xprt_locked(xps, xprt);
@@ -193,9 +196,21 @@ void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi)
}
static
+bool xprt_is_active(const struct rpc_xprt *xprt)
+{
+ return kref_read(&xprt->kref) != 0;
+}
+
+static
struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head)
{
- return list_first_or_null_rcu(head, struct rpc_xprt, xprt_switch);
+ struct rpc_xprt *pos;
+
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (xprt_is_active(pos))
+ return pos;
+ }
+ return NULL;
}
static
@@ -213,9 +228,12 @@ struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
const struct rpc_xprt *cur)
{
struct rpc_xprt *pos;
+ bool found = false;
list_for_each_entry_rcu(pos, head, xprt_switch) {
if (cur == pos)
+ found = true;
+ if (found && xprt_is_active(pos))
return pos;
}
return NULL;
@@ -260,9 +278,12 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
const struct rpc_xprt *cur)
{
struct rpc_xprt *pos, *prev = NULL;
+ bool found = false;
list_for_each_entry_rcu(pos, head, xprt_switch) {
if (cur == prev)
+ found = true;
+ if (found && xprt_is_active(pos))
return pos;
prev = pos;
}
@@ -270,22 +291,15 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
}
static
-struct rpc_xprt *xprt_switch_set_next_cursor(struct list_head *head,
+struct rpc_xprt *xprt_switch_set_next_cursor(struct rpc_xprt_switch *xps,
struct rpc_xprt **cursor,
xprt_switch_find_xprt_t find_next)
{
- struct rpc_xprt *cur, *pos, *old;
+ struct rpc_xprt *pos, *old;
- cur = READ_ONCE(*cursor);
- for (;;) {
- old = cur;
- pos = find_next(head, old);
- if (pos == NULL)
- break;
- cur = cmpxchg_relaxed(cursor, old, pos);
- if (cur == old)
- break;
- }
+ old = smp_load_acquire(cursor);
+ pos = find_next(xps, old);
+ smp_store_release(cursor, pos);
return pos;
}
@@ -297,13 +311,11 @@ struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
if (xps == NULL)
return NULL;
- return xprt_switch_set_next_cursor(&xps->xps_xprt_list,
- &xpi->xpi_cursor,
- find_next);
+ return xprt_switch_set_next_cursor(xps, &xpi->xpi_cursor, find_next);
}
static
-struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
+struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head,
const struct rpc_xprt *cur)
{
struct rpc_xprt *ret;
@@ -315,6 +327,31 @@ struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
}
static
+struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct rpc_xprt_switch *xps,
+ const struct rpc_xprt *cur)
+{
+ struct list_head *head = &xps->xps_xprt_list;
+ struct rpc_xprt *xprt;
+ unsigned int nactive;
+
+ for (;;) {
+ unsigned long xprt_queuelen, xps_queuelen;
+
+ xprt = __xprt_switch_find_next_entry_roundrobin(head, cur);
+ if (!xprt)
+ break;
+ xprt_queuelen = atomic_long_read(&xprt->queuelen);
+ xps_queuelen = atomic_long_read(&xps->xps_queuelen);
+ nactive = READ_ONCE(xps->xps_nactive);
+ /* Exit loop if xprt_queuelen <= average queue length */
+ if (xprt_queuelen * nactive <= xps_queuelen)
+ break;
+ cur = xprt;
+ }
+ return xprt;
+}
+
+static
struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
{
return xprt_iter_next_entry_multiple(xpi,
@@ -322,9 +359,17 @@ struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
}
static
+struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps,
+ const struct rpc_xprt *cur)
+{
+ return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur);
+}
+
+static
struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
{
- return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry);
+ return xprt_iter_next_entry_multiple(xpi,
+ xprt_switch_find_next_entry_all);
}
/*
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index ce986591f213..59e624b1d7a0 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -52,6 +52,13 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+ return r_xprt->rx_buf.rb_bc_srv_max_requests;
+}
+
static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 794ba4ca0994..0b6dad7580a1 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -144,6 +144,26 @@ frwr_mr_recycle_worker(struct work_struct *work)
frwr_release_mr(mr);
}
+/* frwr_reset - Place MRs back on the free list
+ * @req: request to reset
+ *
+ * Used after a failed marshal. For FRWR, this means the MRs
+ * don't have to be fully released and recreated.
+ *
+ * NB: This is safe only as long as none of @req's MRs are
+ * involved with an ongoing asynchronous FAST_REG or LOCAL_INV
+ * Work Request.
+ */
+void frwr_reset(struct rpcrdma_req *req)
+{
+ while (!list_empty(&req->rl_registered)) {
+ struct rpcrdma_mr *mr;
+
+ mr = rpcrdma_mr_pop(&req->rl_registered);
+ rpcrdma_mr_unmap_and_put(mr);
+ }
+}
+
/**
* frwr_init_mr - Initialize one MR
* @ia: interface adapter
@@ -168,7 +188,6 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
goto out_list_err;
mr->frwr.fr_mr = frmr;
- mr->frwr.fr_state = FRWR_IS_INVALID;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker);
@@ -298,65 +317,6 @@ size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt)
}
/**
- * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
- *
- */
-static void
-frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr =
- container_of(cqe, struct rpcrdma_frwr, fr_cqe);
-
- /* WARNING: Only wr_cqe and status are reliable at this point */
- if (wc->status != IB_WC_SUCCESS)
- frwr->fr_state = FRWR_FLUSHED_FR;
- trace_xprtrdma_wc_fastreg(wc, frwr);
-}
-
-/**
- * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
- *
- */
-static void
-frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
- fr_cqe);
-
- /* WARNING: Only wr_cqe and status are reliable at this point */
- if (wc->status != IB_WC_SUCCESS)
- frwr->fr_state = FRWR_FLUSHED_LI;
- trace_xprtrdma_wc_li(wc, frwr);
-}
-
-/**
- * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
- *
- * Awaken anyone waiting for an MR to finish being fenced.
- */
-static void
-frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
-{
- struct ib_cqe *cqe = wc->wr_cqe;
- struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr,
- fr_cqe);
-
- /* WARNING: Only wr_cqe and status are reliable at this point */
- if (wc->status != IB_WC_SUCCESS)
- frwr->fr_state = FRWR_FLUSHED_LI;
- trace_xprtrdma_wc_li_wake(wc, frwr);
- complete(&frwr->fr_linv_done);
-}
-
-/**
* frwr_map - Register a memory region
* @r_xprt: controlling transport
* @seg: memory region co-ordinates
@@ -378,23 +338,15 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS;
- struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
struct ib_mr *ibmr;
struct ib_reg_wr *reg_wr;
int i, n;
u8 key;
- mr = NULL;
- do {
- if (mr)
- rpcrdma_mr_recycle(mr);
- mr = rpcrdma_mr_get(r_xprt);
- if (!mr)
- return ERR_PTR(-EAGAIN);
- } while (mr->frwr.fr_state != FRWR_IS_INVALID);
- frwr = &mr->frwr;
- frwr->fr_state = FRWR_IS_VALID;
+ mr = rpcrdma_mr_get(r_xprt);
+ if (!mr)
+ goto out_getmr_err;
if (nsegs > ia->ri_max_frwr_depth)
nsegs = ia->ri_max_frwr_depth;
@@ -423,7 +375,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
if (!mr->mr_nents)
goto out_dmamap_err;
- ibmr = frwr->fr_mr;
+ ibmr = mr->frwr.fr_mr;
n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE);
if (unlikely(n != mr->mr_nents))
goto out_mapmr_err;
@@ -433,7 +385,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
key = (u8)(ibmr->rkey & 0x000000FF);
ib_update_fast_reg_key(ibmr, ++key);
- reg_wr = &frwr->fr_regwr;
+ reg_wr = &mr->frwr.fr_regwr;
reg_wr->mr = ibmr;
reg_wr->key = ibmr->rkey;
reg_wr->access = writing ?
@@ -448,6 +400,10 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
*out = mr;
return seg;
+out_getmr_err:
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
+ return ERR_PTR(-EAGAIN);
+
out_dmamap_err:
mr->mr_dir = DMA_NONE;
trace_xprtrdma_frwr_sgerr(mr, i);
@@ -461,6 +417,23 @@ out_mapmr_err:
}
/**
+ * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
+static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_fastreg(wc, frwr);
+ /* The MR will get recycled when the associated req is retransmitted */
+}
+
+/**
* frwr_send - post Send WR containing the RPC Call message
* @ia: interface adapter
* @req: Prepared RPC Call
@@ -512,31 +485,75 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
trace_xprtrdma_mr_remoteinv(mr);
- mr->frwr.fr_state = FRWR_IS_INVALID;
rpcrdma_mr_unmap_and_put(mr);
break; /* only one invalidated MR per RPC */
}
}
+static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
+{
+ if (wc->status != IB_WC_SUCCESS)
+ rpcrdma_mr_recycle(mr);
+ else
+ rpcrdma_mr_unmap_and_put(mr);
+}
+
/**
- * frwr_unmap_sync - invalidate memory regions that were registered for @req
- * @r_xprt: controlling transport
- * @mrs: list of MRs to process
+ * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
+static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_li(wc, frwr);
+ __frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
*
- * Sleeps until it is safe for the host CPU to access the
- * previously mapped memory regions.
+ * Awaken anyone waiting for an MR to finish being fenced.
+ */
+static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_li_wake(wc, frwr);
+ complete(&frwr->fr_linv_done);
+ __frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_unmap_sync - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
*
- * Caller ensures that @mrs is not empty before the call. This
- * function empties the list.
+ * Sleeps until it is safe for the host CPU to access the previously mapped
+ * memory regions. This guarantees that registered MRs are properly fenced
+ * from the server before the RPC consumer accesses the data in them. It
+ * also ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
*/
-void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *first, **prev, *last;
const struct ib_send_wr *bad_wr;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_frwr *frwr;
struct rpcrdma_mr *mr;
- int count, rc;
+ int rc;
/* ORDER: Invalidate all of the MRs first
*
@@ -544,33 +561,32 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
* a single ib_post_send() call.
*/
frwr = NULL;
- count = 0;
prev = &first;
- list_for_each_entry(mr, mrs, mr_list) {
- mr->frwr.fr_state = FRWR_IS_INVALID;
+ while (!list_empty(&req->rl_registered)) {
+ mr = rpcrdma_mr_pop(&req->rl_registered);
- frwr = &mr->frwr;
trace_xprtrdma_mr_localinv(mr);
+ r_xprt->rx_stats.local_inv_needed++;
+ frwr = &mr->frwr;
frwr->fr_cqe.done = frwr_wc_localinv;
last = &frwr->fr_invwr;
- memset(last, 0, sizeof(*last));
+ last->next = NULL;
last->wr_cqe = &frwr->fr_cqe;
+ last->sg_list = NULL;
+ last->num_sge = 0;
last->opcode = IB_WR_LOCAL_INV;
+ last->send_flags = IB_SEND_SIGNALED;
last->ex.invalidate_rkey = mr->mr_handle;
- count++;
*prev = last;
prev = &last->next;
}
- if (!frwr)
- goto unmap;
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
* are complete.
*/
- last->send_flags = IB_SEND_SIGNALED;
frwr->fr_cqe.done = frwr_wc_localinv_wake;
reinit_completion(&frwr->fr_linv_done);
@@ -578,37 +594,126 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
* replaces the QP. The RPC reply handler won't call us
* unless ri_id->qp is a valid pointer.
*/
- r_xprt->rx_stats.local_inv_needed++;
bad_wr = NULL;
- rc = ib_post_send(ia->ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ trace_xprtrdma_post_send(req, rc);
+
+ /* The final LOCAL_INV WR in the chain is supposed to
+ * do the wake. If it was never posted, the wake will
+ * not happen, so don't wait in that case.
+ */
if (bad_wr != first)
wait_for_completion(&frwr->fr_linv_done);
- if (rc)
- goto out_release;
+ if (!rc)
+ return;
- /* ORDER: Now DMA unmap all of the MRs, and return
- * them to the free MR list.
+ /* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
-unmap:
- while (!list_empty(mrs)) {
- mr = rpcrdma_mr_pop(mrs);
- rpcrdma_mr_unmap_and_put(mr);
+ while (bad_wr) {
+ frwr = container_of(bad_wr, struct rpcrdma_frwr,
+ fr_invwr);
+ mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ bad_wr = bad_wr->next;
+
+ list_del_init(&mr->mr_list);
+ rpcrdma_mr_recycle(mr);
}
- return;
+}
-out_release:
- pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc);
+/**
+ * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
+static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_frwr *frwr =
+ container_of(cqe, struct rpcrdma_frwr, fr_cqe);
+ struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr);
- /* Unmap and release the MRs in the LOCAL_INV WRs that did not
- * get posted.
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ trace_xprtrdma_wc_li_done(wc, frwr);
+ rpcrdma_complete_rqst(frwr->fr_req->rl_reply);
+ __frwr_release_mr(wc, mr);
+}
+
+/**
+ * frwr_unmap_async - invalidate memory regions that were registered for @req
+ * @r_xprt: controlling transport instance
+ * @req: rpcrdma_req with a non-empty list of MRs to process
+ *
+ * This guarantees that registered MRs are properly fenced from the
+ * server before the RPC consumer accesses the data in them. It also
+ * ensures proper Send flow control: waking the next RPC waits until
+ * this RPC has relinquished all its Send Queue entries.
+ */
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *first, *last, **prev;
+ const struct ib_send_wr *bad_wr;
+ struct rpcrdma_frwr *frwr;
+ struct rpcrdma_mr *mr;
+ int rc;
+
+ /* Chain the LOCAL_INV Work Requests and post them with
+ * a single ib_post_send() call.
+ */
+ frwr = NULL;
+ prev = &first;
+ while (!list_empty(&req->rl_registered)) {
+ mr = rpcrdma_mr_pop(&req->rl_registered);
+
+ trace_xprtrdma_mr_localinv(mr);
+ r_xprt->rx_stats.local_inv_needed++;
+
+ frwr = &mr->frwr;
+ frwr->fr_cqe.done = frwr_wc_localinv;
+ frwr->fr_req = req;
+ last = &frwr->fr_invwr;
+ last->next = NULL;
+ last->wr_cqe = &frwr->fr_cqe;
+ last->sg_list = NULL;
+ last->num_sge = 0;
+ last->opcode = IB_WR_LOCAL_INV;
+ last->send_flags = IB_SEND_SIGNALED;
+ last->ex.invalidate_rkey = mr->mr_handle;
+
+ *prev = last;
+ prev = &last->next;
+ }
+
+ /* Strong send queue ordering guarantees that when the
+ * last WR in the chain completes, all WRs in the chain
+ * are complete. The last completion will wake up the
+ * RPC waiter.
+ */
+ frwr->fr_cqe.done = frwr_wc_localinv_done;
+
+ /* Transport disconnect drains the receive CQ before it
+ * replaces the QP. The RPC reply handler won't call us
+ * unless ri_id->qp is a valid pointer.
+ */
+ bad_wr = NULL;
+ rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ trace_xprtrdma_post_send(req, rc);
+ if (!rc)
+ return;
+
+ /* Recycle MRs in the LOCAL_INV chain that did not get posted.
*/
while (bad_wr) {
- frwr = container_of(bad_wr, struct rpcrdma_frwr,
- fr_invwr);
+ frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr);
mr = container_of(frwr, struct rpcrdma_mr, frwr);
bad_wr = bad_wr->next;
- list_del_init(&mr->mr_list);
rpcrdma_mr_recycle(mr);
}
+
+ /* The final LOCAL_INV WR in the chain is supposed to
+ * do the wake. If it was never posted, the wake will
+ * not happen, so wake here in that case.
+ */
+ rpcrdma_complete_rqst(req->rl_reply);
}
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 85115a2e2639..4345e6912392 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -366,6 +366,9 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
unsigned int pos;
int nsegs;
+ if (rtype == rpcrdma_noch)
+ goto done;
+
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
pos = 0;
@@ -389,7 +392,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
nsegs -= mr->mr_nents;
} while (nsegs);
- return 0;
+done:
+ return encode_item_not_present(xdr);
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -417,6 +421,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
int nsegs, nchunks;
__be32 *segcount;
+ if (wtype != rpcrdma_writech)
+ goto done;
+
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
@@ -451,7 +458,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
- return 0;
+done:
+ return encode_item_not_present(xdr);
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -476,6 +484,9 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
int nsegs, nchunks;
__be32 *segcount;
+ if (wtype != rpcrdma_replych)
+ return encode_item_not_present(xdr);
+
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
@@ -511,6 +522,16 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
return 0;
}
+static void rpcrdma_sendctx_done(struct kref *kref)
+{
+ struct rpcrdma_req *req =
+ container_of(kref, struct rpcrdma_req, rl_kref);
+ struct rpcrdma_rep *rep = req->rl_reply;
+
+ rpcrdma_complete_rqst(rep);
+ rep->rr_rxprt->rx_stats.reply_waits_for_send++;
+}
+
/**
* rpcrdma_sendctx_unmap - DMA-unmap Send buffer
* @sc: sendctx containing SGEs to unmap
@@ -520,6 +541,9 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
{
struct ib_sge *sge;
+ if (!sc->sc_unmap_count)
+ return;
+
/* The first two SGEs contain the transport header and
* the inline buffer. These are always left mapped so
* they can be cheaply re-used.
@@ -529,9 +553,7 @@ void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc)
ib_dma_unmap_page(sc->sc_device, sge->addr, sge->length,
DMA_TO_DEVICE);
- if (test_and_clear_bit(RPCRDMA_REQ_F_TX_RESOURCES,
- &sc->sc_req->rl_flags))
- wake_up_bit(&sc->sc_req->rl_flags, RPCRDMA_REQ_F_TX_RESOURCES);
+ kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done);
}
/* Prepare an SGE for the RPC-over-RDMA transport header.
@@ -666,7 +688,7 @@ map_tail:
out:
sc->sc_wr.num_sge += sge_no;
if (sc->sc_unmap_count)
- __set_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+ kref_get(&req->rl_kref);
return true;
out_regbuf:
@@ -699,22 +721,28 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_req *req, u32 hdrlen,
struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
{
+ int ret;
+
+ ret = -EAGAIN;
req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt);
if (!req->rl_sendctx)
- return -EAGAIN;
+ goto err;
req->rl_sendctx->sc_wr.num_sge = 0;
req->rl_sendctx->sc_unmap_count = 0;
req->rl_sendctx->sc_req = req;
- __clear_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags);
+ kref_init(&req->rl_kref);
+ ret = -EIO;
if (!rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen))
- return -EIO;
-
+ goto err;
if (rtype != rpcrdma_areadch)
if (!rpcrdma_prepare_msg_sges(r_xprt, req, xdr, rtype))
- return -EIO;
-
+ goto err;
return 0;
+
+err:
+ trace_xprtrdma_prepsend_failed(&req->rl_slot, ret);
+ return ret;
}
/**
@@ -842,50 +870,28 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- if (rtype != rpcrdma_noch) {
- ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
- if (ret)
- goto out_err;
- }
- ret = encode_item_not_present(xdr);
+ ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
if (ret)
goto out_err;
-
- if (wtype == rpcrdma_writech) {
- ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
- if (ret)
- goto out_err;
- }
- ret = encode_item_not_present(xdr);
+ ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
if (ret)
goto out_err;
-
- if (wtype != rpcrdma_replych)
- ret = encode_item_not_present(xdr);
- else
- ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
+ ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
if (ret)
goto out_err;
- trace_xprtrdma_marshal(rqst, xdr_stream_pos(xdr), rtype, wtype);
-
- ret = rpcrdma_prepare_send_sges(r_xprt, req, xdr_stream_pos(xdr),
+ ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len,
&rqst->rq_snd_buf, rtype);
if (ret)
goto out_err;
+
+ trace_xprtrdma_marshal(req, rtype, wtype);
return 0;
out_err:
trace_xprtrdma_marshal_failed(rqst, ret);
- switch (ret) {
- case -EAGAIN:
- xprt_wait_for_buffer_space(rqst->rq_xprt);
- break;
- case -ENOBUFS:
- break;
- default:
- r_xprt->rx_stats.failed_marshal_count++;
- }
+ r_xprt->rx_stats.failed_marshal_count++;
+ frwr_reset(req);
return ret;
}
@@ -1269,51 +1275,17 @@ out_badheader:
goto out;
}
-void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
-{
- /* Invalidate and unmap the data payloads before waking
- * the waiting application. This guarantees the memory
- * regions are properly fenced from the server before the
- * application accesses the data. It also ensures proper
- * send flow control: waking the next RPC waits until this
- * RPC has relinquished all its Send Queue entries.
- */
- if (!list_empty(&req->rl_registered))
- frwr_unmap_sync(r_xprt, &req->rl_registered);
-
- /* Ensure that any DMA mapped pages associated with
- * the Send of the RPC Call have been unmapped before
- * allowing the RPC to complete. This protects argument
- * memory not controlled by the RPC client from being
- * re-used before we're done with it.
- */
- if (test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
- r_xprt->rx_stats.reply_waits_for_send++;
- out_of_line_wait_on_bit(&req->rl_flags,
- RPCRDMA_REQ_F_TX_RESOURCES,
- bit_wait,
- TASK_UNINTERRUPTIBLE);
- }
-}
-
-/* Reply handling runs in the poll worker thread. Anything that
- * might wait is deferred to a separate workqueue.
- */
-void rpcrdma_deferred_completion(struct work_struct *work)
+static void rpcrdma_reply_done(struct kref *kref)
{
- struct rpcrdma_rep *rep =
- container_of(work, struct rpcrdma_rep, rr_work);
- struct rpcrdma_req *req = rpcr_to_rdmar(rep->rr_rqst);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpcrdma_req *req =
+ container_of(kref, struct rpcrdma_req, rl_kref);
- trace_xprtrdma_defer_cmp(rep);
- if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
- frwr_reminv(rep, &req->rl_registered);
- rpcrdma_release_rqst(r_xprt, req);
- rpcrdma_complete_rqst(rep);
+ rpcrdma_complete_rqst(req->rl_reply);
}
-/* Process received RPC/RDMA messages.
+/**
+ * rpcrdma_reply_handler - Process received RPC/RDMA messages
+ * @rep: Incoming rpcrdma_rep object to process
*
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
@@ -1360,10 +1332,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
else if (credits > buf->rb_max_requests)
credits = buf->rb_max_requests;
if (buf->rb_credits != credits) {
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
buf->rb_credits = credits;
xprt->cwnd = credits << RPC_CWNDSHIFT;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
req = rpcr_to_rdmar(rqst);
@@ -1373,10 +1345,16 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
}
req->rl_reply = rep;
rep->rr_rqst = rqst;
- clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
trace_xprtrdma_reply(rqst->rq_task, rep, req, credits);
- queue_work(buf->rb_completion_wq, &rep->rr_work);
+
+ if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
+ frwr_reminv(rep, &req->rl_registered);
+ if (!list_empty(&req->rl_registered))
+ frwr_unmap_async(r_xprt, req);
+ /* LocalInv completion will complete the RPC */
+ else
+ kref_put(&req->rl_kref, rpcrdma_reply_done);
return;
out_badversion:
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index bed57d8b5c19..d1fcc41d5eb5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -72,9 +72,9 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
credits = r_xprt->rx_buf.rb_bc_max_requests;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt->cwnd = credits << RPC_CWNDSHIFT;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
spin_lock(&xprt->queue_lock);
ret = 0;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 0004535c0188..3fe665152d95 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -226,9 +226,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
* Enqueue the new transport on the accept queue of the listening
* transport
*/
- spin_lock_bh(&listen_xprt->sc_lock);
+ spin_lock(&listen_xprt->sc_lock);
list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q);
- spin_unlock_bh(&listen_xprt->sc_lock);
+ spin_unlock(&listen_xprt->sc_lock);
set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags);
svc_xprt_enqueue(&listen_xprt->sc_xprt);
@@ -401,7 +401,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
clear_bit(XPT_CONN, &xprt->xpt_flags);
/* Get the next entry off the accept list */
- spin_lock_bh(&listen_rdma->sc_lock);
+ spin_lock(&listen_rdma->sc_lock);
if (!list_empty(&listen_rdma->sc_accept_q)) {
newxprt = list_entry(listen_rdma->sc_accept_q.next,
struct svcxprt_rdma, sc_accept_q);
@@ -409,7 +409,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
}
if (!list_empty(&listen_rdma->sc_accept_q))
set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags);
- spin_unlock_bh(&listen_rdma->sc_lock);
+ spin_unlock(&listen_rdma->sc_lock);
if (!newxprt)
return NULL;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ffb1684c4573..2ec349ed4770 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -297,6 +297,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
module_put(THIS_MODULE);
}
+/* 60 second timeout, no retries */
static const struct rpc_timeout xprt_rdma_default_timeout = {
.to_initval = 60 * HZ,
.to_maxval = 60 * HZ,
@@ -322,8 +323,9 @@ xprt_setup_rdma(struct xprt_create *args)
if (!xprt)
return ERR_PTR(-ENOMEM);
- /* 60 second timeout, no retries */
xprt->timeout = &xprt_rdma_default_timeout;
+ xprt->connect_timeout = xprt->timeout->to_initval;
+ xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
xprt->bind_timeout = RPCRDMA_BIND_TO;
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
@@ -486,31 +488,64 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
}
/**
- * xprt_rdma_connect - try to establish a transport connection
+ * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
+ * @xprt: controlling transport instance
+ * @connect_timeout: reconnect timeout after client disconnects
+ * @reconnect_timeout: reconnect timeout after server disconnects
+ *
+ */
+static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+ trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
+
+ spin_lock(&xprt->transport_lock);
+
+ if (connect_timeout < xprt->connect_timeout) {
+ struct rpc_timeout to;
+ unsigned long initval;
+
+ to = *xprt->timeout;
+ initval = connect_timeout;
+ if (initval < RPCRDMA_INIT_REEST_TO << 1)
+ initval = RPCRDMA_INIT_REEST_TO << 1;
+ to.to_initval = initval;
+ to.to_maxval = initval;
+ r_xprt->rx_timeout = to;
+ xprt->timeout = &r_xprt->rx_timeout;
+ xprt->connect_timeout = connect_timeout;
+ }
+
+ if (reconnect_timeout < xprt->max_reconnect_timeout)
+ xprt->max_reconnect_timeout = reconnect_timeout;
+
+ spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_rdma_connect - schedule an attempt to reconnect
* @xprt: transport state
- * @task: RPC scheduler context
+ * @task: RPC scheduler context (unused)
*
*/
static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ unsigned long delay;
trace_xprtrdma_op_connect(r_xprt);
+
+ delay = 0;
if (r_xprt->rx_ep.rep_connected != 0) {
- /* Reconnect */
- schedule_delayed_work(&r_xprt->rx_connect_worker,
- xprt->reestablish_timeout);
- xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
- xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
- else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
- xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- } else {
- schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
- if (!RPC_IS_ASYNC(task))
- flush_delayed_work(&r_xprt->rx_connect_worker);
+ delay = xprt_reconnect_delay(xprt);
+ xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
+ queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
+ delay);
}
/**
@@ -549,8 +584,11 @@ out_sleep:
static void
xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
{
+ struct rpcrdma_xprt *r_xprt =
+ container_of(xprt, struct rpcrdma_xprt, rx_xprt);
+
memset(rqst, 0, sizeof(*rqst));
- rpcrdma_buffer_put(rpcr_to_rdmar(rqst));
+ rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
rpc_wake_up_next(&xprt->backlog);
}
@@ -617,9 +655,16 @@ xprt_rdma_free(struct rpc_task *task)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags))
- rpcrdma_release_rqst(r_xprt, req);
trace_xprtrdma_op_free(task, req);
+
+ if (!list_empty(&req->rl_registered))
+ frwr_unmap_sync(r_xprt, req);
+
+ /* XXX: If the RPC is completing because of a signal and
+ * not because a reply was received, we ought to ensure
+ * that the Send completion has fired, so that memory
+ * involved with the Send is not still visible to the NIC.
+ */
}
/**
@@ -666,7 +711,6 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
goto drop_connection;
rqst->rq_xtime = ktime_get();
- __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags);
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
goto drop_connection;
@@ -759,6 +803,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
+ .set_connect_timeout = xprt_rdma_tcp_set_connect_timeout,
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
@@ -766,6 +811,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
.bc_maxpayload = xprt_rdma_bc_maxpayload,
+ .bc_num_slots = xprt_rdma_bc_max_slots,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 84bb37924540..805b1f35e1ca 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -89,14 +89,12 @@ static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
*/
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
ib_drain_rq(ia->ri_id->qp);
- drain_workqueue(buf->rb_completion_wq);
/* Deferred Reply processing might have scheduled
* local invalidations.
@@ -901,7 +899,7 @@ out_emptyq:
* completions recently. This is a sign the Send Queue is
* backing up. Cause the caller to pause and try again.
*/
- set_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags);
+ xprt_wait_for_buffer_space(&r_xprt->rx_xprt);
r_xprt->rx_stats.empty_sendctx_q++;
return NULL;
}
@@ -936,10 +934,7 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc)
/* Paired with READ_ONCE */
smp_store_release(&buf->rb_sc_tail, next_tail);
- if (test_and_clear_bit(RPCRDMA_BUF_F_EMPTY_SCQ, &buf->rb_flags)) {
- smp_mb__after_atomic();
- xprt_write_space(&sc->sc_xprt->rx_xprt);
- }
+ xprt_write_space(&sc->sc_xprt->rx_xprt);
}
static void
@@ -977,8 +972,6 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
r_xprt->rx_stats.mrs_allocated += count;
spin_unlock(&buf->rb_mrlock);
trace_xprtrdma_createmrs(r_xprt, count);
-
- xprt_write_space(&r_xprt->rx_xprt);
}
static void
@@ -990,6 +983,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
rx_buf);
rpcrdma_mrs_create(r_xprt);
+ xprt_write_space(&r_xprt->rx_xprt);
}
/**
@@ -1025,7 +1019,6 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size,
if (!req->rl_recvbuf)
goto out4;
- req->rl_buffer = buffer;
INIT_LIST_HEAD(&req->rl_registered);
spin_lock(&buffer->rb_lock);
list_add(&req->rl_all, &buffer->rb_allreqs);
@@ -1042,9 +1035,9 @@ out1:
return NULL;
}
-static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp)
+static struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
+ bool temp)
{
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_rep *rep;
rep = kzalloc(sizeof(*rep), GFP_KERNEL);
@@ -1055,27 +1048,22 @@ static bool rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp)
DMA_FROM_DEVICE, GFP_KERNEL);
if (!rep->rr_rdmabuf)
goto out_free;
+
xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf));
-
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
- INIT_WORK(&rep->rr_work, rpcrdma_deferred_completion);
rep->rr_recv_wr.next = NULL;
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
rep->rr_temp = temp;
-
- spin_lock(&buf->rb_lock);
- list_add(&rep->rr_list, &buf->rb_recv_bufs);
- spin_unlock(&buf->rb_lock);
- return true;
+ return rep;
out_free:
kfree(rep);
out:
- return false;
+ return NULL;
}
/**
@@ -1089,7 +1077,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
int i, rc;
- buf->rb_flags = 0;
buf->rb_max_requests = r_xprt->rx_ep.rep_max_requests;
buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_mrlock);
@@ -1122,15 +1109,6 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
if (rc)
goto out;
- buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s",
- WQ_MEM_RECLAIM | WQ_HIGHPRI,
- 0,
- r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]);
- if (!buf->rb_completion_wq) {
- rc = -ENOMEM;
- goto out;
- }
-
return 0;
out:
rpcrdma_buffer_destroy(buf);
@@ -1204,11 +1182,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
cancel_delayed_work_sync(&buf->rb_refresh_worker);
- if (buf->rb_completion_wq) {
- destroy_workqueue(buf->rb_completion_wq);
- buf->rb_completion_wq = NULL;
- }
-
rpcrdma_sendctxs_destroy(buf);
while (!list_empty(&buf->rb_recv_bufs)) {
@@ -1325,13 +1298,12 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
/**
* rpcrdma_buffer_put - Put request/reply buffers back into pool
+ * @buffers: buffer pool
* @req: object to return
*
*/
-void
-rpcrdma_buffer_put(struct rpcrdma_req *req)
+void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
{
- struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_rep *rep = req->rl_reply;
req->rl_reply = NULL;
@@ -1484,8 +1456,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
int rc;
- if (!ep->rep_send_count ||
- test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
+ if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
send_wr->send_flags |= IB_SEND_SIGNALED;
ep->rep_send_count = ep->rep_send_batch;
} else {
@@ -1505,11 +1476,13 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct ib_recv_wr *wr, *bad_wr;
+ struct ib_recv_wr *i, *wr, *bad_wr;
+ struct rpcrdma_rep *rep;
int needed, count, rc;
rc = 0;
count = 0;
+
needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
if (ep->rep_receive_count > needed)
goto out;
@@ -1517,51 +1490,65 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!temp)
needed += RPCRDMA_MAX_RECV_BATCH;
- count = 0;
+ /* fast path: all needed reps can be found on the free list */
wr = NULL;
+ spin_lock(&buf->rb_lock);
while (needed) {
- struct rpcrdma_regbuf *rb;
- struct rpcrdma_rep *rep;
-
- spin_lock(&buf->rb_lock);
rep = list_first_entry_or_null(&buf->rb_recv_bufs,
struct rpcrdma_rep, rr_list);
- if (likely(rep))
- list_del(&rep->rr_list);
- spin_unlock(&buf->rb_lock);
- if (!rep) {
- if (!rpcrdma_rep_create(r_xprt, temp))
- break;
- continue;
- }
+ if (!rep)
+ break;
- rb = rep->rr_rdmabuf;
- if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) {
- rpcrdma_recv_buffer_put(rep);
+ list_del(&rep->rr_list);
+ rep->rr_recv_wr.next = wr;
+ wr = &rep->rr_recv_wr;
+ --needed;
+ }
+ spin_unlock(&buf->rb_lock);
+
+ while (needed) {
+ rep = rpcrdma_rep_create(r_xprt, temp);
+ if (!rep)
break;
- }
- trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
rep->rr_recv_wr.next = wr;
wr = &rep->rr_recv_wr;
- ++count;
--needed;
}
- if (!count)
+ if (!wr)
goto out;
+ for (i = wr; i; i = i->next) {
+ rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
+
+ if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf))
+ goto release_wrs;
+
+ trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
+ ++count;
+ }
+
rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
+out:
+ trace_xprtrdma_post_recvs(r_xprt, count, rc);
if (rc) {
- for (wr = bad_wr; wr; wr = wr->next) {
+ for (wr = bad_wr; wr;) {
struct rpcrdma_rep *rep;
rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
+ wr = wr->next;
rpcrdma_recv_buffer_put(rep);
--count;
}
}
ep->rep_receive_count += count;
-out:
- trace_xprtrdma_post_recvs(r_xprt, count, rc);
+ return;
+
+release_wrs:
+ for (i = wr; i;) {
+ rep = container_of(i, struct rpcrdma_rep, rr_recv_wr);
+ i = i->next;
+ rpcrdma_recv_buffer_put(rep);
+ }
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index d1e0749bcbc4..92ce09fcea74 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -44,7 +44,8 @@
#include <linux/wait.h> /* wait_queue_head_t, etc */
#include <linux/spinlock.h> /* spinlock_t, etc */
-#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/atomic.h> /* atomic_t, etc */
+#include <linux/kref.h> /* struct kref */
#include <linux/workqueue.h> /* struct work_struct */
#include <rdma/rdma_cm.h> /* RDMA connection api */
@@ -202,10 +203,9 @@ struct rpcrdma_rep {
bool rr_temp;
struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
- struct work_struct rr_work;
+ struct rpc_rqst *rr_rqst;
struct xdr_buf rr_hdrbuf;
struct xdr_stream rr_stream;
- struct rpc_rqst *rr_rqst;
struct list_head rr_list;
struct ib_recv_wr rr_recv_wr;
};
@@ -240,18 +240,12 @@ struct rpcrdma_sendctx {
* An external memory region is any buffer or page that is registered
* on the fly (ie, not pre-registered).
*/
-enum rpcrdma_frwr_state {
- FRWR_IS_INVALID, /* ready to be used */
- FRWR_IS_VALID, /* in use */
- FRWR_FLUSHED_FR, /* flushed FASTREG WR */
- FRWR_FLUSHED_LI, /* flushed LOCALINV WR */
-};
-
+struct rpcrdma_req;
struct rpcrdma_frwr {
struct ib_mr *fr_mr;
struct ib_cqe fr_cqe;
- enum rpcrdma_frwr_state fr_state;
struct completion fr_linv_done;
+ struct rpcrdma_req *fr_req;
union {
struct ib_reg_wr fr_regwr;
struct ib_send_wr fr_invwr;
@@ -326,7 +320,6 @@ struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_list;
struct rpc_rqst rl_slot;
- struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
struct xdr_stream rl_stream;
struct xdr_buf rl_hdrbuf;
@@ -336,18 +329,12 @@ struct rpcrdma_req {
struct rpcrdma_regbuf *rl_recvbuf; /* rq_rcv_buf */
struct list_head rl_all;
- unsigned long rl_flags;
+ struct kref rl_kref;
struct list_head rl_registered; /* registered segments */
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
-/* rl_flags */
-enum {
- RPCRDMA_REQ_F_PENDING = 0,
- RPCRDMA_REQ_F_TX_RESOURCES,
-};
-
static inline struct rpcrdma_req *
rpcr_to_rdmar(const struct rpc_rqst *rqst)
{
@@ -391,22 +378,15 @@ struct rpcrdma_buffer {
struct list_head rb_recv_bufs;
struct list_head rb_allreqs;
- unsigned long rb_flags;
u32 rb_max_requests;
u32 rb_credits; /* most recent credit grant */
u32 rb_bc_srv_max_requests;
u32 rb_bc_max_requests;
- struct workqueue_struct *rb_completion_wq;
struct delayed_work rb_refresh_worker;
};
-/* rb_flags */
-enum {
- RPCRDMA_BUF_F_EMPTY_SCQ = 0,
-};
-
/*
* Statistics for RPCRDMA
*/
@@ -452,6 +432,7 @@ struct rpcrdma_xprt {
struct rpcrdma_ep rx_ep;
struct rpcrdma_buffer rx_buf;
struct delayed_work rx_connect_worker;
+ struct rpc_timeout rx_timeout;
struct rpcrdma_stats rx_stats;
};
@@ -518,7 +499,8 @@ rpcrdma_mr_recycle(struct rpcrdma_mr *mr)
}
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
-void rpcrdma_buffer_put(struct rpcrdma_req *);
+void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers,
+ struct rpcrdma_req *req);
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size,
@@ -564,6 +546,7 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
bool frwr_is_supported(struct ib_device *device);
+void frwr_reset(struct rpcrdma_req *req);
int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep);
int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
void frwr_release_mr(struct rpcrdma_mr *mr);
@@ -574,8 +557,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr **mr);
int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
-void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt,
- struct list_head *mrs);
+void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
+void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
/*
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
@@ -598,9 +581,6 @@ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
-void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req);
-void rpcrdma_deferred_completion(struct work_struct *work);
static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
{
@@ -625,6 +605,7 @@ void xprt_rdma_cleanup(void);
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *);
int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 36652352a38c..e2176c167a57 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -880,7 +880,7 @@ static int xs_nospace(struct rpc_rqst *req)
req->rq_slen);
/* Protect against races with write_space */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
/* Don't race with disconnect */
if (xprt_connected(xprt)) {
@@ -890,7 +890,7 @@ static int xs_nospace(struct rpc_rqst *req)
} else
ret = -ENOTCONN;
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
/* Race breaker in case memory is freed before above code is called */
if (ret == -EAGAIN) {
@@ -909,6 +909,7 @@ static int xs_nospace(struct rpc_rqst *req)
static void
xs_stream_prepare_request(struct rpc_rqst *req)
{
+ xdr_free_bvec(&req->rq_rcv_buf);
req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_KERNEL);
}
@@ -1211,6 +1212,15 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+ clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
+ clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
+ clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+}
+
+static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
+{
+ set_bit(nr, &transport->sock_state);
+ queue_work(xprtiod_workqueue, &transport->error_worker);
}
static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
@@ -1231,6 +1241,7 @@ static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
*/
static void xs_error_report(struct sock *sk)
{
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
int err;
@@ -1238,13 +1249,14 @@ static void xs_error_report(struct sock *sk)
if (!(xprt = xprt_from_sock(sk)))
goto out;
+ transport = container_of(xprt, struct sock_xprt, xprt);
err = -sk->sk_err;
if (err == 0)
goto out;
dprintk("RPC: xs_error_report client %p, error=%d...\n",
xprt, -err);
trace_rpc_socket_error(xprt, sk->sk_socket, err);
- xprt_wake_pending_tasks(xprt, err);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
out:
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1333,6 +1345,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&transport->connect_worker);
xs_close(xprt);
cancel_work_sync(&transport->recv_worker);
+ cancel_work_sync(&transport->error_worker);
xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
@@ -1386,9 +1399,9 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
}
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, copied);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
spin_lock(&xprt->queue_lock);
xprt_complete_rqst(task, copied);
__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
@@ -1498,7 +1511,6 @@ static void xs_tcp_state_change(struct sock *sk)
trace_rpc_socket_state_change(xprt, sk->sk_socket);
switch (sk->sk_state) {
case TCP_ESTABLISHED:
- spin_lock(&xprt->transport_lock);
if (!xprt_test_and_set_connected(xprt)) {
xprt->connect_cookie++;
clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);
@@ -1507,9 +1519,8 @@ static void xs_tcp_state_change(struct sock *sk)
xprt->stat.connect_count++;
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
- xprt_wake_pending_tasks(xprt, -EAGAIN);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING);
}
- spin_unlock(&xprt->transport_lock);
break;
case TCP_FIN_WAIT1:
/* The client initiated a shutdown of the socket */
@@ -1525,7 +1536,7 @@ static void xs_tcp_state_change(struct sock *sk)
/* The server initiated a shutdown of the socket */
xprt->connect_cookie++;
clear_bit(XPRT_CONNECTED, &xprt->state);
- xs_tcp_force_close(xprt);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
/* fall through */
case TCP_CLOSING:
/*
@@ -1547,7 +1558,7 @@ static void xs_tcp_state_change(struct sock *sk)
xprt_clear_connecting(xprt);
clear_bit(XPRT_CLOSING, &xprt->state);
/* Trigger the socket release */
- xs_tcp_force_close(xprt);
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
}
out:
read_unlock_bh(&sk->sk_callback_lock);
@@ -1556,6 +1567,7 @@ static void xs_tcp_state_change(struct sock *sk)
static void xs_write_space(struct sock *sk)
{
struct socket_wq *wq;
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
if (!sk->sk_socket)
@@ -1564,13 +1576,14 @@ static void xs_write_space(struct sock *sk)
if (unlikely(!(xprt = xprt_from_sock(sk))))
return;
+ transport = container_of(xprt, struct sock_xprt, xprt);
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
goto out;
- if (xprt_write_space(xprt))
- sk->sk_write_pending--;
+ xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
+ sk->sk_write_pending--;
out:
rcu_read_unlock();
}
@@ -1664,9 +1677,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t
*/
static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
{
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
static int xs_get_random_port(void)
@@ -2201,13 +2214,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
unsigned int opt_on = 1;
unsigned int timeo;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
keepcnt = xprt->timeout->to_retries + 1;
timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
(xprt->timeout->to_retries + 1);
clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
/* TCP Keepalive options */
kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2232,7 +2245,7 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
struct rpc_timeout to;
unsigned long initval;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->transport_lock);
if (reconnect_timeout < xprt->max_reconnect_timeout)
xprt->max_reconnect_timeout = reconnect_timeout;
if (connect_timeout < xprt->connect_timeout) {
@@ -2249,7 +2262,7 @@ static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
xprt->connect_timeout = connect_timeout;
}
set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->transport_lock);
}
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2402,25 +2415,6 @@ out:
xprt_wake_pending_tasks(xprt, status);
}
-static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
-{
- unsigned long start, now = jiffies;
-
- start = xprt->stat.connect_start + xprt->reestablish_timeout;
- if (time_after(start, now))
- return start - now;
- return 0;
-}
-
-static void xs_reconnect_backoff(struct rpc_xprt *xprt)
-{
- xprt->reestablish_timeout <<= 1;
- if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
- xprt->reestablish_timeout = xprt->max_reconnect_timeout;
- if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
- xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-}
-
/**
* xs_connect - connect a socket to a remote endpoint
* @xprt: pointer to transport structure
@@ -2450,8 +2444,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
/* Start by resetting any existing state */
xs_reset_transport(transport);
- delay = xs_reconnect_delay(xprt);
- xs_reconnect_backoff(xprt);
+ delay = xprt_reconnect_delay(xprt);
+ xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);
} else
dprintk("RPC: xs_connect scheduled xprt %p\n", xprt);
@@ -2461,6 +2455,56 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
delay);
}
+static void xs_wake_disconnect(struct sock_xprt *transport)
+{
+ if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state))
+ xs_tcp_force_close(&transport->xprt);
+}
+
+static void xs_wake_write(struct sock_xprt *transport)
+{
+ if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state))
+ xprt_write_space(&transport->xprt);
+}
+
+static void xs_wake_error(struct sock_xprt *transport)
+{
+ int sockerr;
+ int sockerr_len = sizeof(sockerr);
+
+ if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+ return;
+ mutex_lock(&transport->recv_mutex);
+ if (transport->sock == NULL)
+ goto out;
+ if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+ goto out;
+ if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR,
+ (char *)&sockerr, &sockerr_len) != 0)
+ goto out;
+ if (sockerr < 0)
+ xprt_wake_pending_tasks(&transport->xprt, sockerr);
+out:
+ mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_wake_pending(struct sock_xprt *transport)
+{
+ if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state))
+ xprt_wake_pending_tasks(&transport->xprt, -EAGAIN);
+}
+
+static void xs_error_handle(struct work_struct *work)
+{
+ struct sock_xprt *transport = container_of(work,
+ struct sock_xprt, error_worker);
+
+ xs_wake_disconnect(transport);
+ xs_wake_write(transport);
+ xs_wake_error(transport);
+ xs_wake_pending(transport);
+}
+
/**
* xs_local_print_stats - display AF_LOCAL socket-specifc stats
* @xprt: rpc_xprt struct containing statistics
@@ -2745,6 +2789,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
#ifdef CONFIG_SUNRPC_BACKCHANNEL
.bc_setup = xprt_setup_bc,
.bc_maxpayload = xs_tcp_bc_maxpayload,
+ .bc_num_slots = xprt_bc_max_slots,
.bc_free_rqst = xprt_free_bc_rqst,
.bc_destroy = xprt_destroy_bc,
#endif
@@ -2873,6 +2918,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
xprt->timeout = &xs_local_default_timeout;
INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+ INIT_WORK(&transport->error_worker, xs_error_handle);
INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
switch (sun->sun_family) {
@@ -2943,6 +2989,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
xprt->timeout = &xs_udp_default_timeout;
INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+ INIT_WORK(&transport->error_worker, xs_error_handle);
INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
switch (addr->sa_family) {
@@ -3024,6 +3071,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
(xprt->timeout->to_retries + 1);
INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+ INIT_WORK(&transport->error_worker, xs_error_handle);
INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
switch (addr->sa_family) {
OpenPOWER on IntegriCloud