From a246b0105bbd9a70a698f69baae2042996f2a0e9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:23 -0400 Subject: [PATCH] RPC: introduce client-side transport switch Move the bulk of client-side socket-specific code into a separate source file, net/sunrpc/xprtsock.c. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Destructive testing (unplugging the network temporarily, server reboots). Connectathon with v2, v3, and v4. Version: Thu, 11 Aug 2005 16:03:38 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index e618c1649814..d82b47ab73cb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -59,7 +59,13 @@ extern unsigned int xprt_tcp_slot_table_entries; */ #define RPC_REESTABLISH_TIMEOUT (15*HZ) -/* RPC call and reply header size as number of 32bit words (verifier +/* + * RPC transport idle timeout. + */ +#define RPC_IDLE_DISCONNECT_TIMEOUT (5*60*HZ) + +/* + * RPC call and reply header size as number of 32bit words (verifier * size computed separately) */ #define RPC_CALLHDRSIZE 6 @@ -121,12 +127,19 @@ struct rpc_rqst { #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len -#define XPRT_LAST_FRAG (1 << 0) -#define XPRT_COPY_RECM (1 << 1) -#define XPRT_COPY_XID (1 << 2) -#define XPRT_COPY_DATA (1 << 3) +struct rpc_task; +struct rpc_xprt; + +struct rpc_xprt_ops { + void (*set_buffer_size)(struct rpc_xprt *xprt); + void (*connect)(struct rpc_task *task); + int (*send_request)(struct rpc_task *task); + void (*close)(struct rpc_xprt *xprt); + void (*destroy)(struct rpc_xprt *xprt); +}; struct rpc_xprt { + struct rpc_xprt_ops * ops; /* transport methods */ struct socket * sock; /* BSD socket layer */ struct sock * inet; /* INET layer */ @@ -199,14 +212,22 @@ struct rpc_xprt { wait_queue_head_t cong_wait; }; +#define XPRT_LAST_FRAG (1 << 0) +#define XPRT_COPY_RECM (1 << 1) +#define XPRT_COPY_XID (1 << 2) +#define XPRT_COPY_DATA (1 << 3) + #ifdef __KERNEL__ struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, struct rpc_timeout *toparms); +void xprt_disconnect(struct rpc_xprt *); int xprt_destroy(struct rpc_xprt *); void xprt_set_timeout(struct rpc_timeout *, unsigned int, unsigned long); - +struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *, u32); +void xprt_complete_rqst(struct rpc_xprt *, + struct rpc_rqst *, int); void xprt_reserve(struct rpc_task *); int xprt_prepare_transmit(struct rpc_task *); void xprt_transmit(struct rpc_task *); @@ -214,7 +235,10 @@ void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release(struct rpc_task *); void xprt_connect(struct rpc_task *); -void xprt_sock_setbufsize(struct rpc_xprt *); +int xs_setup_udp(struct rpc_xprt *, + struct rpc_timeout *); +int xs_setup_tcp(struct rpc_xprt *, + struct rpc_timeout *); #define XPRT_LOCKED 0 #define XPRT_CONNECT 1 -- cgit v1.2.3 From 4a0f8c04f2ece949d54a0c4fd7490259cf23a58a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:32 -0400 Subject: [PATCH] RPC: Rename sock_lock Clean-up: replace a name reference to sockets in the generic parts of the RPC client by renaming sock_lock in the rpc_xprt structure. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:00 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/xprt.c | 44 ++++++++++++++++++++++---------------------- net/sunrpc/xprtsock.c | 22 +++++++++++----------- 3 files changed, 34 insertions(+), 34 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d82b47ab73cb..c4f903f0e17c 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -198,7 +198,7 @@ struct rpc_xprt { /* * Send stuff */ - spinlock_t sock_lock; /* lock socket info */ + spinlock_t transport_lock; /* lock transport info */ spinlock_t xprt_lock; /* lock xprt info */ struct rpc_task * snd_task; /* Task blocked in send */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 589195e630ef..1f0da8c1a3b0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -106,9 +106,9 @@ xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); retval = __xprt_lock_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return retval; } @@ -161,9 +161,9 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } /* @@ -266,9 +266,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req) req->rq_retries = 0; xprt_reset_majortimeo(req); /* Reset the RTT counters == "slow start" */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); pprintk("RPC: %lu timeout\n", jiffies); status = -ETIMEDOUT; } @@ -298,10 +298,10 @@ xprt_socket_autoclose(void *args) void xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); rpc_wake_up_status(&xprt->pending, -ENOTCONN); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } static void @@ -309,12 +309,12 @@ xprt_init_autodisconnect(unsigned long data) { struct rpc_xprt *xprt = (struct rpc_xprt *)data; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) goto out_abort; - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); /* Let keventd close the socket */ if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) xprt_release_write(xprt, NULL); @@ -322,7 +322,7 @@ xprt_init_autodisconnect(unsigned long data) schedule_work(&xprt->task_cleanup); return; out_abort: - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } /** @@ -482,7 +482,7 @@ xprt_timer(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (req->rq_received) goto out; @@ -496,7 +496,7 @@ xprt_timer(struct rpc_task *task) out: task->tk_timeout = 0; rpc_wake_up_task(task); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } /** @@ -515,7 +515,7 @@ int xprt_prepare_transmit(struct rpc_task *task) if (xprt->shutdown) return -EIO; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (req->rq_received && !req->rq_bytes_sent) { err = req->rq_received; goto out_unlock; @@ -530,7 +530,7 @@ int xprt_prepare_transmit(struct rpc_task *task) goto out_unlock; } out_unlock: - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return err; } @@ -552,13 +552,13 @@ void xprt_transmit(struct rpc_task *task) smp_rmb(); if (!req->rq_received) { if (list_empty(&req->rq_list)) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -592,7 +592,7 @@ void xprt_transmit(struct rpc_task *task) out_receive: dprintk("RPC: %4d xmit complete\n", task->tk_pid); /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (!xprt->nocong) { int timer = task->tk_msg.rpc_proc->p_timer; task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); @@ -607,7 +607,7 @@ void xprt_transmit(struct rpc_task *task) else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) @@ -683,7 +683,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); __xprt_release_write(xprt, task); __xprt_put_cong(xprt, req); if (!list_empty(&req->rq_list)) @@ -692,7 +692,7 @@ void xprt_release(struct rpc_task *task) if (list_empty(&xprt->recv) && !xprt->shutdown) mod_timer(&xprt->timer, xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ @@ -750,7 +750,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc return ERR_PTR(result); } - spin_lock_init(&xprt->sock_lock); + spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->xprt_lock); init_waitqueue_head(&xprt->cong_wait); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a5a04203a6b0..bc90caab6088 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -307,7 +307,7 @@ static int xs_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { /* Protect against races with xs_write_space */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; @@ -315,7 +315,7 @@ static int xs_send_request(struct rpc_task *task) task->tk_timeout = req->rq_timeout; rpc_sleep_on(&xprt->pending, task, NULL, NULL); } - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return status; } /* Keep holding the socket if it is blocked */ @@ -415,7 +415,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto dropit; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -436,7 +436,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) xprt_complete_rqst(xprt, rovr, copied); out_unlock: - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); dropit: skb_free_datagram(sk, skb); out: @@ -531,13 +531,13 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc ssize_t r; /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); req = xprt_lookup_rqst(xprt, xprt->tcp_xid); if (!req) { xprt->tcp_flags &= ~XPRT_COPY_DATA; dprintk("RPC: XID %08x request not found!\n", ntohl(xprt->tcp_xid)); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); return; } @@ -597,7 +597,7 @@ out: req->rq_task->tk_pid); xprt_complete_rqst(xprt, req, xprt->tcp_copied); } - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } @@ -696,7 +696,7 @@ static void xs_tcp_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_ESTABLISHED: - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { /* Reset TCP record info */ xprt->tcp_offset = 0; @@ -705,7 +705,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; rpc_wake_up(&xprt->pending); } - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); break; case TCP_SYN_SENT: case TCP_SYN_RECV: @@ -753,10 +753,10 @@ static void xs_write_space(struct sock *sk) if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) goto out; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (xprt->snd_task) rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); out: read_unlock(&sk->sk_callback_lock); } -- cgit v1.2.3 From 5dc07727f86b25851e95193a0c484ea21b531c47 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:35 -0400 Subject: [PATCH] RPC: Rename xprt_lock Clean-up: Replace the xprt_lock with something more aptly named. This lock single-threads the XID and request slot reservation process. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:26 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/xprt.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c4f903f0e17c..41ce296dded1 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -199,7 +199,7 @@ struct rpc_xprt { * Send stuff */ spinlock_t transport_lock; /* lock transport info */ - spinlock_t xprt_lock; /* lock xprt info */ + spinlock_t reserve_lock; /* lock slot table */ struct rpc_task * snd_task; /* Task blocked in send */ struct list_head recv; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1f0da8c1a3b0..9c45c522e3ef 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -643,9 +643,9 @@ void xprt_reserve(struct rpc_task *task) task->tk_status = -EIO; if (!xprt->shutdown) { - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); do_xprt_reserve(task); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } } @@ -698,10 +698,10 @@ void xprt_release(struct rpc_task *task) dprintk("RPC: %4d release request %p\n", task->tk_pid, req); - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); xprt_clear_backlog(xprt); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } /** @@ -751,7 +751,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc } spin_lock_init(&xprt->transport_lock); - spin_lock_init(&xprt->xprt_lock); + spin_lock_init(&xprt->reserve_lock); init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); -- cgit v1.2.3 From 2226feb6bcd0e5e117a9be3ea3dd3ffc14f3e41e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:38 -0400 Subject: [PATCH] RPC: rename the sockstate field Clean-up: get rid of a name reference to sockets in the generic parts of the RPC client by renaming the sockstate field in the rpc_xprt structure. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:53 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 60 ++++++++++++++++++++++++++++++++++++--------- net/sunrpc/xprt.c | 14 +++++------ net/sunrpc/xprtsock.c | 6 ++--- 3 files changed, 58 insertions(+), 22 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 41ce296dded1..009a3bb4f997 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -163,7 +163,7 @@ struct rpc_xprt { struct list_head free; /* free slots */ struct rpc_rqst * slot; /* slot table storage */ unsigned int max_reqs; /* total slots */ - unsigned long sockstate; /* Socket state */ + unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ nocong : 1, /* no congestion control */ resvport : 1, /* use a reserved port */ @@ -240,16 +240,54 @@ int xs_setup_udp(struct rpc_xprt *, int xs_setup_tcp(struct rpc_xprt *, struct rpc_timeout *); -#define XPRT_LOCKED 0 -#define XPRT_CONNECT 1 -#define XPRT_CONNECTING 2 - -#define xprt_connected(xp) (test_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_set_connected(xp) (set_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_test_and_set_connected(xp) (test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_test_and_clear_connected(xp) \ - (test_and_clear_bit(XPRT_CONNECT, &(xp)->sockstate)) -#define xprt_clear_connected(xp) (clear_bit(XPRT_CONNECT, &(xp)->sockstate)) +/* + * Reserved bit positions in xprt->state + */ +#define XPRT_LOCKED (0) +#define XPRT_CONNECTED (1) +#define XPRT_CONNECTING (2) + +static inline void xprt_set_connected(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline void xprt_clear_connected(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline int xprt_connected(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline int xprt_test_and_set_connected(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline int xprt_test_and_clear_connected(struct rpc_xprt *xprt) +{ + return test_and_clear_bit(XPRT_CONNECTED, &xprt->state); +} + +static inline void xprt_clear_connecting(struct rpc_xprt *xprt) +{ + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTING, &xprt->state); + smp_mb__after_clear_bit(); +} + +static inline int xprt_connecting(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_CONNECTING, &xprt->state); +} + +static inline int xprt_test_and_set_connecting(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_CONNECTING, &xprt->state); +} #endif /* __KERNEL__*/ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9c45c522e3ef..57c5e77b155e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -74,7 +74,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; @@ -88,7 +88,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; } smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); @@ -118,7 +118,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) { struct rpc_task *task; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) goto out_unlock; @@ -139,7 +139,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) } out_unlock: smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); } @@ -152,7 +152,7 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) if (xprt->snd_task == task) { xprt->snd_task = NULL; smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); __xprt_lock_write_next(xprt); } @@ -312,11 +312,11 @@ xprt_init_autodisconnect(unsigned long data) spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); /* Let keventd close the socket */ - if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) + if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else schedule_work(&xprt->task_cleanup); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bc90caab6088..76a33b54f436 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -925,9 +925,7 @@ out: else rpc_wake_up(&xprt->pending); out_clear: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTING, &xprt->sockstate); - smp_mb__after_clear_bit(); + xprt_clear_connecting(xprt); } /** @@ -940,7 +938,7 @@ static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + if (!xprt_test_and_set_connecting(xprt)) { if (xprt->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p\n", xprt); schedule_delayed_work(&xprt->sock_connect, -- cgit v1.2.3 From 44fbac2288dfed6f1963ac00bf922c3bcd779cd1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:44 -0400 Subject: [PATCH] RPC: Add helper for waking tasks pending on a transport Clean-up: remove only reference to xprt->pending from the socket transport implementation. This makes a cleaner interface for other transport implementations as well. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:06:52 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 18 ++++++++++++++++-- net/sunrpc/xprtsock.c | 7 ++----- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 009a3bb4f997..d5223993fca9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -232,6 +232,7 @@ void xprt_reserve(struct rpc_task *); int xprt_prepare_transmit(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); +void xprt_wake_pending_tasks(struct rpc_xprt *, int); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release(struct rpc_task *); void xprt_connect(struct rpc_task *); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 57c5e77b155e..2f9cd468b953 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -227,6 +227,20 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) xprt->cwnd = cwnd; } +/** + * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue + * @xprt: transport with waiting tasks + * @status: result code to plant in each task before waking it + * + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) +{ + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -300,7 +314,7 @@ void xprt_disconnect(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - rpc_wake_up_status(&xprt->pending, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } @@ -803,7 +817,7 @@ static void xprt_shutdown(struct rpc_xprt *xprt) xprt->shutdown = 1; rpc_wake_up(&xprt->sending); rpc_wake_up(&xprt->resend); - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 76a33b54f436..182da2edf61c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -703,7 +703,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); break; @@ -920,10 +920,7 @@ static void xs_connect_worker(void *args) } } out: - if (status < 0) - rpc_wake_up_status(&xprt->pending, status); - else - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); } -- cgit v1.2.3 From 55aa4f58aa43dc9a51fb80010630d94b96053a2e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:47 -0400 Subject: [PATCH] RPC: client-side transport switch cleanup Clean-up: change some comments to reflect the realities of the new RPC transport switch mechanism. Get rid of unused xprt_receive() prototype. Also, organize function prototypes in xprt.h by usage and scope. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:07:21 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 61 ++++++++++++++++++++++++++------------------- net/sunrpc/clnt.c | 2 +- net/sunrpc/xprt.c | 26 +++++++++---------- net/sunrpc/xprtsock.c | 12 +++++---- 4 files changed, 55 insertions(+), 46 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d5223993fca9..bfbc492ae36d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -1,5 +1,5 @@ /* - * linux/include/linux/sunrpc/clnt_xprt.h + * linux/include/linux/sunrpc/xprt.h * * Declarations for the RPC transport interface. * @@ -150,8 +150,8 @@ struct rpc_xprt { unsigned long cong; /* current congestion */ unsigned long cwnd; /* congestion window */ - unsigned int rcvsize, /* socket receive buffer size */ - sndsize; /* socket send buffer size */ + unsigned int rcvsize, /* transport rcv buffer size */ + sndsize; /* transport send buffer size */ size_t max_payload; /* largest RPC payload size, in bytes */ @@ -184,12 +184,12 @@ struct rpc_xprt { unsigned long tcp_copied, /* copied to request */ tcp_flags; /* - * Connection of sockets + * Connection of transports */ - struct work_struct sock_connect; + struct work_struct connect_worker; unsigned short port; /* - * Disconnection of idle sockets + * Disconnection of idle transports */ struct work_struct task_cleanup; struct timer_list timer; @@ -219,27 +219,36 @@ struct rpc_xprt { #ifdef __KERNEL__ -struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, - struct rpc_timeout *toparms); -void xprt_disconnect(struct rpc_xprt *); -int xprt_destroy(struct rpc_xprt *); -void xprt_set_timeout(struct rpc_timeout *, unsigned int, - unsigned long); -struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *, u32); -void xprt_complete_rqst(struct rpc_xprt *, - struct rpc_rqst *, int); -void xprt_reserve(struct rpc_task *); -int xprt_prepare_transmit(struct rpc_task *); -void xprt_transmit(struct rpc_task *); -void xprt_receive(struct rpc_task *); -void xprt_wake_pending_tasks(struct rpc_xprt *, int); +/* + * Transport operations used by ULPs + */ +struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, struct rpc_timeout *to); +void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr); + +/* + * Generic internal transport functions + */ +void xprt_connect(struct rpc_task *task); +void xprt_reserve(struct rpc_task *task); +int xprt_prepare_transmit(struct rpc_task *task); +void xprt_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); -void xprt_release(struct rpc_task *); -void xprt_connect(struct rpc_task *); -int xs_setup_udp(struct rpc_xprt *, - struct rpc_timeout *); -int xs_setup_tcp(struct rpc_xprt *, - struct rpc_timeout *); +void xprt_release(struct rpc_task *task); +int xprt_destroy(struct rpc_xprt *xprt); + +/* + * Transport switch helper functions + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); +struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); +void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); +void xprt_disconnect(struct rpc_xprt *xprt); + +/* + * Socket transport setup operations + */ +int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to); +int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to); /* * Reserved bit positions in xprt->state diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0d1b010a4a01..4677959d2834 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1,5 +1,5 @@ /* - * linux/net/sunrpc/rpcclnt.c + * linux/net/sunrpc/clnt.c * * This file contains the high-level RPC interface. * It is modeled as a finite state machine to support both synchronous diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2f9cd468b953..247fa1ec870c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -10,12 +10,12 @@ * one is available. Otherwise, it sleeps on the backlog queue * (xprt_reserve). * - Next, the caller puts together the RPC message, stuffs it into - * the request struct, and calls xprt_call(). - * - xprt_call transmits the message and installs the caller on the - * socket's wait list. At the same time, it installs a timer that + * the request struct, and calls xprt_transmit(). + * - xprt_transmit sends the message and installs the caller on the + * transport's wait list. At the same time, it installs a timer that * is run after the packet's timeout has expired. * - When a packet arrives, the data_ready handler walks the list of - * pending requests for that socket. If a matching XID is found, the + * pending requests for that transport. If a matching XID is found, the * caller is woken up, and the timer removed. * - When no reply arrives within the timeout interval, the timer is * fired by the kernel and runs xprt_timer(). It either adjusts the @@ -32,6 +32,8 @@ * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch + * + * Transport switch API copyright (C) 2005, Chuck Lever */ #include @@ -52,8 +54,6 @@ # define RPCDBG_FACILITY RPCDBG_XPRT #endif -#define XPRT_MAX_BACKOFF (8) - /* * Local functions */ @@ -65,9 +65,9 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); /* - * Serialize write access to sockets, in order to prevent different + * Serialize write access to transports, in order to prevent different * requests from interfering with each other. - * Also prevents TCP socket connects from colliding with writes. + * Also prevents transport connects from colliding with writes. */ static int __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) @@ -91,7 +91,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: - dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); + dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; if (req && req->rq_ntrans) @@ -144,7 +144,7 @@ out_unlock: } /* - * Releases the socket for use by other requests. + * Releases the transport for use by other requests. */ static void __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) @@ -294,8 +294,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req) return status; } -static void -xprt_socket_autoclose(void *args) +static void xprt_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; @@ -329,7 +328,6 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - /* Let keventd close the socket */ if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else @@ -770,7 +768,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); - INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); + INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 182da2edf61c..7f0b9f7f167b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -11,6 +11,8 @@ * Rewrite of larges part of the code in order to stabilize TCP stuff. * Fix behaviour when socket buffer is full. * (C) 1999 Trond Myklebust + * + * IP socket transport implementation, (C) 2005 Chuck Lever */ #include @@ -363,7 +365,7 @@ static void xs_destroy(struct rpc_xprt *xprt) { dprintk("RPC: xs_destroy xprt %p\n", xprt); - cancel_delayed_work(&xprt->sock_connect); + cancel_delayed_work(&xprt->connect_worker); flush_scheduled_work(); xprt_disconnect(xprt); @@ -938,11 +940,11 @@ static void xs_connect(struct rpc_task *task) if (!xprt_test_and_set_connecting(xprt)) { if (xprt->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p\n", xprt); - schedule_delayed_work(&xprt->sock_connect, + schedule_delayed_work(&xprt->connect_worker, RPC_REESTABLISH_TIMEOUT); } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - schedule_work(&xprt->sock_connect); + schedule_work(&xprt->connect_worker); /* flush_scheduled_work can sleep... */ if (!RPC_IS_ASYNC(task)) flush_scheduled_work(); @@ -989,7 +991,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); xprt->ops = &xs_ops; @@ -1028,7 +1030,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); xprt->ops = &xs_ops; -- cgit v1.2.3 From c7b2cae8a634015b72941ba2fc6c4bc9b8d3a129 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:50 -0400 Subject: [PATCH] RPC: separate TCP and UDP write space callbacks Split the socket write space callback function into a TCP version and UDP version, eliminating one dependence on the "xprt->stream" variable. Keep the common pieces of this path in xprt.c so other transports can use it too. Test-plan: Write-intensive workload on a single mount point. Version: Thu, 11 Aug 2005 16:07:51 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 34 ++++++++++++++++++ net/sunrpc/xprtsock.c | 84 ++++++++++++++++++++++++++++----------------- 3 files changed, 89 insertions(+), 31 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bfbc492ae36d..e73174c7e450 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -240,6 +240,8 @@ int xprt_destroy(struct rpc_xprt *xprt); * Transport switch helper functions */ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); +void xprt_wait_for_buffer_space(struct rpc_task *task); +void xprt_write_space(struct rpc_xprt *xprt); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); void xprt_disconnect(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 247fa1ec870c..31ef7dc7eed6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -241,6 +241,40 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) rpc_wake_up(&xprt->pending); } +/** + * xprt_wait_for_buffer_space - wait for transport output buffer to clear + * @task: task to be put to sleep + * + */ +void xprt_wait_for_buffer_space(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); +} + +/** + * xprt_write_space - wake the task waiting for transport output buffer space + * @xprt: transport with waiting tasks + * + * Can be called in a soft IRQ context, so xprt_write_space never sleeps. + */ +void xprt_write_space(struct rpc_xprt *xprt) +{ + if (unlikely(xprt->shutdown)) + return; + + spin_lock_bh(&xprt->transport_lock); + if (xprt->snd_task) { + dprintk("RPC: write space: waking waiting task on xprt %p\n", + xprt); + rpc_wake_up_task(xprt->snd_task); + } + spin_unlock_bh(&xprt->transport_lock); +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7f0b9f7f167b..70a772d7a796 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -308,15 +308,13 @@ static int xs_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xs_write_space */ + /* Protect against races with write_space */ spin_lock_bh(&xprt->transport_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); - } + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); spin_unlock_bh(&xprt->transport_lock); return status; } @@ -721,45 +719,68 @@ static void xs_tcp_state_change(struct sock *sk) } /** - * xs_write_space - callback invoked when socket buffer space becomes - * available + * xs_udp_write_space - callback invoked when socket buffer space + * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" - * progress, otherwise we'll waste resources thrashing sock_sendmsg + * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ -static void xs_write_space(struct sock *sk) +static void xs_udp_write_space(struct sock *sk) { - struct rpc_xprt *xprt; - struct socket *sock; - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) - goto out; - if (xprt->shutdown) - goto out; - /* Wait until we have enough socket memory */ - if (xprt->stream) { - /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + /* from net/core/sock.c:sock_def_write_space */ + if (sock_writeable(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) goto out; - } else { - /* from net/core/sock.c:sock_def_write_space */ - if (!sock_writeable(sk)) + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) goto out; + + xprt_write_space(xprt); } - if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - goto out; + out: + read_unlock(&sk->sk_callback_lock); +} - spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) - rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->transport_lock); -out: +/** + * xs_tcp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_tcp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + goto out; + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + goto out; + + xprt_write_space(xprt); + } + + out: read_unlock(&sk->sk_callback_lock); } @@ -855,15 +876,16 @@ static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) xprt->old_write_space = sk->sk_write_space; if (xprt->prot == IPPROTO_UDP) { sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; xprt_set_connected(xprt); } else { tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; xprt_clear_connected(xprt); } - sk->sk_write_space = xs_write_space; /* Reset to new socket */ xprt->sock = sock; -- cgit v1.2.3 From 808012fbb23a52ec59352445d2076d175ad4ab26 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:49 -0700 Subject: [PATCH] RPC: skip over transport-specific heads automatically Add a generic mechanism for skipping over transport-specific headers when constructing an RPC request. This removes another "xprt->stream" dependency. Test-plan: Write-intensive workload on a single mount point (try both UDP and TCP). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/msg_prot.h | 25 +++++++++++++++++++++++++ include/linux/sunrpc/xprt.h | 7 +++++++ net/sunrpc/auth_gss/auth_gss.c | 6 ++---- net/sunrpc/clnt.c | 5 ++--- net/sunrpc/xprtsock.c | 24 +++++++++++++++++------- 5 files changed, 53 insertions(+), 14 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index 15f115332389..f43f237360ae 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -76,5 +76,30 @@ enum rpc_auth_stat { #define RPC_MAXNETNAMELEN 256 +/* + * From RFC 1831: + * + * "A record is composed of one or more record fragments. A record + * fragment is a four-byte header followed by 0 to (2**31) - 1 bytes of + * fragment data. The bytes encode an unsigned binary number; as with + * XDR integers, the byte order is from highest to lowest. The number + * encodes two values -- a boolean which indicates whether the fragment + * is the last fragment of the record (bit value 1 implies the fragment + * is the last fragment) and a 31-bit unsigned binary value which is the + * length in bytes of the fragment's data. The boolean value is the + * highest-order bit of the header; the length is the 31 low-order bits. + * (Note that this record specification is NOT in XDR standard form!)" + * + * The Linux RPC client always sends its requests in a single record + * fragment, limiting the maximum payload size for stream transports to + * 2GB. + */ + +typedef u32 rpc_fraghdr; + +#define RPC_LAST_STREAM_FRAGMENT (1U << 31) +#define RPC_FRAGMENT_SIZE_MASK (~RPC_LAST_STREAM_FRAGMENT) +#define RPC_MAX_FRAGMENT_SIZE ((1U << 31) - 1) + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_MSGPROT_H_ */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index e73174c7e450..966c456a0f6d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -155,6 +155,8 @@ struct rpc_xprt { size_t max_payload; /* largest RPC payload size, in bytes */ + unsigned int tsh_size; /* size of transport specific + header */ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue resend; /* requests waiting to resend */ @@ -236,6 +238,11 @@ int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release(struct rpc_task *task); int xprt_destroy(struct rpc_xprt *xprt); +static inline u32 *xprt_skip_transport_header(struct rpc_xprt *xprt, u32 *p) +{ + return p + xprt->tsh_size; +} + /* * Transport switch helper functions */ diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 53a030acdf75..d2b08f16c257 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -844,10 +844,8 @@ gss_marshal(struct rpc_task *task, u32 *p) /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ - iov.iov_base = req->rq_snd_buf.head[0].iov_base; - if (task->tk_client->cl_xprt->stream) - /* See clnt.c:call_header() */ - iov.iov_base += 4; + iov.iov_base = xprt_skip_transport_header(task->tk_xprt, + req->rq_snd_buf.head[0].iov_base); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4677959d2834..cc1b773a79d3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1075,13 +1075,12 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; /* FIXME: check buffer size? */ - if (xprt->stream) - *p++ = 0; /* fill in later */ + + p = xprt_skip_transport_header(task->tk_xprt, p); *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 57988300640a..aaf053b1a0c4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -282,6 +282,13 @@ static int xs_udp_send_request(struct rpc_task *task) return status; } +static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); +} + /** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request @@ -301,11 +308,9 @@ static int xs_tcp_send_request(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct xdr_buf *xdr = &req->rq_snd_buf; - u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; - /* Write the record marker */ - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + xs_encode_tcp_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, @@ -503,16 +508,19 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc xprt->tcp_offset += used; if (used != len) return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) + if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) xprt->tcp_flags |= XPRT_LAST_FRAG; else xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + xprt->tcp_flags &= ~XPRT_COPY_RECM; xprt->tcp_offset = 0; + /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { + if (unlikely(xprt->tcp_reclen < 4)) { dprintk("RPC: invalid TCP record fragment length\n"); xprt_disconnect(xprt); return; @@ -1065,6 +1073,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = 0; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -1105,11 +1114,12 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - xprt->max_payload = (1U << 31) - 1; + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); -- cgit v1.2.3 From 43118c29dea2b23798bd42a147015cceee7fa885 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:49 -0700 Subject: [PATCH] RPC: get rid of xprt->stream Now we can fix up the last few places that use the "xprt->stream" variable, and get rid of it from the rpc_xprt structure. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 3 +-- include/linux/sunrpc/xprt.h | 3 +-- net/sunrpc/xprt.c | 3 +-- net/sunrpc/xprtsock.c | 28 ++++++++++++++++++---------- 4 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 82c77df81c5f..7901f5b8092c 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -173,11 +173,10 @@ nlm_bind_host(struct nlm_host *host) /* If we've already created an RPC client, check whether * RPC rebind is required - * Note: why keep rebinding if we're on a tcp connection? */ if ((clnt = host->h_rpcclnt) != NULL) { xprt = clnt->cl_xprt; - if (!xprt->stream && time_after_eq(jiffies, host->h_nextrebind)) { + if (time_after_eq(jiffies, host->h_nextrebind)) { clnt->cl_port = 0; host->h_nextrebind = jiffies + NLM_HOST_REBIND; dprintk("lockd: next rebind in %ld jiffies\n", diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 966c456a0f6d..c9477f022efb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -168,8 +168,7 @@ struct rpc_xprt { unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ nocong : 1, /* no congestion control */ - resvport : 1, /* use a reserved port */ - stream : 1; /* TCP */ + resvport : 1; /* use a reserved port */ /* * XID diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 31ef7dc7eed6..43fef7626442 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -630,8 +630,7 @@ void xprt_transmit(struct rpc_task *task) case -ENOTCONN: return; default: - if (xprt->stream) - xprt_disconnect(xprt); + break; } xprt_release_write(xprt, task); return; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index aaf053b1a0c4..5bb6fed3df34 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -356,6 +356,7 @@ static int xs_tcp_send_request(struct rpc_task *task) default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + xprt_disconnect(xprt); break; } @@ -826,19 +827,17 @@ static void xs_tcp_write_space(struct sock *sk) } /** - * xs_set_buffer_size - set send and receive limits + * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport * * Set socket send and receive limits based on the * sndsize and rcvsize fields in the generic transport - * structure. This applies only to UDP sockets. + * structure. */ -static void xs_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; - if (xprt->stream) - return; if (xprt->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; @@ -850,6 +849,17 @@ static void xs_set_buffer_size(struct rpc_xprt *xprt) } } +/** + * xs_tcp_set_buffer_size - set send and receive limits + * @xprt: generic transport + * + * Nothing to do for TCP. + */ +static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) +{ + return; +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -928,7 +938,7 @@ static void xs_udp_connect_worker(void *args) write_unlock_bh(&sk->sk_callback_lock); } - xs_set_buffer_size(xprt); + xs_udp_set_buffer_size(xprt); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1034,7 +1044,7 @@ static void xs_connect(struct rpc_task *task) } static struct rpc_xprt_ops xs_udp_ops = { - .set_buffer_size = xs_set_buffer_size, + .set_buffer_size = xs_udp_set_buffer_size, .connect = xs_connect, .send_request = xs_udp_send_request, .close = xs_close, @@ -1042,7 +1052,7 @@ static struct rpc_xprt_ops xs_udp_ops = { }; static struct rpc_xprt_ops xs_tcp_ops = { - .set_buffer_size = xs_set_buffer_size, + .set_buffer_size = xs_tcp_set_buffer_size, .connect = xs_connect, .send_request = xs_tcp_send_request, .close = xs_close, @@ -1074,7 +1084,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; @@ -1115,7 +1124,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; -- cgit v1.2.3 From fe3aca290f17ae4978bd73d02aa4029f1c9c024c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:50 -0700 Subject: [PATCH] RPC: add API to set transport-specific timeouts Prepare the way to remove the "xprt->nocong" variable by adding a callout to the RPC client transport switch API to handle setting RPC retransmit timeouts. Add a pair of generic helper functions that provide the ability to set a simple fixed timeout, or to set a timeout based on the state of a round- trip estimator. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++ net/sunrpc/xprt.c | 67 ++++++++++++++++++++++++++++++--------------- net/sunrpc/xprtsock.c | 2 ++ 3 files changed, 50 insertions(+), 22 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c9477f022efb..ac08e99a81cb 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -134,6 +134,7 @@ struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); + void (*set_retrans_timeout)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); }; @@ -245,6 +246,8 @@ static inline u32 *xprt_skip_transport_header(struct rpc_xprt *xprt, u32 *p) /* * Transport switch helper functions */ +void xprt_set_retrans_timeout_def(struct rpc_task *task); +void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_task *task); void xprt_write_space(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 43fef7626442..1ac2fbe05102 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -275,6 +275,38 @@ void xprt_write_space(struct rpc_xprt *xprt) spin_unlock_bh(&xprt->transport_lock); } +/** + * xprt_set_retrans_timeout_def - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation. + */ +void xprt_set_retrans_timeout_def(struct rpc_task *task) +{ + task->tk_timeout = task->tk_rqstp->rq_timeout; +} + +/* + * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout using the RTT estimator. + */ +void xprt_set_retrans_timeout_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = req->rq_xprt->timeout.to_maxval; + + task->tk_timeout = rpc_calc_rto(rtt, timer); + task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (task->tk_timeout > max_timeout || task->tk_timeout == 0) + task->tk_timeout = max_timeout; +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -588,7 +620,6 @@ out_unlock: */ void xprt_transmit(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int status; @@ -613,8 +644,19 @@ void xprt_transmit(struct rpc_task *task) return; status = xprt->ops->send_request(task); - if (!status) - goto out_receive; + if (status == 0) { + dprintk("RPC: %4d xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); + xprt->ops->set_retrans_timeout(task); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->transport_lock); + return; + } /* Note: at this point, task->tk_sleeping has not yet been set, * hence there is no danger of the waking up task being put on @@ -634,25 +676,6 @@ void xprt_transmit(struct rpc_task *task) } xprt_release_write(xprt, task); return; - out_receive: - dprintk("RPC: %4d xmit complete\n", task->tk_pid); - /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->transport_lock); - if (!xprt->nocong) { - int timer = task->tk_msg.rpc_proc->p_timer; - task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); - task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries; - if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0) - task->tk_timeout = xprt->timeout.to_maxval; - } else - task->tk_timeout = req->rq_timeout; - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5bb6fed3df34..79433ffd1df0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1047,6 +1047,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .connect = xs_connect, .send_request = xs_udp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .close = xs_close, .destroy = xs_destroy, }; @@ -1055,6 +1056,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, .connect = xs_connect, .send_request = xs_tcp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, .destroy = xs_destroy, }; -- cgit v1.2.3 From 12a804698b29d040b7cdd92e8a44b0e75164dae9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:51 -0700 Subject: [PATCH] RPC: expose API for serializing access to RPC transports The next several patches introduce an API that allows transports to choose whether the RPC client provides congestion control or whether the transport itself provides it. The first method we abstract is the one that serializes access to the RPC transport to prevent the bytes from different requests from mingling together. This method provides proper request serialization and the opportunity to prevent new requests from being started because the transport is congested. The normal situation is for the transport to handle congestion control itself. Although NFS over UDP was first, it has been recognized after years of experience that having the transport provide congestion control is much better than doing it in the RPC client. Thus TCP, and probably every future transport implementation, will use the default method, xprt_lock_write, provided in xprt.c, which does not provide any kind of congestion control. UDP can continue using the xprt.c-provided Van Jacobson congestion avoidance implementation. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +++ net/sunrpc/xprt.c | 64 ++++++++++++++++++++++++++++++++++++--------- net/sunrpc/xprtsock.c | 2 ++ 3 files changed, 57 insertions(+), 12 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index ac08e99a81cb..eee1c6877851 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -132,6 +132,7 @@ struct rpc_xprt; struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); + int (*reserve_xprt)(struct rpc_task *task); void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); @@ -232,6 +233,8 @@ void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long */ void xprt_connect(struct rpc_task *task); void xprt_reserve(struct rpc_task *task); +int xprt_reserve_xprt(struct rpc_task *task); +int xprt_reserve_xprt_cong(struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1ac2fbe05102..2d1e8b83dd68 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -64,14 +64,56 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); +/** + * xprt_reserve_xprt - serialize write access to transports + * @task: task that is requesting access to the transport + * + * This prevents mixing the payload of separate requests, and prevents + * transport connects from colliding with writes. No congestion control + * is provided. + */ +int xprt_reserve_xprt(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { + if (task == xprt->snd_task) + return 1; + if (task == NULL) + return 0; + goto out_sleep; + } + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + +out_sleep: + dprintk("RPC: %4d failed to lock transport %p\n", + task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + return 0; +} + /* - * Serialize write access to transports, in order to prevent different - * requests from interfering with each other. - * Also prevents transport connects from colliding with writes. + * xprt_reserve_xprt_cong - serialize write access to transports + * @task: task that is requesting access to the transport + * + * Same as xprt_reserve_xprt, but Van Jacobson congestion control is + * integrated into the decision of whether a request is allowed to be + * woken up and given access to the transport. */ -static int -__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +int xprt_reserve_xprt_cong(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { @@ -79,7 +121,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; goto out_sleep; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { xprt->snd_task = task; if (req) { req->rq_bytes_sent = 0; @@ -101,20 +143,18 @@ out_sleep: return 0; } -static inline int -xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; spin_lock_bh(&xprt->transport_lock); - retval = __xprt_lock_write(xprt, task); + retval = xprt->ops->reserve_xprt(task); spin_unlock_bh(&xprt->transport_lock); return retval; } -static void -__xprt_lock_write_next(struct rpc_xprt *xprt) +static void __xprt_lock_write_next(struct rpc_xprt *xprt) { struct rpc_task *task; @@ -598,7 +638,7 @@ int xprt_prepare_transmit(struct rpc_task *task) err = req->rq_received; goto out_unlock; } - if (!__xprt_lock_write(xprt, task)) { + if (!xprt->ops->reserve_xprt(task)) { err = -EAGAIN; goto out_unlock; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 79433ffd1df0..fc4fbe8ea346 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1045,6 +1045,7 @@ static void xs_connect(struct rpc_task *task) static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt_cong, .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, @@ -1054,6 +1055,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt, .connect = xs_connect, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, -- cgit v1.2.3 From 49e9a89086b3cae784a4868ca852863e4f4ea3fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:51 -0700 Subject: [PATCH] RPC: expose API for serializing access to RPC transports The next method we abstract is the one that releases a transport, allowing another task to have access to the transport. Again, one generic version of this is provided for transports that don't need the RPC client to perform congestion control, and one version is for transports that can use the original Van Jacobson implementation in xprt.c. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++ net/sunrpc/xprt.c | 77 ++++++++++++++++++++++++++++++++++++--------- net/sunrpc/xprtsock.c | 2 ++ 3 files changed, 68 insertions(+), 14 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index eee1c6877851..86833b725bb5 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -133,6 +133,7 @@ struct rpc_xprt; struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); int (*reserve_xprt)(struct rpc_task *task); + void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); @@ -238,6 +239,8 @@ int xprt_reserve_xprt_cong(struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_release(struct rpc_task *task); int xprt_destroy(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2d1e8b83dd68..e92ea99dd318 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -153,14 +153,42 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return retval; } - static void __xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + struct rpc_rqst *req; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + return; + + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + + req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + +out_unlock: + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); +} + +static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { struct rpc_task *task; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) + if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; task = rpc_wake_up_next(&xprt->resend); if (!task) { @@ -168,7 +196,7 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (!task) goto out_unlock; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { struct rpc_rqst *req = task->tk_rqstp; xprt->snd_task = task; if (req) { @@ -183,11 +211,14 @@ out_unlock: smp_mb__after_clear_bit(); } -/* - * Releases the transport for use by other requests. +/** + * xprt_release_xprt - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. No congestion control is provided. */ -static void -__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt->snd_task = NULL; @@ -198,11 +229,29 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) } } -static inline void -xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +/** + * xprt_release_xprt_cong - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. Another task is awoken to use the + * transport if the transport's congestion window allows it. + */ +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt->snd_task = NULL; + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); + __xprt_lock_write_next_cong(xprt); + } +} + +static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { spin_lock_bh(&xprt->transport_lock); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); } @@ -237,7 +286,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } /* @@ -256,7 +305,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; if (cwnd > RPC_MAXCWND(xprt)) cwnd = RPC_MAXCWND(xprt); - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; if (cwnd < RPC_CWNDSCALE) @@ -693,7 +742,7 @@ void xprt_transmit(struct rpc_task *task) task->tk_status = -ENOTCONN; else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); return; } @@ -792,7 +841,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; spin_lock_bh(&xprt->transport_lock); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); __xprt_put_cong(xprt, req); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fc4fbe8ea346..8589c1ad55e3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1046,6 +1046,7 @@ static void xs_connect(struct rpc_task *task) static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, @@ -1056,6 +1057,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, .connect = xs_connect, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, -- cgit v1.2.3 From 46c0ee8bc4ad3743de05e8b8b20201df44dcb6d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:52 -0700 Subject: [PATCH] RPC: separate xprt_timer implementations Allow transports to hook the retransmit timer interrupt. Some transports calculate their congestion window here so that a retransmit timeout has immediate effect on the congestion window. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 45 ++++++++++++++++++++------------------------- net/sunrpc/xprtsock.c | 12 ++++++++++++ 3 files changed, 34 insertions(+), 25 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 86833b725bb5..443c3f984cf9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -137,6 +137,7 @@ struct rpc_xprt_ops { void (*connect)(struct rpc_task *task); int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); + void (*timer)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); }; @@ -257,6 +258,7 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_task *task); void xprt_write_space(struct rpc_xprt *xprt); +void xprt_adjust_cwnd(struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); void xprt_disconnect(struct rpc_xprt *xprt); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e92ea99dd318..ffc595592af3 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -289,16 +289,19 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) __xprt_lock_write_next_cong(xprt); } -/* - * Adjust RPC congestion window +/** + * xprt_adjust_cwnd - adjust transport congestion window + * @task: recently completed RPC request used to adjust window + * @result: result code of completed RPC request + * * We use a time-smoothed congestion estimator to avoid heavy oscillation. */ -static void -xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) +void xprt_adjust_cwnd(struct rpc_task *task, int result) { - unsigned long cwnd; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + unsigned long cwnd = xprt->cwnd; - cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ @@ -314,6 +317,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", xprt->cong, xprt->cwnd, cwnd); xprt->cwnd = cwnd; + __xprt_put_cong(xprt, req); } /** @@ -602,8 +606,7 @@ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) /* Adjust congestion window */ if (!xprt->nocong) { unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(xprt, copied); - __xprt_put_cong(xprt, req); + xprt_adjust_cwnd(task, copied); if (timer) { if (req->rq_ntrans == 1) rpc_update_rtt(clnt->cl_rtt, timer, @@ -640,27 +643,19 @@ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -/* - * RPC receive timeout handler. - */ -static void -xprt_timer(struct rpc_task *task) +static void xprt_timer(struct rpc_task *task) { - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - spin_lock(&xprt->transport_lock); - if (req->rq_received) - goto out; - - xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); - __xprt_put_cong(xprt, req); + dprintk("RPC: %4d xprt_timer\n", task->tk_pid); - dprintk("RPC: %4d xprt_timer (%s request)\n", - task->tk_pid, req ? "pending" : "backlogged"); - - task->tk_status = -ETIMEDOUT; -out: + spin_lock(&xprt->transport_lock); + if (!req->rq_received) { + if (xprt->ops->timer) + xprt->ops->timer(task); + task->tk_status = -ETIMEDOUT; + } task->tk_timeout = 0; rpc_wake_up_task(task); spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8589c1ad55e3..c3658ff027a6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -860,6 +860,17 @@ static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) return; } +/** + * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @task: task that timed out + * + * Adjust the congestion window after a retransmit timeout has occurred. + */ +static void xs_udp_timer(struct rpc_task *task) +{ + xprt_adjust_cwnd(task, -ETIMEDOUT); +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -1050,6 +1061,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .timer = xs_udp_timer, .close = xs_close, .destroy = xs_destroy, }; -- cgit v1.2.3 From 1570c1e41eabf6b7031f3e4322a2cf1cbe319fee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:52 -0700 Subject: [PATCH] RPC: add generic interface for adjusting the congestion window A new interface that allows transports to adjust their congestion window using the Van Jacobson implementation in xprt.c is provided. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++- net/sunrpc/xprt.c | 66 ++++++++++++++++++--------------------------- net/sunrpc/xprtsock.c | 13 ++++----- 3 files changed, 33 insertions(+), 49 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 443c3f984cf9..2e48752d55d9 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -258,9 +258,10 @@ void xprt_set_retrans_timeout_rtt(struct rpc_task *task); void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); void xprt_wait_for_buffer_space(struct rpc_task *task); void xprt_write_space(struct rpc_xprt *xprt); +void xprt_update_rtt(struct rpc_task *task); void xprt_adjust_cwnd(struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); -void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied); +void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_disconnect(struct rpc_xprt *xprt); /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ffc595592af3..707806fe1a23 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -591,56 +591,42 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) return req; } +/** + * xprt_update_rtt - update an RPC client's RTT state after receiving a reply + * @task: RPC request that recently completed + * + */ +void xprt_update_rtt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; + + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(rtt, timer, + (long)jiffies - req->rq_xtime); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); + } +} + /** * xprt_complete_rqst - called when reply processing is complete - * @xprt: controlling transport - * @req: RPC request that just completed + * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * + * Caller holds transport lock. */ -void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) -{ - struct rpc_task *task = req->rq_task; - struct rpc_clnt *clnt = task->tk_client; - - /* Adjust congestion window */ - if (!xprt->nocong) { - unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(task, copied); - if (timer) { - if (req->rq_ntrans == 1) - rpc_update_rtt(clnt->cl_rtt, timer, - (long)jiffies - req->rq_xtime); - rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1); - } - } +void xprt_complete_rqst(struct rpc_task *task, int copied) +{ + struct rpc_rqst *req = task->tk_rqstp; -#ifdef RPC_PROFILE - /* Profile only reads for now */ - if (copied > 1024) { - static unsigned long nextstat; - static unsigned long pkt_rtt, pkt_len, pkt_cnt; - - pkt_cnt++; - pkt_len += req->rq_slen + copied; - pkt_rtt += jiffies - req->rq_xtime; - if (time_before(nextstat, jiffies)) { - printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd); - printk("RPC: %ld %ld %ld %ld stat\n", - jiffies, pkt_cnt, pkt_len, pkt_rtt); - pkt_rtt = pkt_len = pkt_cnt = 0; - nextstat = jiffies + 5 * HZ; - } - } -#endif + dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", + task->tk_pid, ntohl(req->rq_xid), copied); - dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied); list_del_init(&req->rq_list); req->rq_received = req->rq_private_buf.len = copied; - - /* ... and wake up the process. */ rpc_wake_up_task(task); - return; } static void xprt_timer(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c3658ff027a6..980f26504f48 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -460,8 +460,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto out_unlock; task = rovr->rq_task; - dprintk("RPC: %4d received reply\n", task->tk_pid); - if ((copied = rovr->rq_private_buf.buflen) > repsize) copied = repsize; @@ -472,7 +470,9 @@ static void xs_udp_data_ready(struct sock *sk, int len) /* Something worked... */ dst_confirm(skb->dst); - xprt_complete_rqst(xprt, rovr, copied); + xprt_adjust_cwnd(task, copied); + xprt_update_rtt(task); + xprt_complete_rqst(task, copied); out_unlock: spin_unlock(&xprt->transport_lock); @@ -634,11 +634,8 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc } out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) + xprt_complete_rqst(req->rq_task, xprt->tcp_copied); spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } -- cgit v1.2.3 From a58dd398f5db4f73d5c581069fd70a4304cc4f0a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:53 -0700 Subject: [PATCH] RPC: add a release_rqst callout to the RPC transport switch The final place where congestion control state is adjusted is in xprt_release, where each request is finally released. Add a callout there to allow transports to perform additional processing when a request is about to be released. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprt.c | 14 +++++++++++++- net/sunrpc/xprtsock.c | 1 + 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 2e48752d55d9..64e77658fa30 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -138,6 +138,7 @@ struct rpc_xprt_ops { int (*send_request)(struct rpc_task *task); void (*set_retrans_timeout)(struct rpc_task *task); void (*timer)(struct rpc_task *task); + void (*release_request)(struct rpc_task *task); void (*close)(struct rpc_xprt *xprt); void (*destroy)(struct rpc_xprt *xprt); }; @@ -262,6 +263,7 @@ void xprt_update_rtt(struct rpc_task *task); void xprt_adjust_cwnd(struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid); void xprt_complete_rqst(struct rpc_task *task, int copied); +void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect(struct rpc_xprt *xprt); /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 707806fe1a23..e8d11bd6158e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -289,6 +289,17 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) __xprt_lock_write_next_cong(xprt); } +/** + * xprt_release_rqst_cong - housekeeping when request is complete + * @task: RPC request that recently completed + * + * Useful for transports that require congestion control. + */ +void xprt_release_rqst_cong(struct rpc_task *task) +{ + __xprt_put_cong(task->tk_xprt, task->tk_rqstp); +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @task: recently completed RPC request used to adjust window @@ -823,7 +834,8 @@ void xprt_release(struct rpc_task *task) return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); - __xprt_put_cong(xprt, req); + if (xprt->ops->release_request) + xprt->ops->release_request(task); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 980f26504f48..6c2f5dcea416 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1059,6 +1059,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .timer = xs_udp_timer, + .release_request = xprt_release_rqst_cong, .close = xs_close, .destroy = xs_destroy, }; -- cgit v1.2.3 From ed63c003701a314c4893c11eceb9d68f8f46c662 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:53 -0700 Subject: [PATCH] RPC: remove xprt->nocong Get rid of the "xprt->nocong" variable. Test-plan: Use WAN simulation to cause sporadic bursty packet loss with UDP mounts. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/host.c | 1 - include/linux/sunrpc/xprt.h | 1 - net/sunrpc/xprtsock.c | 2 -- 3 files changed, 4 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7901f5b8092c..c4c8601096e0 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -188,7 +188,6 @@ nlm_bind_host(struct nlm_host *host) goto forgetit; xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout); - xprt->nocong = 1; /* No congestion control for NLM */ xprt->resvport = 1; /* NLM requires a reserved port */ /* Existing NLM servers accept AUTH_UNIX only */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 64e77658fa30..559fb471f6f2 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -172,7 +172,6 @@ struct rpc_xprt { unsigned int max_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ - nocong : 1, /* no congestion control */ resvport : 1; /* use a reserved port */ /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6c2f5dcea416..7e5e020fe78d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1100,7 +1100,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ @@ -1140,7 +1139,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; -- cgit v1.2.3 From 555ee3af161b037865793bd4bebc06b58daafde6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:54 -0700 Subject: [PATCH] RPC: clean up after nocong was removed Clean-up: Move some macros that are specific to the Van Jacobson implementation into xprt.c. Get rid of the cong_wait field in rpc_xprt, which is no longer used. Get rid of xprt_clear_backlog. Test-plan: Compile with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 22 ---------------------- net/sunrpc/xprt.c | 29 +++++++++++++++++++---------- net/sunrpc/xprtsock.c | 2 -- 3 files changed, 19 insertions(+), 34 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 559fb471f6f2..dcf0326bda01 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -15,20 +15,6 @@ #include #include -/* - * The transport code maintains an estimate on the maximum number of out- - * standing RPC requests, using a smoothed version of the congestion - * avoidance implemented in 44BSD. This is basically the Van Jacobson - * congestion algorithm: If a retransmit occurs, the congestion window is - * halved; otherwise, it is incremented by 1/cwnd when - * - * - a reply is received and - * - a full number of requests are outstanding and - * - the congestion window hasn't been updated recently. - * - * Upper procedures may check whether a request would block waiting for - * a free RPC slot by using the RPC_CONGESTED() macro. - */ extern unsigned int xprt_udp_slot_table_entries; extern unsigned int xprt_tcp_slot_table_entries; @@ -36,12 +22,6 @@ extern unsigned int xprt_tcp_slot_table_entries; #define RPC_DEF_SLOT_TABLE (16U) #define RPC_MAX_SLOT_TABLE (128U) -#define RPC_CWNDSHIFT (8U) -#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) -#define RPC_INITCWND RPC_CWNDSCALE -#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) -#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) - /* Default timeout values */ #define RPC_MAX_UDP_TIMEOUT (60*HZ) #define RPC_MAX_TCP_TIMEOUT (600*HZ) @@ -213,8 +193,6 @@ struct rpc_xprt { void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); - - wait_queue_head_t cong_wait; }; #define XPRT_LAST_FRAG (1 << 0) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e8d11bd6158e..0458319a1bdd 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -62,7 +62,23 @@ static inline void do_xprt_reserve(struct rpc_task *); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static int xprt_clear_backlog(struct rpc_xprt *xprt); +/* + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. + */ +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) +#define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) + +#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) /** * xprt_reserve_xprt - serialize write access to transports @@ -850,7 +866,7 @@ void xprt_release(struct rpc_task *task) spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); - xprt_clear_backlog(xprt); + rpc_wake_up_next(&xprt->backlog); spin_unlock(&xprt->reserve_lock); } @@ -902,7 +918,6 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); @@ -911,6 +926,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; + xprt->cwnd = RPC_INITCWND; rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); @@ -955,16 +971,9 @@ static void xprt_shutdown(struct rpc_xprt *xprt) rpc_wake_up(&xprt->resend); xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); - wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); } -static int xprt_clear_backlog(struct rpc_xprt *xprt) { - rpc_wake_up_next(&xprt->backlog); - wake_up(&xprt->cong_wait); - return 1; -} - /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e5e020fe78d..26402c063f00 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1100,7 +1100,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); @@ -1139,7 +1138,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; -- cgit v1.2.3 From 529b33c6db0120126b1381faa51406dc463acdc9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:54 -0700 Subject: [PATCH] RPC: allow RPC client's port range to be adjustable Select an RPC client source port between 650 and 1023 instead of between 1 and 800. The old range conflicts with a number of network services. Provide sysctls to allow admins to select a different port range. Note that this doesn't affect user-level RPC library behavior, which still uses 1 to 800. Based on a suggestion by Olaf Kirch . Test-plan: Repeated mount and unmount. Destructive testing. Idle timeouts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/debug.h | 2 ++ include/linux/sunrpc/xprt.h | 17 ++++++++++++++--- net/sunrpc/sysctl.c | 29 +++++++++++++++++++++++++++++ net/sunrpc/xprtsock.c | 23 ++++++++--------------- 4 files changed, 53 insertions(+), 18 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 42d299747956..1a42d902bc11 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -95,6 +95,8 @@ enum { CTL_NLMDEBUG, CTL_SLOTTABLE_UDP, CTL_SLOTTABLE_TCP, + CTL_MIN_RESVPORT, + CTL_MAX_RESVPORT, }; #endif /* _LINUX_SUNRPC_DEBUG_H_ */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index dcf0326bda01..9d9266cf8a36 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -51,6 +51,17 @@ extern unsigned int xprt_tcp_slot_table_entries; #define RPC_CALLHDRSIZE 6 #define RPC_REPHDRSIZE 4 +/* + * Parameters for choosing a free port + */ +extern unsigned int xprt_min_resvport; +extern unsigned int xprt_max_resvport; + +#define RPC_MIN_RESVPORT (1U) +#define RPC_MAX_RESVPORT (65535U) +#define RPC_DEF_MIN_RESVPORT (650U) +#define RPC_DEF_MAX_RESVPORT (1023U) + /* * This describes a timeout strategy */ @@ -62,6 +73,9 @@ struct rpc_timeout { unsigned char to_exponential; }; +struct rpc_task; +struct rpc_xprt; + /* * This describes a complete RPC request */ @@ -107,9 +121,6 @@ struct rpc_rqst { #define rq_svec rq_snd_buf.head #define rq_slen rq_snd_buf.len -struct rpc_task; -struct rpc_xprt; - struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt); int (*reserve_xprt)(struct rpc_task *task); diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index ef483262f17f..d0c9f460e411 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -121,9 +121,16 @@ done: unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; +EXPORT_SYMBOL(xprt_min_resvport); +unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; +EXPORT_SYMBOL(xprt_max_resvport); + static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; +static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; +static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static ctl_table debug_table[] = { { @@ -180,6 +187,28 @@ static ctl_table debug_table[] = { .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, + { + .ctl_name = CTL_MIN_RESVPORT, + .procname = "min_resvport", + .data = &xprt_min_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, + { + .ctl_name = CTL_MAX_RESVPORT, + .procname = "max_resvport", + .data = &xprt_max_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, { .ctl_name = 0 } }; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 26402c063f00..62c2e7caa345 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -35,11 +35,6 @@ #include #include -/* - * Maximum port number to use when requesting a reserved port. - */ -#define XS_MAX_RESVPORT (800U) - /* * How many times to try sending a request on a socket before waiting * for the socket buffer to clear. @@ -873,10 +868,9 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) struct sockaddr_in myaddr = { .sin_family = AF_INET, }; - int err, port; + int err; + unsigned short port = xprt->port; - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; do { myaddr.sin_port = htons(port); err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, @@ -887,8 +881,10 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) port); return 0; } - if (--port == 0) - port = XS_MAX_RESVPORT; + if (port <= xprt_min_resvport) + port = xprt_max_resvport; + else + port--; } while (err == -EADDRINUSE && port != xprt->port); dprintk("RPC: can't bind to reserved port (%d).\n", -err); @@ -1075,9 +1071,6 @@ static struct rpc_xprt_ops xs_tcp_ops = { .destroy = xs_destroy, }; -extern unsigned int xprt_udp_slot_table_entries; -extern unsigned int xprt_tcp_slot_table_entries; - /** * xs_setup_udp - Set up transport to use a UDP socket * @xprt: transport to set up @@ -1098,7 +1091,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_UDP; - xprt->port = XS_MAX_RESVPORT; + xprt->port = xprt_max_resvport; xprt->tsh_size = 0; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ @@ -1136,7 +1129,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_TCP; - xprt->port = XS_MAX_RESVPORT; + xprt->port = xprt_max_resvport; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; -- cgit v1.2.3 From 03bf4b707eee06706c9db343dd5c905b7ee47ed2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:55 -0700 Subject: [PATCH] RPC: parametrize various transport connect timeouts Each transport implementation can now set unique bind, connect, reestablishment, and idle timeout values. These are variables, allowing the values to be modified dynamically. This permits exponential backoff of any of these values, for instance. As an example, we implement exponential backoff for the connection reestablishment timeout. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 10 +++---- include/linux/nfs_fs.h | 4 +++ include/linux/sunrpc/xprt.h | 29 ++++--------------- net/sunrpc/clnt.c | 2 +- net/sunrpc/xprt.c | 5 ++-- net/sunrpc/xprtsock.c | 68 +++++++++++++++++++++++++++++++++++++++++++-- 6 files changed, 84 insertions(+), 34 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b6a1ca508e60..062911e7ceb5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -369,8 +369,8 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned case IPPROTO_TCP: if (!to->to_initval) to->to_initval = 60 * HZ; - if (to->to_initval > RPC_MAX_TCP_TIMEOUT) - to->to_initval = RPC_MAX_TCP_TIMEOUT; + if (to->to_initval > NFS_MAX_TCP_TIMEOUT) + to->to_initval = NFS_MAX_TCP_TIMEOUT; to->to_increment = to->to_initval; to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); to->to_exponential = 0; @@ -379,9 +379,9 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto, unsigned default: if (!to->to_initval) to->to_initval = 11 * HZ / 10; - if (to->to_initval > RPC_MAX_UDP_TIMEOUT) - to->to_initval = RPC_MAX_UDP_TIMEOUT; - to->to_maxval = RPC_MAX_UDP_TIMEOUT; + if (to->to_initval > NFS_MAX_UDP_TIMEOUT) + to->to_initval = NFS_MAX_UDP_TIMEOUT; + to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 9a6047ff1b25..7bac2785c6e4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -41,6 +41,10 @@ #define NFS_MAX_FILE_IO_BUFFER_SIZE 32768 #define NFS_DEF_FILE_IO_BUFFER_SIZE 4096 +/* Default timeout values */ +#define NFS_MAX_UDP_TIMEOUT (60*HZ) +#define NFS_MAX_TCP_TIMEOUT (600*HZ) + /* * superblock magic number for NFS */ diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 9d9266cf8a36..2543adf18551 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -22,28 +22,6 @@ extern unsigned int xprt_tcp_slot_table_entries; #define RPC_DEF_SLOT_TABLE (16U) #define RPC_MAX_SLOT_TABLE (128U) -/* Default timeout values */ -#define RPC_MAX_UDP_TIMEOUT (60*HZ) -#define RPC_MAX_TCP_TIMEOUT (600*HZ) - -/* - * Wait duration for an RPC TCP connection to be established. Solaris - * NFS over TCP uses 60 seconds, for example, which is in line with how - * long a server takes to reboot. - */ -#define RPC_CONNECT_TIMEOUT (60*HZ) - -/* - * Delay an arbitrary number of seconds before attempting to reconnect - * after an error. - */ -#define RPC_REESTABLISH_TIMEOUT (15*HZ) - -/* - * RPC transport idle timeout. - */ -#define RPC_IDLE_DISCONNECT_TIMEOUT (5*60*HZ) - /* * RPC call and reply header size as number of 32bit words (verifier * size computed separately) @@ -182,14 +160,19 @@ struct rpc_xprt { /* * Connection of transports */ + unsigned long connect_timeout, + bind_timeout, + reestablish_timeout; struct work_struct connect_worker; unsigned short port; + /* * Disconnection of idle transports */ struct work_struct task_cleanup; struct timer_list timer; - unsigned long last_used; + unsigned long last_used, + idle_timeout; /* * Send stuff diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index cc1b773a79d3..24b44e73f391 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -740,7 +740,7 @@ call_bind(struct rpc_task *task) task->tk_action = call_connect; if (!clnt->cl_port) { task->tk_action = call_bind_status; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_timeout = task->tk_xprt->bind_timeout; rpc_getport(task, clnt); } } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0458319a1bdd..215be0d0ef6b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -551,7 +551,7 @@ void xprt_connect(struct rpc_task *task) if (task->tk_rqstp) task->tk_rqstp->rq_bytes_sent = 0; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_timeout = xprt->connect_timeout; rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); xprt->ops->connect(task); } @@ -763,7 +763,6 @@ void xprt_transmit(struct rpc_task *task) switch (status) { case -ECONNREFUSED: - task->tk_timeout = RPC_REESTABLISH_TIMEOUT; rpc_sleep_on(&xprt->sending, task, NULL, NULL); case -EAGAIN: case -ENOTCONN: @@ -857,7 +856,7 @@ void xprt_release(struct rpc_task *task) xprt->last_used = jiffies; if (list_empty(&xprt->recv) && !xprt->shutdown) mod_timer(&xprt->timer, - xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); + xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 88ac71fcd335..06c2d95484e0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -41,6 +41,50 @@ */ #define XS_SENDMSG_RETRY (10U) +/* + * Time out for an RPC UDP socket connect. UDP socket connects are + * synchronous, but we set a timeout anyway in case of resource + * exhaustion on the local host. + */ +#define XS_UDP_CONN_TO (5U * HZ) + +/* + * Wait duration for an RPC TCP connection to be established. Solaris + * NFS over TCP uses 60 seconds, for example, which is in line with how + * long a server takes to reboot. + */ +#define XS_TCP_CONN_TO (60U * HZ) + +/* + * Wait duration for a reply from the RPC portmapper. + */ +#define XS_BIND_TO (60U * HZ) + +/* + * Delay if a UDP socket connect error occurs. This is most likely some + * kind of resource problem on the local host. + */ +#define XS_UDP_REEST_TO (2U * HZ) + +/* + * The reestablish timeout allows clients to delay for a bit before attempting + * to reconnect to a server that just dropped our connection. + * + * We implement an exponential backoff when trying to reestablish a TCP + * transport connection with the server. Some servers like to drop a TCP + * connection when they are overworked, so we start with a short timeout and + * increase over time if the server is down or not responding. + */ +#define XS_TCP_INIT_REEST_TO (3U * HZ) +#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) + +/* + * TCP idle timeout; client drops the transport socket if it is idle + * for this long. Note that we also timeout UDP sockets to prevent + * holding port numbers when there is no RPC traffic. + */ +#define XS_IDLE_DISC_TO (5U * 60 * HZ) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS @@ -739,6 +783,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); @@ -1066,6 +1111,13 @@ out_clear: * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. + * + * UDP socket connects are synchronous, but we use a work queue anyway + * to guarantee that even unprivileged user processes can set up a + * socket on a privileged port. + * + * If a UDP socket connect fails, the delay behavior here prevents + * retry floods (hard mounts). */ static void xs_connect(struct rpc_task *task) { @@ -1075,9 +1127,13 @@ static void xs_connect(struct rpc_task *task) return; if (xprt->sock != NULL) { - dprintk("RPC: xs_connect delayed xprt %p\n", xprt); + dprintk("RPC: xs_connect delayed xprt %p for %lu seconds\n", + xprt, xprt->reestablish_timeout / HZ); schedule_delayed_work(&xprt->connect_worker, - RPC_REESTABLISH_TIMEOUT); + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) + xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); schedule_work(&xprt->connect_worker); @@ -1139,6 +1195,10 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_udp_ops; @@ -1176,6 +1236,10 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; -- cgit v1.2.3 From 470056c288334eb0b37be26c9ff8aee37ed1cc7a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:56 -0700 Subject: [PATCH] RPC: rationalize set_buffer_size In fact, ->set_buffer_size should be completely functionless for non-UDP. Test-plan: Check socket buffer size on UDP sockets over time. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 4 ++-- net/sunrpc/clnt.c | 10 ++-------- net/sunrpc/xprtsock.c | 30 +++++++++++++++--------------- 3 files changed, 19 insertions(+), 25 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 2543adf18551..99cad3ead81d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -100,7 +100,7 @@ struct rpc_rqst { #define rq_slen rq_snd_buf.len struct rpc_xprt_ops { - void (*set_buffer_size)(struct rpc_xprt *xprt); + void (*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize); int (*reserve_xprt)(struct rpc_task *task); void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*connect)(struct rpc_task *task); @@ -124,7 +124,7 @@ struct rpc_xprt { unsigned long cong; /* current congestion */ unsigned long cwnd; /* congestion window */ - unsigned int rcvsize, /* transport rcv buffer size */ + size_t rcvsize, /* transport rcv buffer size */ sndsize; /* transport send buffer size */ size_t max_payload; /* largest RPC payload size, diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 24b44e73f391..5a8f01d726e9 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -517,14 +517,8 @@ void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { struct rpc_xprt *xprt = clnt->cl_xprt; - - xprt->sndsize = 0; - if (sndsize) - xprt->sndsize = sndsize + RPC_SLACK_SPACE; - xprt->rcvsize = 0; - if (rcvsize) - xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; - xprt->ops->set_buffer_size(xprt); + if (xprt->ops->set_buffer_size) + xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 06c2d95484e0..2e1529217e65 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -865,15 +865,7 @@ static void xs_tcp_write_space(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/** - * xs_udp_set_buffer_size - set send and receive limits - * @xprt: generic transport - * - * Set socket send and receive limits based on the - * sndsize and rcvsize fields in the generic transport - * structure. - */ -static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; @@ -889,14 +881,23 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) } /** - * xs_tcp_set_buffer_size - set send and receive limits + * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport + * @sndsize: requested size of send buffer, in bytes + * @rcvsize: requested size of receive buffer, in bytes * - * Nothing to do for TCP. + * Set socket send and receive buffer size limits. */ -static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) { - return; + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + 1024; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + 1024; + + xs_udp_do_set_buffer_size(xprt); } /** @@ -989,7 +990,7 @@ static void xs_udp_connect_worker(void *args) write_unlock_bh(&sk->sk_callback_lock); } - xs_udp_set_buffer_size(xprt); + xs_udp_do_set_buffer_size(xprt); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1158,7 +1159,6 @@ static struct rpc_xprt_ops xs_udp_ops = { }; static struct rpc_xprt_ops xs_tcp_ops = { - .set_buffer_size = xs_tcp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .connect = xs_connect, -- cgit v1.2.3 From 5e5ce5be6f0161d2a069a4f8a1154fe639c5c02f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:11 -0700 Subject: RPC: allow call_encode() to delay transmission of an RPC call. Currently, call_encode will cause the entire RPC call to abort if it returns an error. This is unnecessarily rigid, and gets in the way of attempts to allow the NFSv4 layer to order RPC calls that carry sequence ids. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 23 ++++++++++++----------- net/sunrpc/xprt.c | 8 ++++++++ 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 99cad3ead81d..068e1fb0868b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -211,6 +211,7 @@ int xprt_reserve_xprt(struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); +void xprt_abort_transmit(struct rpc_task *task); int xprt_adjust_timeout(struct rpc_rqst *req); void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a5f7029b1daa..534274056329 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -678,13 +678,11 @@ call_allocate(struct rpc_task *task) static void call_encode(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_buf *sndbuf = &req->rq_snd_buf; struct xdr_buf *rcvbuf = &req->rq_rcv_buf; unsigned int bufsiz; kxdrproc_t encode; - int status; u32 *p; dprintk("RPC: %4d call_encode (status %d)\n", @@ -712,12 +710,9 @@ call_encode(struct rpc_task *task) rpc_exit(task, -EIO); return; } - if (encode && (status = rpcauth_wrap_req(task, encode, req, p, - task->tk_msg.rpc_argp)) < 0) { - printk(KERN_WARNING "%s: can't encode arguments: %d\n", - clnt->cl_protname, -status); - rpc_exit(task, status); - } + if (encode != NULL) + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); } /* @@ -865,10 +860,12 @@ call_transmit(struct rpc_task *task) if (task->tk_status != 0) return; /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (!task->tk_rqstp->rq_bytes_sent) + if (task->tk_rqstp->rq_bytes_sent == 0) { call_encode(task); - if (task->tk_status < 0) - return; + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) + goto out_nosend; + } xprt_transmit(task); if (task->tk_status < 0) return; @@ -876,6 +873,10 @@ call_transmit(struct rpc_task *task) task->tk_action = NULL; rpc_wake_up_task(task); } + return; +out_nosend: + /* release socket write lock before attempting to handle error */ + xprt_abort_transmit(task); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 215be0d0ef6b..1ba55dc38b7a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -709,6 +709,14 @@ out_unlock: return err; } +void +xprt_abort_transmit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + xprt_release_write(xprt, task); +} + /** * xprt_transmit - send an RPC request on a transport * @task: controlling RPC task -- cgit v1.2.3 From ead5e1c26fdcd969cf40c49cb0589d56879d240d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:43 -0400 Subject: SUNRPC: Provide a callback to allow free pages allocated during xdr encoding For privacy, we need to allocate pages to store the encrypted data (passed in pages can't be used without the risk of corrupting data in the page cache). So we need a way to free that memory after the request has been transmitted. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 5 ++++- net/sunrpc/xprt.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/sunrpc/xprt.h') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 068e1fb0868b..3b8b6e823c70 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -73,7 +73,10 @@ struct rpc_rqst { int rq_cong; /* has incremented xprt->cong */ int rq_received; /* receive completed */ u32 rq_seqno; /* gss seq no. used on req. */ - + int rq_enc_pages_num; + struct page **rq_enc_pages; /* scratch pages for use by + gss privacy code */ + void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */ struct list_head rq_list; struct xdr_buf rq_private_buf; /* The receive buffer diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1ba55dc38b7a..6dda3860351f 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -839,6 +839,7 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_task = task; req->rq_xprt = xprt; req->rq_xid = xprt_alloc_xid(xprt); + req->rq_release_snd_buf = NULL; dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, ntohl(req->rq_xid)); } @@ -867,6 +868,8 @@ void xprt_release(struct rpc_task *task) xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); memset(req, 0, sizeof(*req)); /* mark unused */ dprintk("RPC: %4d release request %p\n", task->tk_pid, req); -- cgit v1.2.3