From 90a9b1473df72a8b356e7ad6c9b9c9608927b103 Mon Sep 17 00:00:00 2001
From: James Ettle <james@ettle.org.uk>
Date: Sun, 28 Jan 2018 20:34:16 +0000
Subject: sunrpc: Fix unaligned access on sparc64

Fix unaligned access in gss_{get,verify}_mic_v2() on sparc64

Signed-off-by: James Ettle <james@ettle.org.uk>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/auth_gss/gss_krb5_seal.c   | 5 ++++-
 net/sunrpc/auth_gss/gss_krb5_unseal.c | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c
index 1d74d653e6c0..94a2b3f082a8 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seal.c
@@ -177,6 +177,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
 	u64 seq_send;
 	u8 *cksumkey;
 	unsigned int cksum_usage;
+	__be64 seq_send_be64;
 
 	dprintk("RPC:       %s\n", __func__);
 
@@ -187,7 +188,9 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text,
 	spin_lock(&krb5_seq_lock);
 	seq_send = ctx->seq_send64++;
 	spin_unlock(&krb5_seq_lock);
-	*((__be64 *)(krb5_hdr + 8)) = cpu_to_be64(seq_send);
+
+	seq_send_be64 = cpu_to_be64(seq_send);
+	memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8);
 
 	if (ctx->initiate) {
 		cksumkey = ctx->initiator_sign;
diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c
index dcf9515d9aef..b601a73cc9db 100644
--- a/net/sunrpc/auth_gss/gss_krb5_unseal.c
+++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c
@@ -155,10 +155,12 @@ gss_verify_mic_v2(struct krb5_ctx *ctx,
 	u8 flags;
 	int i;
 	unsigned int cksum_usage;
+	__be16 be16_ptr;
 
 	dprintk("RPC:       %s\n", __func__);
 
-	if (be16_to_cpu(*((__be16 *)ptr)) != KG2_TOK_MIC)
+	memcpy(&be16_ptr, (char *) ptr, 2);
+	if (be16_to_cpu(be16_ptr) != KG2_TOK_MIC)
 		return GSS_S_DEFECTIVE_TOKEN;
 
 	flags = ptr[2];
-- 
cgit v1.2.1


From 3b68e6ee3cbd4a474bcc7d2ac26812f86cdf333d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 14 Feb 2018 12:15:06 +1100
Subject: SUNRPC: cache: ignore timestamp written to 'flush' file.

The interface for flushing the sunrpc auth cache was poorly
designed and has caused problems a number of times.

The design is that you write a timestamp, and all entries
created before that time are discarded.
The most obvious problem is that this is not what people
actually want.  They want to just flush the whole cache.
The 1-second granularity can be a problem, as can the use
of wall-clock time.

A current problem is that code will write the current time to
this file - expecting it to clear everything - and if the
seconds number ticks over before this timestamp is checked,
the test "then >= now" fails, and a full flush isn't forced.

So lets just drop the subtleties and always flush the whole
cache.  The worst this could do is impose an extra cost
refilling it, but that would require someone to be using
non-standard tools.

We still report an error if the string written is not a number,
but we cause any valid number to flush the whole cache.

Reported-by: "Wang, Alan 1. (NSB - CN/Hangzhou)" <alan.1.wang@nokia-sbell.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/cache.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8a7e1c774f9c..26582e27be6a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1450,8 +1450,8 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 			   struct cache_detail *cd)
 {
 	char tbuf[20];
-	char *bp, *ep;
-	time_t then, now;
+	char *ep;
+	time_t now;
 
 	if (*ppos || count > sizeof(tbuf)-1)
 		return -EINVAL;
@@ -1461,24 +1461,24 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 	simple_strtoul(tbuf, &ep, 0);
 	if (*ep && *ep != '\n')
 		return -EINVAL;
+	/* Note that while we check that 'buf' holds a valid number,
+	 * we always ignore the value and just flush everything.
+	 * Making use of the number leads to races.
+	 */
 
-	bp = tbuf;
-	then = get_expiry(&bp);
 	now = seconds_since_boot();
-	cd->nextcheck = now;
-	/* Can only set flush_time to 1 second beyond "now", or
-	 * possibly 1 second beyond flushtime.  This is because
-	 * flush_time never goes backwards so it mustn't get too far
-	 * ahead of time.
+	/* Always flush everything, so behave like cache_purge()
+	 * Do this by advancing flush_time to the current time,
+	 * or by one second if it has already reached the current time.
+	 * Newly added cache entries will always have ->last_refresh greater
+	 * that ->flush_time, so they don't get flushed prematurely.
 	 */
-	if (then >= now) {
-		/* Want to flush everything, so behave like cache_purge() */
-		if (cd->flush_time >= now)
-			now = cd->flush_time + 1;
-		then = now;
-	}
 
-	cd->flush_time = then;
+	if (cd->flush_time >= now)
+		now = cd->flush_time + 1;
+
+	cd->flush_time = now;
+	cd->nextcheck = now;
 	cache_flush();
 
 	*ppos += count;
-- 
cgit v1.2.1


From bd2decac5e8ab7b326f54b40e3ac8b3b1ac935ed Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 15 Feb 2018 15:16:18 -0500
Subject: nfsd4: send the special close_stateid in v4.0 replies as well

We already send it for v4.1, but RFC7530 also notes that the stateid in
the close reply is bogus.

Always send the special close stateid, even in v4.0 responses. No client
should put any meaning on it whatsoever. For now, we continue to
increment the stateid value, though that might not be necessary either.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 61b770e39809..34e6915831d8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5521,15 +5521,26 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out; 
 
 	stp->st_stid.sc_type = NFS4_CLOSED_STID;
+
+	/*
+	 * Technically we don't _really_ have to increment or copy it, since
+	 * it should just be gone after this operation and we clobber the
+	 * copied value below, but we continue to do so here just to ensure
+	 * that racing ops see that there was a state change.
+	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
 	nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
 
-	/* See RFC5661 sectionm 18.2.4 */
-	if (stp->st_stid.sc_client->cl_minorversion)
-		memcpy(&close->cl_stateid, &close_stateid,
-				sizeof(close->cl_stateid));
+	/* v4.1+ suggests that we send a special stateid in here, since the
+	 * clients should just ignore this anyway. Since this is not useful
+	 * for v4.0 clients either, we set it to the special close_stateid
+	 * universally.
+	 *
+	 * See RFC5661 section 18.2.4, and RFC7530 section 16.2.5
+	 */
+	memcpy(&close->cl_stateid, &close_stateid, sizeof(close->cl_stateid));
 
 	/* put reference from nfs4_preprocess_seqid_op */
 	nfs4_put_stid(&stp->st_stid);
-- 
cgit v1.2.1


From 554faf28198803cb58ff79a2c4891bc42d523438 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 7 Feb 2018 10:29:47 +0000
Subject: lockd: make nlm_ntf_refcnt and nlm_ntf_wq static

The variables nlm_ntf_refcnt and nlm_ntf_wq are local to the source and
do not need to be in global scope, so make them static.

Cleans up sparse warnings:
fs/lockd/svc.c:60:10: warning: symbol 'nlm_ntf_refcnt' was not declared.
Should it be static?
fs/lockd/svc.c:61:1: warning: symbol 'nlm_ntf_wq' was not declared.
Should it be static?

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 9c36d614bf89..346ed161756d 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,8 +57,8 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
-atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
-DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
+static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
 
 unsigned int lockd_net_id;
 
-- 
cgit v1.2.1


From edcc8452a05ffd2d2c8574905660c9c2572af075 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 7 Mar 2018 15:37:35 -0500
Subject: nfsd: remove unsused "cp_consecutive" field

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 1 -
 fs/nfsd/nfs4xdr.c  | 4 ++--
 fs/nfsd/xdr4.h     | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a0bed2b2004d..40dfb759d38d 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1106,7 +1106,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	else {
 		copy->cp_res.wr_bytes_written = bytes;
 		copy->cp_res.wr_stable_how = NFS_UNSTABLE;
-		copy->cp_consecutive = 1;
 		copy->cp_synchronous = 1;
 		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
 		status = nfs_ok;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e502fd16246b..9f8ef2c6e508 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1759,7 +1759,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
 	p = xdr_decode_hyper(p, &copy->cp_src_pos);
 	p = xdr_decode_hyper(p, &copy->cp_dst_pos);
 	p = xdr_decode_hyper(p, &copy->cp_count);
-	copy->cp_consecutive = be32_to_cpup(p++);
+	p++; /* ca_consecutive: we always do consecutive copies */
 	copy->cp_synchronous = be32_to_cpup(p++);
 	tmp = be32_to_cpup(p); /* Source server list not supported */
 
@@ -4214,7 +4214,7 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
 		return nfserr;
 
 	p = xdr_reserve_space(&resp->xdr, 4 + 4);
-	*p++ = cpu_to_be32(copy->cp_consecutive);
+	*p++ = xdr_one; /* cr_consecutive */
 	*p++ = cpu_to_be32(copy->cp_synchronous);
 	return 0;
 }
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index bc29511b6405..7cbc129092fe 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -518,7 +518,6 @@ struct nfsd4_copy {
 	u64		cp_count;
 
 	/* both */
-	bool		cp_consecutive;
 	bool		cp_synchronous;
 
 	/* response */
-- 
cgit v1.2.1


From 9d7ed1355db5b00b9f9f4c333fc2b1825b0db25a Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 8 Mar 2018 15:49:48 -0500
Subject: nfsd: don't require low ports for gss requests

In a traditional NFS deployment using auth_unix, the clients are trusted
to correctly report the credentials of their logged-in users.  The
server assumes that only root on client machines is allowed to send
requests from low-numbered ports, so it can use the originating port
number to distinguish "real" NFS clients from NFS clients run by
ordinary users, to prevent ordinary users from spoofing credentials.

The originating port number on a gss-authenticated request is less
important.  The authentication ties the request to a user, and we take
it as proof that that user authorized the request.  The low port number
check no longer adds much.

So, don't enforce low port numbers in the auth_gss case.

Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfsfh.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8aa011820c4a..a008e7634181 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -87,13 +87,23 @@ nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
 	return nfserr_inval;
 }
 
+static bool nfsd_originating_port_ok(struct svc_rqst *rqstp, int flags)
+{
+	if (flags & NFSEXP_INSECURE_PORT)
+		return true;
+	/* We don't require gss requests to use low ports: */
+	if (rqstp->rq_cred.cr_flavor >= RPC_AUTH_GSS)
+		return true;
+	return test_bit(RQ_SECURE, &rqstp->rq_flags);
+}
+
 static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
 					  struct svc_export *exp)
 {
 	int flags = nfsexp_flags(rqstp, exp);
 
 	/* Check if the request originated from a secure port. */
-	if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) {
+	if (!nfsd_originating_port_ok(rqstp, flags)) {
 		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 		dprintk("nfsd: request from insecure port %s!\n",
 		        svc_print_addr(rqstp, buf, sizeof(buf)));
-- 
cgit v1.2.1


From 9258a2d5cd9577ce3175fbb64d6ccd0a77cf6fa7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 16 Mar 2018 09:47:22 -0400
Subject: nfsd: move nfs4_client allocation to dedicated slabcache

On x86_64, it's 1152 bytes, so we can avoid wasting 896 bytes each.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 34e6915831d8..16efbe2b1853 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,6 +98,7 @@ enum nfsd4_st_mutex_lock_subclass {
  */
 static DECLARE_WAIT_QUEUE_HEAD(close_wq);
 
+static struct kmem_cache *client_slab;
 static struct kmem_cache *openowner_slab;
 static struct kmem_cache *lockowner_slab;
 static struct kmem_cache *file_slab;
@@ -1794,7 +1795,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	struct nfs4_client *clp;
 	int i;
 
-	clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
+	clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
 	if (clp == NULL)
 		return NULL;
 	clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
@@ -1825,7 +1826,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 err_no_hashtbl:
 	kfree(clp->cl_name.data);
 err_no_name:
-	kfree(clp);
+	kmem_cache_free(client_slab, clp);
 	return NULL;
 }
 
@@ -1845,7 +1846,7 @@ free_client(struct nfs4_client *clp)
 	kfree(clp->cl_ownerstr_hashtbl);
 	kfree(clp->cl_name.data);
 	idr_destroy(&clp->cl_stateids);
-	kfree(clp);
+	kmem_cache_free(client_slab, clp);
 }
 
 /* must be called under the client_lock */
@@ -3471,21 +3472,26 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 void
 nfsd4_free_slabs(void)
 {
-	kmem_cache_destroy(odstate_slab);
+	kmem_cache_destroy(client_slab);
 	kmem_cache_destroy(openowner_slab);
 	kmem_cache_destroy(lockowner_slab);
 	kmem_cache_destroy(file_slab);
 	kmem_cache_destroy(stateid_slab);
 	kmem_cache_destroy(deleg_slab);
+	kmem_cache_destroy(odstate_slab);
 }
 
 int
 nfsd4_init_slabs(void)
 {
+	client_slab = kmem_cache_create("nfsd4_clients",
+			sizeof(struct nfs4_client), 0, 0, NULL);
+	if (client_slab == NULL)
+		goto out;
 	openowner_slab = kmem_cache_create("nfsd4_openowners",
 			sizeof(struct nfs4_openowner), 0, 0, NULL);
 	if (openowner_slab == NULL)
-		goto out;
+		goto out_free_client_slab;
 	lockowner_slab = kmem_cache_create("nfsd4_lockowners",
 			sizeof(struct nfs4_lockowner), 0, 0, NULL);
 	if (lockowner_slab == NULL)
@@ -3518,6 +3524,8 @@ out_free_lockowner_slab:
 	kmem_cache_destroy(lockowner_slab);
 out_free_openowner_slab:
 	kmem_cache_destroy(openowner_slab);
+out_free_client_slab:
+	kmem_cache_destroy(client_slab);
 out:
 	dprintk("nfsd4: out of memory while initializing nfsv4\n");
 	return -ENOMEM;
-- 
cgit v1.2.1


From 0c4398ff8b581b68ee02c5194654134acc31f4a7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 20 Mar 2018 17:05:09 -0400
Subject: svcrdma: Use pr_err to report Receive errors

Clean up: Other completion handlers use pr_err, not pr_warn.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9ad12a215b51..135ae175ca8e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -330,9 +330,9 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 
 flushed:
 	if (wc->status != IB_WC_WR_FLUSH_ERR)
-		pr_warn("svcrdma: receive: %s (%u/0x%x)\n",
-			ib_wc_status_msg(wc->status),
-			wc->status, wc->vendor_err);
+		pr_err("svcrdma: Recv: %s (%u/0x%x)\n",
+		       ib_wc_status_msg(wc->status),
+		       wc->status, wc->vendor_err);
 	set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
 	svc_rdma_put_context(ctxt, 1);
 
-- 
cgit v1.2.1


From 97cc3264508f33783ba21573204d7e0bf5b197e7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 20 Mar 2018 17:05:15 -0400
Subject: svcrdma: Consult max_qp_init_rd_atom when accepting connections

The target needs to return the lesser of the client's Inbound RDMA
Read Queue Depth (IRD), provided in the connection parameters, and
the local device's Outbound RDMA Read Queue Depth (ORD). The latter
limit is max_qp_init_rd_atom, not max_qp_rd_atom.

The svcrdma_ord value caps the ORD value for iWARP transports, which
do not exchange ORD/IRD values at connection time. Since no other
Linux kernel RDMA-enabled storage target sees fit to provide this
cap, I'm removing it here too.

initiator_depth is a u8, so ensure the computed ORD value does not
overflow that field.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  3 ---
 net/sunrpc/xprtrdma/svc_rdma.c           |  4 ++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 22 +++++++++-------------
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 4b731b046bcd..7337e1221590 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -132,9 +132,6 @@ struct svcxprt_rdma {
 #define RDMAXPRT_CONN_PENDING	3
 
 #define RPCRDMA_LISTEN_BACKLOG  10
-/* The default ORD value is based on two outstanding full-size writes with a
- * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
-#define RPCRDMA_ORD             (64/4)
 #define RPCRDMA_MAX_REQUESTS    32
 
 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index a4a8f6989ee7..dd8a431dc2ae 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -51,9 +51,9 @@
 #define RPCDBG_FACILITY	RPCDBG_SVCXPRT
 
 /* RPC/RDMA parameters */
-unsigned int svcrdma_ord = RPCRDMA_ORD;
+unsigned int svcrdma_ord = 16;	/* historical default */
 static unsigned int min_ord = 1;
-static unsigned int max_ord = 4096;
+static unsigned int max_ord = 255;
 unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
 unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
 static unsigned int min_max_requests = 4;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 135ae175ca8e..7b2f4d3a2543 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -762,13 +762,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	if (!svc_rdma_prealloc_ctxts(newxprt))
 		goto errout;
 
-	/*
-	 * Limit ORD based on client limit, local device limit, and
-	 * configured svcrdma limit.
-	 */
-	newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
-	newxprt->sc_ord = min_t(size_t,	svcrdma_ord, newxprt->sc_ord);
-
 	newxprt->sc_pd = ib_alloc_pd(dev, 0);
 	if (IS_ERR(newxprt->sc_pd)) {
 		dprintk("svcrdma: error creating PD for connect request\n");
@@ -843,15 +836,18 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 0;
-	conn_param.initiator_depth = newxprt->sc_ord;
+	conn_param.initiator_depth = min_t(int, newxprt->sc_ord,
+					   dev->attrs.max_qp_init_rd_atom);
+	if (!conn_param.initiator_depth) {
+		dprintk("svcrdma: invalid ORD setting\n");
+		ret = -EINVAL;
+		goto errout;
+	}
 	conn_param.private_data = &pmsg;
 	conn_param.private_data_len = sizeof(pmsg);
 	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
-	if (ret) {
-		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
-		       ret);
+	if (ret)
 		goto errout;
-	}
 
 	dprintk("svcrdma: new connection %p accepted:\n", newxprt);
 	sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr;
@@ -862,7 +858,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	dprintk("    sq_depth        : %d\n", newxprt->sc_sq_depth);
 	dprintk("    rdma_rw_ctxs    : %d\n", ctxts);
 	dprintk("    max_requests    : %d\n", newxprt->sc_max_requests);
-	dprintk("    ord             : %d\n", newxprt->sc_ord);
+	dprintk("    ord             : %d\n", conn_param.initiator_depth);
 
 	return &newxprt->sc_xprt;
 
-- 
cgit v1.2.1


From 6f29d07ca4b999c36d42f9cf8d1dddf9ddca3250 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 20 Mar 2018 17:05:20 -0400
Subject: svcrdma: Clean up rdma_build_arg_xdr

Clean up: The value of the byte_count parameter is already passed
to rdma_build_arg_xdr as part of the svc_rdma_op_ctxt structure.

Further, without the parameter called "byte_count" there is no need
to have the abbreviated "bc" automatic variable. "bc" can now be
called something more intuitive.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 19e9c6b33042..3d45015dca97 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -110,15 +110,16 @@
  * the RDMA_RECV completion. The SGL should contain full pages up until the
  * last one.
  */
-static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
-			       struct svc_rdma_op_ctxt *ctxt,
-			       u32 byte_count)
+static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
+				   struct svc_rdma_op_ctxt *ctxt)
 {
 	struct page *page;
-	u32 bc;
 	int sge_no;
+	u32 len;
 
-	/* Swap the page in the SGE with the page in argpages */
+	/* The reply path assumes the Call's transport header resides
+	 * in rqstp->rq_pages[0].
+	 */
 	page = ctxt->pages[0];
 	put_page(rqstp->rq_pages[0]);
 	rqstp->rq_pages[0] = page;
@@ -126,35 +127,35 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	/* Set up the XDR head */
 	rqstp->rq_arg.head[0].iov_base = page_address(page);
 	rqstp->rq_arg.head[0].iov_len =
-		min_t(size_t, byte_count, ctxt->sge[0].length);
-	rqstp->rq_arg.len = byte_count;
-	rqstp->rq_arg.buflen = byte_count;
+		min_t(size_t, ctxt->byte_len, ctxt->sge[0].length);
+	rqstp->rq_arg.len = ctxt->byte_len;
+	rqstp->rq_arg.buflen = ctxt->byte_len;
 
 	/* Compute bytes past head in the SGL */
-	bc = byte_count - rqstp->rq_arg.head[0].iov_len;
+	len = ctxt->byte_len - rqstp->rq_arg.head[0].iov_len;
 
 	/* If data remains, store it in the pagelist */
-	rqstp->rq_arg.page_len = bc;
+	rqstp->rq_arg.page_len = len;
 	rqstp->rq_arg.page_base = 0;
 
 	sge_no = 1;
-	while (bc && sge_no < ctxt->count) {
+	while (len && sge_no < ctxt->count) {
 		page = ctxt->pages[sge_no];
 		put_page(rqstp->rq_pages[sge_no]);
 		rqstp->rq_pages[sge_no] = page;
-		bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
+		len -= min_t(u32, len, ctxt->sge[sge_no].length);
 		sge_no++;
 	}
 	rqstp->rq_respages = &rqstp->rq_pages[sge_no];
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
 	/* If not all pages were used from the SGL, free the remaining ones */
-	bc = sge_no;
+	len = sge_no;
 	while (sge_no < ctxt->count) {
 		page = ctxt->pages[sge_no++];
 		put_page(page);
 	}
-	ctxt->count = bc;
+	ctxt->count = len;
 
 	/* Set up tail */
 	rqstp->rq_arg.tail[0].iov_base = NULL;
@@ -534,10 +535,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		ctxt, rdma_xprt, rqstp);
 	atomic_inc(&rdma_stat_recv);
 
-	/* Build up the XDR from the receive buffers. */
-	rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
+	svc_rdma_build_arg_xdr(rqstp, ctxt);
 
-	/* Decode the RDMA header. */
 	p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
 	ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
 	if (ret < 0)
-- 
cgit v1.2.1


From b8232d3315ec746e3b5d4c9252541567e57a509b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Feb 2018 15:29:15 -0500
Subject: nfsd: simplify put of fi_deleg_file

fi_delegees is basically just a reference count on users of
fi_deleg_file, which is cleared when fi_delegees goes to zero.  The
fi_deleg_file check here is redundant.  Also add an assertion to make
sure we don't have unbalanced puts.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 16efbe2b1853..9072d0f8340b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -879,9 +879,12 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
 {
 	struct file *filp = NULL;
 
+	WARN_ON_ONCE(!fp->fi_delegees);
+
 	spin_lock(&fp->fi_lock);
-	if (fp->fi_deleg_file && --fp->fi_delegees == 0)
+	if (--fp->fi_delegees == 0) {
 		swap(filp, fp->fi_deleg_file);
+	}
 	spin_unlock(&fp->fi_lock);
 
 	if (filp) {
-- 
cgit v1.2.1


From cba7b3d15071fa0deb5b6e7a1c97c29d2904e6a3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Feb 2018 15:50:49 -0500
Subject: nfsd: simplify nfs4_put_deleg_lease calls

Every single caller gets the file out of the delegation, so let's do
that once in nfs4_put_deleg_lease.

Plus we'll need it there for other reasons.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9072d0f8340b..39a4de64ce93 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -875,9 +875,10 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
 	spin_unlock(&stid->sc_lock);
 }
 
-static void nfs4_put_deleg_lease(struct nfs4_file *fp)
+static void nfs4_put_deleg_lease(struct nfs4_delegation *dp)
 {
 	struct file *filp = NULL;
+	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
 	WARN_ON_ONCE(!fp->fi_delegees);
 
@@ -992,7 +993,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	spin_unlock(&state_lock);
 	if (unhashed) {
 		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+		nfs4_put_deleg_lease(dp);
 		nfs4_put_stid(&dp->dl_stid);
 	}
 }
@@ -1004,7 +1005,7 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 	WARN_ON(!list_empty(&dp->dl_recall_lru));
 
 	put_clnt_odstate(dp->dl_clnt_odstate);
-	nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+	nfs4_put_deleg_lease(dp);
 
 	if (clp->cl_minorversion == 0)
 		nfs4_put_stid(&dp->dl_stid);
@@ -1916,7 +1917,7 @@ __destroy_client(struct nfs4_client *clp)
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
 		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+		nfs4_put_deleg_lease(dp);
 		nfs4_put_stid(&dp->dl_stid);
 	}
 	while (!list_empty(&clp->cl_revoked)) {
@@ -7287,7 +7288,7 @@ nfs4_state_shutdown_net(struct net *net)
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
 		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+		nfs4_put_deleg_lease(dp);
 		nfs4_put_stid(&dp->dl_stid);
 	}
 
-- 
cgit v1.2.1


From 653e514e9ebcb1d6ab71c2c8bb28a3b65ffbfe15 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Feb 2018 22:08:45 -0500
Subject: nfsd4: set fl_owner to delegation, not file pointer

For now this makes no difference, as for files having delegations,
there's a one-to-one relationship between an nfs4_file and its
nfs4_delegation.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 39a4de64ce93..33538b98720a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -889,7 +889,7 @@ static void nfs4_put_deleg_lease(struct nfs4_delegation *dp)
 	spin_unlock(&fp->fi_lock);
 
 	if (filp) {
-		vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
+		vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
 		fput(filp);
 	}
 }
@@ -3957,13 +3957,9 @@ static bool
 nfsd_break_deleg_cb(struct file_lock *fl)
 {
 	bool ret = false;
-	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
-	struct nfs4_delegation *dp;
+	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
-	if (!fp) {
-		WARN(1, "(%p)->fl_owner NULL\n", fl);
-		return ret;
-	}
 	if (fp->fi_had_conflict) {
 		WARN(1, "duplicate break on %p\n", fp);
 		return ret;
@@ -4309,7 +4305,8 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
 	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
 
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
+static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+						int flag)
 {
 	struct file_lock *fl;
 
@@ -4320,7 +4317,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 	fl->fl_flags = FL_DELEG;
 	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
 	fl->fl_end = OFFSET_MAX;
-	fl->fl_owner = (fl_owner_t)fp;
+	fl->fl_owner = (fl_owner_t)dp;
 	fl->fl_pid = current->tgid;
 	return fl;
 }
@@ -4344,7 +4341,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
 	struct file *filp;
 	int status = 0;
 
-	fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ);
+	fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
 	if (!fl)
 		return -ENOMEM;
 	filp = find_readable_file(fp);
-- 
cgit v1.2.1


From 0c911f5408e49a313dd17f2f331c62c3db515f34 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 16 Feb 2018 11:49:11 -0500
Subject: nfsd4: dp->dl_stid.sc_file doesn't need locking

The delegation isn't visible to anyone yet.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 33538b98720a..9e9f32f03bf4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4411,9 +4411,10 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		return ERR_PTR(-ENOMEM);
 
 	get_nfs4_file(fp);
+	dp->dl_stid.sc_file = fp;
+
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
-	dp->dl_stid.sc_file = fp;
 	if (!fp->fi_deleg_file) {
 		spin_unlock(&fp->fi_lock);
 		spin_unlock(&state_lock);
-- 
cgit v1.2.1


From 68b18f52947bbe77bc9ddb23626f30ed5cf70641 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 19 Feb 2018 11:38:33 -0500
Subject: nfsd: make nfs4_get_existing_delegation less confusing

This doesn't "get" anything.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9e9f32f03bf4..9ec485980bf0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -900,20 +900,16 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
 }
 
 /**
- * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * nfs4_delegation_exists - Discover if this delegation already exists
  * @clp:     a pointer to the nfs4_client we're granting a delegation to
  * @fp:      a pointer to the nfs4_file we're granting a delegation on
  *
  * Return:
- *      On success: NULL if an existing delegation was not found.
- *
- *      On error: -EAGAIN if one was previously granted to this nfs4_client
- *                 for this nfs4_file.
- *
+ *      On success: true iff an existing delegation is found
  */
 
-static int
-nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+static bool
+nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp)
 {
 	struct nfs4_delegation *searchdp = NULL;
 	struct nfs4_client *searchclp = NULL;
@@ -946,15 +942,13 @@ nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
 static int
 hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 {
-	int status;
 	struct nfs4_client *clp = dp->dl_stid.sc_client;
 
 	lockdep_assert_held(&state_lock);
 	lockdep_assert_held(&fp->fi_lock);
 
-	status = nfs4_get_existing_delegation(clp, fp);
-	if (status)
-		return status;
+	if (nfs4_delegation_exists(clp, fp))
+		return -EAGAIN;
 	++fp->fi_delegees;
 	refcount_inc(&dp->dl_stid.sc_count);
 	dp->dl_stid.sc_type = NFS4_DELEG_STID;
@@ -4391,7 +4385,7 @@ static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
 {
-	int status;
+	int status = 0;
 	struct nfs4_delegation *dp;
 
 	if (fp->fi_had_conflict)
@@ -4399,7 +4393,8 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
-	status = nfs4_get_existing_delegation(clp, fp);
+	if (nfs4_delegation_exists(clp, fp))
+		status = -EAGAIN;
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
 
-- 
cgit v1.2.1


From 0af6e690f0d4e8196f6cfa10bad5c9ffff565d9b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 21 Feb 2018 15:11:03 -0500
Subject: nfsd: factor out common delegation-destruction code

Pull some duplicated code into a common helper.

This changes the order in destroy_delegation a little, but it looks to
me like that shouldn't matter.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ec485980bf0..5bbe86c6f1cb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -894,6 +894,13 @@ static void nfs4_put_deleg_lease(struct nfs4_delegation *dp)
 	}
 }
 
+static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
+{
+	put_clnt_odstate(dp->dl_clnt_odstate);
+	nfs4_put_deleg_lease(dp);
+	nfs4_put_stid(&dp->dl_stid);
+}
+
 void nfs4_unhash_stid(struct nfs4_stid *s)
 {
 	s->sc_type = 0;
@@ -985,11 +992,8 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	spin_lock(&state_lock);
 	unhashed = unhash_delegation_locked(dp);
 	spin_unlock(&state_lock);
-	if (unhashed) {
-		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_deleg_lease(dp);
-		nfs4_put_stid(&dp->dl_stid);
-	}
+	if (unhashed)
+		destroy_unhashed_deleg(dp);
 }
 
 static void revoke_delegation(struct nfs4_delegation *dp)
@@ -998,17 +1002,14 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	WARN_ON(!list_empty(&dp->dl_recall_lru));
 
-	put_clnt_odstate(dp->dl_clnt_odstate);
-	nfs4_put_deleg_lease(dp);
-
-	if (clp->cl_minorversion == 0)
-		nfs4_put_stid(&dp->dl_stid);
-	else {
+	if (clp->cl_minorversion) {
 		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+		refcount_inc(&dp->dl_stid.sc_count);
 		spin_lock(&clp->cl_lock);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
 		spin_unlock(&clp->cl_lock);
 	}
+	destroy_unhashed_deleg(dp);
 }
 
 /* 
@@ -1910,9 +1911,7 @@ __destroy_client(struct nfs4_client *clp)
 	while (!list_empty(&reaplist)) {
 		dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
-		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_deleg_lease(dp);
-		nfs4_put_stid(&dp->dl_stid);
+		destroy_unhashed_deleg(dp);
 	}
 	while (!list_empty(&clp->cl_revoked)) {
 		dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
@@ -7280,9 +7279,7 @@ nfs4_state_shutdown_net(struct net *net)
 	list_for_each_safe(pos, next, &reaplist) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		list_del_init(&dp->dl_recall_lru);
-		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_deleg_lease(dp);
-		nfs4_put_stid(&dp->dl_stid);
+		destroy_unhashed_deleg(dp);
 	}
 
 	nfsd4_client_tracking_exit(net);
-- 
cgit v1.2.1


From 86d29b10eb666f72da70bd5cf9c80b4446270c8c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 21 Feb 2018 15:27:28 -0500
Subject: nfsd: move sc_file assignment into alloc_init_deleg

Take an easy chance to simplify the caller a little.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5bbe86c6f1cb..fdcebfce5fdb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -807,7 +807,8 @@ static void block_delegations(struct knfsd_fh *fh)
 }
 
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+		 struct svc_fh *current_fh,
 		 struct nfs4_clnt_odstate *odstate)
 {
 	struct nfs4_delegation *dp;
@@ -838,6 +839,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
 	dp->dl_retries = 1;
 	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
 		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+	get_nfs4_file(fp);
+	dp->dl_stid.sc_file = fp;
 	return dp;
 out_dec:
 	atomic_long_dec(&num_delegations);
@@ -4400,13 +4403,10 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 	if (status)
 		return ERR_PTR(status);
 
-	dp = alloc_init_deleg(clp, fh, odstate);
+	dp = alloc_init_deleg(clp, fp, fh, odstate);
 	if (!dp)
 		return ERR_PTR(-ENOMEM);
 
-	get_nfs4_file(fp);
-	dp->dl_stid.sc_file = fp;
-
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
 	if (!fp->fi_deleg_file) {
-- 
cgit v1.2.1


From 353601e7d323c700d891d0ede2bbf13c050741a5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 16 Feb 2018 14:29:42 -0500
Subject: nfsd: create a separate lease for each delegation

Currently we only take one vfs-level delegation (lease) for each file,
no matter how many clients hold delegations on that file.

Let's instead keep a one-to-one mapping between NFSv4 delegations and
VFS delegations.  This turns out to be simpler.

There is still a many-to-one mapping of NFS opens to NFS files, and the
delegations on one file are all associated with one struct file.  The
VFS can still distinguish between these delegations since we're setting
fl_owner to the struct nfs4_delegation now, not to the shared file.

I'm replacing at least one complicated function wholesale, which I don't
like to do, but I haven't figured out how to do this more incrementally.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 173 ++++++++++++++++++++--------------------------------
 1 file changed, 65 insertions(+), 108 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fdcebfce5fdb..a64ee08aadb2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -878,29 +878,34 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
 	spin_unlock(&stid->sc_lock);
 }
 
-static void nfs4_put_deleg_lease(struct nfs4_delegation *dp)
+static void put_deleg_file(struct nfs4_file *fp)
 {
 	struct file *filp = NULL;
-	struct nfs4_file *fp = dp->dl_stid.sc_file;
-
-	WARN_ON_ONCE(!fp->fi_delegees);
 
 	spin_lock(&fp->fi_lock);
-	if (--fp->fi_delegees == 0) {
+	if (--fp->fi_delegees == 0)
 		swap(filp, fp->fi_deleg_file);
-	}
 	spin_unlock(&fp->fi_lock);
 
-	if (filp) {
-		vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
+	if (filp)
 		fput(filp);
-	}
+}
+
+static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
+{
+	struct nfs4_file *fp = dp->dl_stid.sc_file;
+	struct file *filp = fp->fi_deleg_file;
+
+	WARN_ON_ONCE(!fp->fi_delegees);
+
+	vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp);
+	put_deleg_file(fp);
 }
 
 static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
 {
 	put_clnt_odstate(dp->dl_clnt_odstate);
-	nfs4_put_deleg_lease(dp);
+	nfs4_unlock_deleg_lease(dp);
 	nfs4_put_stid(&dp->dl_stid);
 }
 
@@ -959,7 +964,6 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 
 	if (nfs4_delegation_exists(clp, fp))
 		return -EAGAIN;
-	++fp->fi_delegees;
 	refcount_inc(&dp->dl_stid.sc_count);
 	dp->dl_stid.sc_type = NFS4_DELEG_STID;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -3956,10 +3960,6 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
-	if (fp->fi_had_conflict) {
-		WARN(1, "duplicate break on %p\n", fp);
-		return ret;
-	}
 	/*
 	 * We don't want the locks code to timeout the lease for us;
 	 * we'll remove it ourself if a delegation isn't returned
@@ -3969,15 +3969,7 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 
 	spin_lock(&fp->fi_lock);
 	fp->fi_had_conflict = true;
-	/*
-	 * If there are no delegations on the list, then return true
-	 * so that the lease code will go ahead and delete it.
-	 */
-	if (list_empty(&fp->fi_delegations))
-		ret = true;
-	else
-		list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
-			nfsd_break_one_deleg(dp);
+	nfsd_break_one_deleg(dp);
 	spin_unlock(&fp->fi_lock);
 	return ret;
 }
@@ -4315,121 +4307,86 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
 	fl->fl_end = OFFSET_MAX;
 	fl->fl_owner = (fl_owner_t)dp;
 	fl->fl_pid = current->tgid;
+	fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file;
 	return fl;
 }
 
-/**
- * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
- * @dp:   a pointer to the nfs4_delegation we're adding.
- *
- * Return:
- *      On success: Return code will be 0 on success.
- *
- *      On error: -EAGAIN if there was an existing delegation.
- *                 nonzero if there is an error in other cases.
- *
- */
-
-static int nfs4_setlease(struct nfs4_delegation *dp)
-{
-	struct nfs4_file *fp = dp->dl_stid.sc_file;
-	struct file_lock *fl;
-	struct file *filp;
-	int status = 0;
-
-	fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
-	if (!fl)
-		return -ENOMEM;
-	filp = find_readable_file(fp);
-	if (!filp) {
-		/* We should always have a readable file here */
-		WARN_ON_ONCE(1);
-		locks_free_lock(fl);
-		return -EBADF;
-	}
-	fl->fl_file = filp;
-	status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
-	if (fl)
-		locks_free_lock(fl);
-	if (status)
-		goto out_fput;
-	spin_lock(&state_lock);
-	spin_lock(&fp->fi_lock);
-	/* Did the lease get broken before we took the lock? */
-	status = -EAGAIN;
-	if (fp->fi_had_conflict)
-		goto out_unlock;
-	/* Race breaker */
-	if (fp->fi_deleg_file) {
-		status = hash_delegation_locked(dp, fp);
-		goto out_unlock;
-	}
-	fp->fi_deleg_file = filp;
-	fp->fi_delegees = 0;
-	status = hash_delegation_locked(dp, fp);
-	spin_unlock(&fp->fi_lock);
-	spin_unlock(&state_lock);
-	if (status) {
-		/* Should never happen, this is a new fi_deleg_file  */
-		WARN_ON_ONCE(1);
-		goto out_fput;
-	}
-	return 0;
-out_unlock:
-	spin_unlock(&fp->fi_lock);
-	spin_unlock(&state_lock);
-out_fput:
-	fput(filp);
-	return status;
-}
-
 static struct nfs4_delegation *
 nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
 		    struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
 {
 	int status = 0;
 	struct nfs4_delegation *dp;
+	struct file *filp;
+	struct file_lock *fl;
 
+	/*
+	 * The fi_had_conflict and nfs_get_existing_delegation checks
+	 * here are just optimizations; we'll need to recheck them at
+	 * the end:
+	 */
 	if (fp->fi_had_conflict)
 		return ERR_PTR(-EAGAIN);
 
+	filp = find_readable_file(fp);
+	if (!filp) {
+		/* We should always have a readable file here */
+		WARN_ON_ONCE(1);
+		return ERR_PTR(-EBADF);
+	}
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
 	if (nfs4_delegation_exists(clp, fp))
 		status = -EAGAIN;
+	else if (!fp->fi_deleg_file) {
+		fp->fi_deleg_file = filp;
+		/* increment early to prevent fi_deleg_file from being
+		 * cleared */
+		fp->fi_delegees = 1;
+		filp = NULL;
+	} else
+		fp->fi_delegees++;
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
-
+	if (filp)
+		fput(filp);
 	if (status)
 		return ERR_PTR(status);
 
+	status = -ENOMEM;
 	dp = alloc_init_deleg(clp, fp, fh, odstate);
 	if (!dp)
-		return ERR_PTR(-ENOMEM);
+		goto out_delegees;
+
+	fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
+	if (!fl)
+		goto out_stid;
+
+	status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL);
+	if (fl)
+		locks_free_lock(fl);
+	if (status)
+		goto out_clnt_odstate;
 
 	spin_lock(&state_lock);
 	spin_lock(&fp->fi_lock);
-	if (!fp->fi_deleg_file) {
-		spin_unlock(&fp->fi_lock);
-		spin_unlock(&state_lock);
-		status = nfs4_setlease(dp);
-		goto out;
-	}
-	if (fp->fi_had_conflict) {
+	if (fp->fi_had_conflict)
 		status = -EAGAIN;
-		goto out_unlock;
-	}
-	status = hash_delegation_locked(dp, fp);
-out_unlock:
+	else
+		status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
 	spin_unlock(&state_lock);
-out:
-	if (status) {
-		put_clnt_odstate(dp->dl_clnt_odstate);
-		nfs4_put_stid(&dp->dl_stid);
-		return ERR_PTR(status);
-	}
+
+	if (status)
+		destroy_unhashed_deleg(dp);
 	return dp;
+out_clnt_odstate:
+	put_clnt_odstate(dp->dl_clnt_odstate);
+out_stid:
+	nfs4_put_stid(&dp->dl_stid);
+out_delegees:
+	put_deleg_file(fp);
+	return ERR_PTR(status);
 }
 
 static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
-- 
cgit v1.2.1


From 51d87bc2bfa7ff5381941654738540fb23cc5f6a Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Thu, 22 Mar 2018 13:37:20 +0800
Subject: nfsd: fix boolreturn.cocci warnings

fs/nfsd/nfs4state.c:926:8-9: WARNING: return of 0/1 in function 'nfs4_delegation_exists' with return type bool
fs/nfsd/nfs4state.c:2955:9-10: WARNING: return of 0/1 in function 'nfsd4_compound_in_session' with return type bool

 Return statements in functions returning bool should use
 true/false instead of 1/0.
Generated by: scripts/coccinelle/misc/boolreturn.cocci

Fixes: 68b18f52947b ("nfsd: make nfs4_get_existing_delegation less confusing")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
[bfields: also fix -EAGAIN]
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a64ee08aadb2..fc74d6f46bd5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -935,10 +935,10 @@ nfs4_delegation_exists(struct nfs4_client *clp, struct nfs4_file *fp)
 	list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
 		searchclp = searchdp->dl_stid.sc_client;
 		if (clp == searchclp) {
-			return -EAGAIN;
+			return true;
 		}
 	}
-	return 0;
+	return false;
 }
 
 /**
@@ -2958,7 +2958,7 @@ out_no_session:
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
 {
 	if (!session)
-		return 0;
+		return false;
 	return !memcmp(sid, &session->se_sessionid, sizeof(*sid));
 }
 
-- 
cgit v1.2.1


From 47299f79ea4742b191931ffc33cde2401a9792e4 Mon Sep 17 00:00:00 2001
From: Stefan Agner <stefan@agner.ch>
Date: Mon, 19 Mar 2018 23:37:05 +0100
Subject: nfsd: use correct enum type in decode_cb_op_status

Use enum nfs_cb_opnum4 in decode_cb_op_status. This fixes warnings
seen with clang:
  fs/nfsd/nfs4callback.c:451:36: warning: implicit conversion from
      enumeration type 'enum nfs_cb_opnum4' to different enumeration
      type 'enum nfs_opnum4' [-Wenum-conversion]
        status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
                 ~~~~~~~~~~~~~~~~~~~      ^~~~~~~~~~~~~~

Signed-off-by: Stefan Agner <stefan@agner.ch>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 49b0a9e7ff18..1f04d2a70d25 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -223,8 +223,8 @@ static int nfs_cb_stat_to_errno(int status)
 	return -status;
 }
 
-static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected,
-			       int *status)
+static int decode_cb_op_status(struct xdr_stream *xdr,
+			       enum nfs_cb_opnum4 expected, int *status)
 {
 	__be32 *p;
 	u32 op;
-- 
cgit v1.2.1


From 63a1b1569372860fdef9e25edfc2320766b2f4c2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:49:22 -0400
Subject: sunrpc: Remove unneeded pointer dereference

Clean up: Noticed during code inspection that there is already a
local automatic variable "xprt" so dereferencing rqst->rq_xprt
again is unnecessary.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f9307bd6644b..6dca0f513c95 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -492,7 +492,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
 {
 	struct svc_xprt	*xprt = rqstp->rq_xprt;
 
-	rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+	xprt->xpt_ops->xpo_release_rqst(rqstp);
 
 	kfree(rqstp->rq_deferred);
 	rqstp->rq_deferred = NULL;
@@ -889,7 +889,7 @@ int svc_send(struct svc_rqst *rqstp)
 		goto out;
 
 	/* release the receive skb before sending the reply */
-	rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp);
+	xprt->xpt_ops->xpo_release_rqst(rqstp);
 
 	/* calculate over-all length */
 	xb = &rqstp->rq_res;
-- 
cgit v1.2.1


From 989f881ebf77d70e883dd0fbcfa04a058d97f771 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:49:38 -0400
Subject: svc: Simplify ->xpo_secure_port

Clean up: Instead of returning a value that is used to set or clear
a bit, just make ->xpo_secure_port mangle that bit, and return void.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          | 2 +-
 net/sunrpc/svc_xprt.c                    | 5 +----
 net/sunrpc/svcsock.c                     | 7 +++++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 1caf7bc83306..19475acb68ea 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -25,7 +25,7 @@ struct svc_xprt_ops {
 	void		(*xpo_release_rqst)(struct svc_rqst *);
 	void		(*xpo_detach)(struct svc_xprt *);
 	void		(*xpo_free)(struct svc_xprt *);
-	int		(*xpo_secure_port)(struct svc_rqst *);
+	void		(*xpo_secure_port)(struct svc_rqst *rqstp);
 	void		(*xpo_kill_temp_xprt)(struct svc_xprt *);
 };
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 6dca0f513c95..4e3b4c596bae 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -844,10 +844,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 
 	clear_bit(XPT_OLD, &xprt->xpt_flags);
 
-	if (xprt->xpt_ops->xpo_secure_port(rqstp))
-		set_bit(RQ_SECURE, &rqstp->rq_flags);
-	else
-		clear_bit(RQ_SECURE, &rqstp->rq_flags);
+	xprt->xpt_ops->xpo_secure_port(rqstp);
 	rqstp->rq_chandle.defer = svc_defer;
 	rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]);
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 943f2a745cd5..9b6703588e35 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -391,9 +391,12 @@ static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
 	release_sock(sock->sk);
 }
 
-static int svc_sock_secure_port(struct svc_rqst *rqstp)
+static void svc_sock_secure_port(struct svc_rqst *rqstp)
 {
-	return svc_port_is_privileged(svc_addr(rqstp));
+	if (svc_port_is_privileged(svc_addr(rqstp)))
+		set_bit(RQ_SECURE, &rqstp->rq_flags);
+	else
+		clear_bit(RQ_SECURE, &rqstp->rq_flags);
 }
 
 /*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 7b2f4d3a2543..17da06d6b8e5 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -69,7 +69,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *);
 static void svc_rdma_detach(struct svc_xprt *xprt);
 static void svc_rdma_free(struct svc_xprt *xprt);
 static int svc_rdma_has_wspace(struct svc_xprt *xprt);
-static int svc_rdma_secure_port(struct svc_rqst *);
+static void svc_rdma_secure_port(struct svc_rqst *);
 static void svc_rdma_kill_temp_xprt(struct svc_xprt *);
 
 static const struct svc_xprt_ops svc_rdma_ops = {
@@ -988,9 +988,9 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
 	return 1;
 }
 
-static int svc_rdma_secure_port(struct svc_rqst *rqstp)
+static void svc_rdma_secure_port(struct svc_rqst *rqstp)
 {
-	return 1;
+	set_bit(RQ_SECURE, &rqstp->rq_flags);
 }
 
 static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt)
-- 
cgit v1.2.1


From 03edb90f57a5f7edaa13db3174e269331ef9a527 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:49:53 -0400
Subject: sunrpc: Update show_svc_xprt_flags() to include recently added flags

XPT_KILL_TEMP was added by commit 546125d16142 ("sunrpc: don't call
sleeping functions from the notifier block callbacks"), and
XPT_CONG_CTRL was added by commit 362142b25843 ("sunrpc: flag
transports as having congestion control") .

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/trace/events/sunrpc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 970c91a83173..a23144471b6b 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -591,7 +591,9 @@ DEFINE_EVENT(svc_rqst_status, svc_send,
 		{ (1UL << XPT_OLD),		"XPT_OLD"},		\
 		{ (1UL << XPT_LISTENER),	"XPT_LISTENER"},	\
 		{ (1UL << XPT_CACHE_AUTH),	"XPT_CACHE_AUTH"},	\
-		{ (1UL << XPT_LOCAL),		"XPT_LOCAL"})
+		{ (1UL << XPT_LOCAL),		"XPT_LOCAL"},		\
+		{ (1UL << XPT_KILL_TEMP),	"XPT_KILL_TEMP"},	\
+		{ (1UL << XPT_CONG_CTRL),	"XPT_CONG_CTRL"})
 
 TRACE_EVENT(svc_xprt_do_enqueue,
 	TP_PROTO(struct svc_xprt *xprt, struct svc_rqst *rqst),
-- 
cgit v1.2.1


From caa3e106dc623eb41542e6221abecf9956e8a0e6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:50:07 -0400
Subject: sunrpc: Move trace_svc_xprt_dequeue()

Reduce the amount of noise generated by trace_svc_xprt_dequeue by
moving it to the end of svc_get_next_xprt. This generates exactly
one trace event when a ready xprt is found, rather than spurious
events when there is no work to do. The empty events contain no
information that can't be obtained simply by tracing function calls
to svc_xprt_dequeue.

A small additional benefit is simplification of the svc_xprt_event
trace class, which no longer has to handle the case when the @xprt
parameter is NULL.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/trace/events/sunrpc.h | 16 +++++-----------
 net/sunrpc/svc_xprt.c         |  5 +----
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index a23144471b6b..9bba3070f873 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -634,24 +634,18 @@ DECLARE_EVENT_CLASS(svc_xprt_event,
 	TP_STRUCT__entry(
 		__field(struct svc_xprt *, xprt)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, xprt != NULL ?
-			xprt->xpt_remotelen : 0)
+		__dynamic_array(unsigned char, addr, xprt->xpt_remotelen)
 	),
 
 	TP_fast_assign(
 		__entry->xprt = xprt;
-		if (xprt) {
-			memcpy(__get_dynamic_array(addr),
-					&xprt->xpt_remote,
-					xprt->xpt_remotelen);
-			__entry->flags = xprt->xpt_flags;
-		} else
-			__entry->flags = 0;
+		__entry->flags = xprt->xpt_flags;
+		memcpy(__get_dynamic_array(addr),
+		       &xprt->xpt_remote, xprt->xpt_remotelen);
 	),
 
 	TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt,
-		__get_dynamic_array_len(addr) != 0 ?
-			(struct sockaddr *)__get_dynamic_array(addr) : NULL,
+		(struct sockaddr *)__get_dynamic_array(addr),
 		show_svc_xprt_flags(__entry->flags))
 );
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 4e3b4c596bae..71f47187b4ec 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -454,13 +454,9 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool)
 					struct svc_xprt, xpt_ready);
 		list_del_init(&xprt->xpt_ready);
 		svc_xprt_get(xprt);
-
-		dprintk("svc: transport %p dequeued, inuse=%d\n",
-			xprt, kref_read(&xprt->xpt_ref));
 	}
 	spin_unlock_bh(&pool->sp_lock);
 out:
-	trace_svc_xprt_dequeue(xprt);
 	return xprt;
 }
 
@@ -734,6 +730,7 @@ out_found:
 		rqstp->rq_chandle.thread_wait = 5*HZ;
 	else
 		rqstp->rq_chandle.thread_wait = 1*HZ;
+	trace_svc_xprt_dequeue(rqstp->rq_xprt);
 	return rqstp->rq_xprt;
 }
 
-- 
cgit v1.2.1


From 7dbb53baed3c3969dea43e3cee261a75adde123c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:50:27 -0400
Subject: sunrpc: Simplify do_enqueue tracing

There are three cases where svc_xprt_do_enqueue() returns without
waking an nfsd thread:

1. There is no work to do

2. The transport is already busy

3. There are no available nfsd threads

Only 3. is truly interesting. Move the trace point so it records
that there was work to do and either an nfsd thread was awoken, or
a free one could not found.

As an additional clean up, remove a redundant comment and a couple
of dprintk call sites.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/svc_xprt.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 71f47187b4ec..5fe150c78d0a 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -382,25 +382,21 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 	int cpu;
 
 	if (!svc_xprt_has_something_to_do(xprt))
-		goto out;
+		return;
 
 	/* Mark transport as busy. It will remain in this state until
 	 * the provider calls svc_xprt_received. We update XPT_BUSY
 	 * atomically because it also guards against trying to enqueue
 	 * the transport twice.
 	 */
-	if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) {
-		/* Don't enqueue transport while already enqueued */
-		dprintk("svc: transport %p busy, not enqueued\n", xprt);
-		goto out;
-	}
+	if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags))
+		return;
 
 	cpu = get_cpu();
 	pool = svc_pool_for_cpu(xprt->xpt_server, cpu);
 
 	atomic_long_inc(&pool->sp_stats.packets);
 
-	dprintk("svc: transport %p put into queue\n", xprt);
 	spin_lock_bh(&pool->sp_lock);
 	list_add_tail(&xprt->xpt_ready, &pool->sp_sockets);
 	pool->sp_stats.sockets_queued++;
@@ -420,7 +416,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 out_unlock:
 	rcu_read_unlock();
 	put_cpu();
-out:
 	trace_svc_xprt_do_enqueue(xprt, rqstp);
 }
 EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
-- 
cgit v1.2.1


From 41f306d0c287e0cc04054135f9f4ceb003ad6795 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:50:42 -0400
Subject: sunrpc: Simplify trace_svc_recv

There doesn't seem to be a lot of value in calling trace_svc_recv
in the failing case.

1. There are two very common cases: one is the transport is not
ready, and the other is shutdown. Neither is terribly interesting.

2. The trace record for the failing case contains nothing but
the status code.

Therefore the trace point call site in the error exit is removed.
Since the trace point is now recording a length instead of a
status, rename the status field and remove the case that records a
zero XID.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/trace/events/sunrpc.h | 14 +++++++-------
 net/sunrpc/svc_xprt.c         |  1 -
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 9bba3070f873..5849bfb3ece2 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -485,28 +485,28 @@ TRACE_EVENT(xs_tcp_data_recv,
 		{ (1UL << RQ_BUSY),		"RQ_BUSY"})
 
 TRACE_EVENT(svc_recv,
-	TP_PROTO(struct svc_rqst *rqst, int status),
+	TP_PROTO(struct svc_rqst *rqst, int len),
 
-	TP_ARGS(rqst, status),
+	TP_ARGS(rqst, len),
 
 	TP_STRUCT__entry(
 		__field(u32, xid)
-		__field(int, status)
+		__field(int, len)
 		__field(unsigned long, flags)
 		__dynamic_array(unsigned char, addr, rqst->rq_addrlen)
 	),
 
 	TP_fast_assign(
-		__entry->xid = status > 0 ? be32_to_cpu(rqst->rq_xid) : 0;
-		__entry->status = status;
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->len = len;
 		__entry->flags = rqst->rq_flags;
 		memcpy(__get_dynamic_array(addr),
 			&rqst->rq_addr, rqst->rq_addrlen);
 	),
 
-	TP_printk("addr=%pIScp xid=0x%08x status=%d flags=%s",
+	TP_printk("addr=%pIScp xid=0x%08x len=%d flags=%s",
 			(struct sockaddr *)__get_dynamic_array(addr),
-			__entry->xid, __entry->status,
+			__entry->xid, __entry->len,
 			show_rqstp_flags(__entry->flags))
 );
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 5fe150c78d0a..47384d0b1673 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -848,7 +848,6 @@ out_release:
 	rqstp->rq_res.len = 0;
 	svc_xprt_release(rqstp);
 out:
-	trace_svc_recv(rqstp, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(svc_recv);
-- 
cgit v1.2.1


From ece200ddd54b9ce840cfee554fb812560c545c7d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:51:00 -0400
Subject: sunrpc: Save remote presentation address in svc_xprt for trace events

TP_printk defines a format string that is passed to user space for
converting raw trace event records to something human-readable.

My user space's printf (Oracle Linux 7), however, does not have a
%pI format specifier. The result is that what is supposed to be an
IP address in the output of "trace-cmd report" is just a string that
says the field couldn't be displayed.

To fix this, adopt the same approach as the client: maintain a pre-
formated presentation address for occasions when %pI is not
available.

The location of the trace_svc_send trace point is adjusted so that
rqst->rq_xprt is not NULL when the trace event is recorded.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h          |  4 ++
 include/trace/events/sunrpc.h            | 89 ++++++++++++--------------------
 net/sunrpc/svc_xprt.c                    |  3 +-
 net/sunrpc/svcsock.c                     |  1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  4 +-
 5 files changed, 43 insertions(+), 58 deletions(-)

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 19475acb68ea..c3d72066d4b1 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -83,6 +83,7 @@ struct svc_xprt {
 	size_t			xpt_locallen;	/* length of address */
 	struct sockaddr_storage	xpt_remote;	/* remote peer's address */
 	size_t			xpt_remotelen;	/* length of address */
+	char			xpt_remotebuf[INET6_ADDRSTRLEN + 10];
 	struct rpc_wait_queue	xpt_bc_pending;	/* backchannel wait queue */
 	struct list_head	xpt_users;	/* callbacks on free */
 
@@ -152,7 +153,10 @@ static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
 {
 	memcpy(&xprt->xpt_remote, sa, salen);
 	xprt->xpt_remotelen = salen;
+	snprintf(xprt->xpt_remotebuf, sizeof(xprt->xpt_remotebuf) - 1,
+		 "%pISpc", sa);
 }
+
 static inline unsigned short svc_addr_port(const struct sockaddr *sa)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 5849bfb3ece2..1ec8c4c45766 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -493,20 +493,18 @@ TRACE_EVENT(svc_recv,
 		__field(u32, xid)
 		__field(int, len)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, rqst->rq_addrlen)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->len = len;
 		__entry->flags = rqst->rq_flags;
-		memcpy(__get_dynamic_array(addr),
-			&rqst->rq_addr, rqst->rq_addrlen);
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
 	),
 
-	TP_printk("addr=%pIScp xid=0x%08x len=%d flags=%s",
-			(struct sockaddr *)__get_dynamic_array(addr),
-			__entry->xid, __entry->len,
+	TP_printk("addr=%s xid=0x%08x len=%d flags=%s",
+			__get_str(addr), __entry->xid, __entry->len,
 			show_rqstp_flags(__entry->flags))
 );
 
@@ -519,20 +517,18 @@ DECLARE_EVENT_CLASS(svc_rqst_event,
 	TP_STRUCT__entry(
 		__field(u32, xid)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, rqst->rq_addrlen)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->flags = rqst->rq_flags;
-		memcpy(__get_dynamic_array(addr),
-			&rqst->rq_addr, rqst->rq_addrlen);
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
 	),
 
-	TP_printk("addr=%pIScp rq_xid=0x%08x flags=%s",
-		(struct sockaddr *)__get_dynamic_array(addr),
-		__entry->xid,
-		show_rqstp_flags(__entry->flags))
+	TP_printk("addr=%s xid=0x%08x flags=%s",
+			__get_str(addr), __entry->xid,
+			show_rqstp_flags(__entry->flags))
 );
 
 DEFINE_EVENT(svc_rqst_event, svc_defer,
@@ -553,21 +549,19 @@ DECLARE_EVENT_CLASS(svc_rqst_status,
 		__field(u32, xid)
 		__field(int, status)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, rqst->rq_addrlen)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->status = status;
 		__entry->flags = rqst->rq_flags;
-		memcpy(__get_dynamic_array(addr),
-			&rqst->rq_addr, rqst->rq_addrlen);
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
 	),
 
-	TP_printk("addr=%pIScp rq_xid=0x%08x status=%d flags=%s",
-		(struct sockaddr *)__get_dynamic_array(addr),
-		__entry->xid,
-		__entry->status, show_rqstp_flags(__entry->flags))
+	TP_printk("addr=%s xid=0x%08x status=%d flags=%s",
+		  __get_str(addr), __entry->xid,
+		  __entry->status, show_rqstp_flags(__entry->flags))
 );
 
 DEFINE_EVENT(svc_rqst_status, svc_process,
@@ -604,26 +598,19 @@ TRACE_EVENT(svc_xprt_do_enqueue,
 		__field(struct svc_xprt *, xprt)
 		__field(int, pid)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, xprt != NULL ?
-			xprt->xpt_remotelen : 0)
+		__string(addr, xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xprt = xprt;
 		__entry->pid = rqst? rqst->rq_task->pid : 0;
-		if (xprt) {
-			memcpy(__get_dynamic_array(addr),
-				&xprt->xpt_remote,
-				xprt->xpt_remotelen);
-			__entry->flags = xprt->xpt_flags;
-		} else
-			__entry->flags = 0;
+		__entry->flags = xprt->xpt_flags;
+		__assign_str(addr, xprt->xpt_remotebuf);
 	),
 
-	TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt,
-		__get_dynamic_array_len(addr) != 0 ?
-			(struct sockaddr *)__get_dynamic_array(addr) : NULL,
-		__entry->pid, show_svc_xprt_flags(__entry->flags))
+	TP_printk("xprt=%p addr=%s pid=%d flags=%s",
+			__entry->xprt, __get_str(addr),
+			__entry->pid, show_svc_xprt_flags(__entry->flags))
 );
 
 DECLARE_EVENT_CLASS(svc_xprt_event,
@@ -634,19 +621,18 @@ DECLARE_EVENT_CLASS(svc_xprt_event,
 	TP_STRUCT__entry(
 		__field(struct svc_xprt *, xprt)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, xprt->xpt_remotelen)
+		__string(addr, xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xprt = xprt;
 		__entry->flags = xprt->xpt_flags;
-		memcpy(__get_dynamic_array(addr),
-		       &xprt->xpt_remote, xprt->xpt_remotelen);
+		__assign_str(addr, xprt->xpt_remotebuf);
 	),
 
-	TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt,
-		(struct sockaddr *)__get_dynamic_array(addr),
-		show_svc_xprt_flags(__entry->flags))
+	TP_printk("xprt=%p addr=%s flags=%s",
+			__entry->xprt, __get_str(addr),
+			show_svc_xprt_flags(__entry->flags))
 );
 
 DEFINE_EVENT(svc_xprt_event, svc_xprt_dequeue,
@@ -682,25 +668,18 @@ TRACE_EVENT(svc_handle_xprt,
 		__field(struct svc_xprt *, xprt)
 		__field(int, len)
 		__field(unsigned long, flags)
-		__dynamic_array(unsigned char, addr, xprt != NULL ?
-			xprt->xpt_remotelen : 0)
+		__string(addr, xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xprt = xprt;
 		__entry->len = len;
-		if (xprt) {
-			memcpy(__get_dynamic_array(addr),
-					&xprt->xpt_remote,
-					xprt->xpt_remotelen);
-			__entry->flags = xprt->xpt_flags;
-		} else
-			__entry->flags = 0;
+		__entry->flags = xprt->xpt_flags;
+		__assign_str(addr, xprt->xpt_remotebuf);
 	),
 
-	TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt,
-		__get_dynamic_array_len(addr) != 0 ?
-			(struct sockaddr *)__get_dynamic_array(addr) : NULL,
+	TP_printk("xprt=%p addr=%s len=%d flags=%s",
+		__entry->xprt, __get_str(addr),
 		__entry->len, show_svc_xprt_flags(__entry->flags))
 );
 
@@ -712,18 +691,16 @@ DECLARE_EVENT_CLASS(svc_deferred_event,
 
 	TP_STRUCT__entry(
 		__field(u32, xid)
-		__dynamic_array(unsigned char, addr, dr->addrlen)
+		__string(addr, dr->xprt->xpt_remotebuf)
 	),
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(*(__be32 *)(dr->args +
 						       (dr->xprt_hlen>>2)));
-		memcpy(__get_dynamic_array(addr), &dr->addr, dr->addrlen);
+		__assign_str(addr, dr->xprt->xpt_remotebuf);
 	),
 
-	TP_printk("addr=%pIScp xid=0x%08x",
-		(struct sockaddr *)__get_dynamic_array(addr),
-		__entry->xid)
+	TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid)
 );
 
 DEFINE_EVENT(svc_deferred_event, svc_drop_deferred,
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 47384d0b1673..f745754a55ea 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -173,6 +173,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
 	set_bit(XPT_BUSY, &xprt->xpt_flags);
 	rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending");
 	xprt->xpt_net = get_net(net);
+	strcpy(xprt->xpt_remotebuf, "uninitialized");
 }
 EXPORT_SYMBOL_GPL(svc_xprt_init);
 
@@ -894,12 +895,12 @@ int svc_send(struct svc_rqst *rqstp)
 		len = xprt->xpt_ops->xpo_sendto(rqstp);
 	mutex_unlock(&xprt->xpt_mutex);
 	rpc_wake_up(&xprt->xpt_bc_pending);
+	trace_svc_send(rqstp, len);
 	svc_xprt_release(rqstp);
 
 	if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
 		len = 0;
 out:
-	trace_svc_send(rqstp, len);
 	return len;
 }
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 9b6703588e35..4ca1d92b531a 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1310,6 +1310,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
 	set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
 	if (sk->sk_state == TCP_LISTEN) {
 		dprintk("setting up TCP socket for listening\n");
+		strcpy(svsk->sk_xprt.xpt_remotebuf, "listener");
 		set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
 		sk->sk_data_ready = svc_tcp_listen_data_ready;
 		set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 17da06d6b8e5..96cc8f6597d3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -401,8 +401,10 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	 */
 	set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
 
-	if (listener)
+	if (listener) {
+		strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener");
 		set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
+	}
 
 	return cma_xprt;
 }
-- 
cgit v1.2.1


From 0b9547bf6b94317b3f8e2496dc2b44cb6e599b01 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:51:22 -0400
Subject: sunrpc: Re-purpose trace_svc_process

Currently, trace_svc_process has two call sites:

1. Just after a call to svc_send. svc_send already invokes
   trace_svc_send with the same arguments just before returning

2. Just before a call to svc_drop. svc_drop already invokes
   trace_svc_drop with the same arguments just after it is called

Therefore trace_svc_process does not provide any additional
information not already provided by these other trace points.

However, it would be useful to record the incoming RPC procedure.
So reuse trace_svc_process for this purpose.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/trace/events/sunrpc.h | 30 ++++++++++++++++++++++++++----
 net/sunrpc/svc.c              |  9 +++------
 2 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 1ec8c4c45766..5a8157c04900 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -508,6 +508,32 @@ TRACE_EVENT(svc_recv,
 			show_rqstp_flags(__entry->flags))
 );
 
+TRACE_EVENT(svc_process,
+	TP_PROTO(const struct svc_rqst *rqst, const char *name),
+
+	TP_ARGS(rqst, name),
+
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, vers)
+		__field(u32, proc)
+		__string(service, name)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->vers = rqst->rq_vers;
+		__entry->proc = rqst->rq_proc;
+		__assign_str(service, name);
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+	),
+
+	TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u",
+			__get_str(addr), __entry->xid,
+			__get_str(service), __entry->vers, __entry->proc)
+);
+
 DECLARE_EVENT_CLASS(svc_rqst_event,
 
 	TP_PROTO(struct svc_rqst *rqst),
@@ -564,10 +590,6 @@ DECLARE_EVENT_CLASS(svc_rqst_status,
 		  __entry->status, show_rqstp_flags(__entry->flags))
 );
 
-DEFINE_EVENT(svc_rqst_status, svc_process,
-	TP_PROTO(struct svc_rqst *rqst, int status),
-	TP_ARGS(rqst, status));
-
 DEFINE_EVENT(svc_rqst_status, svc_send,
 	TP_PROTO(struct svc_rqst *rqst, int status),
 	TP_ARGS(rqst, status));
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 387cc4add6f6..f19987f5d3b5 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1255,6 +1255,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
 
 	/* Syntactic check complete */
 	serv->sv_stats->rpccnt++;
+	trace_svc_process(rqstp, progp->pg_name);
 
 	/* Build the reply header. */
 	statp = resv->iov_base +resv->iov_len;
@@ -1431,14 +1432,10 @@ svc_process(struct svc_rqst *rqstp)
 	}
 
 	/* Returns 1 for send, 0 for drop */
-	if (likely(svc_process_common(rqstp, argv, resv))) {
-		int ret = svc_send(rqstp);
+	if (likely(svc_process_common(rqstp, argv, resv)))
+		return svc_send(rqstp);
 
-		trace_svc_process(rqstp, ret);
-		return ret;
-	}
 out_drop:
-	trace_svc_process(rqstp, 0);
 	svc_drop(rqstp);
 	return 0;
 }
-- 
cgit v1.2.1


From aaba72cd4e793fbf1c04e06dee3d2c3710339678 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:51:39 -0400
Subject: sunrpc: Report per-RPC execution stats

Introduce a mechanism to report the server-side execution latency of
each RPC. The goal is to enable user space to filter the trace
record for latency outliers, build histograms, etc.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h    |  1 +
 include/trace/events/sunrpc.h | 21 +++++++++++++++++++++
 net/sunrpc/svc_xprt.c         |  3 ++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 786ae2255f05..3bd7504066e1 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -283,6 +283,7 @@ struct svc_rqst {
 	int			rq_reserved;	/* space on socket outq
 						 * reserved for this request
 						 */
+	ktime_t			rq_stime;	/* start time */
 
 	struct cache_req	rq_chandle;	/* handle passed to caches for 
 						 * request delaying 
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 5a8157c04900..1b383f71ccd7 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -705,6 +705,27 @@ TRACE_EVENT(svc_handle_xprt,
 		__entry->len, show_svc_xprt_flags(__entry->flags))
 );
 
+TRACE_EVENT(svc_stats_latency,
+	TP_PROTO(const struct svc_rqst *rqst),
+
+	TP_ARGS(rqst),
+
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(unsigned long, execute)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->execute = ktime_to_us(ktime_sub(ktime_get(),
+							 rqst->rq_stime));
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+	),
+
+	TP_printk("addr=%s xid=0x%08x execute-us=%lu",
+		__get_str(addr), __entry->xid, __entry->execute)
+);
 
 DECLARE_EVENT_CLASS(svc_deferred_event,
 	TP_PROTO(struct svc_deferred_req *dr),
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index f745754a55ea..a7425da14f5b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -782,7 +782,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
 			len = svc_deferred_recv(rqstp);
 		else
 			len = xprt->xpt_ops->xpo_recvfrom(rqstp);
-		dprintk("svc: got len=%d\n", len);
+		rqstp->rq_stime = ktime_get();
 		rqstp->rq_reserved = serv->sv_max_mesg;
 		atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
 	}
@@ -888,6 +888,7 @@ int svc_send(struct svc_rqst *rqstp)
 
 	/* Grab mutex to serialize outgoing data. */
 	mutex_lock(&xprt->xpt_mutex);
+	trace_svc_stats_latency(rqstp);
 	if (test_bit(XPT_DEAD, &xprt->xpt_flags)
 			|| test_bit(XPT_CLOSE, &xprt->xpt_flags))
 		len = -ENOTCONN;
-- 
cgit v1.2.1


From 55f5088c22cc83dbc64394abfbf76cd1ff5e7cd0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:52:27 -0400
Subject: svc: Report xprt dequeue latency

Record the time between when a rqstp is enqueued on a transport
and when it is dequeued. This includes how long the rqstp waits on
the queue and how long it takes the kernel scheduler to wake a
nfsd thread to service it.

The svc_xprt_dequeue trace point is altered to include the number
of microseconds between xprt_enqueue and xprt_dequeue.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc.h    |  1 +
 include/trace/events/sunrpc.h | 30 ++++++++++++++++++++++++++----
 net/sunrpc/svc_xprt.c         |  4 ++--
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3bd7504066e1..dc4c009deec1 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -272,6 +272,7 @@ struct svc_rqst {
 #define	RQ_BUSY		(6)			/* request is busy */
 #define	RQ_DATA		(7)			/* request has data */
 	unsigned long		rq_flags;	/* flags field */
+	ktime_t			rq_qtime;	/* enqueue time */
 
 	void *			rq_argp;	/* decoded arguments */
 	void *			rq_resp;	/* xdr'd results */
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index 1b383f71ccd7..922cb8968fb2 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -657,14 +657,36 @@ DECLARE_EVENT_CLASS(svc_xprt_event,
 			show_svc_xprt_flags(__entry->flags))
 );
 
-DEFINE_EVENT(svc_xprt_event, svc_xprt_dequeue,
-	TP_PROTO(struct svc_xprt *xprt),
-	TP_ARGS(xprt));
-
 DEFINE_EVENT(svc_xprt_event, svc_xprt_no_write_space,
 	TP_PROTO(struct svc_xprt *xprt),
 	TP_ARGS(xprt));
 
+TRACE_EVENT(svc_xprt_dequeue,
+	TP_PROTO(struct svc_rqst *rqst),
+
+	TP_ARGS(rqst),
+
+	TP_STRUCT__entry(
+		__field(struct svc_xprt *, xprt)
+		__field(unsigned long, flags)
+		__field(unsigned long, wakeup)
+		__string(addr, rqst->rq_xprt->xpt_remotebuf)
+	),
+
+	TP_fast_assign(
+		__entry->xprt = rqst->rq_xprt;
+		__entry->flags = rqst->rq_xprt->xpt_flags;
+		__entry->wakeup = ktime_to_us(ktime_sub(ktime_get(),
+							rqst->rq_qtime));
+		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+	),
+
+	TP_printk("xprt=%p addr=%s flags=%s wakeup-us=%lu",
+			__entry->xprt, __get_str(addr),
+			show_svc_xprt_flags(__entry->flags),
+			__entry->wakeup)
+);
+
 TRACE_EVENT(svc_wake_up,
 	TP_PROTO(int pid),
 
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index a7425da14f5b..5185efb9027b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -409,6 +409,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
 		if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags))
 			continue;
 		atomic_long_inc(&pool->sp_stats.threads_woken);
+		rqstp->rq_qtime = ktime_get();
 		wake_up_process(rqstp->rq_task);
 		goto out_unlock;
 	}
@@ -530,7 +531,6 @@ void svc_wake_up(struct svc_serv *serv)
 		if (test_bit(RQ_BUSY, &rqstp->rq_flags))
 			continue;
 		rcu_read_unlock();
-		dprintk("svc: daemon %p woken up.\n", rqstp);
 		wake_up_process(rqstp->rq_task);
 		trace_svc_wake_up(rqstp->rq_task->pid);
 		return;
@@ -726,7 +726,7 @@ out_found:
 		rqstp->rq_chandle.thread_wait = 5*HZ;
 	else
 		rqstp->rq_chandle.thread_wait = 1*HZ;
-	trace_svc_xprt_dequeue(rqstp->rq_xprt);
+	trace_svc_xprt_dequeue(rqstp);
 	return rqstp->rq_xprt;
 }
 
-- 
cgit v1.2.1


From afa720a0910bd0fe4a714ab9b902ff6e845900c3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:52:43 -0400
Subject: nfsd: Fix NFSD trace points

nfsd-1915  [003] 77915.780959: write_opened:
	[FAILED TO PARSE] xid=3286130958 fh=0 offset=154624 len=1
nfsd-1915  [003] 77915.780960: write_io_done:
	[FAILED TO PARSE] xid=3286130958 fh=0 offset=154624 len=1
nfsd-1915  [003] 77915.780964: write_done:
	[FAILED TO PARSE] xid=3286130958 fh=0 offset=154624 len=1

Byte swapping and knfsd_fh_hash() are not available in "trace-cmd
report", where the print format string is actually used. These
data transformations have to be done during the TP_fast_assign step.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/trace.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 8b2f1d92c579..3aa3f3b1eeaa 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -18,19 +18,24 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
 		 int		len),
 	TP_ARGS(rqstp, fhp, offset, len),
 	TP_STRUCT__entry(
-		__field(__be32, xid)
-		__field_struct(struct knfsd_fh, fh)
+		__field(u32, xid)
+		__field(u32, fh_hash)
 		__field(loff_t, offset)
 		__field(int, len)
 	),
 	TP_fast_assign(
-		__entry->xid = rqstp->rq_xid,
-		fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		do {
+			struct knfsd_fh fh;
+
+			fh_copy_shallow(&fh, &fhp->fh_handle);
+			__entry->fh_hash = knfsd_fh_hash(&fh);
+		} while (0);
 		__entry->offset = offset;
 		__entry->len = len;
 	),
-	TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
-		  __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+	TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%d",
+		  __entry->xid, __entry->fh_hash,
 		  __entry->offset, __entry->len)
 )
 
-- 
cgit v1.2.1


From 79e0b4e247b3d1787ade451cc0187e8d3b0e6e53 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:52:57 -0400
Subject: nfsd: Record request byte count, not count of vectors

Byte count is more helpful to know than vector count.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/trace.h | 15 +++++----------
 fs/nfsd/vfs.c   | 16 ++++++++--------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 3aa3f3b1eeaa..9b7950602fad 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -15,26 +15,21 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
 	TP_PROTO(struct svc_rqst *rqstp,
 		 struct svc_fh	*fhp,
 		 loff_t		offset,
-		 int		len),
+		 unsigned long	len),
 	TP_ARGS(rqstp, fhp, offset, len),
 	TP_STRUCT__entry(
 		__field(u32, xid)
 		__field(u32, fh_hash)
 		__field(loff_t, offset)
-		__field(int, len)
+		__field(unsigned long, len)
 	),
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqstp->rq_xid);
-		do {
-			struct knfsd_fh fh;
-
-			fh_copy_shallow(&fh, &fhp->fh_handle);
-			__entry->fh_hash = knfsd_fh_hash(&fh);
-		} while (0);
+		__entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
 		__entry->offset = offset;
 		__entry->len = len;
 	),
-	TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%d",
+	TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu",
 		  __entry->xid, __entry->fh_hash,
 		  __entry->offset, __entry->len)
 )
@@ -44,7 +39,7 @@ DEFINE_EVENT(nfsd_io_class, name,		\
 	TP_PROTO(struct svc_rqst *rqstp,	\
 		 struct svc_fh	*fhp,		\
 		 loff_t		offset,		\
-		 int		len),		\
+		 unsigned long	len),		\
 	TP_ARGS(rqstp, fhp, offset, len))
 
 DEFINE_NFSD_IO_EVENT(read_start);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3c9bfa77def..e2eab4c37030 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1024,27 +1024,27 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct raparms	*ra;
 	__be32 err;
 
-	trace_read_start(rqstp, fhp, offset, vlen);
+	trace_read_start(rqstp, fhp, offset, *count);
 	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 	if (err)
 		return err;
 
 	ra = nfsd_init_raparms(file);
 
-	trace_read_opened(rqstp, fhp, offset, vlen);
+	trace_read_opened(rqstp, fhp, offset, *count);
 
 	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
 		err = nfsd_splice_read(rqstp, file, offset, count);
 	else
 		err = nfsd_readv(file, offset, vec, vlen, count);
 
-	trace_read_io_done(rqstp, fhp, offset, vlen);
+	trace_read_io_done(rqstp, fhp, offset, *count);
 
 	if (ra)
 		nfsd_put_raparams(file, ra);
 	fput(file);
 
-	trace_read_done(rqstp, fhp, offset, vlen);
+	trace_read_done(rqstp, fhp, offset, *count);
 
 	return err;
 }
@@ -1061,18 +1061,18 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	struct file *file = NULL;
 	__be32 err = 0;
 
-	trace_write_start(rqstp, fhp, offset, vlen);
+	trace_write_start(rqstp, fhp, offset, *cnt);
 
 	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
 	if (err)
 		goto out;
 
-	trace_write_opened(rqstp, fhp, offset, vlen);
+	trace_write_opened(rqstp, fhp, offset, *cnt);
 	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
-	trace_write_io_done(rqstp, fhp, offset, vlen);
+	trace_write_io_done(rqstp, fhp, offset, *cnt);
 	fput(file);
 out:
-	trace_write_done(rqstp, fhp, offset, vlen);
+	trace_write_done(rqstp, fhp, offset, *cnt);
 	return err;
 }
 
-- 
cgit v1.2.1


From f394b62b7b0dd24757e9813e19f4b6091c5a28f6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:53:11 -0400
Subject: nfsd: Add "nfsd_" to trace point names

Follow naming convention used in client and in sunrpc layers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4layouts.c | 16 ++++++++--------
 fs/nfsd/nfs4proc.c    |  4 ++--
 fs/nfsd/trace.h       |  4 ++--
 fs/nfsd/vfs.c         | 16 ++++++++--------
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 7d888369f85a..228faf00a594 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -165,7 +165,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
 	struct nfs4_file *fp = ls->ls_stid.sc_file;
 
-	trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+	trace_nfsd_layoutstate_free(&ls->ls_stid.sc_stateid);
 
 	spin_lock(&clp->cl_lock);
 	list_del_init(&ls->ls_perclnt);
@@ -264,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	list_add(&ls->ls_perfile, &fp->fi_lo_states);
 	spin_unlock(&fp->fi_lock);
 
-	trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+	trace_nfsd_layoutstate_alloc(&ls->ls_stid.sc_stateid);
 	return ls;
 }
 
@@ -334,7 +334,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
 	if (list_empty(&ls->ls_layouts))
 		goto out_unlock;
 
-	trace_layout_recall(&ls->ls_stid.sc_stateid);
+	trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
 
 	refcount_inc(&ls->ls_stid.sc_count);
 	nfsd4_run_cb(&ls->ls_recall);
@@ -507,7 +507,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 						false, lrp->lr_layout_type,
 						&ls);
 	if (nfserr) {
-		trace_layout_return_lookup_fail(&lrp->lr_sid);
+		trace_nfsd_layout_return_lookup_fail(&lrp->lr_sid);
 		return nfserr;
 	}
 
@@ -523,7 +523,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 			nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
 		lrp->lrs_present = 1;
 	} else {
-		trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+		trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
 		nfs4_unhash_stid(&ls->ls_stid);
 		lrp->lrs_present = 0;
 	}
@@ -694,7 +694,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		/*
 		 * Unknown error or non-responding client, we'll need to fence.
 		 */
-		trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
+		trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
 
 		ops = nfsd4_layout_ops[ls->ls_layout_type];
 		if (ops->fence_client)
@@ -703,7 +703,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 			nfsd4_cb_layout_fail(ls);
 		return -1;
 	case -NFS4ERR_NOMATCHING_LAYOUT:
-		trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+		trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
 		task->tk_status = 0;
 		return 1;
 	}
@@ -716,7 +716,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 	LIST_HEAD(reaplist);
 
-	trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+	trace_nfsd_layout_recall_release(&ls->ls_stid.sc_stateid);
 
 	nfsd4_return_all_layouts(ls, &reaplist);
 	nfsd4_free_layouts(&reaplist);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 40dfb759d38d..792fbb851046 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1411,7 +1411,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
 	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
 						true, lgp->lg_layout_type, &ls);
 	if (nfserr) {
-		trace_layout_get_lookup_fail(&lgp->lg_sid);
+		trace_nfsd_layout_get_lookup_fail(&lgp->lg_sid);
 		goto out;
 	}
 
@@ -1480,7 +1480,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
 						false, lcp->lc_layout_type,
 						&ls);
 	if (nfserr) {
-		trace_layout_commit_lookup_fail(&lcp->lc_sid);
+		trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid);
 		/* fixup error code as per RFC5661 */
 		if (nfserr == nfserr_bad_stateid)
 			nfserr = nfserr_badlayout;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 9b7950602fad..30e04852a4e0 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -35,7 +35,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
 )
 
 #define DEFINE_NFSD_IO_EVENT(name)		\
-DEFINE_EVENT(nfsd_io_class, name,		\
+DEFINE_EVENT(nfsd_io_class, nfsd_##name,	\
 	TP_PROTO(struct svc_rqst *rqstp,	\
 		 struct svc_fh	*fhp,		\
 		 loff_t		offset,		\
@@ -76,7 +76,7 @@ DECLARE_EVENT_CLASS(nfsd_stateid_class,
 )
 
 #define DEFINE_STATEID_EVENT(name) \
-DEFINE_EVENT(nfsd_stateid_class, name, \
+DEFINE_EVENT(nfsd_stateid_class, nfsd_##name, \
 	TP_PROTO(stateid_t *stp), \
 	TP_ARGS(stp))
 DEFINE_STATEID_EVENT(layoutstate_alloc);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index e2eab4c37030..857031952970 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1024,27 +1024,27 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct raparms	*ra;
 	__be32 err;
 
-	trace_read_start(rqstp, fhp, offset, *count);
+	trace_nfsd_read_start(rqstp, fhp, offset, *count);
 	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 	if (err)
 		return err;
 
 	ra = nfsd_init_raparms(file);
 
-	trace_read_opened(rqstp, fhp, offset, *count);
+	trace_nfsd_read_opened(rqstp, fhp, offset, *count);
 
 	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
 		err = nfsd_splice_read(rqstp, file, offset, count);
 	else
 		err = nfsd_readv(file, offset, vec, vlen, count);
 
-	trace_read_io_done(rqstp, fhp, offset, *count);
+	trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
 
 	if (ra)
 		nfsd_put_raparams(file, ra);
 	fput(file);
 
-	trace_read_done(rqstp, fhp, offset, *count);
+	trace_nfsd_read_done(rqstp, fhp, offset, *count);
 
 	return err;
 }
@@ -1061,18 +1061,18 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	struct file *file = NULL;
 	__be32 err = 0;
 
-	trace_write_start(rqstp, fhp, offset, *cnt);
+	trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
 
 	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
 	if (err)
 		goto out;
 
-	trace_write_opened(rqstp, fhp, offset, *cnt);
+	trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
 	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
-	trace_write_io_done(rqstp, fhp, offset, *cnt);
+	trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt);
 	fput(file);
 out:
-	trace_write_done(rqstp, fhp, offset, *cnt);
+	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
 	return err;
 }
 
-- 
cgit v1.2.1


From d890be159a71395bd4ceca09233a8a56d7da69ee Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:53:27 -0400
Subject: nfsd: Add I/O trace points in the NFSv4 write path

NFSv4 write compound processing invokes nfsd_vfs_write directly. The
trace points currently in nfsd_write are not effective for NFSv4
writes.

Move the trace points into the shared nfsd_vfs_write() helper.

After the I/O, we also want to record any local I/O error that
might have occurred, and the total count of bytes that were actually
moved (rather than the requested number).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c |  7 +++++--
 fs/nfsd/trace.h    | 33 +++++++++++++++++++++++++++++++++
 fs/nfsd/vfs.c      | 23 ++++++++++++-----------
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 792fbb851046..b773b314f599 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1001,6 +1001,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (write->wr_offset >= OFFSET_MAX)
 		return nfserr_inval;
 
+	cnt = write->wr_buflen;
+	trace_nfsd_write_start(rqstp, &cstate->current_fh,
+			       write->wr_offset, cnt);
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 						stateid, WR_STATE, &filp, NULL);
 	if (status) {
@@ -1008,7 +1011,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 	}
 
-	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;
 	gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
 
@@ -1021,7 +1023,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	fput(filp);
 
 	write->wr_bytes_written = cnt;
-
+	trace_nfsd_write_done(rqstp, &cstate->current_fh,
+			      write->wr_offset, cnt);
 	return status;
 }
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 30e04852a4e0..653e9ee754d2 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -51,6 +51,39 @@ DEFINE_NFSD_IO_EVENT(write_opened);
 DEFINE_NFSD_IO_EVENT(write_io_done);
 DEFINE_NFSD_IO_EVENT(write_done);
 
+DECLARE_EVENT_CLASS(nfsd_err_class,
+	TP_PROTO(struct svc_rqst *rqstp,
+		 struct svc_fh	*fhp,
+		 loff_t		offset,
+		 int		status),
+	TP_ARGS(rqstp, fhp, offset, status),
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, fh_hash)
+		__field(loff_t, offset)
+		__field(int, status)
+	),
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+		__entry->offset = offset;
+		__entry->status = status;
+	),
+	TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld status=%d",
+		  __entry->xid, __entry->fh_hash,
+		  __entry->offset, __entry->status)
+)
+
+#define DEFINE_NFSD_ERR_EVENT(name)		\
+DEFINE_EVENT(nfsd_err_class, nfsd_##name,	\
+	TP_PROTO(struct svc_rqst *rqstp,	\
+		 struct svc_fh	*fhp,		\
+		 loff_t		offset,		\
+		 int		len),		\
+	TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_ERR_EVENT(write_err);
+
 #include "state.h"
 
 DECLARE_EVENT_CLASS(nfsd_stateid_class,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 857031952970..ee59a0b24fab 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -965,13 +965,15 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 {
 	struct svc_export	*exp;
 	struct iov_iter		iter;
-	__be32			err = 0;
+	__be32			nfserr;
 	int			host_err;
 	int			use_wgather;
 	loff_t			pos = offset;
 	unsigned int		pflags = current->flags;
 	rwf_t			flags = 0;
 
+	trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
+
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
 		/*
 		 * We want less throttling in balance_dirty_pages()
@@ -994,22 +996,23 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	host_err = vfs_iter_write(file, &iter, &pos, flags);
 	if (host_err < 0)
 		goto out_nfserr;
-	*cnt = host_err;
-	nfsdstats.io_write += host_err;
+	nfsdstats.io_write += *cnt;
 	fsnotify_modify(file);
 
 	if (stable && use_wgather)
 		host_err = wait_for_concurrent_writes(file);
 
 out_nfserr:
-	dprintk("nfsd: write complete host_err=%d\n", host_err);
-	if (host_err >= 0)
-		err = 0;
-	else
-		err = nfserrno(host_err);
+	if (host_err >= 0) {
+		trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt);
+		nfserr = nfs_ok;
+	} else {
+		trace_nfsd_write_err(rqstp, fhp, offset, host_err);
+		nfserr = nfserrno(host_err);
+	}
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
 		current_restore_flags(pflags, PF_LESS_THROTTLE);
-	return err;
+	return nfserr;
 }
 
 /*
@@ -1067,9 +1070,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	if (err)
 		goto out;
 
-	trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
 	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable);
-	trace_nfsd_write_io_done(rqstp, fhp, offset, *cnt);
 	fput(file);
 out:
 	trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
-- 
cgit v1.2.1


From 87c5942e8fae81ac296267654a047eb2db81592b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 28 Mar 2018 13:29:11 -0400
Subject: nfsd: Add I/O trace points in the NFSv4 read proc

NFSv4 read compound processing invokes nfsd_splice_read and
nfs_readv directly, so the trace points currently in nfsd_read are
not invoked for NFSv4 reads.

Move the NFSD READ trace points to common helpers so that NFSv4
reads are captured.

Also, record any local I/O error that occurs, the total count of
bytes that were actually returned, and whether splice or vectored
read was used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c |  5 +++++
 fs/nfsd/nfs4xdr.c  | 10 ++++++----
 fs/nfsd/trace.h    |  4 +++-
 fs/nfsd/vfs.c      | 34 ++++++++++++++++++----------------
 fs/nfsd/vfs.h      | 11 +++++++----
 5 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b773b314f599..250e0c550fd8 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -751,6 +751,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (read->rd_offset >= OFFSET_MAX)
 		return nfserr_inval;
 
+	trace_nfsd_read_start(rqstp, &cstate->current_fh,
+			      read->rd_offset, read->rd_length);
+
 	/*
 	 * If we do a zero copy read, then a client will see read data
 	 * that reflects the state of the file *after* performing the
@@ -783,6 +786,8 @@ nfsd4_read_release(union nfsd4_op_u *u)
 {
 	if (u->read.rd_filp)
 		fput(u->read.rd_filp);
+	trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp,
+			     u->read.rd_offset, u->read.rd_length);
 }
 
 static __be32
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9f8ef2c6e508..dd72cdf3662e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3427,8 +3427,9 @@ static __be32 nfsd4_encode_splice_read(
 		return nfserr_resource;
 
 	len = maxcount;
-	nfserr = nfsd_splice_read(read->rd_rqstp, file,
-				  read->rd_offset, &maxcount);
+	nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
+				  file, read->rd_offset, &maxcount);
+	read->rd_length = maxcount;
 	if (nfserr) {
 		/*
 		 * nfsd_splice_actor may have already messed with the
@@ -3511,8 +3512,9 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
 	read->rd_vlen = v;
 
 	len = maxcount;
-	nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
-			read->rd_vlen, &maxcount);
+	nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+			    resp->rqstp->rq_vec, read->rd_vlen, &maxcount);
+	read->rd_length = maxcount;
 	if (nfserr)
 		return nfserr;
 	xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 653e9ee754d2..a8bbd9d00a9e 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -43,7 +43,8 @@ DEFINE_EVENT(nfsd_io_class, nfsd_##name,	\
 	TP_ARGS(rqstp, fhp, offset, len))
 
 DEFINE_NFSD_IO_EVENT(read_start);
-DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_splice);
+DEFINE_NFSD_IO_EVENT(read_vector);
 DEFINE_NFSD_IO_EVENT(read_io_done);
 DEFINE_NFSD_IO_EVENT(read_done);
 DEFINE_NFSD_IO_EVENT(write_start);
@@ -82,6 +83,7 @@ DEFINE_EVENT(nfsd_err_class, nfsd_##name,	\
 		 int		len),		\
 	TP_ARGS(rqstp, fhp, offset, len))
 
+DEFINE_NFSD_ERR_EVENT(read_err);
 DEFINE_NFSD_ERR_EVENT(write_err);
 
 #include "state.h"
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ee59a0b24fab..2410b093a2e6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -881,20 +881,24 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
 	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
 }
 
-static __be32
-nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
+static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+			       struct file *file, loff_t offset,
+			       unsigned long *count, int host_err)
 {
 	if (host_err >= 0) {
 		nfsdstats.io_read += host_err;
 		*count = host_err;
 		fsnotify_access(file);
+		trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
 		return 0;
-	} else 
+	} else {
+		trace_nfsd_read_err(rqstp, fhp, offset, host_err);
 		return nfserrno(host_err);
+	}
 }
 
-__be32 nfsd_splice_read(struct svc_rqst *rqstp,
-		     struct file *file, loff_t offset, unsigned long *count)
+__be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+			struct file *file, loff_t offset, unsigned long *count)
 {
 	struct splice_desc sd = {
 		.len		= 0,
@@ -904,21 +908,23 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp,
 	};
 	int host_err;
 
+	trace_nfsd_read_splice(rqstp, fhp, offset, *count);
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
-	return nfsd_finish_read(file, count, host_err);
+	return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
 }
 
-__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
-		unsigned long *count)
+__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		  struct file *file, loff_t offset,
+		  struct kvec *vec, int vlen, unsigned long *count)
 {
 	struct iov_iter iter;
 	int host_err;
 
+	trace_nfsd_read_vector(rqstp, fhp, offset, *count);
 	iov_iter_kvec(&iter, READ | ITER_KVEC, vec, vlen, *count);
 	host_err = vfs_iter_read(file, &iter, &offset, 0);
-
-	return nfsd_finish_read(file, count, host_err);
+	return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err);
 }
 
 /*
@@ -1034,14 +1040,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	ra = nfsd_init_raparms(file);
 
-	trace_nfsd_read_opened(rqstp, fhp, offset, *count);
-
 	if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
-		err = nfsd_splice_read(rqstp, file, offset, count);
+		err = nfsd_splice_read(rqstp, fhp, file, offset, count);
 	else
-		err = nfsd_readv(file, offset, vec, vlen, count);
-
-	trace_nfsd_read_io_done(rqstp, fhp, offset, *count);
+		err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count);
 
 	if (ra)
 		nfsd_put_raparams(file, ra);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index be6d8e00453f..a7e107309f76 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -78,10 +78,13 @@ __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
 struct raparms;
-__be32		nfsd_splice_read(struct svc_rqst *,
-				struct file *, loff_t, unsigned long *);
-__be32		nfsd_readv(struct file *, loff_t, struct kvec *, int,
-				unsigned long *);
+__be32		nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				struct file *file, loff_t offset,
+				unsigned long *count);
+__be32		nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp,
+				struct file *file, loff_t offset,
+				struct kvec *vec, int vlen,
+				unsigned long *count);
 __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
 				loff_t, struct kvec *, int, unsigned long *);
 __be32 		nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
-- 
cgit v1.2.1


From fff4080b2f1389ecf3028d72eb70e1837df48b01 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:53:54 -0400
Subject: nfsd: Trace NFSv4 COMPOUND execution

This helps record the identity and timing of the ops in each NFSv4
COMPOUND, replacing dprintk calls that did much the same thing.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c |  9 +++------
 fs/nfsd/trace.h    | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 250e0c550fd8..c6157ece46b1 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1721,12 +1721,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 		goto encode_op;
 	}
 
+	trace_nfsd_compound(rqstp, args->opcnt);
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
 
-		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
-			resp->opcnt, args->opcnt, op->opnum,
-			nfsd4_op_name(op->opnum));
 		/*
 		 * The XDR decode routines may have pre-set op->status;
 		 * for example, if there is a miscellaneous XDR error
@@ -1800,9 +1798,8 @@ encode_op:
 			status = op->status;
 		}
 
-		dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
-			args->ops, args->opcnt, resp->opcnt, op->opnum,
-			be32_to_cpu(status));
+		trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
+					   nfsd4_op_name(op->opnum));
 
 		nfsd4_cstate_clear_replay(cstate);
 		nfsd4_increment_op_stats(op->opnum);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a8bbd9d00a9e..80933e4334d8 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -11,6 +11,45 @@
 #include <linux/tracepoint.h>
 #include "nfsfh.h"
 
+TRACE_EVENT(nfsd_compound,
+	TP_PROTO(const struct svc_rqst *rqst,
+		 u32 args_opcnt),
+	TP_ARGS(rqst, args_opcnt),
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(u32, args_opcnt)
+	),
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqst->rq_xid);
+		__entry->args_opcnt = args_opcnt;
+	),
+	TP_printk("xid=0x%08x opcnt=%u",
+		__entry->xid, __entry->args_opcnt)
+)
+
+TRACE_EVENT(nfsd_compound_status,
+	TP_PROTO(u32 args_opcnt,
+		 u32 resp_opcnt,
+		 __be32 status,
+		 const char *name),
+	TP_ARGS(args_opcnt, resp_opcnt, status, name),
+	TP_STRUCT__entry(
+		__field(u32, args_opcnt)
+		__field(u32, resp_opcnt)
+		__field(int, status)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->args_opcnt = args_opcnt;
+		__entry->resp_opcnt = resp_opcnt;
+		__entry->status = be32_to_cpu(status);
+		__assign_str(name, name);
+	),
+	TP_printk("op=%u/%u %s status=%d",
+		__entry->resp_opcnt, __entry->args_opcnt,
+		__get_str(name), __entry->status)
+)
+
 DECLARE_EVENT_CLASS(nfsd_io_class,
 	TP_PROTO(struct svc_rqst *rqstp,
 		 struct svc_fh	*fhp,
-- 
cgit v1.2.1


From 8154ef2776aa512a3eaa0e7db030dc4803354d61 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:54:07 -0400
Subject: NFSD: Clean up legacy NFS WRITE argument XDR decoders

Move common code in NFSD's legacy NFS WRITE decoders into a helper.
The immediate benefit is reduction of code duplication and some nice
micro-optimizations (see below).

In the long term, this helper can perform a per-transport call-out
to fill the rq_vec (say, using RDMA Reads).

The legacy WRITE decoders and procs are changed to work like NFSv4,
which constructs the rq_vec just before it is about to call
vfs_writev.

Why? Calling a transport call-out from the proc instead of the XDR
decoder means that the incoming FH can be resolved to a particular
filesystem and file. This would allow pages from the backing file to
be presented to the transport to be filled, rather than presenting
anonymous pages and copying or flipping them into the file's page
cache later.

I also prefer using the pages in rq_arg.pages, instead of pulling
the data pages directly out of the rqstp::rq_pages array. This is
currently the way the NFSv3 write decoder works, but the other two
do not seem to take this approach. Fixing this removes the only
reference to rq_pages found in NFSD, eliminating an NFSD assumption
about how transports use the pages in rq_pages.

Lastly, avoid setting up the first element of rq_vec as a zero-
length buffer. This happens with an RDMA transport when a normal
Read chunk is present because the data payload is in rq_arg's
page list (none of it is in the head buffer).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3proc.c         |  8 ++++++--
 fs/nfsd/nfs3xdr.c          | 16 ++++------------
 fs/nfsd/nfsproc.c          |  9 +++++++--
 fs/nfsd/nfsxdr.c           | 14 ++------------
 fs/nfsd/xdr.h              |  2 +-
 fs/nfsd/xdr3.h             |  2 +-
 include/linux/sunrpc/svc.h |  2 ++
 net/sunrpc/svc.c           | 42 ++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 1d0ce3c57d93..2dd95ebf4935 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -192,6 +192,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
 	struct nfsd3_writeres *resp = rqstp->rq_resp;
 	__be32	nfserr;
 	unsigned long cnt = argp->len;
+	unsigned int nvecs;
 
 	dprintk("nfsd: WRITE(3)    %s %d bytes at %Lu%s\n",
 				SVCFH_fmt(&argp->fh),
@@ -201,9 +202,12 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
 
 	fh_copy(&resp->fh, &argp->fh);
 	resp->committed = argp->stable;
+	nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt);
+	if (!nvecs)
+		RETURN_STATUS(nfserr_io);
 	nfserr = nfsd_write(rqstp, &resp->fh, argp->offset,
-				rqstp->rq_vec, argp->vlen,
-				&cnt, resp->committed);
+			    rqstp->rq_vec, nvecs, &cnt,
+			    resp->committed);
 	resp->count = cnt;
 	RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 1a70581e1cb2..e19fc5d8bcb5 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -391,7 +391,7 @@ int
 nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
 {
 	struct nfsd3_writeargs *args = rqstp->rq_argp;
-	unsigned int len, v, hdr, dlen;
+	unsigned int len, hdr, dlen;
 	u32 max_blocksize = svc_max_payload(rqstp);
 	struct kvec *head = rqstp->rq_arg.head;
 	struct kvec *tail = rqstp->rq_arg.tail;
@@ -433,17 +433,9 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
 		args->count = max_blocksize;
 		len = args->len = max_blocksize;
 	}
-	rqstp->rq_vec[0].iov_base = (void*)p;
-	rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
-	v = 0;
-	while (len > rqstp->rq_vec[v].iov_len) {
-		len -= rqstp->rq_vec[v].iov_len;
-		v++;
-		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
-		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
-	}
-	rqstp->rq_vec[v].iov_len = len;
-	args->vlen = v + 1;
+
+	args->first.iov_base = (void *)p;
+	args->first.iov_len = head->iov_len - hdr;
 	return 1;
 }
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 43c0419b8ddb..1995ea6bfd2b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -212,13 +212,18 @@ nfsd_proc_write(struct svc_rqst *rqstp)
 	struct nfsd_attrstat *resp = rqstp->rq_resp;
 	__be32	nfserr;
 	unsigned long cnt = argp->len;
+	unsigned int nvecs;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
 		argp->len, argp->offset);
 
-	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset,
-				rqstp->rq_vec, argp->vlen, &cnt, NFS_DATA_SYNC);
+	nvecs = svc_fill_write_vector(rqstp, &argp->first, cnt);
+	if (!nvecs)
+		return nfserr_io;
+	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
+			    argp->offset, rqstp->rq_vec, nvecs,
+			    &cnt, NFS_DATA_SYNC);
 	return nfsd_return_attrs(nfserr, resp);
 }
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 79b6064f8977..db24ae8b67e0 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -287,7 +287,6 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
 	struct nfsd_writeargs *args = rqstp->rq_argp;
 	unsigned int len, hdr, dlen;
 	struct kvec *head = rqstp->rq_arg.head;
-	int v;
 
 	p = decode_fh(p, &args->fh);
 	if (!p)
@@ -323,17 +322,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
 	if (dlen < XDR_QUADLEN(len)*4)
 		return 0;
 
-	rqstp->rq_vec[0].iov_base = (void*)p;
-	rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
-	v = 0;
-	while (len > rqstp->rq_vec[v].iov_len) {
-		len -= rqstp->rq_vec[v].iov_len;
-		v++;
-		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
-		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
-	}
-	rqstp->rq_vec[v].iov_len = len;
-	args->vlen = v + 1;
+	args->first.iov_base = (void *)p;
+	args->first.iov_len = head->iov_len - hdr;
 	return 1;
 }
 
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 2f4f22e6b8cb..a765c414015e 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -34,7 +34,7 @@ struct nfsd_writeargs {
 	svc_fh			fh;
 	__u32			offset;
 	int			len;
-	int			vlen;
+	struct kvec		first;
 };
 
 struct nfsd_createargs {
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 056bf8a7364e..deccf7f691e9 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -41,7 +41,7 @@ struct nfsd3_writeargs {
 	__u32			count;
 	int			stable;
 	__u32			len;
-	int			vlen;
+	struct kvec		first;
 };
 
 struct nfsd3_createargs {
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index dc4c009deec1..fb3fcacc1e98 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -495,6 +495,8 @@ void		   svc_wake_up(struct svc_serv *);
 void		   svc_reserve(struct svc_rqst *rqstp, int space);
 struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
 char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
+unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
+					 struct kvec *first, size_t total);
 
 #define	RPC_MAX_ADDRBUFLEN	(63U)
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index f19987f5d3b5..a155e2de19aa 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1533,3 +1533,45 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
 	return max;
 }
 EXPORT_SYMBOL_GPL(svc_max_payload);
+
+/**
+ * svc_fill_write_vector - Construct data argument for VFS write call
+ * @rqstp: svc_rqst to operate on
+ * @first: buffer containing first section of write payload
+ * @total: total number of bytes of write payload
+ *
+ * Returns the number of elements populated in the data argument array.
+ */
+unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first,
+				   size_t total)
+{
+	struct kvec *vec = rqstp->rq_vec;
+	struct page **pages;
+	unsigned int i;
+
+	/* Some types of transport can present the write payload
+	 * entirely in rq_arg.pages. In this case, @first is empty.
+	 */
+	i = 0;
+	if (first->iov_len) {
+		vec[i].iov_base = first->iov_base;
+		vec[i].iov_len = min_t(size_t, total, first->iov_len);
+		total -= vec[i].iov_len;
+		++i;
+	}
+
+	WARN_ON_ONCE(rqstp->rq_arg.page_base != 0);
+	pages = rqstp->rq_arg.pages;
+	while (total) {
+		vec[i].iov_base = page_address(*pages);
+		vec[i].iov_len = min_t(size_t, total, PAGE_SIZE);
+		total -= vec[i].iov_len;
+		++i;
+
+		++pages;
+	}
+
+	WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec));
+	return i;
+}
+EXPORT_SYMBOL_GPL(svc_fill_write_vector);
-- 
cgit v1.2.1


From 38a70315599dedacd9ff3bd1016f9048c9d0ad12 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 27 Mar 2018 10:54:21 -0400
Subject: NFSD: Clean up legacy NFS SYMLINK argument XDR decoders

Move common code in NFSD's legacy SYMLINK decoders into a helper.
The immediate benefits include:

 - one fewer data copies on transports that support DDP
 - consistent error checking across all versions
 - reduction of code duplication
 - support for both legal forms of SYMLINK requests on RDMA
   transports for all versions of NFS (in particular, NFSv2, for
   completeness)

In the long term, this helper is an appropriate spot to perform a
per-transport call-out to fill the pathname argument using, say,
RDMA Reads.

Filling the pathname in the proc function also means that eventually
the incoming filehandle can be interpreted so that filesystem-
specific memory can be allocated as a sink for the pathname
argument, rather than using anonymous pages.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3proc.c         | 10 +++++++
 fs/nfsd/nfs3xdr.c          | 51 +++++++++--------------------------
 fs/nfsd/nfsproc.c          | 14 +++++-----
 fs/nfsd/nfsxdr.c           | 49 +++++++++++++++++++--------------
 fs/nfsd/xdr.h              |  1 +
 fs/nfsd/xdr3.h             |  1 +
 fs/nfsd/xdr4.h             |  2 ++
 include/linux/sunrpc/svc.h |  2 ++
 net/sunrpc/svc.c           | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 132 insertions(+), 65 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2dd95ebf4935..6259a4b8579f 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -283,6 +283,16 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
 	struct nfsd3_diropres *resp = rqstp->rq_resp;
 	__be32	nfserr;
 
+	if (argp->tlen == 0)
+		RETURN_STATUS(nfserr_inval);
+	if (argp->tlen > NFS3_MAXPATHLEN)
+		RETURN_STATUS(nfserr_nametoolong);
+
+	argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
+						argp->tlen);
+	if (IS_ERR(argp->tname))
+		RETURN_STATUS(nfserrno(PTR_ERR(argp->tname)));
+
 	dprintk("nfsd: SYMLINK(3)  %s %.*s -> %.*s\n",
 				SVCFH_fmt(&argp->ffh),
 				argp->flen, argp->fname,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e19fc5d8bcb5..3192b544a441 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -481,51 +481,24 @@ int
 nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
 {
 	struct nfsd3_symlinkargs *args = rqstp->rq_argp;
-	unsigned int len, avail;
-	char *old, *new;
-	struct kvec *vec;
+	char *base = (char *)p;
+	size_t dlen;
 
 	if (!(p = decode_fh(p, &args->ffh)) ||
-	    !(p = decode_filename(p, &args->fname, &args->flen))
-		)
+	    !(p = decode_filename(p, &args->fname, &args->flen)))
 		return 0;
 	p = decode_sattr3(p, &args->attrs);
 
-	/* now decode the pathname, which might be larger than the first page.
-	 * As we have to check for nul's anyway, we copy it into a new page
-	 * This page appears in the rq_res.pages list, but as pages_len is always
-	 * 0, it won't get in the way
-	 */
-	len = ntohl(*p++);
-	if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
-		return 0;
-	args->tname = new = page_address(*(rqstp->rq_next_page++));
-	args->tlen = len;
-	/* first copy and check from the first page */
-	old = (char*)p;
-	vec = &rqstp->rq_arg.head[0];
-	if ((void *)old > vec->iov_base + vec->iov_len)
-		return 0;
-	avail = vec->iov_len - (old - (char*)vec->iov_base);
-	while (len && avail && *old) {
-		*new++ = *old++;
-		len--;
-		avail--;
-	}
-	/* now copy next page if there is one */
-	if (len && !avail && rqstp->rq_arg.page_len) {
-		avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE);
-		old = page_address(rqstp->rq_arg.pages[0]);
-	}
-	while (len && avail && *old) {
-		*new++ = *old++;
-		len--;
-		avail--;
-	}
-	*new = '\0';
-	if (len)
-		return 0;
+	args->tlen = ntohl(*p++);
 
+	args->first.iov_base = p;
+	args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+	args->first.iov_len -= (char *)p - base;
+
+	dlen = args->first.iov_len + rqstp->rq_arg.page_len +
+	       rqstp->rq_arg.tail[0].iov_len;
+	if (dlen < XDR_QUADLEN(args->tlen) << 2)
+		return 0;
 	return 1;
 }
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 1995ea6bfd2b..f107f9fa8e15 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -449,17 +449,19 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
 	struct svc_fh	newfh;
 	__be32		nfserr;
 
+	if (argp->tlen > NFS_MAXPATHLEN)
+		return nfserr_nametoolong;
+
+	argp->tname = svc_fill_symlink_pathname(rqstp, &argp->first,
+						argp->tlen);
+	if (IS_ERR(argp->tname))
+		return nfserrno(PTR_ERR(argp->tname));
+
 	dprintk("nfsd: SYMLINK  %s %.*s -> %.*s\n",
 		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
 		argp->tlen, argp->tname);
 
 	fh_init(&newfh, NFS_FHSIZE);
-	/*
-	 * Crazy hack: the request fits in a page, and already-decoded
-	 * attributes follow argp->tname, so it's safe to just write a
-	 * null to ensure it's null-terminated:
-	 */
-	argp->tname[argp->tlen] = '\0';
 	nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
 						 argp->tname, &newfh);
 
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index db24ae8b67e0..a43e8260520a 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -70,22 +70,6 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp)
 	return p;
 }
 
-static __be32 *
-decode_pathname(__be32 *p, char **namp, unsigned int *lenp)
-{
-	char		*name;
-	unsigned int	i;
-
-	if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) {
-		for (i = 0, name = *namp; i < *lenp; i++, name++) {
-			if (*name == '\0')
-				return NULL;
-		}
-	}
-
-	return p;
-}
-
 static __be32 *
 decode_sattr(__be32 *p, struct iattr *iap)
 {
@@ -384,14 +368,39 @@ int
 nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
 {
 	struct nfsd_symlinkargs *args = rqstp->rq_argp;
+	char *base = (char *)p;
+	size_t xdrlen;
 
 	if (   !(p = decode_fh(p, &args->ffh))
-	    || !(p = decode_filename(p, &args->fname, &args->flen))
-	    || !(p = decode_pathname(p, &args->tname, &args->tlen)))
+	    || !(p = decode_filename(p, &args->fname, &args->flen)))
 		return 0;
-	p = decode_sattr(p, &args->attrs);
 
-	return xdr_argsize_check(rqstp, p);
+	args->tlen = ntohl(*p++);
+	if (args->tlen == 0)
+		return 0;
+
+	args->first.iov_base = p;
+	args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+	args->first.iov_len -= (char *)p - base;
+
+	/* This request is never larger than a page. Therefore,
+	 * transport will deliver either:
+	 * 1. pathname in the pagelist -> sattr is in the tail.
+	 * 2. everything in the head buffer -> sattr is in the head.
+	 */
+	if (rqstp->rq_arg.page_len) {
+		if (args->tlen != rqstp->rq_arg.page_len)
+			return 0;
+		p = rqstp->rq_arg.tail[0].iov_base;
+	} else {
+		xdrlen = XDR_QUADLEN(args->tlen);
+		if (xdrlen > args->first.iov_len - (8 * sizeof(__be32)))
+			return 0;
+		p += xdrlen;
+	}
+	decode_sattr(p, &args->attrs);
+
+	return 1;
 }
 
 int
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index a765c414015e..ea7cca3a64b7 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -72,6 +72,7 @@ struct nfsd_symlinkargs {
 	char *			tname;
 	unsigned int		tlen;
 	struct iattr		attrs;
+	struct kvec		first;
 };
 
 struct nfsd_readdirargs {
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index deccf7f691e9..2cb29e961a76 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -90,6 +90,7 @@ struct nfsd3_symlinkargs {
 	char *			tname;
 	unsigned int		tlen;
 	struct iattr		attrs;
+	struct kvec		first;
 };
 
 struct nfsd3_readdirargs {
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 7cbc129092fe..468020fe2a07 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -110,6 +110,7 @@ struct nfsd4_create {
 		struct {
 			u32 datalen;
 			char *data;
+			struct kvec first;
 		} link;   /* NF4LNK */
 		struct {
 			u32 specdata1;
@@ -124,6 +125,7 @@ struct nfsd4_create {
 };
 #define cr_datalen	u.link.datalen
 #define cr_data		u.link.data
+#define cr_first	u.link.first
 #define cr_specdata1	u.dev.specdata1
 #define cr_specdata2	u.dev.specdata2
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index fb3fcacc1e98..574368e8a16f 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -497,6 +497,8 @@ struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
 char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
 unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
 					 struct kvec *first, size_t total);
+char		  *svc_fill_symlink_pathname(struct svc_rqst *rqstp,
+					     struct kvec *first, size_t total);
 
 #define	RPC_MAX_ADDRBUFLEN	(63U)
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a155e2de19aa..30a4226baf03 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1575,3 +1575,70 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct kvec *first,
 	return i;
 }
 EXPORT_SYMBOL_GPL(svc_fill_write_vector);
+
+/**
+ * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call
+ * @rqstp: svc_rqst to operate on
+ * @first: buffer containing first section of pathname
+ * @total: total length of the pathname argument
+ *
+ * Returns pointer to a NUL-terminated string, or an ERR_PTR. The buffer is
+ * released automatically when @rqstp is recycled.
+ */
+char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first,
+				size_t total)
+{
+	struct xdr_buf *arg = &rqstp->rq_arg;
+	struct page **pages;
+	char *result;
+
+	/* VFS API demands a NUL-terminated pathname. This function
+	 * uses a page from @rqstp as the pathname buffer, to enable
+	 * direct placement. Thus the total buffer size is PAGE_SIZE.
+	 * Space in this buffer for NUL-termination requires that we
+	 * cap the size of the returned symlink pathname just a
+	 * little early.
+	 */
+	if (total > PAGE_SIZE - 1)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	/* Some types of transport can present the pathname entirely
+	 * in rq_arg.pages. If not, then copy the pathname into one
+	 * page.
+	 */
+	pages = arg->pages;
+	WARN_ON_ONCE(arg->page_base != 0);
+	if (first->iov_base == 0) {
+		result = page_address(*pages);
+		result[total] = '\0';
+	} else {
+		size_t len, remaining;
+		char *dst;
+
+		result = page_address(*(rqstp->rq_next_page++));
+		dst = result;
+		remaining = total;
+
+		len = min_t(size_t, total, first->iov_len);
+		memcpy(dst, first->iov_base, len);
+		dst += len;
+		remaining -= len;
+
+		/* No more than one page left */
+		if (remaining) {
+			len = min_t(size_t, remaining, PAGE_SIZE);
+			memcpy(dst, page_address(*pages), len);
+			dst += len;
+		}
+
+		*dst = '\0';
+	}
+
+	/* Sanity check: we don't allow the pathname argument to
+	 * contain a NUL byte.
+	 */
+	if (strlen(result) != total)
+		return ERR_PTR(-EINVAL);
+	return result;
+}
+EXPORT_SYMBOL_GPL(svc_fill_symlink_pathname);
-- 
cgit v1.2.1


From f3aefb6a7066e24bfea7fcf1b07907576de69d63 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 28 Mar 2018 10:57:22 -0700
Subject: sunrpc: remove incorrect HMAC request initialization

make_checksum_hmac_md5() is allocating an HMAC transform and doing
crypto API calls in the following order:

    crypto_ahash_init()
    crypto_ahash_setkey()
    crypto_ahash_digest()

This is wrong because it makes no sense to init() the request before a
key has been set, given that the initial state depends on the key.  And
digest() is short for init() + update() + final(), so in this case
there's no need to explicitly call init() at all.

Before commit 9fa68f620041 ("crypto: hash - prevent using keyed hashes
without setting key") the extra init() had no real effect, at least for
the software HMAC implementation.  (There are also hardware drivers that
implement HMAC-MD5, and it's not immediately obvious how gracefully they
handle init() before setkey().)  But now the crypto API detects this
incorrect initialization and returns -ENOKEY.  This is breaking NFS
mounts in some cases.

Fix it by removing the incorrect call to crypto_ahash_init().

Reported-by: Michael Young <m.a.young@durham.ac.uk>
Fixes: 9fa68f620041 ("crypto: hash - prevent using keyed hashes without setting key")
Fixes: fffdaef2eb4a ("gss_krb5: Add support for rc4-hmac encryption")
Cc: stable@vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 net/sunrpc/auth_gss/gss_krb5_crypto.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index 12649c9fedab..8654494b4d0a 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -237,9 +237,6 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
 
 	ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
 
-	err = crypto_ahash_init(req);
-	if (err)
-		goto out;
 	err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
 	if (err)
 		goto out;
-- 
cgit v1.2.1


From 880a3a5325489a143269a8e172e7563ebf9897bc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 21 Mar 2018 17:19:02 -0400
Subject: nfsd: fix incorrect umasks

We're neglecting to clear the umask after it's set, which can cause a
later unrelated rpc to (incorrectly) use the same umask if it happens to
be processed by the same thread.

There's a more subtle problem here too:

An NFSv4 compound request is decoded all in one pass before any
operations are executed.

Currently we're setting current->fs->umask at the time we decode the
compound.  In theory a single compound could contain multiple creates
each setting a umask.  In that case we'd end up using whichever umask
was passed in the *last* operation as the umask for all the creates,
whether that was correct or not.

So, we should just be saving the umask at decode time and waiting to set
it until we actually process the corresponding operation.

In practice it's unlikely any client would do multiple creates in a
single compound.  And even if it did they'd likely be from the same
process (hence carry the same umask).  So this is a little academic, but
we should get it right anyway.

Fixes: 47057abde515 (nfsd: add support for the umask attribute)
Cc: stable@vger.kernel.org
Reported-by: Lucash Stach <l.stach@pengutronix.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 12 ++++++++++--
 fs/nfsd/nfs4xdr.c  |  8 +++-----
 fs/nfsd/xdr4.h     |  2 ++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c6157ece46b1..5d99e8810b85 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -32,6 +32,7 @@
  *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+#include <linux/fs_struct.h>
 #include <linux/file.h>
 #include <linux/falloc.h>
 #include <linux/slab.h>
@@ -252,11 +253,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 		 * Note: create modes (UNCHECKED,GUARDED...) are the same
 		 * in NFSv4 as in v3 except EXCLUSIVE4_1.
 		 */
+		current->fs->umask = open->op_umask;
 		status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
 					open->op_fname.len, &open->op_iattr,
 					*resfh, open->op_createmode,
 					(u32 *)open->op_verf.data,
 					&open->op_truncate, &open->op_created);
+		current->fs->umask = 0;
 
 		if (!status && open->op_label.len)
 			nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
@@ -603,6 +606,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
+	current->fs->umask = create->cr_umask;
 	switch (create->cr_type) {
 	case NF4LNK:
 		status = nfsd_symlink(rqstp, &cstate->current_fh,
@@ -611,20 +615,22 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		break;
 
 	case NF4BLK:
+		status = nfserr_inval;
 		rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
 		if (MAJOR(rdev) != create->cr_specdata1 ||
 		    MINOR(rdev) != create->cr_specdata2)
-			return nfserr_inval;
+			goto out_umask;
 		status = nfsd_create(rqstp, &cstate->current_fh,
 				     create->cr_name, create->cr_namelen,
 				     &create->cr_iattr, S_IFBLK, rdev, &resfh);
 		break;
 
 	case NF4CHR:
+		status = nfserr_inval;
 		rdev = MKDEV(create->cr_specdata1, create->cr_specdata2);
 		if (MAJOR(rdev) != create->cr_specdata1 ||
 		    MINOR(rdev) != create->cr_specdata2)
-			return nfserr_inval;
+			goto out_umask;
 		status = nfsd_create(rqstp, &cstate->current_fh,
 				     create->cr_name, create->cr_namelen,
 				     &create->cr_iattr,S_IFCHR, rdev, &resfh);
@@ -668,6 +674,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	fh_dup2(&cstate->current_fh, &resfh);
 out:
 	fh_put(&resfh);
+out_umask:
+	current->fs->umask = 0;
 	return status;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index dd72cdf3662e..1d048dd95464 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -33,7 +33,6 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <linux/fs_struct.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
@@ -682,7 +681,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 
 	status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
 				    &create->cr_acl, &create->cr_label,
-				    &current->fs->umask);
+				    &create->cr_umask);
 	if (status)
 		goto out;
 
@@ -927,7 +926,6 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	case NFS4_OPEN_NOCREATE:
 		break;
 	case NFS4_OPEN_CREATE:
-		current->fs->umask = 0;
 		READ_BUF(4);
 		open->op_createmode = be32_to_cpup(p++);
 		switch (open->op_createmode) {
@@ -935,7 +933,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		case NFS4_CREATE_GUARDED:
 			status = nfsd4_decode_fattr(argp, open->op_bmval,
 				&open->op_iattr, &open->op_acl, &open->op_label,
-				&current->fs->umask);
+				&open->op_umask);
 			if (status)
 				goto out;
 			break;
@@ -950,7 +948,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 			COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
 			status = nfsd4_decode_fattr(argp, open->op_bmval,
 				&open->op_iattr, &open->op_acl, &open->op_label,
-				&current->fs->umask);
+				&open->op_umask);
 			if (status)
 				goto out;
 			break;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 468020fe2a07..17c453a7999c 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -119,6 +119,7 @@ struct nfsd4_create {
 	} u;
 	u32		cr_bmval[3];        /* request */
 	struct iattr	cr_iattr;           /* request */
+	int		cr_umask;           /* request */
 	struct nfsd4_change_info  cr_cinfo; /* response */
 	struct nfs4_acl *cr_acl;
 	struct xdr_netobj cr_label;
@@ -230,6 +231,7 @@ struct nfsd4_open {
 	u32		op_why_no_deleg;    /* response - DELEG_NONE_EXT only */
 	u32		op_create;     	    /* request */
 	u32		op_createmode;      /* request */
+	int		op_umask;           /* request */
 	u32		op_bmval[3];        /* request */
 	struct iattr	op_iattr;           /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
 	nfs4_verifier	op_verf __attribute__((aligned(32)));
-- 
cgit v1.2.1