From 03a6c82218b9a87014b2c6c4e178294fdc8ebd8a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:40 +0000
Subject: rxrpc: The mutex lock returned by rxrpc_accept_call() needs releasing

The caller of rxrpc_accept_call() must release the lock on call->user_mutex
returned by that function.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sendmsg.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 7d2595582c09..3a99b1a908df 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -619,8 +619,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		/* The socket is now unlocked. */
 		if (IS_ERR(call))
 			return PTR_ERR(call);
-		rxrpc_put_call(call, rxrpc_call_put);
-		return 0;
+		ret = 0;
+		goto out_put_unlock;
 	}
 
 	call = rxrpc_find_call_by_user_ID(rx, p.user_call_ID);
@@ -689,6 +689,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		ret = rxrpc_send_data(rx, call, msg, len, NULL);
 	}
 
+out_put_unlock:
 	mutex_unlock(&call->user_mutex);
 error_put:
 	rxrpc_put_call(call, rxrpc_call_put);
-- 
cgit v1.2.3


From 48ca24636d5a26f1b7b8e69c54bccf19394843e5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:40 +0000
Subject: rxrpc: Don't set upgrade by default in sendmsg()

Don't set upgrade by default when creating a call from sendmsg().  This is
a holdover from when I was testing the code.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sendmsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 3a99b1a908df..94555c94b2d8 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -602,7 +602,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		.abort_code	= 0,
 		.command	= RXRPC_CMD_SEND_DATA,
 		.exclusive	= false,
-		.upgrade	= true,
+		.upgrade	= false,
 	};
 
 	_enter("");
-- 
cgit v1.2.3


From 9faaff593404a9c4e5abc6839a641635d7b9d0cd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:40 +0000
Subject: rxrpc: Provide a different lockdep key for call->user_mutex for
 kernel calls

Provide a different lockdep key for rxrpc_call::user_mutex when the call is
made on a kernel socket, such as by the AFS filesystem.

The problem is that lockdep registers a false positive between userspace
calling the sendmsg syscall on a user socket where call->user_mutex is held
whilst userspace memory is accessed whereas the AFS filesystem may perform
operations with mmap_sem held by the caller.

In such a case, the following warning is produced.

======================================================
WARNING: possible circular locking dependency detected
4.14.0-fscache+ #243 Tainted: G            E
------------------------------------------------------
modpost/16701 is trying to acquire lock:
 (&vnode->io_lock){+.+.}, at: [<ffffffffa000fc40>] afs_begin_vnode_operation+0x33/0x77 [kafs]

but task is already holding lock:
 (&mm->mmap_sem){++++}, at: [<ffffffff8104376a>] __do_page_fault+0x1ef/0x486

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&mm->mmap_sem){++++}:
       __might_fault+0x61/0x89
       _copy_from_iter_full+0x40/0x1fa
       rxrpc_send_data+0x8dc/0xff3
       rxrpc_do_sendmsg+0x62f/0x6a1
       rxrpc_sendmsg+0x166/0x1b7
       sock_sendmsg+0x2d/0x39
       ___sys_sendmsg+0x1ad/0x22b
       __sys_sendmsg+0x41/0x62
       do_syscall_64+0x89/0x1be
       return_from_SYSCALL_64+0x0/0x75

-> #2 (&call->user_mutex){+.+.}:
       __mutex_lock+0x86/0x7d2
       rxrpc_new_client_call+0x378/0x80e
       rxrpc_kernel_begin_call+0xf3/0x154
       afs_make_call+0x195/0x454 [kafs]
       afs_vl_get_capabilities+0x193/0x198 [kafs]
       afs_vl_lookup_vldb+0x5f/0x151 [kafs]
       afs_create_volume+0x2e/0x2f4 [kafs]
       afs_mount+0x56a/0x8d7 [kafs]
       mount_fs+0x6a/0x109
       vfs_kern_mount+0x67/0x135
       do_mount+0x90b/0xb57
       SyS_mount+0x72/0x98
       do_syscall_64+0x89/0x1be
       return_from_SYSCALL_64+0x0/0x75

-> #1 (k-sk_lock-AF_RXRPC){+.+.}:
       lock_sock_nested+0x74/0x8a
       rxrpc_kernel_begin_call+0x8a/0x154
       afs_make_call+0x195/0x454 [kafs]
       afs_fs_get_capabilities+0x17a/0x17f [kafs]
       afs_probe_fileserver+0xf7/0x2f0 [kafs]
       afs_select_fileserver+0x83f/0x903 [kafs]
       afs_fetch_status+0x89/0x11d [kafs]
       afs_iget+0x16f/0x4f8 [kafs]
       afs_mount+0x6c6/0x8d7 [kafs]
       mount_fs+0x6a/0x109
       vfs_kern_mount+0x67/0x135
       do_mount+0x90b/0xb57
       SyS_mount+0x72/0x98
       do_syscall_64+0x89/0x1be
       return_from_SYSCALL_64+0x0/0x75

-> #0 (&vnode->io_lock){+.+.}:
       lock_acquire+0x174/0x19f
       __mutex_lock+0x86/0x7d2
       afs_begin_vnode_operation+0x33/0x77 [kafs]
       afs_fetch_data+0x80/0x12a [kafs]
       afs_readpages+0x314/0x405 [kafs]
       __do_page_cache_readahead+0x203/0x2ba
       filemap_fault+0x179/0x54d
       __do_fault+0x17/0x60
       __handle_mm_fault+0x6d7/0x95c
       handle_mm_fault+0x24e/0x2a3
       __do_page_fault+0x301/0x486
       do_page_fault+0x236/0x259
       page_fault+0x22/0x30
       __clear_user+0x3d/0x60
       padzero+0x1c/0x2b
       load_elf_binary+0x785/0xdc7
       search_binary_handler+0x81/0x1ff
       do_execveat_common.isra.14+0x600/0x888
       do_execve+0x1f/0x21
       SyS_execve+0x28/0x2f
       do_syscall_64+0x89/0x1be
       return_from_SYSCALL_64+0x0/0x75

other info that might help us debug this:

Chain exists of:
  &vnode->io_lock --> &call->user_mutex --> &mm->mmap_sem

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&mm->mmap_sem);
                               lock(&call->user_mutex);
                               lock(&mm->mmap_sem);
  lock(&vnode->io_lock);

 *** DEADLOCK ***

1 lock held by modpost/16701:
 #0:  (&mm->mmap_sem){++++}, at: [<ffffffff8104376a>] __do_page_fault+0x1ef/0x486

stack backtrace:
CPU: 0 PID: 16701 Comm: modpost Tainted: G            E   4.14.0-fscache+ #243
Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
Call Trace:
 dump_stack+0x67/0x8e
 print_circular_bug+0x341/0x34f
 check_prev_add+0x11f/0x5d4
 ? add_lock_to_list.isra.12+0x8b/0x8b
 ? add_lock_to_list.isra.12+0x8b/0x8b
 ? __lock_acquire+0xf77/0x10b4
 __lock_acquire+0xf77/0x10b4
 lock_acquire+0x174/0x19f
 ? afs_begin_vnode_operation+0x33/0x77 [kafs]
 __mutex_lock+0x86/0x7d2
 ? afs_begin_vnode_operation+0x33/0x77 [kafs]
 ? afs_begin_vnode_operation+0x33/0x77 [kafs]
 ? afs_begin_vnode_operation+0x33/0x77 [kafs]
 afs_begin_vnode_operation+0x33/0x77 [kafs]
 afs_fetch_data+0x80/0x12a [kafs]
 afs_readpages+0x314/0x405 [kafs]
 __do_page_cache_readahead+0x203/0x2ba
 ? filemap_fault+0x179/0x54d
 filemap_fault+0x179/0x54d
 __do_fault+0x17/0x60
 __handle_mm_fault+0x6d7/0x95c
 handle_mm_fault+0x24e/0x2a3
 __do_page_fault+0x301/0x486
 do_page_fault+0x236/0x259
 page_fault+0x22/0x30
RIP: 0010:__clear_user+0x3d/0x60
RSP: 0018:ffff880071e93da0 EFLAGS: 00010202
RAX: 0000000000000000 RBX: 000000000000011c RCX: 000000000000011c
RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000000000060f720
RBP: 000000000060f720 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000001 R11: ffff8800b5459b68 R12: ffff8800ce150e00
R13: 000000000060f720 R14: 00000000006127a8 R15: 0000000000000000
 padzero+0x1c/0x2b
 load_elf_binary+0x785/0xdc7
 search_binary_handler+0x81/0x1ff
 do_execveat_common.isra.14+0x600/0x888
 do_execve+0x1f/0x21
 SyS_execve+0x28/0x2f
 do_syscall_64+0x89/0x1be
 entry_SYSCALL64_slow_path+0x25/0x25
RIP: 0033:0x7fdb6009ee07
RSP: 002b:00007fff566d9728 EFLAGS: 00000246 ORIG_RAX: 000000000000003b
RAX: ffffffffffffffda RBX: 000055ba57280900 RCX: 00007fdb6009ee07
RDX: 000055ba5727f270 RSI: 000055ba5727cac0 RDI: 000055ba57280900
RBP: 000055ba57280900 R08: 00007fff566d9700 R09: 0000000000000000
R10: 000055ba5727cac0 R11: 0000000000000246 R12: 0000000000000000
R13: 000055ba5727cac0 R14: 000055ba5727f270 R15: 0000000000000000

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  2 +-
 net/rxrpc/call_accept.c |  2 +-
 net/rxrpc/call_object.c | 19 +++++++++++++++----
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index b2151993d384..a972887b3f5d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -672,7 +672,7 @@ extern unsigned int rxrpc_max_call_lifetime;
 extern struct kmem_cache *rxrpc_call_jar;
 
 struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
-struct rxrpc_call *rxrpc_alloc_call(gfp_t);
+struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t);
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index cbd1701e813a..3028298ca561 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -94,7 +94,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 	/* Now it gets complicated, because calls get registered with the
 	 * socket here, particularly if a user ID is preassigned by the user.
 	 */
-	call = rxrpc_alloc_call(gfp);
+	call = rxrpc_alloc_call(rx, gfp);
 	if (!call)
 		return -ENOMEM;
 	call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 4c7fbc6dcce7..1f141dc08ad2 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -55,6 +55,8 @@ static void rxrpc_call_timer_expired(unsigned long _call)
 		rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
 }
 
+static struct lock_class_key rxrpc_call_user_mutex_lock_class_key;
+
 /*
  * find an extant server call
  * - called in process context with IRQs enabled
@@ -95,7 +97,7 @@ found_extant_call:
 /*
  * allocate a new call
  */
-struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
+struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp)
 {
 	struct rxrpc_call *call;
 
@@ -114,6 +116,14 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 		goto nomem_2;
 
 	mutex_init(&call->user_mutex);
+
+	/* Prevent lockdep reporting a deadlock false positive between the afs
+	 * filesystem and sys_sendmsg() via the mmap sem.
+	 */
+	if (rx->sk.sk_kern_sock)
+		lockdep_set_class(&call->user_mutex,
+				  &rxrpc_call_user_mutex_lock_class_key);
+
 	setup_timer(&call->timer, rxrpc_call_timer_expired,
 		    (unsigned long)call);
 	INIT_WORK(&call->processor, &rxrpc_process_call);
@@ -151,7 +161,8 @@ nomem:
 /*
  * Allocate a new client call.
  */
-static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
+static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+						  struct sockaddr_rxrpc *srx,
 						  gfp_t gfp)
 {
 	struct rxrpc_call *call;
@@ -159,7 +170,7 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct sockaddr_rxrpc *srx,
 
 	_enter("");
 
-	call = rxrpc_alloc_call(gfp);
+	call = rxrpc_alloc_call(rx, gfp);
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
@@ -210,7 +221,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 
 	_enter("%p,%lx", rx, user_call_ID);
 
-	call = rxrpc_alloc_client_call(srx, gfp);
+	call = rxrpc_alloc_client_call(rx, srx, gfp);
 	if (IS_ERR(call)) {
 		release_sock(&rx->sk);
 		_leave(" = %ld", PTR_ERR(call));
-- 
cgit v1.2.3


From 3136ef49a14ccc148becf813074e08fc92fc9b23 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Delay terminal ACK transmission on a client call

Delay terminal ACK transmission on a client call by deferring it to the
connection processor.  This allows it to be skipped if we can send the next
call instead, the first DATA packet of which will implicitly ack this call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 17 ++++++++++++
 net/rxrpc/conn_client.c | 18 ++++++++++++
 net/rxrpc/conn_event.c  | 74 ++++++++++++++++++++++++++++++++++++++++---------
 net/rxrpc/conn_object.c | 10 +++++++
 net/rxrpc/recvmsg.c     |  2 ++
 5 files changed, 108 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a972887b3f5d..d1213d503f30 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -338,8 +338,17 @@ enum rxrpc_conn_flag {
 	RXRPC_CONN_DONT_REUSE,		/* Don't reuse this connection */
 	RXRPC_CONN_COUNTED,		/* Counted by rxrpc_nr_client_conns */
 	RXRPC_CONN_PROBING_FOR_UPGRADE,	/* Probing for service upgrade */
+	RXRPC_CONN_FINAL_ACK_0,		/* Need final ACK for channel 0 */
+	RXRPC_CONN_FINAL_ACK_1,		/* Need final ACK for channel 1 */
+	RXRPC_CONN_FINAL_ACK_2,		/* Need final ACK for channel 2 */
+	RXRPC_CONN_FINAL_ACK_3,		/* Need final ACK for channel 3 */
 };
 
+#define RXRPC_CONN_FINAL_ACK_MASK ((1UL << RXRPC_CONN_FINAL_ACK_0) |	\
+				   (1UL << RXRPC_CONN_FINAL_ACK_1) |	\
+				   (1UL << RXRPC_CONN_FINAL_ACK_2) |	\
+				   (1UL << RXRPC_CONN_FINAL_ACK_3))
+
 /*
  * Events that can be raised upon a connection.
  */
@@ -393,6 +402,7 @@ struct rxrpc_connection {
 #define RXRPC_ACTIVE_CHANS_MASK	((1 << RXRPC_MAXCALLS) - 1)
 	struct list_head	waiting_calls;	/* Calls waiting for channels */
 	struct rxrpc_channel {
+		unsigned long		final_ack_at;	/* Time at which to issue final ACK */
 		struct rxrpc_call __rcu	*call;		/* Active call */
 		u32			call_id;	/* ID of current call */
 		u32			call_counter;	/* Call ID counter */
@@ -404,6 +414,7 @@ struct rxrpc_connection {
 		};
 	} channels[RXRPC_MAXCALLS];
 
+	struct timer_list	timer;		/* Conn event timer */
 	struct work_struct	processor;	/* connection event processor */
 	union {
 		struct rb_node	client_node;	/* Node in local->client_conns */
@@ -861,6 +872,12 @@ static inline void rxrpc_put_connection(struct rxrpc_connection *conn)
 		rxrpc_put_service_conn(conn);
 }
 
+static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn,
+					   unsigned long expire_at)
+{
+	timer_reduce(&conn->timer, expire_at);
+}
+
 /*
  * conn_service.c
  */
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 5f9624bd311c..cfb997593da9 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -554,6 +554,11 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
 
 	trace_rxrpc_client(conn, channel, rxrpc_client_chan_activate);
 
+	/* Cancel the final ACK on the previous call if it hasn't been sent yet
+	 * as the DATA packet will implicitly ACK it.
+	 */
+	clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags);
+
 	write_lock_bh(&call->state_lock);
 	if (!test_bit(RXRPC_CALL_TX_LASTQ, &call->flags))
 		call->state = RXRPC_CALL_CLIENT_SEND_REQUEST;
@@ -813,6 +818,19 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 		goto out_2;
 	}
 
+	/* Schedule the final ACK to be transmitted in a short while so that it
+	 * can be skipped if we find a follow-on call.  The first DATA packet
+	 * of the follow on call will implicitly ACK this call.
+	 */
+	if (test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
+		unsigned long final_ack_at = jiffies + 2;
+
+		WRITE_ONCE(chan->final_ack_at, final_ack_at);
+		smp_wmb(); /* vs rxrpc_process_delayed_final_acks() */
+		set_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags);
+		rxrpc_reduce_conn_timer(conn, final_ack_at);
+	}
+
 	/* Things are more complex and we need the cache lock.  We might be
 	 * able to simply idle the conn or it might now be lurking on the wait
 	 * list.  It might even get moved back to the active list whilst we're
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 59a51a56e7c8..9e9a8db1bc9c 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -24,9 +24,10 @@
  * Retransmit terminal ACK or ABORT of the previous call.
  */
 static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
-				       struct sk_buff *skb)
+				       struct sk_buff *skb,
+				       unsigned int channel)
 {
-	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	struct rxrpc_skb_priv *sp = skb ? rxrpc_skb(skb) : NULL;
 	struct rxrpc_channel *chan;
 	struct msghdr msg;
 	struct kvec iov;
@@ -48,7 +49,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 
 	_enter("%d", conn->debug_id);
 
-	chan = &conn->channels[sp->hdr.cid & RXRPC_CHANNELMASK];
+	chan = &conn->channels[channel];
 
 	/* If the last call got moved on whilst we were waiting to run, just
 	 * ignore this packet.
@@ -56,7 +57,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	call_id = READ_ONCE(chan->last_call);
 	/* Sync with __rxrpc_disconnect_call() */
 	smp_rmb();
-	if (call_id != sp->hdr.callNumber)
+	if (skb && call_id != sp->hdr.callNumber)
 		return;
 
 	msg.msg_name	= &conn->params.peer->srx.transport;
@@ -65,9 +66,9 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	msg.msg_controllen = 0;
 	msg.msg_flags	= 0;
 
-	pkt.whdr.epoch		= htonl(sp->hdr.epoch);
-	pkt.whdr.cid		= htonl(sp->hdr.cid);
-	pkt.whdr.callNumber	= htonl(sp->hdr.callNumber);
+	pkt.whdr.epoch		= htonl(conn->proto.epoch);
+	pkt.whdr.cid		= htonl(conn->proto.cid);
+	pkt.whdr.callNumber	= htonl(call_id);
 	pkt.whdr.seq		= 0;
 	pkt.whdr.type		= chan->last_type;
 	pkt.whdr.flags		= conn->out_clientflag;
@@ -87,11 +88,11 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 		mtu = conn->params.peer->if_mtu;
 		mtu -= conn->params.peer->hdrsize;
 		pkt.ack.bufferSpace	= 0;
-		pkt.ack.maxSkew		= htons(skb->priority);
-		pkt.ack.firstPacket	= htonl(chan->last_seq);
-		pkt.ack.previousPacket	= htonl(chan->last_seq - 1);
-		pkt.ack.serial		= htonl(sp->hdr.serial);
-		pkt.ack.reason		= RXRPC_ACK_DUPLICATE;
+		pkt.ack.maxSkew		= htons(skb ? skb->priority : 0);
+		pkt.ack.firstPacket	= htonl(chan->last_seq + 1);
+		pkt.ack.previousPacket	= htonl(chan->last_seq);
+		pkt.ack.serial		= htonl(skb ? sp->hdr.serial : 0);
+		pkt.ack.reason		= skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
 		pkt.ack.nAcks		= 0;
 		pkt.info.rxMTU		= htonl(rxrpc_rx_mtu);
 		pkt.info.maxMTU		= htonl(mtu);
@@ -272,7 +273,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_DATA:
 	case RXRPC_PACKET_TYPE_ACK:
-		rxrpc_conn_retransmit_call(conn, skb);
+		rxrpc_conn_retransmit_call(conn, skb,
+					   sp->hdr.cid & RXRPC_CHANNELMASK);
 		return 0;
 
 	case RXRPC_PACKET_TYPE_BUSY:
@@ -378,6 +380,48 @@ abort:
 	_leave(" [aborted]");
 }
 
+/*
+ * Process delayed final ACKs that we haven't subsumed into a subsequent call.
+ */
+static void rxrpc_process_delayed_final_acks(struct rxrpc_connection *conn)
+{
+	unsigned long j = jiffies, next_j;
+	unsigned int channel;
+	bool set;
+
+again:
+	next_j = j + LONG_MAX;
+	set = false;
+	for (channel = 0; channel < RXRPC_MAXCALLS; channel++) {
+		struct rxrpc_channel *chan = &conn->channels[channel];
+		unsigned long ack_at;
+
+		if (!test_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags))
+			continue;
+
+		smp_rmb(); /* vs rxrpc_disconnect_client_call */
+		ack_at = READ_ONCE(chan->final_ack_at);
+
+		if (time_before(j, ack_at)) {
+			if (time_before(ack_at, next_j)) {
+				next_j = ack_at;
+				set = true;
+			}
+			continue;
+		}
+
+		if (test_and_clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel,
+				       &conn->flags))
+			rxrpc_conn_retransmit_call(conn, NULL, channel);
+	}
+
+	j = jiffies;
+	if (time_before_eq(next_j, j))
+		goto again;
+	if (set)
+		rxrpc_reduce_conn_timer(conn, next_j);
+}
+
 /*
  * connection-level event processor
  */
@@ -394,6 +438,10 @@ void rxrpc_process_connection(struct work_struct *work)
 	if (test_and_clear_bit(RXRPC_CONN_EV_CHALLENGE, &conn->events))
 		rxrpc_secure_connection(conn);
 
+	/* Process delayed ACKs whose time has come. */
+	if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK)
+		rxrpc_process_delayed_final_acks(conn);
+
 	/* go through the conn-level event packets, releasing the ref on this
 	 * connection that each one has when we've finished with it */
 	while ((skb = skb_dequeue(&conn->rx_queue))) {
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index fe575798592f..a01a3791f06a 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -24,6 +24,14 @@ unsigned int rxrpc_connection_expiry = 10 * 60;
 
 static void rxrpc_destroy_connection(struct rcu_head *);
 
+static void rxrpc_connection_timer(struct timer_list *timer)
+{
+	struct rxrpc_connection *conn =
+		container_of(timer, struct rxrpc_connection, timer);
+
+	rxrpc_queue_conn(conn);
+}
+
 /*
  * allocate a new connection
  */
@@ -38,6 +46,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
 		INIT_LIST_HEAD(&conn->cache_link);
 		spin_lock_init(&conn->channel_lock);
 		INIT_LIST_HEAD(&conn->waiting_calls);
+		timer_setup(&conn->timer, &rxrpc_connection_timer, 0);
 		INIT_WORK(&conn->processor, &rxrpc_process_connection);
 		INIT_LIST_HEAD(&conn->proc_link);
 		INIT_LIST_HEAD(&conn->link);
@@ -332,6 +341,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
 
 	_net("DESTROY CONN %d", conn->debug_id);
 
+	del_timer_sync(&conn->timer);
 	rxrpc_purge_queue(&conn->rx_queue);
 
 	conn->security->clear(conn);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 8510a98b87e1..be0b9ae13893 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -144,11 +144,13 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 	trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
 	ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
 
+#if 0 // TODO: May want to transmit final ACK under some circumstances anyway
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
 				  rxrpc_propose_ack_terminal_ack);
 		rxrpc_send_ack_packet(call, false);
 	}
+#endif
 
 	write_lock_bh(&call->state_lock);
 
-- 
cgit v1.2.3


From 4812417894770f8c13e5dd8a66479ae44f4b01ff Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Split the call params from the operation params

When rxrpc_sendmsg() parses the control message buffer, it places the
parameters extracted into a structure, but lumps together call parameters
(such as user call ID) with operation parameters (such as whether to send
data, send an abort or accept a call).

Split the call parameters out into their own structure, a copy of which is
then embedded in the operation parameters struct.

The call parameters struct is then passed down into the places that need it
instead of passing the individual parameters.  This allows for extra call
parameters to be added.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |  8 ++++++--
 net/rxrpc/ar-internal.h | 31 +++++++++++++++++++++++++++++-
 net/rxrpc/call_object.c | 15 +++++++--------
 net/rxrpc/sendmsg.c     | 51 +++++++++++++++++--------------------------------
 4 files changed, 60 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 9b5c46b052fd..c0cdcf980ffc 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -285,6 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 					   bool upgrade)
 {
 	struct rxrpc_conn_parameters cp;
+	struct rxrpc_call_params p;
 	struct rxrpc_call *call;
 	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
 	int ret;
@@ -302,6 +303,10 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	if (key && !key->payload.data[0])
 		key = NULL; /* a no-security key */
 
+	memset(&p, 0, sizeof(p));
+	p.user_call_ID = user_call_ID;
+	p.tx_total_len = tx_total_len;
+
 	memset(&cp, 0, sizeof(cp));
 	cp.local		= rx->local;
 	cp.key			= key;
@@ -309,8 +314,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	cp.exclusive		= false;
 	cp.upgrade		= upgrade;
 	cp.service_id		= srx->srx_service;
-	call = rxrpc_new_client_call(rx, &cp, srx, user_call_ID, tx_total_len,
-				     gfp);
+	call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp);
 	/* The socket has been unlocked. */
 	if (!IS_ERR(call)) {
 		call->notify_rx = notify_rx;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index d1213d503f30..ba63f2231107 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -643,6 +643,35 @@ struct rxrpc_ack_summary {
 	u8			cumulative_acks;
 };
 
+/*
+ * sendmsg() cmsg-specified parameters.
+ */
+enum rxrpc_command {
+	RXRPC_CMD_SEND_DATA,		/* send data message */
+	RXRPC_CMD_SEND_ABORT,		/* request abort generation */
+	RXRPC_CMD_ACCEPT,		/* [server] accept incoming call */
+	RXRPC_CMD_REJECT_BUSY,		/* [server] reject a call as busy */
+};
+
+struct rxrpc_call_params {
+	s64			tx_total_len;	/* Total Tx data length (if send data) */
+	unsigned long		user_call_ID;	/* User's call ID */
+	struct {
+		u32		hard;		/* Maximum lifetime (sec) */
+		u32		idle;		/* Max time since last data packet (msec) */
+		u32		normal;		/* Max time since last call packet (msec) */
+	} timeouts;
+	u8			nr_timeouts;	/* Number of timeouts specified */
+};
+
+struct rxrpc_send_params {
+	struct rxrpc_call_params call;
+	u32			abort_code;	/* Abort code to Tx (if abort) */
+	enum rxrpc_command	command : 8;	/* The command to implement */
+	bool			exclusive;	/* Shared or exclusive call */
+	bool			upgrade;	/* If the connection is upgradeable */
+};
+
 #include <trace/events/rxrpc.h>
 
 /*
@@ -687,7 +716,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t);
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
-					 unsigned long, s64, gfp_t);
+					 struct rxrpc_call_params *, gfp_t);
 int rxrpc_retry_client_call(struct rxrpc_sock *,
 			    struct rxrpc_call *,
 			    struct rxrpc_conn_parameters *,
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 1f141dc08ad2..c3e1fa854471 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -208,8 +208,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 					 struct rxrpc_conn_parameters *cp,
 					 struct sockaddr_rxrpc *srx,
-					 unsigned long user_call_ID,
-					 s64 tx_total_len,
+					 struct rxrpc_call_params *p,
 					 gfp_t gfp)
 	__releases(&rx->sk.sk_lock.slock)
 {
@@ -219,7 +218,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 	const void *here = __builtin_return_address(0);
 	int ret;
 
-	_enter("%p,%lx", rx, user_call_ID);
+	_enter("%p,%lx", rx, p->user_call_ID);
 
 	call = rxrpc_alloc_client_call(rx, srx, gfp);
 	if (IS_ERR(call)) {
@@ -228,9 +227,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		return call;
 	}
 
-	call->tx_total_len = tx_total_len;
+	call->tx_total_len = p->tx_total_len;
 	trace_rxrpc_call(call, rxrpc_call_new_client, atomic_read(&call->usage),
-			 here, (const void *)user_call_ID);
+			 here, (const void *)p->user_call_ID);
 
 	/* We need to protect a partially set up call against the user as we
 	 * will be acting outside the socket lock.
@@ -246,16 +245,16 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 		parent = *pp;
 		xcall = rb_entry(parent, struct rxrpc_call, sock_node);
 
-		if (user_call_ID < xcall->user_call_ID)
+		if (p->user_call_ID < xcall->user_call_ID)
 			pp = &(*pp)->rb_left;
-		else if (user_call_ID > xcall->user_call_ID)
+		else if (p->user_call_ID > xcall->user_call_ID)
 			pp = &(*pp)->rb_right;
 		else
 			goto error_dup_user_ID;
 	}
 
 	rcu_assign_pointer(call->socket, rx);
-	call->user_call_ID = user_call_ID;
+	call->user_call_ID = p->user_call_ID;
 	__set_bit(RXRPC_CALL_HAS_USERID, &call->flags);
 	rxrpc_get_call(call, rxrpc_call_got_userid);
 	rb_link_node(&call->sock_node, parent, pp);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 94555c94b2d8..de5ab327c18a 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -21,22 +21,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-enum rxrpc_command {
-	RXRPC_CMD_SEND_DATA,		/* send data message */
-	RXRPC_CMD_SEND_ABORT,		/* request abort generation */
-	RXRPC_CMD_ACCEPT,		/* [server] accept incoming call */
-	RXRPC_CMD_REJECT_BUSY,		/* [server] reject a call as busy */
-};
-
-struct rxrpc_send_params {
-	s64			tx_total_len;	/* Total Tx data length (if send data) */
-	unsigned long		user_call_ID;	/* User's call ID */
-	u32			abort_code;	/* Abort code to Tx (if abort) */
-	enum rxrpc_command	command : 8;	/* The command to implement */
-	bool			exclusive;	/* Shared or exclusive call */
-	bool			upgrade;	/* If the connection is upgradeable */
-};
-
 /*
  * Wait for space to appear in the Tx queue or a signal to occur.
  */
@@ -480,11 +464,11 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
 			if (msg->msg_flags & MSG_CMSG_COMPAT) {
 				if (len != sizeof(u32))
 					return -EINVAL;
-				p->user_call_ID = *(u32 *)CMSG_DATA(cmsg);
+				p->call.user_call_ID = *(u32 *)CMSG_DATA(cmsg);
 			} else {
 				if (len != sizeof(unsigned long))
 					return -EINVAL;
-				p->user_call_ID = *(unsigned long *)
+				p->call.user_call_ID = *(unsigned long *)
 					CMSG_DATA(cmsg);
 			}
 			got_user_ID = true;
@@ -522,10 +506,10 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
 			break;
 
 		case RXRPC_TX_LENGTH:
-			if (p->tx_total_len != -1 || len != sizeof(__s64))
+			if (p->call.tx_total_len != -1 || len != sizeof(__s64))
 				return -EINVAL;
-			p->tx_total_len = *(__s64 *)CMSG_DATA(cmsg);
-			if (p->tx_total_len < 0)
+			p->call.tx_total_len = *(__s64 *)CMSG_DATA(cmsg);
+			if (p->call.tx_total_len < 0)
 				return -EINVAL;
 			break;
 
@@ -536,7 +520,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
 
 	if (!got_user_ID)
 		return -EINVAL;
-	if (p->tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA)
+	if (p->call.tx_total_len != -1 && p->command != RXRPC_CMD_SEND_DATA)
 		return -EINVAL;
 	_leave(" = 0");
 	return 0;
@@ -576,8 +560,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
 	cp.exclusive		= rx->exclusive | p->exclusive;
 	cp.upgrade		= p->upgrade;
 	cp.service_id		= srx->srx_service;
-	call = rxrpc_new_client_call(rx, &cp, srx, p->user_call_ID,
-				     p->tx_total_len, GFP_KERNEL);
+	call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL);
 	/* The socket is now unlocked */
 
 	_leave(" = %p\n", call);
@@ -597,12 +580,12 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 	int ret;
 
 	struct rxrpc_send_params p = {
-		.tx_total_len	= -1,
-		.user_call_ID	= 0,
-		.abort_code	= 0,
-		.command	= RXRPC_CMD_SEND_DATA,
-		.exclusive	= false,
-		.upgrade	= false,
+		.call.tx_total_len	= -1,
+		.call.user_call_ID	= 0,
+		.abort_code		= 0,
+		.command		= RXRPC_CMD_SEND_DATA,
+		.exclusive		= false,
+		.upgrade		= false,
 	};
 
 	_enter("");
@@ -615,7 +598,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		ret = -EINVAL;
 		if (rx->sk.sk_state != RXRPC_SERVER_LISTENING)
 			goto error_release_sock;
-		call = rxrpc_accept_call(rx, p.user_call_ID, NULL);
+		call = rxrpc_accept_call(rx, p.call.user_call_ID, NULL);
 		/* The socket is now unlocked. */
 		if (IS_ERR(call))
 			return PTR_ERR(call);
@@ -623,7 +606,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		goto out_put_unlock;
 	}
 
-	call = rxrpc_find_call_by_user_ID(rx, p.user_call_ID);
+	call = rxrpc_find_call_by_user_ID(rx, p.call.user_call_ID);
 	if (!call) {
 		ret = -EBADSLT;
 		if (p.command != RXRPC_CMD_SEND_DATA)
@@ -653,13 +636,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 			goto error_put;
 		}
 
-		if (p.tx_total_len != -1) {
+		if (p.call.tx_total_len != -1) {
 			ret = -EINVAL;
 			if (call->tx_total_len != -1 ||
 			    call->tx_pending ||
 			    call->tx_top != 0)
 				goto error_put;
-			call->tx_total_len = p.tx_total_len;
+			call->tx_total_len = p.call.tx_total_len;
 		}
 	}
 
-- 
cgit v1.2.3


From a158bdd3247b9656df36ba133235fff702e9fdc3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Fix call timeouts

Fix the rxrpc call expiration timeouts and make them settable from
userspace.  By analogy with other rx implementations, there should be three
timeouts:

 (1) "Normal timeout"

     This is set for all calls and is triggered if we haven't received any
     packets from the peer in a while.  It is measured from the last time
     we received any packet on that call.  This is not reset by any
     connection packets (such as CHALLENGE/RESPONSE packets).

     If a service operation takes a long time, the server should generate
     PING ACKs at a duration that's substantially less than the normal
     timeout so is to keep both sides alive.  This is set at 1/6 of normal
     timeout.

 (2) "Idle timeout"

     This is set only for a service call and is triggered if we stop
     receiving the DATA packets that comprise the request data.  It is
     measured from the last time we received a DATA packet.

 (3) "Hard timeout"

     This can be set for a call and specified the maximum lifetime of that
     call.  It should not be specified by default.  Some operations (such
     as volume transfer) take a long time.

Allow userspace to set/change the timeouts on a call with sendmsg, using a
control message:

	RXRPC_SET_CALL_TIMEOUTS

The data to the message is a number of 32-bit words, not all of which need
be given:

	u32 hard_timeout;	/* sec from first packet */
	u32 idle_timeout;	/* msec from packet Rx */
	u32 normal_timeout;	/* msec from data Rx */

This can be set in combination with any other sendmsg() that affects a
call.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  69 +++++++++++------
 include/uapi/linux/rxrpc.h   |   1 +
 net/rxrpc/ar-internal.h      |  37 +++++----
 net/rxrpc/call_event.c       | 179 ++++++++++++++++++++-----------------------
 net/rxrpc/call_object.c      |  27 ++++---
 net/rxrpc/conn_client.c      |   4 +-
 net/rxrpc/input.c            |  34 +++++++-
 net/rxrpc/misc.c             |  19 ++---
 net/rxrpc/recvmsg.c          |   2 +-
 net/rxrpc/sendmsg.c          |  59 +++++++++++---
 net/rxrpc/sysctl.c           |  60 +++++++--------
 11 files changed, 290 insertions(+), 201 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index ebe96796027a..01dcbc2164b5 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -138,10 +138,20 @@ enum rxrpc_rtt_rx_trace {
 
 enum rxrpc_timer_trace {
 	rxrpc_timer_begin,
+	rxrpc_timer_exp_ack,
+	rxrpc_timer_exp_hard,
+	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_normal,
+	rxrpc_timer_exp_ping,
+	rxrpc_timer_exp_resend,
 	rxrpc_timer_expired,
 	rxrpc_timer_init_for_reply,
 	rxrpc_timer_init_for_send_reply,
+	rxrpc_timer_restart,
 	rxrpc_timer_set_for_ack,
+	rxrpc_timer_set_for_hard,
+	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
 	rxrpc_timer_set_for_resend,
 	rxrpc_timer_set_for_send,
@@ -296,12 +306,22 @@ enum rxrpc_congest_change {
 #define rxrpc_timer_traces \
 	EM(rxrpc_timer_begin,			"Begin ") \
 	EM(rxrpc_timer_expired,			"*EXPR*") \
+	EM(rxrpc_timer_exp_ack,			"ExpAck") \
+	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
+	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_normal,		"ExpNml") \
+	EM(rxrpc_timer_exp_ping,		"ExpPng") \
+	EM(rxrpc_timer_exp_resend,		"ExpRsn") \
 	EM(rxrpc_timer_init_for_reply,		"IniRpl") \
 	EM(rxrpc_timer_init_for_send_reply,	"SndRpl") \
+	EM(rxrpc_timer_restart,			"Restrt") \
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
+	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
+	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
 	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
-	E_(rxrpc_timer_set_for_send,		"SetTx ")
+	E_(rxrpc_timer_set_for_send,		"SetSnd")
 
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
@@ -932,39 +952,44 @@ TRACE_EVENT(rxrpc_rtt_rx,
 
 TRACE_EVENT(rxrpc_timer,
 	    TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     ktime_t now, unsigned long now_j),
+		     unsigned long now),
 
-	    TP_ARGS(call, why, now, now_j),
+	    TP_ARGS(call, why, now),
 
 	    TP_STRUCT__entry(
 		    __field(struct rxrpc_call *,		call		)
 		    __field(enum rxrpc_timer_trace,		why		)
-		    __field_struct(ktime_t,			now		)
-		    __field_struct(ktime_t,			expire_at	)
-		    __field_struct(ktime_t,			ack_at		)
-		    __field_struct(ktime_t,			resend_at	)
-		    __field(unsigned long,			now_j		)
-		    __field(unsigned long,			timer		)
+		    __field(long,				now		)
+		    __field(long,				ack_at		)
+		    __field(long,				resend_at	)
+		    __field(long,				ping_at		)
+		    __field(long,				expect_rx_by	)
+		    __field(long,				expect_req_by	)
+		    __field(long,				expect_term_by	)
+		    __field(long,				timer		)
 			     ),
 
 	    TP_fast_assign(
-		    __entry->call	= call;
-		    __entry->why	= why;
-		    __entry->now	= now;
-		    __entry->expire_at	= call->expire_at;
-		    __entry->ack_at	= call->ack_at;
-		    __entry->resend_at	= call->resend_at;
-		    __entry->now_j	= now_j;
-		    __entry->timer	= call->timer.expires;
+		    __entry->call		= call;
+		    __entry->why		= why;
+		    __entry->now		= now;
+		    __entry->ack_at		= call->ack_at;
+		    __entry->resend_at		= call->resend_at;
+		    __entry->expect_rx_by	= call->expect_rx_by;
+		    __entry->expect_req_by	= call->expect_req_by;
+		    __entry->expect_term_by	= call->expect_term_by;
+		    __entry->timer		= call->timer.expires;
 			   ),
 
-	    TP_printk("c=%p %s x=%lld a=%lld r=%lld t=%ld",
+	    TP_printk("c=%p %s a=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_timer_traces),
-		      ktime_to_ns(ktime_sub(__entry->expire_at, __entry->now)),
-		      ktime_to_ns(ktime_sub(__entry->ack_at, __entry->now)),
-		      ktime_to_ns(ktime_sub(__entry->resend_at, __entry->now)),
-		      __entry->timer - __entry->now_j)
+		      __entry->ack_at - __entry->now,
+		      __entry->resend_at - __entry->now,
+		      __entry->expect_rx_by - __entry->now,
+		      __entry->expect_req_by - __entry->now,
+		      __entry->expect_term_by - __entry->now,
+		      __entry->timer - __entry->now)
 	    );
 
 TRACE_EVENT(rxrpc_rx_lose,
diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h
index 9d4afea308a4..9335d92c14a4 100644
--- a/include/uapi/linux/rxrpc.h
+++ b/include/uapi/linux/rxrpc.h
@@ -59,6 +59,7 @@ enum rxrpc_cmsg_type {
 	RXRPC_EXCLUSIVE_CALL	= 10,	/* s-: Call should be on exclusive connection */
 	RXRPC_UPGRADE_SERVICE	= 11,	/* s-: Request service upgrade for client call */
 	RXRPC_TX_LENGTH		= 12,	/* s-: Total length of Tx data */
+	RXRPC_SET_CALL_TIMEOUT	= 13,	/* s-: Set one or more call timeouts */
 	RXRPC__SUPPORTED
 };
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index ba63f2231107..548411371048 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -468,9 +468,9 @@ enum rxrpc_call_flag {
 enum rxrpc_call_event {
 	RXRPC_CALL_EV_ACK,		/* need to generate ACK */
 	RXRPC_CALL_EV_ABORT,		/* need to generate abort */
-	RXRPC_CALL_EV_TIMER,		/* Timer expired */
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
 	RXRPC_CALL_EV_PING,		/* Ping send required */
+	RXRPC_CALL_EV_EXPIRED,		/* Expiry occurred */
 };
 
 /*
@@ -514,10 +514,14 @@ struct rxrpc_call {
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
 	struct mutex		user_mutex;	/* User access mutex */
-	ktime_t			ack_at;		/* When deferred ACK needs to happen */
-	ktime_t			resend_at;	/* When next resend needs to happen */
-	ktime_t			ping_at;	/* When next to send a ping */
-	ktime_t			expire_at;	/* When the call times out */
+	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		resend_at;	/* When next resend needs to happen */
+	unsigned long		ping_at;	/* When next to send a ping */
+	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
+	unsigned long		expect_req_by;	/* When we expect to get a request DATA packet by */
+	unsigned long		expect_term_by;	/* When we expect call termination by */
+	u32			next_rx_timo;	/* Timeout for next Rx packet (jif) */
+	u32			next_req_timo;	/* Timeout for next Rx request packet (jif) */
 	struct timer_list	timer;		/* Combined event timer */
 	struct work_struct	processor;	/* Event processor */
 	rxrpc_notify_rx_t	notify_rx;	/* kernel service Rx notification function */
@@ -697,12 +701,19 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void __rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
-void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace, ktime_t);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool,
 		       enum rxrpc_propose_ack_trace);
 void rxrpc_process_call(struct work_struct *);
 
+static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call,
+					   unsigned long expire_at,
+					   unsigned long now,
+					   enum rxrpc_timer_trace why)
+{
+	trace_rxrpc_timer(call, why, now);
+	timer_reduce(&call->timer, expire_at);
+}
+
 /*
  * call_object.c
  */
@@ -843,8 +854,8 @@ static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call,
  */
 extern unsigned int rxrpc_max_client_connections;
 extern unsigned int rxrpc_reap_client_connections;
-extern unsigned int rxrpc_conn_idle_client_expiry;
-extern unsigned int rxrpc_conn_idle_client_fast_expiry;
+extern unsigned long rxrpc_conn_idle_client_expiry;
+extern unsigned long rxrpc_conn_idle_client_fast_expiry;
 extern struct idr rxrpc_client_conn_ids;
 
 void rxrpc_destroy_client_conn_ids(void);
@@ -976,13 +987,13 @@ static inline void rxrpc_queue_local(struct rxrpc_local *local)
  * misc.c
  */
 extern unsigned int rxrpc_max_backlog __read_mostly;
-extern unsigned int rxrpc_requested_ack_delay;
-extern unsigned int rxrpc_soft_ack_delay;
-extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned long rxrpc_requested_ack_delay;
+extern unsigned long rxrpc_soft_ack_delay;
+extern unsigned long rxrpc_idle_ack_delay;
 extern unsigned int rxrpc_rx_window_size;
 extern unsigned int rxrpc_rx_mtu;
 extern unsigned int rxrpc_rx_jumbo_max;
-extern unsigned int rxrpc_resend_timeout;
+extern unsigned long rxrpc_resend_timeout;
 
 extern const s8 rxrpc_ack_priority[];
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3574508baf9a..c14395d5ad8c 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -21,80 +21,6 @@
 #include <net/af_rxrpc.h>
 #include "ar-internal.h"
 
-/*
- * Set the timer
- */
-void __rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		       ktime_t now)
-{
-	unsigned long t_j, now_j = jiffies;
-	ktime_t t;
-	bool queue = false;
-
-	if (call->state < RXRPC_CALL_COMPLETE) {
-		t = call->expire_at;
-		if (!ktime_after(t, now)) {
-			trace_rxrpc_timer(call, why, now, now_j);
-			queue = true;
-			goto out;
-		}
-
-		if (!ktime_after(call->resend_at, now)) {
-			call->resend_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
-				queue = true;
-		} else if (ktime_before(call->resend_at, t)) {
-			t = call->resend_at;
-		}
-
-		if (!ktime_after(call->ack_at, now)) {
-			call->ack_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
-				queue = true;
-		} else if (ktime_before(call->ack_at, t)) {
-			t = call->ack_at;
-		}
-
-		if (!ktime_after(call->ping_at, now)) {
-			call->ping_at = call->expire_at;
-			if (!test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
-				queue = true;
-		} else if (ktime_before(call->ping_at, t)) {
-			t = call->ping_at;
-		}
-
-		t_j = nsecs_to_jiffies(ktime_to_ns(ktime_sub(t, now)));
-		t_j += jiffies;
-
-		/* We have to make sure that the calculated jiffies value falls
-		 * at or after the nsec value, or we may loop ceaselessly
-		 * because the timer times out, but we haven't reached the nsec
-		 * timeout yet.
-		 */
-		t_j++;
-
-		if (call->timer.expires != t_j || !timer_pending(&call->timer)) {
-			mod_timer(&call->timer, t_j);
-			trace_rxrpc_timer(call, why, now, now_j);
-		}
-	}
-
-out:
-	if (queue)
-		rxrpc_queue_call(call);
-}
-
-/*
- * Set the timer
- */
-void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why,
-		     ktime_t now)
-{
-	read_lock_bh(&call->state_lock);
-	__rxrpc_set_timer(call, why, now);
-	read_unlock_bh(&call->state_lock);
-}
-
 /*
  * Propose a PING ACK be sent.
  */
@@ -106,12 +32,13 @@ static void rxrpc_propose_ping(struct rxrpc_call *call,
 		    !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events))
 			rxrpc_queue_call(call);
 	} else {
-		ktime_t now = ktime_get_real();
-		ktime_t ping_at = ktime_add_ms(now, rxrpc_idle_ack_delay);
+		unsigned long now = jiffies;
+		unsigned long ping_at = now + rxrpc_idle_ack_delay;
 
-		if (ktime_before(ping_at, call->ping_at)) {
-			call->ping_at = ping_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_ping, now);
+		if (time_before(ping_at, call->ping_at)) {
+			WRITE_ONCE(call->ping_at, ping_at);
+			rxrpc_reduce_call_timer(call, ping_at, now,
+						rxrpc_timer_set_for_ping);
 		}
 	}
 }
@@ -125,8 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 				enum rxrpc_propose_ack_trace why)
 {
 	enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
-	unsigned int expiry = rxrpc_soft_ack_delay;
-	ktime_t now, ack_at;
+	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
 	/* Pings are handled specially because we don't want to accidentally
@@ -190,11 +116,12 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		    background)
 			rxrpc_queue_call(call);
 	} else {
-		now = ktime_get_real();
-		ack_at = ktime_add_ms(now, expiry);
-		if (ktime_before(ack_at, call->ack_at)) {
-			call->ack_at = ack_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_ack, now);
+		now = jiffies;
+		ack_at = jiffies + expiry;
+		if (time_before(ack_at, call->ack_at)) {
+			WRITE_ONCE(call->ack_at, ack_at);
+			rxrpc_reduce_call_timer(call, ack_at, now,
+						rxrpc_timer_set_for_ack);
 		}
 	}
 
@@ -227,18 +154,20 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
 /*
  * Perform retransmission of NAK'd and unack'd packets.
  */
-static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
+static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 {
 	struct rxrpc_skb_priv *sp;
 	struct sk_buff *skb;
+	unsigned long resend_at;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t max_age, oldest, ack_ts;
+	ktime_t now, max_age, oldest, ack_ts;
 	int ix;
 	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
-	max_age = ktime_sub_ms(now, rxrpc_resend_timeout);
+	now = ktime_get_real();
+	max_age = ktime_sub_ms(now, rxrpc_resend_timeout * 1000 / HZ);
 
 	spin_lock_bh(&call->lock);
 
@@ -282,7 +211,9 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
-	call->resend_at = ktime_add_ms(oldest, rxrpc_resend_timeout);
+	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now)));
+	resend_at += jiffies + rxrpc_resend_timeout;
+	WRITE_ONCE(call->resend_at, resend_at);
 
 	if (unacked)
 		rxrpc_congestion_timeout(call);
@@ -292,7 +223,8 @@ static void rxrpc_resend(struct rxrpc_call *call, ktime_t now)
 	 * retransmitting data.
 	 */
 	if (!retrans) {
-		rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+		rxrpc_reduce_call_timer(call, resend_at, now,
+					rxrpc_timer_set_for_resend);
 		spin_unlock_bh(&call->lock);
 		ack_ts = ktime_sub(now, call->acks_latest_ts);
 		if (ktime_to_ns(ack_ts) < call->peer->rtt)
@@ -364,7 +296,7 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
-	ktime_t now;
+	unsigned long now, next, t;
 
 	rxrpc_see_call(call);
 
@@ -384,8 +316,50 @@ recheck_state:
 		goto out_put;
 	}
 
-	now = ktime_get_real();
-	if (ktime_before(call->expire_at, now)) {
+	/* Work out if any timeouts tripped */
+	now = jiffies;
+	t = READ_ONCE(call->expect_rx_by);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->expect_req_by);
+	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST &&
+	    time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->expect_term_by);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now);
+		set_bit(RXRPC_CALL_EV_EXPIRED, &call->events);
+	}
+
+	t = READ_ONCE(call->ack_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now);
+		cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_ACK, &call->events);
+	}
+
+	t = READ_ONCE(call->ping_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
+		cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_PING, &call->events);
+	}
+
+	t = READ_ONCE(call->resend_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now);
+		cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_RESEND, &call->events);
+	}
+
+	/* Process events */
+	if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) {
 		rxrpc_abort_call("EXP", call, 0, RX_USER_ABORT, -ETIME);
 		set_bit(RXRPC_CALL_EV_ABORT, &call->events);
 		goto recheck_state;
@@ -408,7 +382,22 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	rxrpc_set_timer(call, rxrpc_timer_set_for_resend, now);
+	/* Make sure the timer is restarted */
+	next = call->expect_rx_by;
+
+#define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; }
+	
+	set(call->expect_req_by);
+	set(call->expect_term_by);
+	set(call->ack_at);
+	set(call->resend_at);
+	set(call->ping_at);
+
+	now = jiffies;
+	if (time_after_eq(now, next))
+		goto recheck_state;
+
+	rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart);
 
 	/* other events may have been raised since we started checking */
 	if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index c3e1fa854471..b305970a9b63 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -51,8 +51,10 @@ static void rxrpc_call_timer_expired(unsigned long _call)
 
 	_enter("%d", call->debug_id);
 
-	if (call->state < RXRPC_CALL_COMPLETE)
-		rxrpc_set_timer(call, rxrpc_timer_expired, ktime_get_real());
+	if (call->state < RXRPC_CALL_COMPLETE) {
+		trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies);
+		rxrpc_queue_call(call);
+	}
 }
 
 static struct lock_class_key rxrpc_call_user_mutex_lock_class_key;
@@ -139,6 +141,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp)
 	atomic_set(&call->usage, 1);
 	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
 	call->tx_total_len = -1;
+	call->next_rx_timo = 20 * HZ;
+	call->next_req_timo = 1 * HZ;
 
 	memset(&call->sock_node, 0xed, sizeof(call->sock_node));
 
@@ -189,15 +193,16 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
  */
 static void rxrpc_start_call_timer(struct rxrpc_call *call)
 {
-	ktime_t now = ktime_get_real(), expire_at;
-
-	expire_at = ktime_add_ms(now, rxrpc_max_call_lifetime);
-	call->expire_at = expire_at;
-	call->ack_at = expire_at;
-	call->ping_at = expire_at;
-	call->resend_at = expire_at;
-	call->timer.expires = jiffies + LONG_MAX / 2;
-	rxrpc_set_timer(call, rxrpc_timer_begin, now);
+	unsigned long now = jiffies;
+	unsigned long j = now + MAX_JIFFY_OFFSET;
+
+	call->ack_at = j;
+	call->resend_at = j;
+	call->ping_at = j;
+	call->expect_rx_by = j;
+	call->expect_req_by = j;
+	call->expect_term_by = j;
+	call->timer.expires = now;
 }
 
 /*
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index cfb997593da9..97f6a8de4845 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -85,8 +85,8 @@
 
 __read_mostly unsigned int rxrpc_max_client_connections = 1000;
 __read_mostly unsigned int rxrpc_reap_client_connections = 900;
-__read_mostly unsigned int rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
-__read_mostly unsigned int rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
+__read_mostly unsigned long rxrpc_conn_idle_client_expiry = 2 * 60 * HZ;
+__read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
 
 /*
  * We use machine-unique IDs for our client connections.
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 1b592073ec96..c89647eae86d 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -318,16 +318,18 @@ bad_state:
 static bool rxrpc_receiving_reply(struct rxrpc_call *call)
 {
 	struct rxrpc_ack_summary summary = { 0 };
+	unsigned long now, timo;
 	rxrpc_seq_t top = READ_ONCE(call->tx_top);
 
 	if (call->ackr_reason) {
 		spin_lock_bh(&call->lock);
 		call->ackr_reason = 0;
-		call->resend_at = call->expire_at;
-		call->ack_at = call->expire_at;
 		spin_unlock_bh(&call->lock);
-		rxrpc_set_timer(call, rxrpc_timer_init_for_reply,
-				ktime_get_real());
+		now = jiffies;
+		timo = now + MAX_JIFFY_OFFSET;
+		WRITE_ONCE(call->resend_at, timo);
+		WRITE_ONCE(call->ack_at, timo);
+		trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now);
 	}
 
 	if (!test_bit(RXRPC_CALL_TX_LAST, &call->flags))
@@ -437,6 +439,19 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb,
 	if (state >= RXRPC_CALL_COMPLETE)
 		return;
 
+	if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST) {
+		unsigned long timo = READ_ONCE(call->next_req_timo);
+		unsigned long now, expect_req_by;
+
+		if (timo) {
+			now = jiffies;
+			expect_req_by = now + timo;
+			WRITE_ONCE(call->expect_req_by, expect_req_by);
+			rxrpc_reduce_call_timer(call, expect_req_by, now,
+						rxrpc_timer_set_for_idle);
+		}
+	}
+
 	/* Received data implicitly ACKs all of the request packets we sent
 	 * when we're acting as a client.
 	 */
@@ -908,9 +923,20 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call,
 				    struct sk_buff *skb, u16 skew)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned long timo;
 
 	_enter("%p,%p", call, skb);
 
+	timo = READ_ONCE(call->next_rx_timo);
+	if (timo) {
+		unsigned long now = jiffies, expect_rx_by;
+
+		expect_rx_by = jiffies + timo;
+		WRITE_ONCE(call->expect_rx_by, expect_rx_by);
+		rxrpc_reduce_call_timer(call, expect_rx_by, now,
+					rxrpc_timer_set_for_normal);
+	}
+	
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_DATA:
 		rxrpc_input_data(call, skb, skew);
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 1a2d4b112064..c1d9e7fd7448 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -20,34 +20,29 @@
  */
 unsigned int rxrpc_max_backlog __read_mostly = 10;
 
-/*
- * Maximum lifetime of a call (in mx).
- */
-unsigned int rxrpc_max_call_lifetime = 60 * 1000;
-
 /*
  * How long to wait before scheduling ACK generation after seeing a
- * packet with RXRPC_REQUEST_ACK set (in ms).
+ * packet with RXRPC_REQUEST_ACK set (in jiffies).
  */
-unsigned int rxrpc_requested_ack_delay = 1;
+unsigned long rxrpc_requested_ack_delay = 1;
 
 /*
- * How long to wait before scheduling an ACK with subtype DELAY (in ms).
+ * How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
  *
  * We use this when we've received new data packets.  If those packets aren't
  * all consumed within this time we will send a DELAY ACK if an ACK was not
  * requested to let the sender know it doesn't need to resend.
  */
-unsigned int rxrpc_soft_ack_delay = 1 * 1000;
+unsigned long rxrpc_soft_ack_delay = HZ;
 
 /*
- * How long to wait before scheduling an ACK with subtype IDLE (in ms).
+ * How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
  *
  * We use this when we've consumed some previously soft-ACK'd packets when
  * further packets aren't immediately received to decide when to send an IDLE
  * ACK let the other end know that it can free up its Tx buffer space.
  */
-unsigned int rxrpc_idle_ack_delay = 0.5 * 1000;
+unsigned long rxrpc_idle_ack_delay = HZ / 2;
 
 /*
  * Receive window size in packets.  This indicates the maximum number of
@@ -75,7 +70,7 @@ unsigned int rxrpc_rx_jumbo_max = 4;
 /*
  * Time till packet resend (in milliseconds).
  */
-unsigned int rxrpc_resend_timeout = 4 * 1000;
+unsigned long rxrpc_resend_timeout = 4 * HZ;
 
 const s8 rxrpc_ack_priority[] = {
 	[0]				= 0,
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index be0b9ae13893..0b6609da80b7 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -163,7 +163,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 	case RXRPC_CALL_SERVER_RECV_REQUEST:
 		call->tx_phase = true;
 		call->state = RXRPC_CALL_SERVER_ACK_REQUEST;
-		call->ack_at = call->expire_at;
+		call->expect_req_by = jiffies + MAX_JIFFY_OFFSET;
 		write_unlock_bh(&call->state_lock);
 		rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true,
 				  rxrpc_propose_ack_processing_op);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index de5ab327c18a..03e0676db28c 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -158,6 +158,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			       rxrpc_notify_end_tx_t notify_end_tx)
 {
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+	unsigned long now;
 	rxrpc_seq_t seq = sp->hdr.seq;
 	int ret, ix;
 	u8 annotation = RXRPC_TX_ANNO_UNACK;
@@ -197,11 +198,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 			break;
 		case RXRPC_CALL_SERVER_ACK_REQUEST:
 			call->state = RXRPC_CALL_SERVER_SEND_REPLY;
-			call->ack_at = call->expire_at;
+			now = jiffies;
+			WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET);
 			if (call->ackr_reason == RXRPC_ACK_DELAY)
 				call->ackr_reason = 0;
-			__rxrpc_set_timer(call, rxrpc_timer_init_for_send_reply,
-					  ktime_get_real());
+			trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now);
 			if (!last)
 				break;
 			/* Fall through */
@@ -223,14 +224,12 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 		_debug("need instant resend %d", ret);
 		rxrpc_instant_resend(call, ix);
 	} else {
-		ktime_t now = ktime_get_real(), resend_at;
+		unsigned long now = jiffies, resend_at;
 
-		resend_at = ktime_add_ms(now, rxrpc_resend_timeout);
-
-		if (ktime_before(resend_at, call->resend_at)) {
-			call->resend_at = resend_at;
-			rxrpc_set_timer(call, rxrpc_timer_set_for_send, now);
-		}
+		resend_at = now + rxrpc_resend_timeout;
+		WRITE_ONCE(call->resend_at, resend_at);
+		rxrpc_reduce_call_timer(call, resend_at, now,
+					rxrpc_timer_set_for_send);
 	}
 
 	rxrpc_free_skb(skb, rxrpc_skb_tx_freed);
@@ -513,6 +512,19 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p)
 				return -EINVAL;
 			break;
 
+		case RXRPC_SET_CALL_TIMEOUT:
+			if (len & 3 || len < 4 || len > 12)
+				return -EINVAL;
+			memcpy(&p->call.timeouts, CMSG_DATA(cmsg), len);
+			p->call.nr_timeouts = len / 4;
+			if (p->call.timeouts.hard > INT_MAX / HZ)
+				return -ERANGE;
+			if (p->call.nr_timeouts >= 2 && p->call.timeouts.idle > 60 * 60 * 1000)
+				return -ERANGE;
+			if (p->call.nr_timeouts >= 3 && p->call.timeouts.normal > 60 * 60 * 1000)
+				return -ERANGE;
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -577,11 +589,13 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 {
 	enum rxrpc_call_state state;
 	struct rxrpc_call *call;
+	unsigned long now, j;
 	int ret;
 
 	struct rxrpc_send_params p = {
 		.call.tx_total_len	= -1,
 		.call.user_call_ID	= 0,
+		.call.nr_timeouts	= 0,
 		.abort_code		= 0,
 		.command		= RXRPC_CMD_SEND_DATA,
 		.exclusive		= false,
@@ -646,6 +660,31 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 		}
 	}
 
+	switch (p.call.nr_timeouts) {
+	case 3:
+		j = msecs_to_jiffies(p.call.timeouts.normal);
+		if (p.call.timeouts.normal > 0 && j == 0)
+			j = 1;
+		WRITE_ONCE(call->next_rx_timo, j);
+		/* Fall through */
+	case 2:
+		j = msecs_to_jiffies(p.call.timeouts.idle);
+		if (p.call.timeouts.idle > 0 && j == 0)
+			j = 1;
+		WRITE_ONCE(call->next_req_timo, j);
+		/* Fall through */
+	case 1:
+		if (p.call.timeouts.hard > 0) {
+			j = msecs_to_jiffies(p.call.timeouts.hard);
+			now = jiffies;
+			j += now;
+			WRITE_ONCE(call->expect_term_by, j);
+			rxrpc_reduce_call_timer(call, j, now,
+						rxrpc_timer_set_for_hard);
+		}
+		break;
+	}
+
 	state = READ_ONCE(call->state);
 	_debug("CALL %d USR %lx ST %d on CONN %p",
 	       call->debug_id, call->user_call_ID, state, call->conn);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 34c706d2f79c..4a7af7aff37d 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -21,6 +21,8 @@ static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
 static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
+static const unsigned long one_jiffy = 1;
+static const unsigned long max_jiffies = MAX_JIFFY_OFFSET;
 
 /*
  * RxRPC operating parameters.
@@ -29,64 +31,60 @@ static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1;
  * information on the individual parameters.
  */
 static struct ctl_table rxrpc_sysctl_table[] = {
-	/* Values measured in milliseconds */
+	/* Values measured in milliseconds but used in jiffies */
 	{
 		.procname	= "req_ack_delay",
 		.data		= &rxrpc_requested_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&zero,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "soft_ack_delay",
 		.data		= &rxrpc_soft_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_ack_delay",
 		.data		= &rxrpc_idle_ack_delay,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
-	},
-	{
-		.procname	= "resend_timeout",
-		.data		= &rxrpc_resend_timeout,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_conn_expiry",
 		.data		= &rxrpc_conn_idle_client_expiry,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 	{
 		.procname	= "idle_conn_fast_expiry",
 		.data		= &rxrpc_conn_idle_client_fast_expiry,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
-
-	/* Values measured in seconds but used in jiffies */
 	{
-		.procname	= "max_call_lifetime",
-		.data		= &rxrpc_max_call_lifetime,
-		.maxlen		= sizeof(unsigned int),
+		.procname	= "resend_timeout",
+		.data		= &rxrpc_resend_timeout,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-		.extra1		= (void *)&one,
+		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
+		.extra1		= (void *)&one_jiffy,
+		.extra2		= (void *)&max_jiffies,
 	},
 
 	/* Non-time values */
-- 
cgit v1.2.3


From 8637abaa72fe8200a4a37579e6ec5273db85d2c1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Don't transmit DELAY ACKs immediately on proposal

Don't transmit a DELAY ACK immediately on proposal when the Rx window is
rotated, but rather defer it to the work function.  This means that we have
a chance to queue/consume more received packets before we actually send the
DELAY ACK, or even cancel it entirely, thereby reducing the number of
packets transmitted.

We do, however, want to continue sending other types of packet immediately,
particularly REQUESTED ACKs, as they may be used for RTT calculation by the
other side.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/recvmsg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 0b6609da80b7..fad5f42a3abd 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -219,9 +219,9 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 		    after_eq(top, call->ackr_seen + 2) ||
 		    (hard_ack == top && after(hard_ack, call->ackr_consumed)))
 			rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial,
-					  true, false,
+					  true, true,
 					  rxrpc_propose_ack_rotate_rx);
-		if (call->ackr_reason)
+		if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY)
 			rxrpc_send_ack_packet(call, false);
 	}
 }
-- 
cgit v1.2.3


From beb8e5e4f38cc3e4c2839cfc143e0312bf53d0e0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:41 +0000
Subject: rxrpc: Express protocol timeouts in terms of RTT

Express protocol timeouts for data retransmission and deferred ack
generation in terms on RTT rather than specified timeouts once we have
sufficient RTT samples.

For the moment, this requires just one RTT sample to be able to use this
for ack deferral and two for data retransmission.

The data retransmission timeout is set at RTT*1.5 and the ACK deferral
timeout is set at RTT.

Note that the calculated timeout is limited to a minimum of 4ns to make
sure it doesn't happen too quickly.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 22 ++++++++++++++++++----
 net/rxrpc/sendmsg.c    |  7 +++++++
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index c14395d5ad8c..da91f16ac77c 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -52,7 +52,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 				enum rxrpc_propose_ack_trace why)
 {
 	enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use;
-	unsigned long now, ack_at, expiry = rxrpc_soft_ack_delay;
+	unsigned long expiry = rxrpc_soft_ack_delay;
 	s8 prior = rxrpc_ack_priority[ack_reason];
 
 	/* Pings are handled specially because we don't want to accidentally
@@ -116,7 +116,13 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 		    background)
 			rxrpc_queue_call(call);
 	} else {
-		now = jiffies;
+		unsigned long now = jiffies, ack_at;
+
+		if (call->peer->rtt_usage > 0)
+			ack_at = nsecs_to_jiffies(call->peer->rtt);
+		else
+			ack_at = expiry;
+
 		ack_at = jiffies + expiry;
 		if (time_before(ack_at, call->ack_at)) {
 			WRITE_ONCE(call->ack_at, ack_at);
@@ -160,14 +166,22 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 	struct sk_buff *skb;
 	unsigned long resend_at;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t now, max_age, oldest, ack_ts;
+	ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo;
 	int ix;
 	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
+	if (call->peer->rtt_usage > 1)
+		timeout = ns_to_ktime(call->peer->rtt * 3 / 2);
+	else
+		timeout = ms_to_ktime(rxrpc_resend_timeout);
+	min_timeo = ns_to_ktime((1000000000 / HZ) * 4);
+	if (ktime_before(timeout, min_timeo))
+		timeout = min_timeo;
+
 	now = ktime_get_real();
-	max_age = ktime_sub_ms(now, rxrpc_resend_timeout * 1000 / HZ);
+	max_age = ktime_sub(now, timeout);
 
 	spin_lock_bh(&call->lock);
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 03e0676db28c..c56ee54fdd1f 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -226,6 +226,13 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 	} else {
 		unsigned long now = jiffies, resend_at;
 
+		if (call->peer->rtt_usage > 1)
+			resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2);
+		else
+			resend_at = rxrpc_resend_timeout;
+		if (resend_at < 1)
+			resend_at = 1;
+
 		resend_at = now + rxrpc_resend_timeout;
 		WRITE_ONCE(call->resend_at, resend_at);
 		rxrpc_reduce_call_timer(call, resend_at, now,
-- 
cgit v1.2.3


From bd1fdf8cfdf3fdbccd2b21c33ec649ebd7429af7 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Add a timeout for detecting lost ACKs/lost DATA

Add an extra timeout that is set/updated when we send a DATA packet that
has the request-ack flag set.  This allows us to detect if we don't get an
ACK in response to the latest flagged packet.

The ACK packet is adjudged to have been lost if it doesn't turn up within
2*RTT of the transmission.

If the timeout occurs, we schedule the sending of a PING ACK to find out
the state of the other side.  If a new DATA packet is ready to go sooner,
we cancel the sending of the ping and set the request-ack flag on that
instead.

If we get back a PING-RESPONSE ACK that indicates a lower tx_top than what
we had at the time of the ping transmission, we adjudge all the DATA
packets sent between the response tx_top and the ping-time tx_top to have
been lost and retransmit immediately.

Rather than sending a PING ACK, we could just pick a DATA packet and
speculatively retransmit that with request-ack set.  It should result in
either a REQUESTED ACK or a DUPLICATE ACK which we can then use in lieu the
a PING-RESPONSE ACK mentioned above.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h | 11 +++++++++--
 net/rxrpc/ar-internal.h      |  6 +++++-
 net/rxrpc/call_event.c       | 26 ++++++++++++++++++++++----
 net/rxrpc/call_object.c      |  1 +
 net/rxrpc/input.c            | 40 ++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/output.c           | 20 ++++++++++++++++++--
 net/rxrpc/recvmsg.c          |  4 ++--
 net/rxrpc/sendmsg.c          |  2 +-
 8 files changed, 98 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 01dcbc2164b5..84ade8b76a19 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -141,6 +141,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_exp_ack,
 	rxrpc_timer_exp_hard,
 	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_lost_ack,
 	rxrpc_timer_exp_normal,
 	rxrpc_timer_exp_ping,
 	rxrpc_timer_exp_resend,
@@ -151,6 +152,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_set_for_ack,
 	rxrpc_timer_set_for_hard,
 	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_lost_ack,
 	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
 	rxrpc_timer_set_for_resend,
@@ -309,6 +311,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_exp_ack,			"ExpAck") \
 	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
 	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_lost_ack,		"ExpLoA") \
 	EM(rxrpc_timer_exp_normal,		"ExpNml") \
 	EM(rxrpc_timer_exp_ping,		"ExpPng") \
 	EM(rxrpc_timer_exp_resend,		"ExpRsn") \
@@ -318,6 +321,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
 	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
 	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_lost_ack,	"SetLoA") \
 	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
 	EM(rxrpc_timer_set_for_resend,		"SetRTx") \
@@ -961,6 +965,7 @@ TRACE_EVENT(rxrpc_timer,
 		    __field(enum rxrpc_timer_trace,		why		)
 		    __field(long,				now		)
 		    __field(long,				ack_at		)
+		    __field(long,				ack_lost_at	)
 		    __field(long,				resend_at	)
 		    __field(long,				ping_at		)
 		    __field(long,				expect_rx_by	)
@@ -974,6 +979,7 @@ TRACE_EVENT(rxrpc_timer,
 		    __entry->why		= why;
 		    __entry->now		= now;
 		    __entry->ack_at		= call->ack_at;
+		    __entry->ack_lost_at	= call->ack_lost_at;
 		    __entry->resend_at		= call->resend_at;
 		    __entry->expect_rx_by	= call->expect_rx_by;
 		    __entry->expect_req_by	= call->expect_req_by;
@@ -981,10 +987,11 @@ TRACE_EVENT(rxrpc_timer,
 		    __entry->timer		= call->timer.expires;
 			   ),
 
-	    TP_printk("c=%p %s a=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
+	    TP_printk("c=%p %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld",
 		      __entry->call,
 		      __print_symbolic(__entry->why, rxrpc_timer_traces),
 		      __entry->ack_at - __entry->now,
+		      __entry->ack_lost_at - __entry->now,
 		      __entry->resend_at - __entry->now,
 		      __entry->expect_rx_by - __entry->now,
 		      __entry->expect_req_by - __entry->now,
@@ -1105,7 +1112,7 @@ TRACE_EVENT(rxrpc_congest,
 		    memcpy(&__entry->sum, summary, sizeof(__entry->sum));
 			   ),
 
-	    TP_printk("c=%p %08x %s %08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
+	    TP_printk("c=%p r=%08x %s q=%08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s",
 		      __entry->call,
 		      __entry->ack_serial,
 		      __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names),
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 548411371048..7e7b817c69f0 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -471,6 +471,7 @@ enum rxrpc_call_event {
 	RXRPC_CALL_EV_RESEND,		/* Tx resend required */
 	RXRPC_CALL_EV_PING,		/* Ping send required */
 	RXRPC_CALL_EV_EXPIRED,		/* Expiry occurred */
+	RXRPC_CALL_EV_ACK_LOST,		/* ACK may be lost, send ping */
 };
 
 /*
@@ -515,6 +516,7 @@ struct rxrpc_call {
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
 	struct mutex		user_mutex;	/* User access mutex */
 	unsigned long		ack_at;		/* When deferred ACK needs to happen */
+	unsigned long		ack_lost_at;	/* When ACK is figured as lost */
 	unsigned long		resend_at;	/* When next resend needs to happen */
 	unsigned long		ping_at;	/* When next to send a ping */
 	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
@@ -624,6 +626,8 @@ struct rxrpc_call {
 	ktime_t			acks_latest_ts;	/* Timestamp of latest ACK received */
 	rxrpc_serial_t		acks_latest;	/* serial number of latest ACK received */
 	rxrpc_seq_t		acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
+	rxrpc_seq_t		acks_lost_top;	/* tx_top at the time lost-ack ping sent */
+	rxrpc_serial_t		acks_lost_ping;	/* Serial number of probe ACK */
 };
 
 /*
@@ -1011,7 +1015,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
 /*
  * output.c
  */
-int rxrpc_send_ack_packet(struct rxrpc_call *, bool);
+int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *);
 int rxrpc_send_abort_packet(struct rxrpc_call *);
 int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
 void rxrpc_reject_packets(struct rxrpc_local *);
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index da91f16ac77c..c65666b2f39e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -245,7 +245,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 			goto out;
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
 				  rxrpc_propose_ack_ping_for_lost_ack);
-		rxrpc_send_ack_packet(call, true);
+		rxrpc_send_ack_packet(call, true, NULL);
 		goto out;
 	}
 
@@ -310,6 +310,7 @@ void rxrpc_process_call(struct work_struct *work)
 {
 	struct rxrpc_call *call =
 		container_of(work, struct rxrpc_call, processor);
+	rxrpc_serial_t *send_ack;
 	unsigned long now, next, t;
 
 	rxrpc_see_call(call);
@@ -358,6 +359,13 @@ recheck_state:
 		set_bit(RXRPC_CALL_EV_ACK, &call->events);
 	}
 
+	t = READ_ONCE(call->ack_lost_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_lost_ack, now);
+		cmpxchg(&call->ack_lost_at, t, now + MAX_JIFFY_OFFSET);
+		set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
+	}
+
 	t = READ_ONCE(call->ping_at);
 	if (time_after_eq(now, t)) {
 		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
@@ -379,15 +387,24 @@ recheck_state:
 		goto recheck_state;
 	}
 
-	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events)) {
+	send_ack = NULL;
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) {
+		call->acks_lost_top = call->tx_top;
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false,
+				  rxrpc_propose_ack_ping_for_lost_ack);
+		send_ack = &call->acks_lost_ping;
+	}
+
+	if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) ||
+	    send_ack) {
 		if (call->ackr_reason) {
-			rxrpc_send_ack_packet(call, false);
+			rxrpc_send_ack_packet(call, false, send_ack);
 			goto recheck_state;
 		}
 	}
 
 	if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) {
-		rxrpc_send_ack_packet(call, true);
+		rxrpc_send_ack_packet(call, true, NULL);
 		goto recheck_state;
 	}
 
@@ -404,6 +421,7 @@ recheck_state:
 	set(call->expect_req_by);
 	set(call->expect_term_by);
 	set(call->ack_at);
+	set(call->ack_lost_at);
 	set(call->resend_at);
 	set(call->ping_at);
 
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index b305970a9b63..7ee3d6ce5aa2 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -197,6 +197,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call)
 	unsigned long j = now + MAX_JIFFY_OFFSET;
 
 	call->ack_at = j;
+	call->ack_lost_at = j;
 	call->resend_at = j;
 	call->ping_at = j;
 	call->expect_rx_by = j;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index c89647eae86d..23a5e61d8f79 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -630,6 +630,43 @@ found:
 			   orig_serial, ack_serial, sent_at, resp_time);
 }
 
+/*
+ * Process the response to a ping that we sent to find out if we lost an ACK.
+ *
+ * If we got back a ping response that indicates a lower tx_top than what we
+ * had at the time of the ping transmission, we adjudge all the DATA packets
+ * sent between the response tx_top and the ping-time tx_top to have been lost.
+ */
+static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call)
+{
+	rxrpc_seq_t top, bottom, seq;
+	bool resend = false;
+
+	spin_lock_bh(&call->lock);
+
+	bottom = call->tx_hard_ack + 1;
+	top = call->acks_lost_top;
+	if (before(bottom, top)) {
+		for (seq = bottom; before_eq(seq, top); seq++) {
+			int ix = seq & RXRPC_RXTX_BUFF_MASK;
+			u8 annotation = call->rxtx_annotations[ix];
+			u8 anno_type = annotation & RXRPC_TX_ANNO_MASK;
+
+			if (anno_type != RXRPC_TX_ANNO_UNACK)
+				continue;
+			annotation &= ~RXRPC_TX_ANNO_MASK;
+			annotation |= RXRPC_TX_ANNO_RETRANS;
+			call->rxtx_annotations[ix] = annotation;
+			resend = true;
+		}
+	}
+
+	spin_unlock_bh(&call->lock);
+
+	if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
+		rxrpc_queue_call(call);
+}
+
 /*
  * Process a ping response.
  */
@@ -645,6 +682,9 @@ static void rxrpc_input_ping_response(struct rxrpc_call *call,
 	smp_rmb();
 	ping_serial = call->ping_serial;
 
+	if (orig_serial == call->acks_lost_ping)
+		rxrpc_input_check_for_lost_ack(call);
+
 	if (!test_bit(RXRPC_CALL_PINGING, &call->flags) ||
 	    before(orig_serial, ping_serial))
 		return;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f47659c7b224..efe06edce189 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -95,7 +95,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
 /*
  * Send an ACK call packet.
  */
-int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
+int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
+			  rxrpc_serial_t *_serial)
 {
 	struct rxrpc_connection *conn = NULL;
 	struct rxrpc_ack_buffer *pkt;
@@ -165,6 +166,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
 			   ntohl(pkt->ack.firstPacket),
 			   ntohl(pkt->ack.serial),
 			   pkt->ack.reason, pkt->ack.nAcks);
+	if (_serial)
+		*_serial = serial;
 
 	if (ping) {
 		call->ping_serial = serial;
@@ -323,7 +326,8 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	 * ACKs if a DATA packet appears to have been lost.
 	 */
 	if (!(sp->hdr.flags & RXRPC_LAST_PACKET) &&
-	    (retrans ||
+	    (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) ||
+	     retrans ||
 	     call->cong_mode == RXRPC_CALL_SLOW_START ||
 	     (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
 	     ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
@@ -370,6 +374,18 @@ done:
 		if (whdr.flags & RXRPC_REQUEST_ACK) {
 			call->peer->rtt_last_req = now;
 			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
+			if (call->peer->rtt_usage > 1) {
+				unsigned long nowj = jiffies, ack_lost_at;
+
+				ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt);
+				if (ack_lost_at < 1)
+					ack_lost_at = 1;
+
+				ack_lost_at += nowj;
+				WRITE_ONCE(call->ack_lost_at, ack_lost_at);
+				rxrpc_reduce_call_timer(call, ack_lost_at, nowj,
+							rxrpc_timer_set_for_lost_ack);
+			}
 		}
 	}
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index fad5f42a3abd..cc21e8db25b0 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -148,7 +148,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
 	if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
 		rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
 				  rxrpc_propose_ack_terminal_ack);
-		rxrpc_send_ack_packet(call, false);
+		rxrpc_send_ack_packet(call, false, NULL);
 	}
 #endif
 
@@ -222,7 +222,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
 					  true, true,
 					  rxrpc_propose_ack_rotate_rx);
 		if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY)
-			rxrpc_send_ack_packet(call, false);
+			rxrpc_send_ack_packet(call, false, NULL);
 	}
 }
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index c56ee54fdd1f..a1c53ac066a1 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -285,7 +285,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
 	do {
 		/* Check to see if there's a ping ACK to reply to. */
 		if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE)
-			rxrpc_send_ack_packet(call, false);
+			rxrpc_send_ack_packet(call, false, NULL);
 
 		if (!skb) {
 			size_t size, chunk, max, space;
-- 
cgit v1.2.3


From 415f44e43282a16ec0808c7ccfd401762e587437 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Add keepalive for a call

We need to transmit a packet every so often to act as a keepalive for the
peer (which has a timeout from the last time it received a packet) and also
to prevent any intervening firewalls from closing the route.

Do this by resetting a timer every time we transmit a packet.  If the timer
ever expires, we transmit a PING ACK packet and thereby also elicit a PING
RESPONSE ACK from the other side - which prevents our last-rx timeout from
expiring.

The timer is set to 1/6 of the last-rx timeout so that we can detect the
other side going away if it misses 6 replies in a row.

This is particularly necessary for servers where the processing of the
service function may take a significant amount of time.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  6 ++++++
 net/rxrpc/ar-internal.h      |  1 +
 net/rxrpc/call_event.c       | 10 ++++++++++
 net/rxrpc/output.c           | 23 +++++++++++++++++++++++
 4 files changed, 40 insertions(+)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 84ade8b76a19..e98fed6de497 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -141,6 +141,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_exp_ack,
 	rxrpc_timer_exp_hard,
 	rxrpc_timer_exp_idle,
+	rxrpc_timer_exp_keepalive,
 	rxrpc_timer_exp_lost_ack,
 	rxrpc_timer_exp_normal,
 	rxrpc_timer_exp_ping,
@@ -152,6 +153,7 @@ enum rxrpc_timer_trace {
 	rxrpc_timer_set_for_ack,
 	rxrpc_timer_set_for_hard,
 	rxrpc_timer_set_for_idle,
+	rxrpc_timer_set_for_keepalive,
 	rxrpc_timer_set_for_lost_ack,
 	rxrpc_timer_set_for_normal,
 	rxrpc_timer_set_for_ping,
@@ -162,6 +164,7 @@ enum rxrpc_timer_trace {
 enum rxrpc_propose_ack_trace {
 	rxrpc_propose_ack_client_tx_end,
 	rxrpc_propose_ack_input_data,
+	rxrpc_propose_ack_ping_for_keepalive,
 	rxrpc_propose_ack_ping_for_lost_ack,
 	rxrpc_propose_ack_ping_for_lost_reply,
 	rxrpc_propose_ack_ping_for_params,
@@ -311,6 +314,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_exp_ack,			"ExpAck") \
 	EM(rxrpc_timer_exp_hard,		"ExpHrd") \
 	EM(rxrpc_timer_exp_idle,		"ExpIdl") \
+	EM(rxrpc_timer_exp_keepalive,		"ExpKA ") \
 	EM(rxrpc_timer_exp_lost_ack,		"ExpLoA") \
 	EM(rxrpc_timer_exp_normal,		"ExpNml") \
 	EM(rxrpc_timer_exp_ping,		"ExpPng") \
@@ -321,6 +325,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_timer_set_for_ack,		"SetAck") \
 	EM(rxrpc_timer_set_for_hard,		"SetHrd") \
 	EM(rxrpc_timer_set_for_idle,		"SetIdl") \
+	EM(rxrpc_timer_set_for_keepalive,	"KeepAl") \
 	EM(rxrpc_timer_set_for_lost_ack,	"SetLoA") \
 	EM(rxrpc_timer_set_for_normal,		"SetNml") \
 	EM(rxrpc_timer_set_for_ping,		"SetPng") \
@@ -330,6 +335,7 @@ enum rxrpc_congest_change {
 #define rxrpc_propose_ack_traces \
 	EM(rxrpc_propose_ack_client_tx_end,	"ClTxEnd") \
 	EM(rxrpc_propose_ack_input_data,	"DataIn ") \
+	EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \
 	EM(rxrpc_propose_ack_ping_for_lost_ack,	"LostAck") \
 	EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
 	EM(rxrpc_propose_ack_ping_for_params,	"Params ") \
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7e7b817c69f0..cdcbc798f921 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -519,6 +519,7 @@ struct rxrpc_call {
 	unsigned long		ack_lost_at;	/* When ACK is figured as lost */
 	unsigned long		resend_at;	/* When next resend needs to happen */
 	unsigned long		ping_at;	/* When next to send a ping */
+	unsigned long		keepalive_at;	/* When next to send a keepalive ping */
 	unsigned long		expect_rx_by;	/* When we expect to get a packet by */
 	unsigned long		expect_req_by;	/* When we expect to get a request DATA packet by */
 	unsigned long		expect_term_by;	/* When we expect call termination by */
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index c65666b2f39e..bda952ffe6a6 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -366,6 +366,15 @@ recheck_state:
 		set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
 	}
 
+	t = READ_ONCE(call->keepalive_at);
+	if (time_after_eq(now, t)) {
+		trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now);
+		cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET);
+		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, true,
+				  rxrpc_propose_ack_ping_for_keepalive);
+		set_bit(RXRPC_CALL_EV_PING, &call->events);
+	}
+
 	t = READ_ONCE(call->ping_at);
 	if (time_after_eq(now, t)) {
 		trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now);
@@ -423,6 +432,7 @@ recheck_state:
 	set(call->ack_at);
 	set(call->ack_lost_at);
 	set(call->resend_at);
+	set(call->keepalive_at);
 	set(call->ping_at);
 
 	now = jiffies;
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index efe06edce189..42410e910aff 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -32,6 +32,24 @@ struct rxrpc_abort_buffer {
 	__be32 abort_code;
 };
 
+/*
+ * Arrange for a keepalive ping a certain time after we last transmitted.  This
+ * lets the far side know we're still interested in this call and helps keep
+ * the route through any intervening firewall open.
+ *
+ * Receiving a response to the ping will prevent the ->expect_rx_by timer from
+ * expiring.
+ */
+static void rxrpc_set_keepalive(struct rxrpc_call *call)
+{
+	unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6;
+
+	keepalive_at += now;
+	WRITE_ONCE(call->keepalive_at, keepalive_at);
+	rxrpc_reduce_call_timer(call, keepalive_at, now,
+				rxrpc_timer_set_for_keepalive);
+}
+
 /*
  * Fill out an ACK packet.
  */
@@ -205,6 +223,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 				call->ackr_seen = top;
 			spin_unlock_bh(&call->lock);
 		}
+
+		rxrpc_set_keepalive(call);
 	}
 
 out:
@@ -388,6 +408,9 @@ done:
 			}
 		}
 	}
+
+	rxrpc_set_keepalive(call);
+
 	_leave(" = %d [%u]", ret, call->peer->maxdata);
 	return ret;
 
-- 
cgit v1.2.3


From f859ab61875978eeaa539740ff7f7d91f5d60006 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Fix service endpoint expiry

RxRPC service endpoints expire like they're supposed to by the following
means:

 (1) Mark dead rxrpc_net structs (with ->live) rather than twiddling the
     global service conn timeout, otherwise the first rxrpc_net struct to
     die will cause connections on all others to expire immediately from
     then on.

 (2) Mark local service endpoints for which the socket has been closed
     (->service_closed) so that the expiration timeout can be much
     shortened for service and client connections going through that
     endpoint.

 (3) rxrpc_put_service_conn() needs to schedule the reaper when the usage
     count reaches 1, not 0, as idle conns have a 1 count.

 (4) The accumulator for the earliest time we might want to schedule for
     should be initialised to jiffies + MAX_JIFFY_OFFSET, not ULONG_MAX as
     the comparison functions use signed arithmetic.

 (5) Simplify the expiration handling, adding the expiration value to the
     idle timestamp each time rather than keeping track of the time in the
     past before which the idle timestamp must go to be expired.  This is
     much easier to read.

 (6) Ignore the timeouts if the net namespace is dead.

 (7) Restart the service reaper work item rather the client reaper.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/trace/events/rxrpc.h |  2 ++
 net/rxrpc/af_rxrpc.c         | 13 +++++++++++++
 net/rxrpc/ar-internal.h      |  3 +++
 net/rxrpc/conn_client.c      |  2 ++
 net/rxrpc/conn_object.c      | 42 ++++++++++++++++++++++++------------------
 net/rxrpc/net_ns.c           |  3 +++
 6 files changed, 47 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index e98fed6de497..36cb50c111a6 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -49,6 +49,7 @@ enum rxrpc_conn_trace {
 	rxrpc_conn_put_client,
 	rxrpc_conn_put_service,
 	rxrpc_conn_queued,
+	rxrpc_conn_reap_service,
 	rxrpc_conn_seen,
 };
 
@@ -221,6 +222,7 @@ enum rxrpc_congest_change {
 	EM(rxrpc_conn_put_client,		"PTc") \
 	EM(rxrpc_conn_put_service,		"PTs") \
 	EM(rxrpc_conn_queued,			"QUE") \
+	EM(rxrpc_conn_reap_service,		"RPs") \
 	E_(rxrpc_conn_seen,			"SEE")
 
 #define rxrpc_client_traces \
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index c0cdcf980ffc..abb524c2b8f8 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -867,6 +867,19 @@ static int rxrpc_release_sock(struct sock *sk)
 	sock_orphan(sk);
 	sk->sk_shutdown = SHUTDOWN_MASK;
 
+	/* We want to kill off all connections from a service socket
+	 * as fast as possible because we can't share these; client
+	 * sockets, on the other hand, can share an endpoint.
+	 */
+	switch (sk->sk_state) {
+	case RXRPC_SERVER_BOUND:
+	case RXRPC_SERVER_BOUND2:
+	case RXRPC_SERVER_LISTENING:
+	case RXRPC_SERVER_LISTEN_DISABLED:
+		rx->local->service_closed = true;
+		break;
+	}
+
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	sk->sk_state = RXRPC_CLOSE;
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index cdcbc798f921..a0082c407005 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -84,6 +84,7 @@ struct rxrpc_net {
 	unsigned int		nr_client_conns;
 	unsigned int		nr_active_client_conns;
 	bool			kill_all_client_conns;
+	bool			live;
 	spinlock_t		client_conn_cache_lock; /* Lock for ->*_client_conns */
 	spinlock_t		client_conn_discard_lock; /* Prevent multiple discarders */
 	struct list_head	waiting_client_conns;
@@ -265,6 +266,7 @@ struct rxrpc_local {
 	rwlock_t		services_lock;	/* lock for services list */
 	int			debug_id;	/* debug ID for printks */
 	bool			dead;
+	bool			service_closed;	/* Service socket closed */
 	struct sockaddr_rxrpc	srx;		/* local address */
 };
 
@@ -881,6 +883,7 @@ void rxrpc_process_connection(struct work_struct *);
  * conn_object.c
  */
 extern unsigned int rxrpc_connection_expiry;
+extern unsigned int rxrpc_closed_conn_expiry;
 
 struct rxrpc_connection *rxrpc_alloc_connection(gfp_t);
 struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *,
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 97f6a8de4845..785dfdb9fef1 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -1079,6 +1079,8 @@ next:
 		expiry = rxrpc_conn_idle_client_expiry;
 		if (nr_conns > rxrpc_reap_client_connections)
 			expiry = rxrpc_conn_idle_client_fast_expiry;
+		if (conn->params.local->service_closed)
+			expiry = rxrpc_closed_conn_expiry * HZ;
 
 		conn_expires_at = conn->idle_timestamp + expiry;
 
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index a01a3791f06a..726eda6140ae 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -20,7 +20,8 @@
 /*
  * Time till a connection expires after last use (in seconds).
  */
-unsigned int rxrpc_connection_expiry = 10 * 60;
+unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60;
+unsigned int __read_mostly rxrpc_closed_conn_expiry = 10;
 
 static void rxrpc_destroy_connection(struct rcu_head *);
 
@@ -321,7 +322,7 @@ void rxrpc_put_service_conn(struct rxrpc_connection *conn)
 	n = atomic_dec_return(&conn->usage);
 	trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
 	ASSERTCMP(n, >=, 0);
-	if (n == 0) {
+	if (n == 1) {
 		rxnet = conn->params.local->rxnet;
 		rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0);
 	}
@@ -363,15 +364,14 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 	struct rxrpc_net *rxnet =
 		container_of(to_delayed_work(work),
 			     struct rxrpc_net, service_conn_reaper);
-	unsigned long reap_older_than, earliest, idle_timestamp, now;
+	unsigned long expire_at, earliest, idle_timestamp, now;
 
 	LIST_HEAD(graveyard);
 
 	_enter("");
 
 	now = jiffies;
-	reap_older_than = now - rxrpc_connection_expiry * HZ;
-	earliest = ULONG_MAX;
+	earliest = now + MAX_JIFFY_OFFSET;
 
 	write_lock(&rxnet->conn_lock);
 	list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) {
@@ -381,15 +381,21 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 		if (conn->state == RXRPC_CONN_SERVICE_PREALLOC)
 			continue;
 
-		idle_timestamp = READ_ONCE(conn->idle_timestamp);
-		_debug("reap CONN %d { u=%d,t=%ld }",
-		       conn->debug_id, atomic_read(&conn->usage),
-		       (long)reap_older_than - (long)idle_timestamp);
-
-		if (time_after(idle_timestamp, reap_older_than)) {
-			if (time_before(idle_timestamp, earliest))
-				earliest = idle_timestamp;
-			continue;
+		if (rxnet->live) {
+			idle_timestamp = READ_ONCE(conn->idle_timestamp);
+			expire_at = idle_timestamp + rxrpc_connection_expiry * HZ;
+			if (conn->params.local->service_closed)
+				expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ;
+
+			_debug("reap CONN %d { u=%d,t=%ld }",
+			       conn->debug_id, atomic_read(&conn->usage),
+			       (long)expire_at - (long)now);
+
+			if (time_before(now, expire_at)) {
+				if (time_before(expire_at, earliest))
+					earliest = expire_at;
+				continue;
+			}
 		}
 
 		/* The usage count sits at 1 whilst the object is unused on the
@@ -397,6 +403,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 		 */
 		if (atomic_cmpxchg(&conn->usage, 1, 0) != 1)
 			continue;
+		trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0);
 
 		if (rxrpc_conn_is_client(conn))
 			BUG();
@@ -407,10 +414,10 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 	}
 	write_unlock(&rxnet->conn_lock);
 
-	if (earliest != ULONG_MAX) {
-		_debug("reschedule reaper %ld", (long) earliest - now);
+	if (earliest != now + MAX_JIFFY_OFFSET) {
+		_debug("reschedule reaper %ld", (long)earliest - (long)now);
 		ASSERT(time_after(earliest, now));
-		rxrpc_queue_delayed_work(&rxnet->client_conn_reaper,
+		rxrpc_queue_delayed_work(&rxnet->service_conn_reaper,
 					 earliest - now);
 	}
 
@@ -439,7 +446,6 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
 	rxrpc_destroy_all_client_connections(rxnet);
 
-	rxrpc_connection_expiry = 0;
 	cancel_delayed_work(&rxnet->client_conn_reaper);
 	rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0);
 	flush_workqueue(rxrpc_workqueue);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 7edceb8522f5..684c51d600c7 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -22,6 +22,7 @@ static __net_init int rxrpc_init_net(struct net *net)
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 	int ret;
 
+	rxnet->live = true;
 	get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch));
 	rxnet->epoch |= RXRPC_RANDOM_EPOCH;
 
@@ -60,6 +61,7 @@ static __net_init int rxrpc_init_net(struct net *net)
 	return 0;
 
 err_proc:
+	rxnet->live = false;
 	return ret;
 }
 
@@ -70,6 +72,7 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 
+	rxnet->live = false;
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
 	rxrpc_destroy_all_locals(rxnet);
-- 
cgit v1.2.3


From 3d18cbb7fd0cfdf0b2ca18139950a4b0c1a0a220 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 24 Nov 2017 10:18:42 +0000
Subject: rxrpc: Fix conn expiry timers

Fix the rxrpc connection expiry timers so that connections for closed
AF_RXRPC sockets get deleted in a more timely fashion, freeing up the
transport UDP port much more quickly.

 (1) Replace the delayed work items with work items plus timers so that
     timer_reduce() can be used to shorten them and so that the timer
     doesn't requeue the work item if the net namespace is dead.

 (2) Don't use queue_delayed_work() as that won't alter the timeout if the
     timer is already running.

 (3) Don't rearm the timers if the network namespace is dead.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |  2 ++
 net/rxrpc/ar-internal.h |  6 ++++--
 net/rxrpc/conn_client.c | 30 +++++++++++++++++++-----------
 net/rxrpc/conn_object.c | 28 +++++++++++++++++-----------
 net/rxrpc/net_ns.c      | 30 ++++++++++++++++++++++++++----
 5 files changed, 68 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index abb524c2b8f8..8f7cf4c042be 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -895,6 +895,8 @@ static int rxrpc_release_sock(struct sock *sk)
 	rxrpc_release_calls_on_socket(rx);
 	flush_workqueue(rxrpc_workqueue);
 	rxrpc_purge_queue(&sk->sk_receive_queue);
+	rxrpc_queue_work(&rx->local->rxnet->service_conn_reaper);
+	rxrpc_queue_work(&rx->local->rxnet->client_conn_reaper);
 
 	rxrpc_put_local(rx->local);
 	rx->local = NULL;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a0082c407005..416688381eb7 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -79,7 +79,8 @@ struct rxrpc_net {
 	struct list_head	conn_proc_list;	/* List of conns in this namespace for proc */
 	struct list_head	service_conns;	/* Service conns in this namespace */
 	rwlock_t		conn_lock;	/* Lock for ->conn_proc_list, ->service_conns */
-	struct delayed_work	service_conn_reaper;
+	struct work_struct	service_conn_reaper;
+	struct timer_list	service_conn_reap_timer;
 
 	unsigned int		nr_client_conns;
 	unsigned int		nr_active_client_conns;
@@ -90,7 +91,8 @@ struct rxrpc_net {
 	struct list_head	waiting_client_conns;
 	struct list_head	active_client_conns;
 	struct list_head	idle_client_conns;
-	struct delayed_work	client_conn_reaper;
+	struct work_struct	client_conn_reaper;
+	struct timer_list	client_conn_reap_timer;
 
 	struct list_head	local_endpoints;
 	struct mutex		local_mutex;	/* Lock for ->local_endpoints */
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 785dfdb9fef1..7f74ca3059f8 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -691,7 +691,7 @@ int rxrpc_connect_call(struct rxrpc_call *call,
 
 	_enter("{%d,%lx},", call->debug_id, call->user_call_ID);
 
-	rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper.work);
+	rxrpc_discard_expired_client_conns(&rxnet->client_conn_reaper);
 	rxrpc_cull_active_client_conns(rxnet);
 
 	ret = rxrpc_get_client_conn(call, cp, srx, gfp);
@@ -756,6 +756,18 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
 	}
 }
 
+/*
+ * Set the reap timer.
+ */
+static void rxrpc_set_client_reap_timer(struct rxrpc_net *rxnet)
+{
+	unsigned long now = jiffies;
+	unsigned long reap_at = now + rxrpc_conn_idle_client_expiry;
+
+	if (rxnet->live)
+		timer_reduce(&rxnet->client_conn_reap_timer, reap_at);
+}
+
 /*
  * Disconnect a client call.
  */
@@ -896,9 +908,7 @@ idle_connection:
 		list_move_tail(&conn->cache_link, &rxnet->idle_client_conns);
 		if (rxnet->idle_client_conns.next == &conn->cache_link &&
 		    !rxnet->kill_all_client_conns)
-			queue_delayed_work(rxrpc_workqueue,
-					   &rxnet->client_conn_reaper,
-					   rxrpc_conn_idle_client_expiry);
+			rxrpc_set_client_reap_timer(rxnet);
 	} else {
 		trace_rxrpc_client(conn, channel, rxrpc_client_to_inactive);
 		conn->cache_state = RXRPC_CONN_CLIENT_INACTIVE;
@@ -1036,8 +1046,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
 {
 	struct rxrpc_connection *conn;
 	struct rxrpc_net *rxnet =
-		container_of(to_delayed_work(work),
-			     struct rxrpc_net, client_conn_reaper);
+		container_of(work, struct rxrpc_net, client_conn_reaper);
 	unsigned long expiry, conn_expires_at, now;
 	unsigned int nr_conns;
 	bool did_discard = false;
@@ -1116,9 +1125,8 @@ not_yet_expired:
 	 */
 	_debug("not yet");
 	if (!rxnet->kill_all_client_conns)
-		queue_delayed_work(rxrpc_workqueue,
-				   &rxnet->client_conn_reaper,
-				   conn_expires_at - now);
+		timer_reduce(&rxnet->client_conn_reap_timer,
+			     conn_expires_at);
 
 out:
 	spin_unlock(&rxnet->client_conn_cache_lock);
@@ -1138,9 +1146,9 @@ void rxrpc_destroy_all_client_connections(struct rxrpc_net *rxnet)
 	rxnet->kill_all_client_conns = true;
 	spin_unlock(&rxnet->client_conn_cache_lock);
 
-	cancel_delayed_work(&rxnet->client_conn_reaper);
+	del_timer_sync(&rxnet->client_conn_reap_timer);
 
-	if (!queue_delayed_work(rxrpc_workqueue, &rxnet->client_conn_reaper, 0))
+	if (!rxrpc_queue_work(&rxnet->client_conn_reaper))
 		_debug("destroy: queue failed");
 
 	_leave("");
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 726eda6140ae..1aad04a32d5e 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -310,22 +310,30 @@ rxrpc_get_connection_maybe(struct rxrpc_connection *conn)
 	return conn;
 }
 
+/*
+ * Set the service connection reap timer.
+ */
+static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet,
+					 unsigned long reap_at)
+{
+	if (rxnet->live)
+		timer_reduce(&rxnet->service_conn_reap_timer, reap_at);
+}
+
 /*
  * Release a service connection
  */
 void rxrpc_put_service_conn(struct rxrpc_connection *conn)
 {
-	struct rxrpc_net *rxnet;
 	const void *here = __builtin_return_address(0);
 	int n;
 
 	n = atomic_dec_return(&conn->usage);
 	trace_rxrpc_conn(conn, rxrpc_conn_put_service, n, here);
 	ASSERTCMP(n, >=, 0);
-	if (n == 1) {
-		rxnet = conn->params.local->rxnet;
-		rxrpc_queue_delayed_work(&rxnet->service_conn_reaper, 0);
-	}
+	if (n == 1)
+		rxrpc_set_service_reap_timer(conn->params.local->rxnet,
+					     jiffies + rxrpc_connection_expiry);
 }
 
 /*
@@ -362,8 +370,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 {
 	struct rxrpc_connection *conn, *_p;
 	struct rxrpc_net *rxnet =
-		container_of(to_delayed_work(work),
-			     struct rxrpc_net, service_conn_reaper);
+		container_of(work, struct rxrpc_net, service_conn_reaper);
 	unsigned long expire_at, earliest, idle_timestamp, now;
 
 	LIST_HEAD(graveyard);
@@ -417,8 +424,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 	if (earliest != now + MAX_JIFFY_OFFSET) {
 		_debug("reschedule reaper %ld", (long)earliest - (long)now);
 		ASSERT(time_after(earliest, now));
-		rxrpc_queue_delayed_work(&rxnet->service_conn_reaper,
-					 earliest - now);
+		rxrpc_set_service_reap_timer(rxnet, earliest);		
 	}
 
 	while (!list_empty(&graveyard)) {
@@ -446,8 +452,8 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
 	rxrpc_destroy_all_client_connections(rxnet);
 
-	cancel_delayed_work(&rxnet->client_conn_reaper);
-	rxrpc_queue_delayed_work(&rxnet->client_conn_reaper, 0);
+	del_timer_sync(&rxnet->service_conn_reap_timer);
+	rxrpc_queue_work(&rxnet->service_conn_reaper);
 	flush_workqueue(rxrpc_workqueue);
 
 	write_lock(&rxnet->conn_lock);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 684c51d600c7..f18c9248e0d4 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -14,6 +14,24 @@
 
 unsigned int rxrpc_net_id;
 
+static void rxrpc_client_conn_reap_timeout(struct timer_list *timer)
+{
+	struct rxrpc_net *rxnet =
+		container_of(timer, struct rxrpc_net, client_conn_reap_timer);
+
+	if (rxnet->live)
+		rxrpc_queue_work(&rxnet->client_conn_reaper);
+}
+
+static void rxrpc_service_conn_reap_timeout(struct timer_list *timer)
+{
+	struct rxrpc_net *rxnet =
+		container_of(timer, struct rxrpc_net, service_conn_reap_timer);
+
+	if (rxnet->live)
+		rxrpc_queue_work(&rxnet->service_conn_reaper);
+}
+
 /*
  * Initialise a per-network namespace record.
  */
@@ -32,8 +50,10 @@ static __net_init int rxrpc_init_net(struct net *net)
 	INIT_LIST_HEAD(&rxnet->conn_proc_list);
 	INIT_LIST_HEAD(&rxnet->service_conns);
 	rwlock_init(&rxnet->conn_lock);
-	INIT_DELAYED_WORK(&rxnet->service_conn_reaper,
-			  rxrpc_service_connection_reaper);
+	INIT_WORK(&rxnet->service_conn_reaper,
+		  rxrpc_service_connection_reaper);
+	timer_setup(&rxnet->service_conn_reap_timer,
+		    rxrpc_service_conn_reap_timeout, 0);
 
 	rxnet->nr_client_conns = 0;
 	rxnet->nr_active_client_conns = 0;
@@ -43,8 +63,10 @@ static __net_init int rxrpc_init_net(struct net *net)
 	INIT_LIST_HEAD(&rxnet->waiting_client_conns);
 	INIT_LIST_HEAD(&rxnet->active_client_conns);
 	INIT_LIST_HEAD(&rxnet->idle_client_conns);
-	INIT_DELAYED_WORK(&rxnet->client_conn_reaper,
-			  rxrpc_discard_expired_client_conns);
+	INIT_WORK(&rxnet->client_conn_reaper,
+		  rxrpc_discard_expired_client_conns);
+	timer_setup(&rxnet->client_conn_reap_timer,
+		    rxrpc_client_conn_reap_timeout, 0);
 
 	INIT_LIST_HEAD(&rxnet->local_endpoints);
 	mutex_init(&rxnet->local_mutex);
-- 
cgit v1.2.3


From 01a95b2141e337dea15ad48e60a35c72a9b10205 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 24 Nov 2017 09:35:25 +0100
Subject: cfg80211: select CRYPTO_SHA256 if needed

When regulatory database certificates are built-in, they're
currently using the SHA256 digest algorithm, so add that to
the build in that case.

Also add a note that for custom certificates, one may need
to add the right algorithms.

Reported-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/Kconfig | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index da91bb547db3..1abcc4fc4df1 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -20,6 +20,10 @@ config CFG80211
 	tristate "cfg80211 - wireless configuration API"
 	depends on RFKILL || !RFKILL
 	select FW_LOADER
+	# may need to update this when certificates are changed and are
+	# using a different algorithm, though right now they shouldn't
+	# (this is here rather than below to allow it to be a module)
+	select CRYPTO_SHA256 if CFG80211_USE_KERNEL_REGDB_KEYS
 	---help---
 	  cfg80211 is the Linux wireless LAN (802.11) configuration API.
 	  Enable this if you have a wireless device.
@@ -113,6 +117,9 @@ config CFG80211_EXTRA_REGDB_KEYDIR
 	  certificates like in the kernel sources (net/wireless/certs/)
 	  that shall be accepted for a signed regulatory database.
 
+	  Note that you need to also select the correct CRYPTO_<hash> modules
+	  for your certificates, and if cfg80211 is built-in they also must be.
+
 config CFG80211_REG_CELLULAR_HINTS
 	bool "cfg80211 regulatory support for cellular base station hints"
 	depends on CFG80211_CERTIFICATION_ONUS
-- 
cgit v1.2.3


From a60b3f515d30d0fe8537c64671926879a3548103 Mon Sep 17 00:00:00 2001
From: Roman Kapl <code@rkapl.cz>
Date: Fri, 24 Nov 2017 12:27:58 +0100
Subject: net: sched: crash on blocks with goto chain action

tcf_block_put_ext has assumed that all filters (and thus their goto
actions) are destroyed in RCU callback and thus can not race with our
list iteration. However, that is not true during netns cleanup (see
tcf_exts_get_net comment).

Prevent the user after free by holding all chains (except 0, that one is
already held). foreach_safe is not enough in this case.

To reproduce, run the following in a netns and then delete the ns:
    ip link add dtest type dummy
    tc qdisc add dev dtest ingress
    tc filter add dev dtest chain 1 parent ffff: handle 1 prio 1 flower action goto chain 2

Fixes: 822e86d997 ("net_sched: remove tcf_block_put_deferred()")
Signed-off-by: Roman Kapl <code@rkapl.cz>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 7d97f612c9b9..ddcf04b4ab43 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -336,7 +336,8 @@ static void tcf_block_put_final(struct work_struct *work)
 	struct tcf_chain *chain, *tmp;
 
 	rtnl_lock();
-	/* Only chain 0 should be still here. */
+
+	/* At this point, all the chains should have refcnt == 1. */
 	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
 		tcf_chain_put(chain);
 	rtnl_unlock();
@@ -344,15 +345,21 @@ static void tcf_block_put_final(struct work_struct *work)
 }
 
 /* XXX: Standalone actions are not allowed to jump to any chain, and bound
- * actions should be all removed after flushing. However, filters are now
- * destroyed in tc filter workqueue with RTNL lock, they can not race here.
+ * actions should be all removed after flushing.
  */
 void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
 		       struct tcf_block_ext_info *ei)
 {
-	struct tcf_chain *chain, *tmp;
+	struct tcf_chain *chain;
 
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+	/* Hold a refcnt for all chains, except 0, so that they don't disappear
+	 * while we are iterating.
+	 */
+	list_for_each_entry(chain, &block->chain_list, list)
+		if (chain->index)
+			tcf_chain_hold(chain);
+
+	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_flush(chain);
 
 	tcf_block_offload_unbind(block, q, ei);
-- 
cgit v1.2.3


From afbea2cd253b5198350dfd8edb963567d05827d6 Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Fri, 24 Nov 2017 06:25:28 -0800
Subject: VSOCK: Don't call vsock_stream_has_data in atomic context

When using the host personality, VMCI will grab a mutex for any
queue pair access. In the detach callback for the vmci vsock
transport, we call vsock_stream_has_data while holding a spinlock,
and vsock_stream_has_data will access a queue pair.

To avoid this, we can simply omit calling vsock_stream_has_data
for host side queue pairs, since the QPs are empty per default
when the guest has detached.

This bug affects users of VMware Workstation using kernel version
4.4 and later.

Testing: Ran vsock tests between guest and host, and verified that
with this change, the host isn't calling vsock_stream_has_data
during detach. Ran mixedTest between guest and host using both
guest and host as server.

v2: Rebased on top of recent change to sk_state values
Reviewed-by: Adit Ranadive <aditr@vmware.com>
Reviewed-by: Aditya Sarwade <asarwade@vmware.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/vmci_transport.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 391775e3575c..56573dc85709 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -797,9 +797,13 @@ static void vmci_transport_handle_detach(struct sock *sk)
 
 		/* We should not be sending anymore since the peer won't be
 		 * there to receive, but we can still receive if there is data
-		 * left in our consume queue.
+		 * left in our consume queue. If the local endpoint is a host,
+		 * we can't call vsock_stream_has_data, since that may block,
+		 * but a host endpoint can't read data once the VM has
+		 * detached, so there is no available data in that case.
 		 */
-		if (vsock_stream_has_data(vsk) <= 0) {
+		if (vsk->local_addr.svm_cid == VMADDR_CID_HOST ||
+		    vsock_stream_has_data(vsk) <= 0) {
 			sk->sk_state = TCP_CLOSE;
 
 			if (sk->sk_state == TCP_SYN_SENT) {
@@ -2144,7 +2148,7 @@ module_exit(vmci_transport_exit);
 
 MODULE_AUTHOR("VMware, Inc.");
 MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
-MODULE_VERSION("1.0.4.0-k");
+MODULE_VERSION("1.0.5.0-k");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS("vmware_vsock");
 MODULE_ALIAS_NETPROTO(PF_VSOCK);
-- 
cgit v1.2.3


From 9e741045faea17e28663f14a45f7f3304827c968 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Fri, 24 Nov 2017 11:36:06 -0500
Subject: net: dsa: fix 'increment on 0' warning

Setting the refcount to 0 when allocating a tree to match the number of
switch devices it holds may cause an 'increment on 0; use-after-free',
if CONFIG_REFCOUNT_FULL is enabled.

To fix this, do not decrement the refcount of a newly allocated tree,
increment it when an already allocated tree is found, and decrement it
after the probing of a switch, as done with the previous behavior.

At the same time, make dsa_tree_get and dsa_tree_put accept a NULL
argument to simplify callers, and return the tree after incrementation,
as most kref users like of_node_get and of_node_put do.

Fixes: 8e5bf9759a06 ("net: dsa: simplify tree reference counting")
Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 44e3fb7dec8c..1e287420ff49 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -51,9 +51,7 @@ static struct dsa_switch_tree *dsa_tree_alloc(int index)
 	INIT_LIST_HEAD(&dst->list);
 	list_add_tail(&dsa_tree_list, &dst->list);
 
-	/* Initialize the reference counter to the number of switches, not 1 */
 	kref_init(&dst->refcount);
-	refcount_set(&dst->refcount.refcount, 0);
 
 	return dst;
 }
@@ -64,20 +62,23 @@ static void dsa_tree_free(struct dsa_switch_tree *dst)
 	kfree(dst);
 }
 
-static struct dsa_switch_tree *dsa_tree_touch(int index)
+static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst)
 {
-	struct dsa_switch_tree *dst;
-
-	dst = dsa_tree_find(index);
-	if (!dst)
-		dst = dsa_tree_alloc(index);
+	if (dst)
+		kref_get(&dst->refcount);
 
 	return dst;
 }
 
-static void dsa_tree_get(struct dsa_switch_tree *dst)
+static struct dsa_switch_tree *dsa_tree_touch(int index)
 {
-	kref_get(&dst->refcount);
+	struct dsa_switch_tree *dst;
+
+	dst = dsa_tree_find(index);
+	if (dst)
+		return dsa_tree_get(dst);
+	else
+		return dsa_tree_alloc(index);
 }
 
 static void dsa_tree_release(struct kref *ref)
@@ -91,7 +92,8 @@ static void dsa_tree_release(struct kref *ref)
 
 static void dsa_tree_put(struct dsa_switch_tree *dst)
 {
-	kref_put(&dst->refcount, dsa_tree_release);
+	if (dst)
+		kref_put(&dst->refcount, dsa_tree_release);
 }
 
 static bool dsa_port_is_dsa(struct dsa_port *port)
@@ -765,6 +767,7 @@ int dsa_register_switch(struct dsa_switch *ds)
 
 	mutex_lock(&dsa2_mutex);
 	err = dsa_switch_probe(ds);
+	dsa_tree_put(ds->dst);
 	mutex_unlock(&dsa2_mutex);
 
 	return err;
-- 
cgit v1.2.3


From 2734166e89639c973c6e125ac8bcfc2d9db72b70 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Sat, 25 Nov 2017 13:14:40 -0600
Subject: net: openvswitch: datapath: fix data type in queue_gso_packets

gso_type is being used in binary AND operations together with SKB_GSO_UDP.
The issue is that variable gso_type is of type unsigned short and
SKB_GSO_UDP expands to more than 16 bits:

SKB_GSO_UDP = 1 << 16

this makes any binary AND operation between gso_type and SKB_GSO_UDP to
be always zero, hence making some code unreachable and likely causing
undesired behavior.

Fix this by changing the data type of variable gso_type to unsigned int.

Addresses-Coverity-ID: 1462223
Fixes: 0c19f846d582 ("net: accept UFO datagrams from tuntap and packet")
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 99cfafc2a139..ef38e5aecd28 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -308,7 +308,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 			     const struct dp_upcall_info *upcall_info,
 				 uint32_t cutlen)
 {
-	unsigned short gso_type = skb_shinfo(skb)->gso_type;
+	unsigned int gso_type = skb_shinfo(skb)->gso_type;
 	struct sw_flow_key later_key;
 	struct sk_buff *segs, *nskb;
 	int err;
-- 
cgit v1.2.3


From 67c8d22a73128ff910e2287567132530abcf5b71 Mon Sep 17 00:00:00 2001
From: zhangliping <zhangliping02@baidu.com>
Date: Sat, 25 Nov 2017 22:02:12 +0800
Subject: openvswitch: fix the incorrect flow action alloc size

If we want to add a datapath flow, which has more than 500 vxlan outputs'
action, we will get the following error reports:
  openvswitch: netlink: Flow action size 32832 bytes exceeds max
  openvswitch: netlink: Flow action size 32832 bytes exceeds max
  openvswitch: netlink: Actions may not be safe on all matching packets
  ... ...

It seems that we can simply enlarge the MAX_ACTIONS_BUFSIZE to fix it, but
this is not the root cause. For example, for a vxlan output action, we need
about 60 bytes for the nlattr, but after it is converted to the flow
action, it only occupies 24 bytes. This means that we can still support
more than 1000 vxlan output actions for a single datapath flow under the
the current 32k max limitation.

So even if the nla_len(attr) is larger than MAX_ACTIONS_BUFSIZE, we
shouldn't report EINVAL and keep it move on, as the judgement can be
done by the reserve_sfa_size.

Signed-off-by: zhangliping <zhangliping02@baidu.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/flow_netlink.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index dc424798ba6f..624ea74353dd 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2241,14 +2241,11 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb)
 
 #define MAX_ACTIONS_BUFSIZE	(32 * 1024)
 
-static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log)
+static struct sw_flow_actions *nla_alloc_flow_actions(int size)
 {
 	struct sw_flow_actions *sfa;
 
-	if (size > MAX_ACTIONS_BUFSIZE) {
-		OVS_NLERR(log, "Flow action size %u bytes exceeds max", size);
-		return ERR_PTR(-EINVAL);
-	}
+	WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE);
 
 	sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
 	if (!sfa)
@@ -2321,12 +2318,15 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
 	new_acts_size = ksize(*sfa) * 2;
 
 	if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
-		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
+		if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) {
+			OVS_NLERR(log, "Flow action size exceeds max %u",
+				  MAX_ACTIONS_BUFSIZE);
 			return ERR_PTR(-EMSGSIZE);
+		}
 		new_acts_size = MAX_ACTIONS_BUFSIZE;
 	}
 
-	acts = nla_alloc_flow_actions(new_acts_size, log);
+	acts = nla_alloc_flow_actions(new_acts_size);
 	if (IS_ERR(acts))
 		return (void *)acts;
 
@@ -3059,7 +3059,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 {
 	int err;
 
-	*sfa = nla_alloc_flow_actions(nla_len(attr), log);
+	*sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE));
 	if (IS_ERR(*sfa))
 		return PTR_ERR(*sfa);
 
-- 
cgit v1.2.3


From 7b6ddeaf27eca72795ceeae2f0f347db1b5f9a30 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 21 Nov 2017 14:46:08 +0100
Subject: mac80211: use QoS NDP for AP probing

When connected to a QoS/WMM AP, mac80211 should use a QoS NDP
for probing it, instead of a regular non-QoS one, fix this.

Change all the drivers to *not* allow QoS NDP for now, even
though it looks like most of them should be OK with that.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath9k/channel.c |  2 +-
 drivers/net/wireless/st/cw1200/sta.c     |  4 ++--
 drivers/net/wireless/ti/wl1251/main.c    |  2 +-
 drivers/net/wireless/ti/wlcore/cmd.c     |  5 +++--
 include/net/mac80211.h                   |  8 +++++++-
 net/mac80211/mlme.c                      |  2 +-
 net/mac80211/tx.c                        | 29 +++++++++++++++++++++++++++--
 7 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/drivers/net/wireless/ath/ath9k/channel.c b/drivers/net/wireless/ath/ath9k/channel.c
index dfb26f03c1a2..1b05b5d7a038 100644
--- a/drivers/net/wireless/ath/ath9k/channel.c
+++ b/drivers/net/wireless/ath/ath9k/channel.c
@@ -1113,7 +1113,7 @@ ath_chanctx_send_vif_ps_frame(struct ath_softc *sc, struct ath_vif *avp,
 		if (!avp->assoc)
 			return false;
 
-		skb = ieee80211_nullfunc_get(sc->hw, vif);
+		skb = ieee80211_nullfunc_get(sc->hw, vif, false);
 		if (!skb)
 			return false;
 
diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c
index 03687a80d6e9..38678e9a0562 100644
--- a/drivers/net/wireless/st/cw1200/sta.c
+++ b/drivers/net/wireless/st/cw1200/sta.c
@@ -198,7 +198,7 @@ void __cw1200_cqm_bssloss_sm(struct cw1200_common *priv,
 
 		priv->bss_loss_state++;
 
-		skb = ieee80211_nullfunc_get(priv->hw, priv->vif);
+		skb = ieee80211_nullfunc_get(priv->hw, priv->vif, false);
 		WARN_ON(!skb);
 		if (skb)
 			cw1200_tx(priv->hw, NULL, skb);
@@ -2265,7 +2265,7 @@ static int cw1200_upload_null(struct cw1200_common *priv)
 		.rate = 0xFF,
 	};
 
-	frame.skb = ieee80211_nullfunc_get(priv->hw, priv->vif);
+	frame.skb = ieee80211_nullfunc_get(priv->hw, priv->vif, false);
 	if (!frame.skb)
 		return -ENOMEM;
 
diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c
index 9915d83a4a30..6d02c660b4ab 100644
--- a/drivers/net/wireless/ti/wl1251/main.c
+++ b/drivers/net/wireless/ti/wl1251/main.c
@@ -566,7 +566,7 @@ static int wl1251_build_null_data(struct wl1251 *wl)
 		size = sizeof(struct wl12xx_null_data_template);
 		ptr = NULL;
 	} else {
-		skb = ieee80211_nullfunc_get(wl->hw, wl->vif);
+		skb = ieee80211_nullfunc_get(wl->hw, wl->vif, false);
 		if (!skb)
 			goto out;
 		size = skb->len;
diff --git a/drivers/net/wireless/ti/wlcore/cmd.c b/drivers/net/wireless/ti/wlcore/cmd.c
index 2bfc12fdc929..761cf8573a80 100644
--- a/drivers/net/wireless/ti/wlcore/cmd.c
+++ b/drivers/net/wireless/ti/wlcore/cmd.c
@@ -1069,7 +1069,8 @@ int wl12xx_cmd_build_null_data(struct wl1271 *wl, struct wl12xx_vif *wlvif)
 		ptr = NULL;
 	} else {
 		skb = ieee80211_nullfunc_get(wl->hw,
-					     wl12xx_wlvif_to_vif(wlvif));
+					     wl12xx_wlvif_to_vif(wlvif),
+					     false);
 		if (!skb)
 			goto out;
 		size = skb->len;
@@ -1096,7 +1097,7 @@ int wl12xx_cmd_build_klv_null_data(struct wl1271 *wl,
 	struct sk_buff *skb = NULL;
 	int ret = -ENOMEM;
 
-	skb = ieee80211_nullfunc_get(wl->hw, vif);
+	skb = ieee80211_nullfunc_get(wl->hw, vif, false);
 	if (!skb)
 		goto out;
 
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cc9073e45be9..eec143cca1c0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -4470,18 +4470,24 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
  * ieee80211_nullfunc_get - retrieve a nullfunc template
  * @hw: pointer obtained from ieee80211_alloc_hw().
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @qos_ok: QoS NDP is acceptable to the caller, this should be set
+ *	if at all possible
  *
  * Creates a Nullfunc template which can, for example, uploaded to
  * hardware. The template must be updated after association so that correct
  * BSSID and address is used.
  *
+ * If @qos_ndp is set and the association is to an AP with QoS/WMM, the
+ * returned packet will be QoS NDP.
+ *
  * Note: Caller (or hardware) is responsible for setting the
  * &IEEE80211_FCTL_PM bit as well as Duration and Sequence Control fields.
  *
  * Return: The nullfunc template. %NULL on error.
  */
 struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
-				       struct ieee80211_vif *vif);
+				       struct ieee80211_vif *vif,
+				       bool qos_ok);
 
 /**
  * ieee80211_probereq_get - retrieve a Probe Request template
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 04460440d731..c244691deab9 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -895,7 +895,7 @@ void ieee80211_send_nullfunc(struct ieee80211_local *local,
 	struct ieee80211_hdr_3addr *nullfunc;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 
-	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif);
+	skb = ieee80211_nullfunc_get(&local->hw, &sdata->vif, true);
 	if (!skb)
 		return;
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 7b8154474b9e..3160954fc406 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4438,13 +4438,15 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw,
 EXPORT_SYMBOL(ieee80211_pspoll_get);
 
 struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
-				       struct ieee80211_vif *vif)
+				       struct ieee80211_vif *vif,
+				       bool qos_ok)
 {
 	struct ieee80211_hdr_3addr *nullfunc;
 	struct ieee80211_sub_if_data *sdata;
 	struct ieee80211_if_managed *ifmgd;
 	struct ieee80211_local *local;
 	struct sk_buff *skb;
+	bool qos = false;
 
 	if (WARN_ON(vif->type != NL80211_IFTYPE_STATION))
 		return NULL;
@@ -4453,7 +4455,17 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
 	ifmgd = &sdata->u.mgd;
 	local = sdata->local;
 
-	skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*nullfunc));
+	if (qos_ok) {
+		struct sta_info *sta;
+
+		rcu_read_lock();
+		sta = sta_info_get(sdata, ifmgd->bssid);
+		qos = sta && sta->sta.wme;
+		rcu_read_unlock();
+	}
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(*nullfunc) + 2);
 	if (!skb)
 		return NULL;
 
@@ -4463,6 +4475,19 @@ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw,
 	nullfunc->frame_control = cpu_to_le16(IEEE80211_FTYPE_DATA |
 					      IEEE80211_STYPE_NULLFUNC |
 					      IEEE80211_FCTL_TODS);
+	if (qos) {
+		__le16 qos = cpu_to_le16(7);
+
+		BUILD_BUG_ON((IEEE80211_STYPE_QOS_NULLFUNC |
+			      IEEE80211_STYPE_NULLFUNC) !=
+			     IEEE80211_STYPE_QOS_NULLFUNC);
+		nullfunc->frame_control |=
+			cpu_to_le16(IEEE80211_STYPE_QOS_NULLFUNC);
+		skb->priority = 7;
+		skb_set_queue_mapping(skb, IEEE80211_AC_VO);
+		skb_put_data(skb, &qos, sizeof(qos));
+	}
+
 	memcpy(nullfunc->addr1, ifmgd->bssid, ETH_ALEN);
 	memcpy(nullfunc->addr2, vif->addr, ETH_ALEN);
 	memcpy(nullfunc->addr3, ifmgd->bssid, ETH_ALEN);
-- 
cgit v1.2.3


From fbbdad5edf0bb59786a51b94a9d006bc8c2da9a2 Mon Sep 17 00:00:00 2001
From: Chun-Yeow Yeoh <yeohchunyeow@gmail.com>
Date: Tue, 14 Nov 2017 23:20:05 +0800
Subject: mac80211: fix the update of path metric for RANN frame

The previous path metric update from RANN frame has not considered
the own link metric toward the transmitting mesh STA. Fix this.

Reported-by: Michael65535
Signed-off-by: Chun-Yeow Yeoh <yeohchunyeow@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mesh_hwmp.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 4f7826d7b47c..4394463a0c2e 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -797,7 +797,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 	struct mesh_path *mpath;
 	u8 ttl, flags, hopcount;
 	const u8 *orig_addr;
-	u32 orig_sn, metric, metric_txsta, interval;
+	u32 orig_sn, new_metric, orig_metric, last_hop_metric, interval;
 	bool root_is_gate;
 
 	ttl = rann->rann_ttl;
@@ -808,7 +808,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 	interval = le32_to_cpu(rann->rann_interval);
 	hopcount = rann->rann_hopcount;
 	hopcount++;
-	metric = le32_to_cpu(rann->rann_metric);
+	orig_metric = le32_to_cpu(rann->rann_metric);
 
 	/*  Ignore our own RANNs */
 	if (ether_addr_equal(orig_addr, sdata->vif.addr))
@@ -825,7 +825,10 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
-	metric_txsta = airtime_link_metric_get(local, sta);
+	last_hop_metric = airtime_link_metric_get(local, sta);
+	new_metric = orig_metric + last_hop_metric;
+	if (new_metric < orig_metric)
+		new_metric = MAX_METRIC;
 
 	mpath = mesh_path_lookup(sdata, orig_addr);
 	if (!mpath) {
@@ -838,7 +841,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 	}
 
 	if (!(SN_LT(mpath->sn, orig_sn)) &&
-	    !(mpath->sn == orig_sn && metric < mpath->rann_metric)) {
+	    !(mpath->sn == orig_sn && new_metric < mpath->rann_metric)) {
 		rcu_read_unlock();
 		return;
 	}
@@ -856,7 +859,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 	}
 
 	mpath->sn = orig_sn;
-	mpath->rann_metric = metric + metric_txsta;
+	mpath->rann_metric = new_metric;
 	mpath->is_root = true;
 	/* Recording RANNs sender address to send individually
 	 * addressed PREQs destined for root mesh STA */
@@ -876,7 +879,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata,
 		mesh_path_sel_frame_tx(MPATH_RANN, flags, orig_addr,
 				       orig_sn, 0, NULL, 0, broadcast_addr,
 				       hopcount, ttl, interval,
-				       metric + metric_txsta, 0, sdata);
+				       new_metric, 0, sdata);
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 72e2c3438ba3bd2ed640b6b5ea9e58993dd9ab7f Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Sun, 29 Oct 2017 11:51:09 +0200
Subject: mac80211: tear down RX aggregations first

When doing HW restart we tear down aggregations.
Since at this point we are not TX'ing any aggregation, while
the peer is still sending RX aggregation over the air, it will
make sense to tear down the RX aggregations first.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ht.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 41f5e48f8021..167f83b853e6 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -292,7 +292,6 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 
 	mutex_lock(&sta->ampdu_mlme.mtx);
 	for (i = 0; i <  IEEE80211_NUM_TIDS; i++) {
-		___ieee80211_stop_tx_ba_session(sta, i, reason);
 		___ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT,
 						WLAN_REASON_QSTA_LEAVE_QBSS,
 						reason != AGG_STOP_DESTROY_STA &&
@@ -300,6 +299,9 @@ void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta,
 	}
 	mutex_unlock(&sta->ampdu_mlme.mtx);
 
+	for (i = 0; i <  IEEE80211_NUM_TIDS; i++)
+		___ieee80211_stop_tx_ba_session(sta, i, reason);
+
 	/* stopping might queue the work again - so cancel only afterwards */
 	cancel_work_sync(&sta->ampdu_mlme.work);
 
-- 
cgit v1.2.3


From 3aa623da789ff6edf789b5655debf33f50b3a9b2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:05:32 +0800
Subject: sctp: use sizeof(__u16) for each stream number length instead of
 magic number

Now in stream reconf part there are still some places using magic
number 2 for each stream number length. To make it more readable,
this patch is to replace them with sizeof(__u16).

Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index a11db21dc8a0..09c797a10aaa 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -563,7 +563,7 @@ struct sctp_chunk *sctp_process_strreset_outreq(
 		flags = SCTP_STREAM_RESET_INCOMING_SSN;
 	}
 
-	nums = (ntohs(param.p->length) - sizeof(*outreq)) / 2;
+	nums = (ntohs(param.p->length) - sizeof(*outreq)) / sizeof(__u16);
 	if (nums) {
 		str_p = outreq->list_of_streams;
 		for (i = 0; i < nums; i++) {
@@ -627,7 +627,7 @@ struct sctp_chunk *sctp_process_strreset_inreq(
 		goto out;
 	}
 
-	nums = (ntohs(param.p->length) - sizeof(*inreq)) / 2;
+	nums = (ntohs(param.p->length) - sizeof(*inreq)) / sizeof(__u16);
 	str_p = inreq->list_of_streams;
 	for (i = 0; i < nums; i++) {
 		if (ntohs(str_p[i]) >= stream->outcnt) {
@@ -927,7 +927,8 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 		outreq = (struct sctp_strreset_outreq *)req;
 		str_p = outreq->list_of_streams;
-		nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / 2;
+		nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) /
+		       sizeof(__u16);
 
 		if (result == SCTP_STRRESET_PERFORMED) {
 			if (nums) {
@@ -956,7 +957,8 @@ struct sctp_chunk *sctp_process_strreset_resp(
 
 		inreq = (struct sctp_strreset_inreq *)req;
 		str_p = inreq->list_of_streams;
-		nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / 2;
+		nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) /
+		       sizeof(__u16);
 
 		*evp = sctp_ulpevent_make_stream_reset_event(asoc, flags,
 			nums, str_p, GFP_ATOMIC);
-- 
cgit v1.2.3


From d570a59c5b5f8fb3b65fa7b15ddb205a1d55f8d0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:05:33 +0800
Subject: sctp: only allow the out stream reset when the stream outq is empty

Now the out stream reset in sctp stream reconf could be done even if
the stream outq is not empty. It means that users can not be sure
since which msg the new ssn will be used.

To make this more synchronous, it shouldn't allow to do out stream
reset until these chunks in unsent outq all are sent out.

This patch checks the corresponding stream outqs when sending and
processing the request . If any of them has unsent chunks in outq,
it will return -EAGAIN instead or send SCTP_STRRESET_IN_PROGRESS
back to the sender.

Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter")
Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 09c797a10aaa..b20903712d67 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -254,6 +254,30 @@ static int sctp_send_reconf(struct sctp_association *asoc,
 	return retval;
 }
 
+static bool sctp_stream_outq_is_empty(struct sctp_stream *stream,
+				      __u16 str_nums, __be16 *str_list)
+{
+	struct sctp_association *asoc;
+	__u16 i;
+
+	asoc = container_of(stream, struct sctp_association, stream);
+	if (!asoc->outqueue.out_qlen)
+		return true;
+
+	if (!str_nums)
+		return false;
+
+	for (i = 0; i < str_nums; i++) {
+		__u16 sid = ntohs(str_list[i]);
+
+		if (stream->out[sid].ext &&
+		    !list_empty(&stream->out[sid].ext->outq))
+			return false;
+	}
+
+	return true;
+}
+
 int sctp_send_reset_streams(struct sctp_association *asoc,
 			    struct sctp_reset_streams *params)
 {
@@ -317,6 +341,11 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
 	for (i = 0; i < str_nums; i++)
 		nstr_list[i] = htons(str_list[i]);
 
+	if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) {
+		retval = -EAGAIN;
+		goto out;
+	}
+
 	chunk = sctp_make_strreset_req(asoc, str_nums, nstr_list, out, in);
 
 	kfree(nstr_list);
@@ -636,6 +665,12 @@ struct sctp_chunk *sctp_process_strreset_inreq(
 		}
 	}
 
+	if (!sctp_stream_outq_is_empty(stream, nums, str_p)) {
+		result = SCTP_STRRESET_IN_PROGRESS;
+		asoc->strreset_inseq--;
+		goto err;
+	}
+
 	chunk = sctp_make_strreset_req(asoc, nums, str_p, 1, 0);
 	if (!chunk)
 		goto out;
-- 
cgit v1.2.3


From 5c6144a0eb5366ae07fc5059301b139338f39bbd Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:05:34 +0800
Subject: sctp: only allow the asoc reset when the asoc outq is empty

As it says in rfc6525#section5.1.4, before sending the request,

   C2:  The sender has either no outstanding TSNs or considers all
        outstanding TSNs abandoned.

Prior to this patch, it tried to consider all outstanding TSNs abandoned
by dropping all chunks in all outqs with sctp_outq_free (even including
sacked, retransmit and transmitted queues) when doing this reset, which
is too aggressive.

To make it work gently, this patch will only allow the asoc reset when
the sender has no outstanding TSNs by checking if unsent, transmitted
and retransmit are all empty with sctp_outq_is_empty before sending
and processing the request.

Fixes: 692787cef651 ("sctp: implement receiver-side procedures for the SSN/TSN Reset Request Parameter")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index b20903712d67..f3b7d2779c18 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -406,6 +406,9 @@ int sctp_send_reset_assoc(struct sctp_association *asoc)
 	if (asoc->strreset_outstanding)
 		return -EINPROGRESS;
 
+	if (!sctp_outq_is_empty(&asoc->outqueue))
+		return -EAGAIN;
+
 	chunk = sctp_make_strreset_tsnreq(asoc);
 	if (!chunk)
 		return -ENOMEM;
@@ -728,6 +731,12 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 		}
 		goto err;
 	}
+
+	if (!sctp_outq_is_empty(&asoc->outqueue)) {
+		result = SCTP_STRRESET_IN_PROGRESS;
+		goto err;
+	}
+
 	asoc->strreset_inseq++;
 
 	if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ))
-- 
cgit v1.2.3


From 159f2a7456c6ae95c1e1a58e8b8ec65ef12d51cf Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:05:35 +0800
Subject: sctp: avoid flushing unsent queue when doing asoc reset

Now when doing asoc reset, it cleans up sacked and abandoned queues
by calling sctp_outq_free where it also cleans up unsent, retransmit
and transmitted queues.

It's safe for the sender of response, as these 3 queues are empty at
that time. But when the receiver of response is doing the reset, the
users may already enqueue some chunks into unsent during the time
waiting the response, and these chunks should not be flushed.

To void the chunks in it would be removed, it moves the queue into a
temp list, then gets it back after sctp_outq_free is done.

The patch also fixes some incorrect comments in
sctp_process_strreset_tsnreq.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index f3b7d2779c18..9dd5bfe3860c 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -747,9 +747,10 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 		goto out;
 	}
 
-	/* G3: The same processing as though a SACK chunk with no gap report
-	 *     and a cumulative TSN ACK of the Sender's Next TSN minus 1 were
-	 *     received MUST be performed.
+	/* G4: The same processing as though a FWD-TSN chunk (as defined in
+	 *     [RFC3758]) with all streams affected and a new cumulative TSN
+	 *     ACK of the Receiver's Next TSN minus 1 were received MUST be
+	 *     performed.
 	 */
 	max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map);
 	sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen);
@@ -764,10 +765,9 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 	sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
 			 init_tsn, GFP_ATOMIC);
 
-	/* G4: The same processing as though a FWD-TSN chunk (as defined in
-	 *     [RFC3758]) with all streams affected and a new cumulative TSN
-	 *     ACK of the Receiver's Next TSN minus 1 were received MUST be
-	 *     performed.
+	/* G3: The same processing as though a SACK chunk with no gap report
+	 *     and a cumulative TSN ACK of the Sender's Next TSN minus 1 were
+	 *     received MUST be performed.
 	 */
 	sctp_outq_free(&asoc->outqueue);
 
@@ -1021,6 +1021,7 @@ struct sctp_chunk *sctp_process_strreset_resp(
 		if (result == SCTP_STRRESET_PERFORMED) {
 			__u32 mtsn = sctp_tsnmap_get_max_tsn_seen(
 						&asoc->peer.tsn_map);
+			LIST_HEAD(temp);
 
 			sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn);
 			sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC);
@@ -1029,7 +1030,13 @@ struct sctp_chunk *sctp_process_strreset_resp(
 					 SCTP_TSN_MAP_INITIAL,
 					 stsn, GFP_ATOMIC);
 
+			/* Clean up sacked and abandoned queues only. As the
+			 * out_chunk_list may not be empty, splice it to temp,
+			 * then get it back after sctp_outq_free is done.
+			 */
+			list_splice_init(&asoc->outqueue.out_chunk_list, &temp);
 			sctp_outq_free(&asoc->outqueue);
+			list_splice_init(&temp, &asoc->outqueue.out_chunk_list);
 
 			asoc->next_tsn = rtsn;
 			asoc->ctsn_ack_point = asoc->next_tsn - 1;
-- 
cgit v1.2.3


From 52a395896a051a3d5c34fba67c324f69ec5e67c6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sat, 25 Nov 2017 21:05:36 +0800
Subject: sctp: set sender next_tsn for the old result with ctsn_ack_point plus
 1

When doing asoc reset, if the sender of the response has already sent some
chunk and increased asoc->next_tsn before the duplicate request comes, the
response will use the old result with an incorrect sender next_tsn.

Better than asoc->next_tsn, asoc->ctsn_ack_point can't be changed after
the sender of the response has performed the asoc reset and before the
peer has confirmed it, and it's value is still asoc->next_tsn original
value minus 1.

This patch sets sender next_tsn for the old result with ctsn_ack_point
plus 1 when processing the duplicate request, to make sure the sender
next_tsn value peer gets will be always right.

Fixes: 692787cef651 ("sctp: implement receiver-side procedures for the SSN/TSN Reset Request Parameter")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 9dd5bfe3860c..a20145b3a949 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -725,7 +725,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
 		i = asoc->strreset_inseq - request_seq - 1;
 		result = asoc->strreset_result[i];
 		if (result == SCTP_STRRESET_PERFORMED) {
-			next_tsn = asoc->next_tsn;
+			next_tsn = asoc->ctsn_ack_point + 1;
 			init_tsn =
 				sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1;
 		}
-- 
cgit v1.2.3


From 2e724dca7749223204bbae21745c0e3fc932700a Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 27 Nov 2017 20:13:39 +0100
Subject: tipc: eliminate access after delete in group_filter_msg()

KASAN revealed another access after delete in group.c. This time
it found that we read the header of a received message after the
buffer has been released.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/group.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/group.c b/net/tipc/group.c
index 12777cac638a..95fec2c057d6 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -497,6 +497,7 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
 	while ((skb = skb_peek(defq))) {
 		hdr = buf_msg(skb);
 		mtyp = msg_type(hdr);
+		blks = msg_blocks(hdr);
 		deliver = true;
 		ack = false;
 		update = false;
@@ -546,7 +547,6 @@ void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq,
 		if (!update)
 			continue;
 
-		blks = msg_blocks(hdr);
 		tipc_group_update_rcv_win(grp, blks, node, port, xmitq);
 	}
 	return;
-- 
cgit v1.2.3


From 08f46070dde2a835a7a5bfd4c70372c27bff5d88 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 26 Nov 2017 20:16:06 +0800
Subject: sctp: force SCTP_ERROR_INV_STRM with __u32 when calling
 sctp_chunk_fail

This patch is to force SCTP_ERROR_INV_STRM with right type to
fit in sctp_chunk_fail to avoid the sparse error.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/stream.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index a20145b3a949..76ea66be0bbe 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -64,7 +64,7 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
 		 */
 
 		/* Mark as failed send. */
-		sctp_chunk_fail(ch, SCTP_ERROR_INV_STRM);
+		sctp_chunk_fail(ch, (__force __u32)SCTP_ERROR_INV_STRM);
 		if (asoc->peer.prsctp_capable &&
 		    SCTP_PR_PRIO_ENABLED(ch->sinfo.sinfo_flags))
 			asoc->sent_cnt_removable--;
-- 
cgit v1.2.3


From 1ba896f6f52bfafac6dec4ca583cdd9a073858e8 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 26 Nov 2017 20:16:08 +0800
Subject: sctp: remove extern from stream sched

Now each stream sched ops is defined in different .c file and
added into the global ops in another .c file, it uses extern
to make this work.

However extern is not good coding style to get them in and
even make C=2 reports errors for this.

This patch adds sctp_sched_ops_xxx_init for each stream sched
ops in their .c file, then get them into the global ops by
calling them when initializing sctp module.

Fixes: 637784ade221 ("sctp: introduce priority based stream scheduler")
Fixes: ac1ed8b82cd6 ("sctp: introduce round robin stream scheduler")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/sctp.h         |  5 +++++
 include/net/sctp/stream_sched.h |  5 +++++
 net/sctp/protocol.c             |  1 +
 net/sctp/stream_sched.c         | 25 ++++++++++++++++++-------
 net/sctp/stream_sched_prio.c    |  7 ++++++-
 net/sctp/stream_sched_rr.c      |  7 ++++++-
 6 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 749a42882437..906a9c0efa71 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -194,6 +194,11 @@ void sctp_remaddr_proc_exit(struct net *net);
  */
 int sctp_offload_init(void);
 
+/*
+ * sctp/stream_sched.c
+ */
+void sctp_sched_ops_init(void);
+
 /*
  * sctp/stream.c
  */
diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index c676550a4c7d..5c5da48f65e7 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -69,4 +69,9 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
 int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
 struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
 
+void sctp_sched_ops_register(enum sctp_sched_type sched,
+			     struct sctp_sched_ops *sched_ops);
+void sctp_sched_ops_prio_init(void);
+void sctp_sched_ops_rr_init(void);
+
 #endif /* __sctp_stream_sched_h__ */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f5172c21349b..6a38c2503649 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1499,6 +1499,7 @@ static __init int sctp_init(void)
 	INIT_LIST_HEAD(&sctp_address_families);
 	sctp_v4_pf_init();
 	sctp_v6_pf_init();
+	sctp_sched_ops_init();
 
 	status = register_pernet_subsys(&sctp_defaults_ops);
 	if (status)
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 0b83ec51e43b..d8c162a4089c 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -119,16 +119,27 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 	.unsched_all = sctp_sched_fcfs_unsched_all,
 };
 
+static void sctp_sched_ops_fcfs_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs);
+}
+
 /* API to other parts of the stack */
 
-extern struct sctp_sched_ops sctp_sched_prio;
-extern struct sctp_sched_ops sctp_sched_rr;
+static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1];
 
-static struct sctp_sched_ops *sctp_sched_ops[] = {
-	&sctp_sched_fcfs,
-	&sctp_sched_prio,
-	&sctp_sched_rr,
-};
+void sctp_sched_ops_register(enum sctp_sched_type sched,
+			     struct sctp_sched_ops *sched_ops)
+{
+	sctp_sched_ops[sched] = sched_ops;
+}
+
+void sctp_sched_ops_init(void)
+{
+	sctp_sched_ops_fcfs_init();
+	sctp_sched_ops_prio_init();
+	sctp_sched_ops_rr_init();
+}
 
 int sctp_sched_set_sched(struct sctp_association *asoc,
 			 enum sctp_sched_type sched)
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 384dbf3c8760..7997d35dd0fd 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -333,7 +333,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream)
 			sctp_sched_prio_unsched(soute);
 }
 
-struct sctp_sched_ops sctp_sched_prio = {
+static struct sctp_sched_ops sctp_sched_prio = {
 	.set = sctp_sched_prio_set,
 	.get = sctp_sched_prio_get,
 	.init = sctp_sched_prio_init,
@@ -345,3 +345,8 @@ struct sctp_sched_ops sctp_sched_prio = {
 	.sched_all = sctp_sched_prio_sched_all,
 	.unsched_all = sctp_sched_prio_unsched_all,
 };
+
+void sctp_sched_ops_prio_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_PRIO, &sctp_sched_prio);
+}
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index 7612a438c5b9..1155692448f1 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -187,7 +187,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream)
 		sctp_sched_rr_unsched(stream, soute);
 }
 
-struct sctp_sched_ops sctp_sched_rr = {
+static struct sctp_sched_ops sctp_sched_rr = {
 	.set = sctp_sched_rr_set,
 	.get = sctp_sched_rr_get,
 	.init = sctp_sched_rr_init,
@@ -199,3 +199,8 @@ struct sctp_sched_ops sctp_sched_rr = {
 	.sched_all = sctp_sched_rr_sched_all,
 	.unsched_all = sctp_sched_rr_unsched_all,
 };
+
+void sctp_sched_ops_rr_init(void)
+{
+	sctp_sched_ops_register(SCTP_SS_RR, &sctp_sched_rr);
+}
-- 
cgit v1.2.3


From 57f015f5eccf25fd4a3336fe3cbbee920a8fba6f Mon Sep 17 00:00:00 2001
From: Mike Maloney <maloney@google.com>
Date: Tue, 28 Nov 2017 10:44:29 -0500
Subject: packet: fix crash in fanout_demux_rollover()

syzkaller found a race condition fanout_demux_rollover() while removing
a packet socket from a fanout group.

po->rollover is read and operated on during packet_rcv_fanout(), via
fanout_demux_rollover(), but the pointer is currently cleared before the
synchronization in packet_release().   It is safer to delay the cleanup
until after synchronize_net() has been called, ensuring all calls to
packet_rcv_fanout() for this socket have finished.

To further simplify synchronization around the rollover structure, set
po->rollover in fanout_add() only if there are no errors.  This removes
the need for rcu in the struct and in the call to
packet_getsockopt(..., PACKET_ROLLOVER_STATS, ...).

Crashing stack trace:
 fanout_demux_rollover+0xb6/0x4d0 net/packet/af_packet.c:1392
 packet_rcv_fanout+0x649/0x7c8 net/packet/af_packet.c:1487
 dev_queue_xmit_nit+0x835/0xc10 net/core/dev.c:1953
 xmit_one net/core/dev.c:2975 [inline]
 dev_hard_start_xmit+0x16b/0xac0 net/core/dev.c:2995
 __dev_queue_xmit+0x17a4/0x2050 net/core/dev.c:3476
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3509
 neigh_connected_output+0x489/0x720 net/core/neighbour.c:1379
 neigh_output include/net/neighbour.h:482 [inline]
 ip6_finish_output2+0xad1/0x22a0 net/ipv6/ip6_output.c:120
 ip6_finish_output+0x2f9/0x920 net/ipv6/ip6_output.c:146
 NF_HOOK_COND include/linux/netfilter.h:239 [inline]
 ip6_output+0x1f4/0x850 net/ipv6/ip6_output.c:163
 dst_output include/net/dst.h:459 [inline]
 NF_HOOK.constprop.35+0xff/0x630 include/linux/netfilter.h:250
 mld_sendpack+0x6a8/0xcc0 net/ipv6/mcast.c:1660
 mld_send_initial_cr.part.24+0x103/0x150 net/ipv6/mcast.c:2072
 mld_send_initial_cr net/ipv6/mcast.c:2056 [inline]
 ipv6_mc_dad_complete+0x99/0x130 net/ipv6/mcast.c:2079
 addrconf_dad_completed+0x595/0x970 net/ipv6/addrconf.c:4039
 addrconf_dad_work+0xac9/0x1160 net/ipv6/addrconf.c:3971
 process_one_work+0xbf0/0x1bc0 kernel/workqueue.c:2113
 worker_thread+0x223/0x1990 kernel/workqueue.c:2247
 kthread+0x35e/0x430 kernel/kthread.c:231
 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:432

Fixes: 0648ab70afe6 ("packet: rollover prepare: per-socket state")
Fixes: 509c7a1ecc860 ("packet: avoid panic in packet_getsockopt()")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Mike Maloney <maloney@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 32 ++++++++++----------------------
 net/packet/internal.h  |  1 -
 2 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 737092ca9b4e..1b7bb9d9865e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1687,7 +1687,6 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		atomic_long_set(&rollover->num, 0);
 		atomic_long_set(&rollover->num_huge, 0);
 		atomic_long_set(&rollover->num_failed, 0);
-		po->rollover = rollover;
 	}
 
 	if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
@@ -1745,6 +1744,8 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
 			__dev_remove_pack(&po->prot_hook);
 			po->fanout = match;
+			po->rollover = rollover;
+			rollover = NULL;
 			refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
 			__fanout_link(sk, po);
 			err = 0;
@@ -1758,10 +1759,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	}
 
 out:
-	if (err && rollover) {
-		kfree_rcu(rollover, rcu);
-		po->rollover = NULL;
-	}
+	kfree(rollover);
 	mutex_unlock(&fanout_mutex);
 	return err;
 }
@@ -1785,11 +1783,6 @@ static struct packet_fanout *fanout_release(struct sock *sk)
 			list_del(&f->list);
 		else
 			f = NULL;
-
-		if (po->rollover) {
-			kfree_rcu(po->rollover, rcu);
-			po->rollover = NULL;
-		}
 	}
 	mutex_unlock(&fanout_mutex);
 
@@ -3029,6 +3022,7 @@ static int packet_release(struct socket *sock)
 	synchronize_net();
 
 	if (f) {
+		kfree(po->rollover);
 		fanout_release_data(f);
 		kfree(f);
 	}
@@ -3843,7 +3837,6 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 	void *data = &val;
 	union tpacket_stats_u st;
 	struct tpacket_rollover_stats rstats;
-	struct packet_rollover *rollover;
 
 	if (level != SOL_PACKET)
 		return -ENOPROTOOPT;
@@ -3922,18 +3915,13 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		       0);
 		break;
 	case PACKET_ROLLOVER_STATS:
-		rcu_read_lock();
-		rollover = rcu_dereference(po->rollover);
-		if (rollover) {
-			rstats.tp_all = atomic_long_read(&rollover->num);
-			rstats.tp_huge = atomic_long_read(&rollover->num_huge);
-			rstats.tp_failed = atomic_long_read(&rollover->num_failed);
-			data = &rstats;
-			lv = sizeof(rstats);
-		}
-		rcu_read_unlock();
-		if (!rollover)
+		if (!po->rollover)
 			return -EINVAL;
+		rstats.tp_all = atomic_long_read(&po->rollover->num);
+		rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
+		rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
+		data = &rstats;
+		lv = sizeof(rstats);
 		break;
 	case PACKET_TX_HAS_OFF:
 		val = po->tp_tx_has_off;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 562fbc155006..a1d2b2319ae9 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -95,7 +95,6 @@ struct packet_fanout {
 
 struct packet_rollover {
 	int			sock;
-	struct rcu_head		rcu;
 	atomic_long_t		num;
 	atomic_long_t		num_huge;
 	atomic_long_t		num_failed;
-- 
cgit v1.2.3


From 15fe076edea787807a7cdc168df832544b58eba6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 28 Nov 2017 08:03:30 -0800
Subject: net/packet: fix a race in packet_bind() and packet_notifier()

syzbot reported crashes [1] and provided a C repro easing bug hunting.

When/if packet_do_bind() calls __unregister_prot_hook() and releases
po->bind_lock, another thread can run packet_notifier() and process an
NETDEV_UP event.

This calls register_prot_hook() and hooks again the socket right before
first thread is able to grab again po->bind_lock.

Fixes this issue by temporarily setting po->num to 0, as suggested by
David Miller.

[1]
dev_remove_pack: ffff8801bf16fa80 not found
------------[ cut here ]------------
kernel BUG at net/core/dev.c:7945!  ( BUG_ON(!list_empty(&dev->ptype_all)); )
invalid opcode: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
device syz0 entered promiscuous mode
CPU: 0 PID: 3161 Comm: syzkaller404108 Not tainted 4.14.0+ #190
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
task: ffff8801cc57a500 task.stack: ffff8801cc588000
RIP: 0010:netdev_run_todo+0x772/0xae0 net/core/dev.c:7945
RSP: 0018:ffff8801cc58f598 EFLAGS: 00010293
RAX: ffff8801cc57a500 RBX: dffffc0000000000 RCX: ffffffff841f75b2
RDX: 0000000000000000 RSI: 1ffff100398b1ede RDI: ffff8801bf1f8810
device syz0 entered promiscuous mode
RBP: ffff8801cc58f898 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801bf1f8cd8
R13: ffff8801cc58f870 R14: ffff8801bf1f8780 R15: ffff8801cc58f7f0
FS:  0000000001716880(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020b13000 CR3: 0000000005e25000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 rtnl_unlock+0xe/0x10 net/core/rtnetlink.c:106
 tun_detach drivers/net/tun.c:670 [inline]
 tun_chr_close+0x49/0x60 drivers/net/tun.c:2845
 __fput+0x333/0x7f0 fs/file_table.c:210
 ____fput+0x15/0x20 fs/file_table.c:244
 task_work_run+0x199/0x270 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x9bb/0x1ae0 kernel/exit.c:865
 do_group_exit+0x149/0x400 kernel/exit.c:968
 SYSC_exit_group kernel/exit.c:979 [inline]
 SyS_exit_group+0x1d/0x20 kernel/exit.c:977
 entry_SYSCALL_64_fastpath+0x1f/0x96
RIP: 0033:0x44ad19

Fixes: 30f7ea1c2b5f ("packet: race condition in packet_bind")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Francesco Ruggeri <fruggeri@aristanetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1b7bb9d9865e..da215e5c1399 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3091,6 +3091,10 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 	if (need_rehook) {
 		if (po->running) {
 			rcu_read_unlock();
+			/* prevents packet_notifier() from calling
+			 * register_prot_hook()
+			 */
+			po->num = 0;
 			__unregister_prot_hook(sk, true);
 			rcu_read_lock();
 			dev_curr = po->prot_hook.dev;
@@ -3099,6 +3103,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 								 dev->ifindex);
 		}
 
+		BUG_ON(po->running);
 		po->num = proto;
 		po->prot_hook.type = proto;
 
-- 
cgit v1.2.3


From 25415cec502a1232b19fffc85465882b19a90415 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 27 Nov 2017 11:11:41 -0800
Subject: cls_bpf: don't decrement net's refcount when offload fails

When cls_bpf offload was added it seemed like a good idea to
call cls_bpf_delete_prog() instead of extending the error
handling path, since the software state is fully initialized
at that point.  This handling of errors without jumping to
the end of the function is error prone, as proven by later
commit missing that extra call to __cls_bpf_delete_prog().

__cls_bpf_delete_prog() is now expected to be invoked with
a reference on exts->net or the field zeroed out.  The call
on the offload's error patch does not fullfil this requirement,
leading to each error stealing a reference on net namespace.

Create a function undoing what cls_bpf_set_parms() did and
use it from __cls_bpf_delete_prog() and the error path.

Fixes: aae2c35ec892 ("cls_bpf: use tcf_exts_get_net() before call_rcu()")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_bpf.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index a9f3e317055c..6fe798c2df1a 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -258,11 +258,8 @@ static int cls_bpf_init(struct tcf_proto *tp)
 	return 0;
 }
 
-static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
+static void cls_bpf_free_parms(struct cls_bpf_prog *prog)
 {
-	tcf_exts_destroy(&prog->exts);
-	tcf_exts_put_net(&prog->exts);
-
 	if (cls_bpf_is_ebpf(prog))
 		bpf_prog_put(prog->filter);
 	else
@@ -270,6 +267,14 @@ static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
 
 	kfree(prog->bpf_name);
 	kfree(prog->bpf_ops);
+}
+
+static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog)
+{
+	tcf_exts_destroy(&prog->exts);
+	tcf_exts_put_net(&prog->exts);
+
+	cls_bpf_free_parms(prog);
 	kfree(prog);
 }
 
@@ -514,12 +519,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 		goto errout_idr;
 
 	ret = cls_bpf_offload(tp, prog, oldprog);
-	if (ret) {
-		if (!oldprog)
-			idr_remove_ext(&head->handle_idr, prog->handle);
-		__cls_bpf_delete_prog(prog);
-		return ret;
-	}
+	if (ret)
+		goto errout_parms;
 
 	if (!tc_in_hw(prog->gen_flags))
 		prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW;
@@ -537,6 +538,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
 	*arg = prog;
 	return 0;
 
+errout_parms:
+	cls_bpf_free_parms(prog);
 errout_idr:
 	if (!oldprog)
 		idr_remove_ext(&head->handle_idr, prog->handle);
-- 
cgit v1.2.3


From f85729d07cd649bf69820f14bdad36326a24a699 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 28 Nov 2017 14:28:39 +0100
Subject: sch_sfq: fix null pointer dereference at timer expiration

While converting sch_sfq to use timer_setup(), the commit cdeabbb88134
("net: sched: Convert timers to use timer_setup()") forgot to
initialize the 'sch' field. As a result, the timer callback tries to
dereference a NULL pointer, and the kernel does oops.

Fix it initializing such field at qdisc creation time.

Fixes: cdeabbb88134 ("net: sched: Convert timers to use timer_setup()")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_sfq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 890f4a4564e7..09c1203c1711 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -724,6 +724,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 	int i;
 	int err;
 
+	q->sch = sch;
 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
 
 	err = tcf_block_get(&q->block, &q->filter_list, sch);
-- 
cgit v1.2.3


From a8dd397903a6e57157f6265911f7d35681364427 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 26 Nov 2017 20:56:07 +0800
Subject: sctp: use right member as the param of list_for_each_entry

Commit d04adf1b3551 ("sctp: reset owner sk for data chunks on out queues
when migrating a sock") made a mistake that using 'list' as the param of
list_for_each_entry to traverse the retransmit, sacked and abandoned
queues, while chunks are using 'transmitted_list' to link into these
queues.

It could cause NULL dereference panic if there are chunks in any of these
queues when peeling off one asoc.

So use the chunk member 'transmitted_list' instead in this patch.

Fixes: d04adf1b3551 ("sctp: reset owner sk for data chunks on out queues when migrating a sock")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 3204a9b29407..014847e25648 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -188,13 +188,13 @@ static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
 		list_for_each_entry(chunk, &t->transmitted, transmitted_list)
 			cb(chunk);
 
-	list_for_each_entry(chunk, &q->retransmit, list)
+	list_for_each_entry(chunk, &q->retransmit, transmitted_list)
 		cb(chunk);
 
-	list_for_each_entry(chunk, &q->sacked, list)
+	list_for_each_entry(chunk, &q->sacked, transmitted_list)
 		cb(chunk);
 
-	list_for_each_entry(chunk, &q->abandoned, list)
+	list_for_each_entry(chunk, &q->abandoned, transmitted_list)
 		cb(chunk);
 
 	list_for_each_entry(chunk, &q->out_chunk_list, list)
-- 
cgit v1.2.3


From 4a5def7f6a758aef1a0a3b10e981881c1e914f69 Mon Sep 17 00:00:00 2001
From: Jorgen Hansen <jhansen@vmware.com>
Date: Mon, 27 Nov 2017 05:29:32 -0800
Subject: VSOCK: Don't set sk_state to TCP_CLOSE before testing it

A recent commit (3b4477d2dcf2) converted the sk_state to use
TCP constants. In that change, vmci_transport_handle_detach
was changed such that sk->sk_state was set to TCP_CLOSE before
we test whether it is TCP_SYN_SENT. This change moves the
sk_state change back to the original locations in that function.

Signed-off-by: Jorgen Hansen <jhansen@vmware.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/vmw_vsock/vmci_transport.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 56573dc85709..a7a73ffe675b 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -804,8 +804,6 @@ static void vmci_transport_handle_detach(struct sock *sk)
 		 */
 		if (vsk->local_addr.svm_cid == VMADDR_CID_HOST ||
 		    vsock_stream_has_data(vsk) <= 0) {
-			sk->sk_state = TCP_CLOSE;
-
 			if (sk->sk_state == TCP_SYN_SENT) {
 				/* The peer may detach from a queue pair while
 				 * we are still in the connecting state, i.e.,
@@ -815,10 +813,12 @@ static void vmci_transport_handle_detach(struct sock *sk)
 				 * event like a reset.
 				 */
 
+				sk->sk_state = TCP_CLOSE;
 				sk->sk_err = ECONNRESET;
 				sk->sk_error_report(sk);
 				return;
 			}
+			sk->sk_state = TCP_CLOSE;
 		}
 		sk->sk_state_change(sk);
 	}
-- 
cgit v1.2.3


From d51aae68b142f48232257e96ce317db25445418d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 27 Nov 2017 18:37:21 +0100
Subject: net: sched: cbq: create block for q->link.block

q->link.block is not initialized, that leads to EINVAL when one tries to
add filter there. So initialize it properly.

This can be reproduced by:
$ tc qdisc add dev eth0 root handle 1: cbq avpkt 1000 rate 1000Mbit bandwidth 1000Mbit
$ tc filter add dev eth0 parent 1: protocol ip prio 100 u32 match ip protocol 0 0x00 flowid 1:1

Reported-by: Jaroslav Aster <jaster@redhat.com>
Reported-by: Ivan Vecera <ivecera@redhat.com>
Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Reviewed-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_cbq.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 6361be7881f1..525eb3a6d625 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1158,9 +1158,13 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
 		return -EINVAL;
 
+	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch);
+	if (err)
+		goto put_rtab;
+
 	err = qdisc_class_hash_init(&q->clhash);
 	if (err < 0)
-		goto put_rtab;
+		goto put_block;
 
 	q->link.sibling = &q->link;
 	q->link.common.classid = sch->handle;
@@ -1194,6 +1198,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	cbq_addprio(q, &q->link);
 	return 0;
 
+put_block:
+	tcf_block_put(q->link.block);
+
 put_rtab:
 	qdisc_put_rtab(q->link.R_tab);
 	return err;
-- 
cgit v1.2.3