From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:33 -0800
Subject: bpf: sk_msg, fix socket data_ready events

When a skb verdict program is in-use and either another BPF program
redirects to that socket or the new SK_PASS support is used the
data_ready callback does not wake up application. Instead because
the stream parser/verdict is using the sk data_ready callback we wake
up the stream parser/verdict block.

Fix this by adding a helper to check if the stream parser block is
enabled on the sk and if so call the saved pointer which is the
upper layers wake up function.

This fixes application stalls observed when an application is waiting
for data in a blocking read().

Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv4/tcp_bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index a47c1cdf90fc..87503343743d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -198,7 +198,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 		msg->sg.start = i;
 		msg->sg.size -= apply_bytes;
 		sk_psock_queue_msg(psock, tmp);
-		sk->sk_data_ready(sk);
+		sk_psock_data_ready(sk, psock);
 	} else {
 		sk_msg_free(sk, tmp);
 		kfree(tmp);
-- 
cgit v1.2.1


From 0608c69c9a805c6264689d7eab4203eab88cf1da Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:35 -0800
Subject: bpf: sk_msg, sock{map|hash} redirect through ULP

A sockmap program that redirects through a kTLS ULP enabled socket
will not work correctly because the ULP layer is skipped. This
fixes the behavior to call through the ULP layer on redirect to
ensure any operations required on the data stream at the ULP layer
continue to be applied.

To do this we add an internal flag MSG_SENDPAGE_NOPOLICY to avoid
calling the BPF layer on a redirected message. This is
required to avoid calling the BPF layer multiple times (possibly
recursively) which is not the current/expected behavior without
ULPs. In the future we may add a redirect flag if users _do_
want the policy applied again but this would need to work for both
ULP and non-ULP sockets and be opt-in to avoid breaking existing
programs.

Also to avoid polluting the flag space with an internal flag we
reuse the flag space overlapping MSG_SENDPAGE_NOPOLICY with
MSG_WAITFORONE. Here WAITFORONE is specific to recv path and
SENDPAGE_NOPOLICY is only used for sendpage hooks. The last thing
to verify is user space API is masked correctly to ensure the flag
can not be set by user. (Note this needs to be true regardless
because we have internal flags already in-use that user space
should not be able to set). But for completeness we have two UAPI
paths into sendpage, sendfile and splice.

In the sendfile case the function do_sendfile() zero's flags,

./fs/read_write.c:
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
		   	    size_t count, loff_t max)
 {
   ...
   fl = 0;
#if 0
   /*
    * We need to debate whether we can enable this or not. The
    * man page documents EAGAIN return for the output at least,
    * and the application is arguably buggy if it doesn't expect
    * EAGAIN on a non-blocking file descriptor.
    */
    if (in.file->f_flags & O_NONBLOCK)
	fl = SPLICE_F_NONBLOCK;
#endif
    file_start_write(out.file);
    retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
 }

In the splice case the pipe_to_sendpage "actor" is used which
masks flags with SPLICE_F_MORE.

./fs/splice.c:
 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
			    struct pipe_buffer *buf, struct splice_desc *sd)
 {
   ...
   more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
   ...
 }

Confirming what we expect that internal flags  are in fact internal
to socket side.

Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv4/tcp_bpf.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 87503343743d..1bb7321a256d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -8,6 +8,7 @@
 #include <linux/wait.h>
 
 #include <net/inet_common.h>
+#include <net/tls.h>
 
 static bool tcp_bpf_stream_read(const struct sock *sk)
 {
@@ -218,6 +219,8 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
 	u32 off;
 
 	while (1) {
+		bool has_tx_ulp;
+
 		sge = sk_msg_elem(msg, msg->sg.start);
 		size = (apply && apply_bytes < sge->length) ?
 			apply_bytes : sge->length;
@@ -226,7 +229,15 @@ static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
 
 		tcp_rate_check_app_limited(sk);
 retry:
-		ret = do_tcp_sendpages(sk, page, off, size, flags);
+		has_tx_ulp = tls_sw_has_ctx_tx(sk);
+		if (has_tx_ulp) {
+			flags |= MSG_SENDPAGE_NOPOLICY;
+			ret = kernel_sendpage_locked(sk,
+						     page, off, size, flags);
+		} else {
+			ret = do_tcp_sendpages(sk, page, off, size, flags);
+		}
+
 		if (ret <= 0)
 			return ret;
 		if (apply)
-- 
cgit v1.2.1