From 33966dd0e2f68f26943cd9ee93ec6abbc6547a8e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 13 Jan 2009 16:04:36 -0800 Subject: tcp: splice as many packets as possible at once As spotted by Willy Tarreau, current splice() from tcp socket to pipe is not optimal. It processes at most one segment per call. This results in low performance and very high overhead due to syscall rate when splicing from interfaces which do not support LRO. Willy provided a patch inside tcp_splice_read(), but a better fix is to let tcp_read_sock() process as many segments as possible, so that tcp_rcv_space_adjust() and tcp_cleanup_rbuf() are called less often. With this change, splice() behaves like tcp_recvmsg(), being able to consume many skbs in one system call. With typical 1460 bytes of payload per frame, that means splice(SPLICE_F_NONBLOCK) can return 16*1460 = 23360 bytes. Signed-off-by: Willy Tarreau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ce572f9dff02..48ada1b2d2c4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -522,8 +522,12 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; + int ret; - return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); + ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags); + if (ret > 0) + rd_desc->count -= ret; + return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) @@ -531,6 +535,7 @@ static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, + .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); @@ -611,11 +616,13 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, tss.len -= ret; spliced += ret; + if (!timeo) + break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || + (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } -- cgit v1.2.1 From 4e704ee3c2cd38748ca59d835435d6a7e7f6f613 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 14 Jan 2009 20:41:12 -0800 Subject: gso: Ensure that the packet is long enough When we get a GSO packet from an untrusted source, we need to ensure that it is sufficiently long so that we don't end up crashing. Based on discovery and patch by Ian Campbell. Signed-off-by: Herbert Xu Tested-by: Ian Campbell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 48ada1b2d2c4..0cd71b84e483 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2389,7 +2389,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) unsigned int seq; __be32 delta; unsigned int oldlen; - unsigned int len; + unsigned int mss; if (!pskb_may_pull(skb, sizeof(*th))) goto out; @@ -2405,10 +2405,13 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) oldlen = (u16)~skb->len; __skb_pull(skb, thlen); + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; - int mss; if (unlikely(type & ~(SKB_GSO_TCPV4 | @@ -2419,7 +2422,6 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; - mss = skb_shinfo(skb)->gso_size; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; @@ -2430,8 +2432,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) if (IS_ERR(segs)) goto out; - len = skb_shinfo(skb)->gso_size; - delta = htonl(oldlen + (thlen + len)); + delta = htonl(oldlen + (thlen + mss)); skb = segs; th = tcp_hdr(skb); @@ -2447,7 +2448,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) csum_fold(csum_partial(skb_transport_header(skb), thlen, skb->csum)); - seq += len; + seq += mss; skb = skb->next; th = tcp_hdr(skb); -- cgit v1.2.1 From 9fa5fdf291c9b58b1cb8b4bb2a0ee57efa21d635 Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Mon, 26 Jan 2009 22:15:31 -0800 Subject: tcp: Fix length tcp_splice_data_recv passes to skb_splice_bits. tcp_splice_data_recv has two lengths to consider: the len parameter it gets from tcp_read_sock, which specifies the amount of data in the skb, and rd_desc->count, which is the amount of data the splice caller still wants. Currently it passes just the latter to skb_splice_bits, which then splices min(rd_desc->count, skb->len - offset) bytes. Most of the time this is fine, except when the skb contains urgent data. In that case len goes only up to the urgent byte and is less than skb->len - offset. By ignoring len tcp_splice_data_recv may a) splice data tcp_read_sock told it not to, b) return to tcp_read_sock a value > len. Now, tcp_read_sock doesn't handle used > len and leaves the socket in a bad state (both sk_receive_queue and copied_seq are bad at that point) resulting in duplicated data and corruption. Fix by passing min(rd_desc->count, len) to skb_splice_bits. Signed-off-by: Dimitris Michailidis Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cd71b84e483..76b148bcb0dc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -524,7 +524,8 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags); + ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), + tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; -- cgit v1.2.1