From d8c6f4b9b7848bca8babfc0ae43a50c8ab22fbb9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 1 Mar 2013 07:44:08 +0000 Subject: ipv[4|6]: correct dropwatch false positive in local_deliver_finish I had a report recently of a user trying to use dropwatch to localise some frame loss, and they were getting false positives. Turned out they were using a user space SCTP stack that used raw sockets to grab frames. When we don't have a registered protocol for a given packet, we record it as a drop, even if a raw socket receieves the frame. We should only record the drop in the event a raw socket doesnt exist to receive the frames Tested by the reported successfully Signed-off-by: Neil Horman Reported-by: William Reich Tested-by: William Reich CC: "David S. Miller" CC: William Reich CC: eric.dumazet@gmail.com Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 87abd3e2bd32..2bdf802e28e2 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -228,9 +228,11 @@ static int ip_local_deliver_finish(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } - } else + kfree_skb(skb); + } else { IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); - kfree_skb(skb); + consume_skb(skb); + } } } out: -- cgit v1.2.1 From aab2b4bf224ef8358d262f95b568b8ad0cecf0a0 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 4 Mar 2013 06:23:05 +0000 Subject: tcp: fix double-counted receiver RTT when leaving receiver fast path We should not update ts_recent and call tcp_rcv_rtt_measure_ts() both before and after going to step5. That wastes CPU and double-counts the receiver-side RTT sample. Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a759e19496d2..0d9bdacce99f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5485,6 +5485,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if (tcp_checksum_complete_user(sk, skb)) goto csum_error; + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5496,9 +5499,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - if ((int)skb->truesize > sk->sk_forward_alloc) - goto step5; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ -- cgit v1.2.1 From fa2b04f4502d74659e4e4b1294c6d88e08ece032 Mon Sep 17 00:00:00 2001 From: David Ward Date: Tue, 5 Mar 2013 17:06:32 +0000 Subject: net/ipv4: Timestamp option cannot overflow with prespecified addresses When a router forwards a packet that contains the IPv4 timestamp option, if there is no space left in the option for the router to add its own timestamp, then the router increments the Overflow value in the option. However, if the addresses of the routers are prespecified in the option, then the overflow condition cannot happen: the option is structured so that each prespecified router has a place to write its timestamp. Other routers do not add a timestamp, so there will never be a lack of space. This fix ensures that the Overflow value in the IPv4 timestamp option is not incremented when the addresses of the routers are prespecified, even if the Pointer value is greater than the Length value. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index f6289bf6f332..310a3647c83d 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -423,7 +423,7 @@ int ip_options_compile(struct net *net, put_unaligned_be32(midtime, timeptr); opt->is_changed = 1; } - } else { + } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { unsigned int overflow = optptr[3]>>4; if (overflow == 15) { pp_ptr = optptr + 3; -- cgit v1.2.1 From c10cb5fc0fc9fa605e01f715118bde5ba5a98616 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 7 Mar 2013 02:34:33 +0000 Subject: Fix: sparse warning in inet_csk_prepare_forced_close In e337e24d66 (inet: Fix kmemleak in tcp_v4/6_syn_recv_sock and dccp_v4/6_request_recv_sock) I introduced the function inet_csk_prepare_forced_close, which does a call to bh_unlock_sock(). This produces a sparse-warning. This patch adds the missing __releases. Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7d1874be1df3..786d97aee751 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -735,6 +735,7 @@ EXPORT_SYMBOL(inet_csk_destroy_sock); * tcp/dccp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) + __releases(&sk->sk_lock.slock) { /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); -- cgit v1.2.1 From 4660c7f498c07c43173142ea95145e9dac5a6d14 Mon Sep 17 00:00:00 2001 From: David Ward Date: Mon, 11 Mar 2013 10:43:39 +0000 Subject: net/ipv4: Ensure that location of timestamp option is stored This is needed in order to detect if the timestamp option appears more than once in a packet, to remove the option if the packet is fragmented, etc. My previous change neglected to store the option location when the router addresses were prespecified and Pointer > Length. But now the option location is also stored when Flag is an unrecognized value, to ensure these option handling behaviors are still performed. Signed-off-by: David Ward Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 310a3647c83d..ec7264514a82 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -370,7 +370,6 @@ int ip_options_compile(struct net *net, } switch (optptr[3]&0xF) { case IPOPT_TS_TSONLY: - opt->ts = optptr - iph; if (skb) timeptr = &optptr[optptr[2]-1]; opt->ts_needtime = 1; @@ -381,7 +380,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; if (rt) { spec_dst_fill(&spec_dst, skb); memcpy(&optptr[optptr[2]-1], &spec_dst, 4); @@ -396,7 +394,6 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 2; goto error; } - opt->ts = optptr - iph; { __be32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); @@ -429,12 +426,12 @@ int ip_options_compile(struct net *net, pp_ptr = optptr + 3; goto error; } - opt->ts = optptr - iph; if (skb) { optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); opt->is_changed = 1; } } + opt->ts = optptr - iph; break; case IPOPT_RA: if (optlen < 4) { -- cgit v1.2.1 From 16fad69cfe4adbbfa813de516757b87bcae36d93 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Mar 2013 05:40:32 +0000 Subject: tcp: fix skb_availroom() Chrome OS team reported a crash on a Pixel ChromeBook in TCP stack : https://code.google.com/p/chromium/issues/detail?id=182056 commit a21d45726acac (tcp: avoid order-1 allocations on wifi and tx path) did a poor choice adding an 'avail_size' field to skb, while what we really needed was a 'reserved_tailroom' one. It would have avoided commit 22b4a4f22da (tcp: fix retransmit of partially acked frames) and this commit. Crash occurs because skb_split() is not aware of the 'avail_size' management (and should not be aware) Signed-off-by: Eric Dumazet Reported-by: Mukesh Agrawal Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 47e854fcae24..e22020790709 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -775,7 +775,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) * Make sure that we have exactly size bytes * available to the caller, no more, no less. */ - skb->avail_size = size; + skb->reserved_tailroom = skb->end - skb->tail - size; return skb; } __kfree_skb(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e2b4461074da..817fbb396bc8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1298,7 +1298,6 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = min_t(int, len, skb_headlen(skb)); if (eat) { __skb_pull(skb, eat); - skb->avail_size -= eat; len -= eat; if (!len) return; -- cgit v1.2.1 From 8c6216d7f118a128678270824b6a1286a63863ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Wed, 13 Mar 2013 02:37:49 +0000 Subject: Revert "ip_gre: make ipgre_tunnel_xmit() not parse network header as IP unconditionally" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 412ed94744d16806fbec3bd250fd94e71cde5a1f. The commit is wrong as tiph points to the outer IPv4 header which is installed at ipgre_header() and not the inner one which is protocol dependant. This commit broke succesfully opennhrp which use PF_PACKET socket with ETH_P_NHRP protocol. Additionally ssl_addr is set to the link-layer IPv4 address. This address is written by ipgre_header() to the skb earlier, and this is the IPv4 header tiph should point to - regardless of the inner protocol payload. Signed-off-by: Timo Teräs Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d0ef0e674ec5..91d66dbde9c0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -798,10 +798,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (dev->header_ops && dev->type == ARPHRD_IPGRE) { gre_hlen = 0; - if (skb->protocol == htons(ETH_P_IP)) - tiph = (const struct iphdr *)skb->data; - else - tiph = &tunnel->parms.iph; + tiph = (const struct iphdr *)skb->data; } else { gre_hlen = tunnel->hlen; tiph = &tunnel->parms.iph; -- cgit v1.2.1 From 0d4f0608619de59fd8169dd8e72aadc28d80e715 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Mar 2013 07:01:28 +0000 Subject: tcp: dont handle MTU reduction on LISTEN socket When an ICMP ICMP_FRAG_NEEDED (or ICMPV6_PKT_TOOBIG) message finds a LISTEN socket, and this socket is currently owned by the user, we set TCP_MTU_REDUCED_DEFERRED flag in listener tsq_flags. This is bad because if we clone the parent before it had a chance to clear the flag, the child inherits the tsq_flags value, and next tcp_release_cb() on the child will decrement sk_refcnt. Result is that we might free a live TCP socket, as reported by Dormando. IPv4: Attempt to release TCP socket in state 1 Fix this issue by testing sk_state against TCP_LISTEN early, so that we set TCP_MTU_REDUCED_DEFERRED on appropriate sockets (not a LISTEN one) This bug was introduced in commit 563d34d05786 (tcp: dont drop MTU reduction indications) Reported-by: dormando Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4a8ec457310f..d09203c63264 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -274,13 +274,6 @@ static void tcp_v4_mtu_reduced(struct sock *sk) struct inet_sock *inet = inet_sk(sk); u32 mtu = tcp_sk(sk)->mtu_info; - /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs - * send out by Linux are always <576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == TCP_LISTEN) - return; - dst = inet_csk_update_pmtu(sk, mtu); if (!dst) return; @@ -408,6 +401,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + /* We are not interested in TCP_LISTEN and open_requests + * (SYN-ACKs send out by Linux are always <576bytes so + * they should go through unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + goto out; + tp->mtu_info = info; if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); -- cgit v1.2.1 From 5a3da1fe9561828d0ca7eca664b16ec2b9bf0055 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Fri, 15 Mar 2013 11:32:30 +0000 Subject: inet: limit length of fragment queue hash table bucket lists This patch introduces a constant limit of the fragment queue hash table bucket list lengths. Currently the limit 128 is choosen somewhat arbitrary and just ensures that we can fill up the fragment cache with empty packets up to the default ip_frag_high_thresh limits. It should just protect from list iteration eating considerable amounts of cpu. If we reach the maximum length in one hash bucket a warning is printed. This is implemented on the caller side of inet_frag_find to distinguish between the different users of inet_fragment.c. I dropped the out of memory warning in the ipv4 fragment lookup path, because we already get a warning by the slab allocator. Cc: Eric Dumazet Cc: Jesper Dangaard Brouer Signed-off-by: Hannes Frederic Sowa Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 20 +++++++++++++++++++- net/ipv4/ip_fragment.c | 11 ++++------- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 245ae078a07f..f4fd23de9b13 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -21,6 +21,7 @@ #include #include +#include #include static void inet_frag_secret_rebuild(unsigned long dummy) @@ -277,6 +278,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, __releases(&f->lock) { struct inet_frag_queue *q; + int depth = 0; hlist_for_each_entry(q, &f->hash[hash], list) { if (q->net == nf && f->match(q, key)) { @@ -284,9 +286,25 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, read_unlock(&f->lock); return q; } + depth++; } read_unlock(&f->lock); - return inet_frag_create(nf, f, key); + if (depth <= INETFRAGS_MAXDEPTH) + return inet_frag_create(nf, f, key); + else + return ERR_PTR(-ENOBUFS); } EXPORT_SYMBOL(inet_frag_find); + +void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, + const char *prefix) +{ + static const char msg[] = "inet_frag_find: Fragment hash bucket" + " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) + ". Dropping fragment.\n"; + + if (PTR_ERR(q) == -ENOBUFS) + LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg); +} +EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b6d30acb600c..a6445b843ef4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -292,14 +292,11 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (q == NULL) - goto out_nomem; - + if (IS_ERR_OR_NULL(q)) { + inet_frag_maybe_warn_overflow(q, pr_fmt()); + return NULL; + } return container_of(q, struct ipq, q); - -out_nomem: - LIMIT_NETDEBUG(KERN_ERR pr_fmt("ip_frag_create: no memory left !\n")); - return NULL; } /* Is the fragment too far ahead to be part of ipq? */ -- cgit v1.2.1 From 3dd6664fac7e6041bfc8756ae9e8c78f59108cd9 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 19 Mar 2013 13:09:59 +0000 Subject: netfilter: remove unused "config IP_NF_QUEUE" Kconfig symbol IP_NF_QUEUE is unused since commit d16cf20e2f2f13411eece7f7fb72c17d141c4a84 ("netfilter: remove ip_queue support"). Let's remove it too. Signed-off-by: Paul Bolle Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/Kconfig | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ce2d43e1f09f..0d755c50994b 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,19 +36,6 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config IP_NF_QUEUE - tristate "IP Userspace queueing via NETLINK (OBSOLETE)" - depends on NETFILTER_ADVANCED - help - Netfilter has the ability to queue packets to user space: the - netlink device can be used to access them using this driver. - - This option enables the old IPv4-only "ip_queue" implementation - which has been obsoleted by the new "nfnetlink_queue" code (see - CONFIG_NETFILTER_NETLINK_QUEUE). - - To compile it as a module, choose M here. If unsure, say N. - config IP_NF_IPTABLES tristate "IP tables support (required for filtering/masq/NAT)" default m if NETFILTER_ADVANCED=n -- cgit v1.2.1 From 44046a593eb770dbecdabf1c82bcd252f2a8337b Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 19 Mar 2013 06:11:12 +0000 Subject: udp: add encap_destroy callback Users of udp encapsulation currently have an encap_rcv callback which they can use to hook into the udp receive path. In situations where a encapsulation user allocates resources associated with a udp encap socket, it may be convenient to be able to also hook the proto .destroy operation. For example, if an encap user holds a reference to the udp socket, the destroy hook might be used to relinquish this reference. This patch adds a socket destroy hook into udp, which is set and enabled in the same way as the existing encap_rcv hook. Signed-off-by: Tom Parkin Signed-off-by: James Chapman Signed-off-by: David S. Miller --- net/ipv4/udp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 265c42cf963c..0a073a263720 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1762,9 +1762,16 @@ int udp_rcv(struct sk_buff *skb) void udp_destroy_sock(struct sock *sk) { + struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); + if (static_key_false(&udp_encap_needed) && up->encap_type) { + void (*encap_destroy)(struct sock *sk); + encap_destroy = ACCESS_ONCE(up->encap_destroy); + if (encap_destroy) + encap_destroy(sk); + } } /* -- cgit v1.2.1 From 283951f95b067877ca5ea77afaa212bb1e0507b5 Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Tue, 19 Mar 2013 08:19:29 +0000 Subject: ipconfig: Fix newline handling in log message. When using ipconfig the logs currently look like: Single name server: [ 3.467270] IP-Config: Complete: [ 3.470613] device=eth0, hwaddr=ac:de:48:00:00:01, ipaddr=172.16.42.2, mask=255.255.255.0, gw=172.16.42.1 [ 3.480670] host=infigo-1, domain=, nis-domain=(none) [ 3.486166] bootserver=172.16.42.1, rootserver=172.16.42.1, rootpath= [ 3.492910] nameserver0=172.16.42.1[ 3.496853] ALSA device list: Three name servers: [ 3.496949] IP-Config: Complete: [ 3.500293] device=eth0, hwaddr=ac:de:48:00:00:01, ipaddr=172.16.42.2, mask=255.255.255.0, gw=172.16.42.1 [ 3.510367] host=infigo-1, domain=, nis-domain=(none) [ 3.515864] bootserver=172.16.42.1, rootserver=172.16.42.1, rootpath= [ 3.522635] nameserver0=172.16.42.1, nameserver1=172.16.42.100 [ 3.529149] , nameserver2=172.16.42.200 Fix newline handling for these cases Signed-off-by: Martin Fuzzey Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 98cbc6877019..bf6c5cf31aed 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1522,7 +1522,8 @@ static int __init ip_auto_config(void) } for (i++; i < CONF_NAMESERVERS_MAX; i++) if (ic_nameservers[i] != NONE) - pr_cont(", nameserver%u=%pI4\n", i, &ic_nameservers[i]); + pr_cont(", nameserver%u=%pI4", i, &ic_nameservers[i]); + pr_cont("\n"); #endif /* !SILENT */ return 0; -- cgit v1.2.1 From f4541d60a449afd40448b06496dcd510f505928e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Mar 2013 17:36:09 +0000 Subject: tcp: preserve ACK clocking in TSO A long standing problem with TSO is the fact that tcp_tso_should_defer() rearms the deferred timer, while it should not. Current code leads to following bad bursty behavior : 20:11:24.484333 IP A > B: . 297161:316921(19760) ack 1 win 119 20:11:24.484337 IP B > A: . ack 263721 win 1117 20:11:24.485086 IP B > A: . ack 265241 win 1117 20:11:24.485925 IP B > A: . ack 266761 win 1117 20:11:24.486759 IP B > A: . ack 268281 win 1117 20:11:24.487594 IP B > A: . ack 269801 win 1117 20:11:24.488430 IP B > A: . ack 271321 win 1117 20:11:24.489267 IP B > A: . ack 272841 win 1117 20:11:24.490104 IP B > A: . ack 274361 win 1117 20:11:24.490939 IP B > A: . ack 275881 win 1117 20:11:24.491775 IP B > A: . ack 277401 win 1117 20:11:24.491784 IP A > B: . 316921:332881(15960) ack 1 win 119 20:11:24.492620 IP B > A: . ack 278921 win 1117 20:11:24.493448 IP B > A: . ack 280441 win 1117 20:11:24.494286 IP B > A: . ack 281961 win 1117 20:11:24.495122 IP B > A: . ack 283481 win 1117 20:11:24.495958 IP B > A: . ack 285001 win 1117 20:11:24.496791 IP B > A: . ack 286521 win 1117 20:11:24.497628 IP B > A: . ack 288041 win 1117 20:11:24.498459 IP B > A: . ack 289561 win 1117 20:11:24.499296 IP B > A: . ack 291081 win 1117 20:11:24.500133 IP B > A: . ack 292601 win 1117 20:11:24.500970 IP B > A: . ack 294121 win 1117 20:11:24.501388 IP B > A: . ack 295641 win 1117 20:11:24.501398 IP A > B: . 332881:351881(19000) ack 1 win 119 While the expected behavior is more like : 20:19:49.259620 IP A > B: . 197601:202161(4560) ack 1 win 119 20:19:49.260446 IP B > A: . ack 154281 win 1212 20:19:49.261282 IP B > A: . ack 155801 win 1212 20:19:49.262125 IP B > A: . ack 157321 win 1212 20:19:49.262136 IP A > B: . 202161:206721(4560) ack 1 win 119 20:19:49.262958 IP B > A: . ack 158841 win 1212 20:19:49.263795 IP B > A: . ack 160361 win 1212 20:19:49.264628 IP B > A: . ack 161881 win 1212 20:19:49.264637 IP A > B: . 206721:211281(4560) ack 1 win 119 20:19:49.265465 IP B > A: . ack 163401 win 1212 20:19:49.265886 IP B > A: . ack 164921 win 1212 20:19:49.266722 IP B > A: . ack 166441 win 1212 20:19:49.266732 IP A > B: . 211281:215841(4560) ack 1 win 119 20:19:49.267559 IP B > A: . ack 167961 win 1212 20:19:49.268394 IP B > A: . ack 169481 win 1212 20:19:49.269232 IP B > A: . ack 171001 win 1212 20:19:49.269241 IP A > B: . 215841:221161(5320) ack 1 win 119 Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Van Jacobson Cc: Neal Cardwell Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 817fbb396bc8..5d0b4387cba6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1809,8 +1809,11 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) goto send_now; } - /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies << 1); + /* Ok, it looks like it is advisable to defer. + * Do not rearm the timer if already set to not break TCP ACK clocking. + */ + if (!tp->tso_deferred) + tp->tso_deferred = 1 | (jiffies << 1); return true; -- cgit v1.2.1 From 7ebe183c6d444ef5587d803b64a1f4734b18c564 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Sun, 24 Mar 2013 10:42:25 +0000 Subject: tcp: undo spurious timeout after SACK reneging On SACK reneging the sender immediately retransmits and forces a timeout but disables Eifel (undo). If the (buggy) receiver does not drop any packet this can trigger a false slow-start retransmit storm driven by the ACKs of the original packets. This can be detected with undo and TCP timestamps. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0d9bdacce99f..3bd55bad230a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2059,11 +2059,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (!how) { - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - tp->undo_marker = tp->snd_una; - } else { + tp->undo_marker = tp->snd_una; + if (how) { tp->sacked_out = 0; tp->fackets_out = 0; } -- cgit v1.2.1 From 330305cc4a6b0cb75c22fc01b8826f0ad755550f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 24 Mar 2013 17:36:29 +0000 Subject: ipv4: Fix ip-header identification for gso packets. ip-header id needs to be incremented even if IP_DF flag is set. This behaviour was changed in commit 490ab08127cebc25e3a26 (IP_GRE: Fix IP-Identification). Following patch fixes it so that identification is always incremented. Reported-by: Cong Wang Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68f6a94f7661..c929d9c1c4b6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1333,8 +1333,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->frag_off |= htons(IP_MF); offset += (skb->len - skb->mac_len - iph->ihl * 4); } else { - if (!(iph->frag_off & htons(IP_DF))) - iph->id = htons(id++); + iph->id = htons(id++); } iph->tot_len = htons(skb->len - skb->mac_len); iph->check = 0; -- cgit v1.2.1 From 34e2ed34a035de07277cca817fe8264324398141 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Apr 2013 08:33:00 +0000 Subject: net: ipv4: notify when address lifetime changes if userspace changes lifetime of address, send netlink notification and call notifier. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index f678507bc829..96083b7a436b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -802,8 +802,10 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) return -EEXIST; - - set_ifa_lifetime(ifa_existing, valid_lft, prefered_lft); + ifa = ifa_existing; + set_ifa_lifetime(ifa, valid_lft, prefered_lft); + rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); + blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } return 0; } -- cgit v1.2.1 From 05a324b9c50c3edbe0ce48ee3e37b210859ef1ae Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Apr 2013 23:39:38 +0000 Subject: net: ipv4: reset check_lifetime_work after changing lifetime This will result in calling check_lifetime in nearest opportunity and that function will adjust next time to call check_lifetime correctly. Without this, check_lifetime is called in time computed by previous run, not affecting modified lifetime. Introduced by: commit 5c766d642bcaf "ipv4: introduce address lifetime" Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 96083b7a436b..00386e02e708 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -804,6 +804,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg return -EEXIST; ifa = ifa_existing; set_ifa_lifetime(ifa, valid_lft, prefered_lft); + cancel_delayed_work(&check_lifetime_work); + schedule_delayed_work(&check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } -- cgit v1.2.1 From c988d1e8cbf722e34ee6124b8b89d47fec655b51 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 4 Apr 2013 23:39:39 +0000 Subject: net: ipv4: fix schedule while atomic bug in check_lifetime() move might_sleep operations out of the rcu_read_lock() section. Also fix iterating over ifa_dev->ifa_list Introduced by: commit 5c766d642bcaf "ipv4: introduce address lifetime" Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 58 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 00386e02e708..c6287cd978c2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -587,13 +587,16 @@ static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; + struct hlist_node *n; int i; now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); - rcu_read_lock(); for (i = 0; i < IN4_ADDR_HSIZE; i++) { + bool change_needed = false; + + rcu_read_lock(); hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { unsigned long age; @@ -606,16 +609,7 @@ static void check_lifetime(struct work_struct *work) if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { - struct in_ifaddr **ifap ; - - rtnl_lock(); - for (ifap = &ifa->ifa_dev->ifa_list; - *ifap != NULL; ifap = &ifa->ifa_next) { - if (*ifap == ifa) - inet_del_ifa(ifa->ifa_dev, - ifap, 1); - } - rtnl_unlock(); + change_needed = true; } else if (ifa->ifa_preferred_lft == INFINITY_LIFE_TIME) { continue; @@ -625,10 +619,8 @@ static void check_lifetime(struct work_struct *work) next = ifa->ifa_tstamp + ifa->ifa_valid_lft * HZ; - if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) { - ifa->ifa_flags |= IFA_F_DEPRECATED; - rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); - } + if (!(ifa->ifa_flags & IFA_F_DEPRECATED)) + change_needed = true; } else if (time_before(ifa->ifa_tstamp + ifa->ifa_preferred_lft * HZ, next)) { @@ -636,8 +628,42 @@ static void check_lifetime(struct work_struct *work) ifa->ifa_preferred_lft * HZ; } } + rcu_read_unlock(); + if (!change_needed) + continue; + rtnl_lock(); + hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { + unsigned long age; + + if (ifa->ifa_flags & IFA_F_PERMANENT) + continue; + + /* We try to batch several events at once. */ + age = (now - ifa->ifa_tstamp + + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; + + if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && + age >= ifa->ifa_valid_lft) { + struct in_ifaddr **ifap; + + for (ifap = &ifa->ifa_dev->ifa_list; + *ifap != NULL; ifap = &(*ifap)->ifa_next) { + if (*ifap == ifa) { + inet_del_ifa(ifa->ifa_dev, + ifap, 1); + break; + } + } + } else if (ifa->ifa_preferred_lft != + INFINITY_LIFE_TIME && + age >= ifa->ifa_preferred_lft && + !(ifa->ifa_flags & IFA_F_DEPRECATED)) { + ifa->ifa_flags |= IFA_F_DEPRECATED; + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); + } + } + rtnl_unlock(); } - rcu_read_unlock(); next_sec = round_jiffies_up(next); next_sched = next; -- cgit v1.2.1 From ca10b9e9a8ca7342ee07065289cbe74ac128c169 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Apr 2013 17:58:11 +0000 Subject: selinux: add a skb_owned_by() hook Commit 90ba9b1986b5ac (tcp: tcp_make_synack() can use alloc_skb()) broke certain SELinux/NetLabel configurations by no longer correctly assigning the sock to the outgoing SYNACK packet. Cost of atomic operations on the LISTEN socket is quite big, and we would like it to happen only if really needed. This patch introduces a new security_ops->skb_owned_by() method, that is a void operation unless selinux is active. Reported-by: Miroslav Vadkerti Diagnosed-by: Paul Moore Signed-off-by: Eric Dumazet Cc: "David S. Miller" Cc: linux-security-module@vger.kernel.org Acked-by: James Morris Tested-by: Paul Moore Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5d0b4387cba6..b44cf81d8178 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2709,6 +2709,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); skb_dst_set(skb, dst); + security_skb_owned_by(skb, sk); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) -- cgit v1.2.1 From d66954a066158781ccf9c13c91d0316970fe57b6 Mon Sep 17 00:00:00 2001 From: Dmitry Popov Date: Thu, 11 Apr 2013 08:55:07 +0000 Subject: tcp: incoming connections might use wrong route under synflood There is a bug in cookie_v4_check (net/ipv4/syncookies.c): flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, ireq->loc_addr, th->source, th->dest); Here we do not respect sk->sk_bound_dev_if, therefore wrong dst_entry may be taken. This dst_entry is used by new socket (get_cookie_sock -> tcp_v4_syn_recv_sock), so its packets may take the wrong path. Signed-off-by: Dmitry Popov Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index ef54377fb11c..397e0f69435f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -349,8 +349,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), - RT_SCOPE_UNIVERSE, IPPROTO_TCP, + flowi4_init_output(&fl4, sk->sk_bound_dev_if, sk->sk_mark, + RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP, inet_sk_flowi_flags(sk), (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, ireq->loc_addr, th->source, th->dest); -- cgit v1.2.1 From 50bceae9bd3569d56744882f3012734d48a1d413 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 11 Apr 2013 10:57:18 +0000 Subject: tcp: Reallocate headroom if it would overflow csum_start If a TCP retransmission gets partially ACKed and collapsed multiple times it is possible for the headroom to grow beyond 64K which will overflow the 16bit skb->csum_start which is based on the start of the headroom. It has been observed rarely in the wild with IPoIB due to the 64K MTU. Verify if the acking and collapsing resulted in a headroom exceeding what csum_start can cover and reallocate the headroom if so. A big thank you to Jim Foraker and the team at LLNL for helping out with the investigation and testing. Reported-by: Jim Foraker Signed-off-by: Thomas Graf Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b44cf81d8178..509912a5ff98 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2388,8 +2388,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ TCP_SKB_CB(skb)->when = tcp_time_stamp; - /* make sure skb->data is aligned on arches that require it */ - if (unlikely(NET_IP_ALIGN && ((unsigned long)skb->data & 3))) { + /* make sure skb->data is aligned on arches that require it + * and check if ack-trimming & collapsing extended the headroom + * beyond what csum_start can cover. + */ + if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || + skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); return nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -- cgit v1.2.1 From 06848c10f720cbc20e3b784c0df24930b7304b93 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 13 Apr 2013 15:49:03 +0000 Subject: esp4: fix error return code in esp_output() Fix to return a negative error code from the error handling case instead of 0, as returned elsewhere in this function. Signed-off-by: Wei Yongjun Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 3b4f0cd2e63e..4cfe34d4cc96 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -139,8 +139,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) /* skb is pure payload to encrypt */ - err = -ENOMEM; - esp = x->data; aead = esp->aead; alen = crypto_aead_authsize(aead); @@ -176,8 +174,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); - if (!tmp) + if (!tmp) { + err = -ENOMEM; goto error; + } seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); -- cgit v1.2.1 From 97599dc792b45b1669c3cdb9a4b365aad0232f65 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Apr 2013 12:55:41 +0000 Subject: net: drop dst before queueing fragments Commit 4a94445c9a5c (net: Use ip_route_input_noref() in input path) added a bug in IP defragmentation handling, as non refcounted dst could escape an RCU protected section. Commit 64f3b9e203bd068 (net: ip_expire() must revalidate route) fixed the case of timeouts, but not the general problem. Tom Parkin noticed crashes in UDP stack and provided a patch, but further analysis permitted us to pinpoint the root cause. Before queueing a packet into a frag list, we must drop its dst, as this dst has limited lifetime (RCU protected) When/if a packet is finally reassembled, we use the dst of the very last skb, still protected by RCU and valid, as the dst of the reassembled packet. Use same logic in IPv6, as there is no need to hold dst references. Reported-by: Tom Parkin Tested-by: Tom Parkin Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a6445b843ef4..52c273ea05c3 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -248,8 +248,7 @@ static void ip_expire(unsigned long arg) if (!head->dev) goto out_rcu_unlock; - /* skb dst is stale, drop it, and perform route lookup again */ - skb_dst_drop(head); + /* skb has no dst, perform route lookup again */ iph = ip_hdr(head); err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); @@ -523,9 +522,16 @@ found: qp->q.max_size = skb->len + ihl; if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) - return ip_frag_reasm(qp, prev, dev); + qp->q.meat == qp->q.len) { + unsigned long orefdst = skb->_skb_refdst; + skb->_skb_refdst = 0UL; + err = ip_frag_reasm(qp, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } + + skb_dst_drop(skb); inet_frag_lru_move(&qp->q); return -EINPROGRESS; -- cgit v1.2.1 From f83a7ea2075ca896f2dbf07672bac9cf3682ff74 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 Apr 2013 22:45:24 +0000 Subject: netfilter: xt_rpfilter: skip locally generated broadcast/multicast, too Alex Efros reported rpfilter module doesn't match following packets: IN=br.qemu SRC=192.168.2.1 DST=192.168.2.255 [ .. ] (netfilter bugzilla #814). Problem is that network stack arranges for the locally generated broadcasts to appear on the interface they were sent out, so the IFF_LOOPBACK check doesn't trigger. As -m rpfilter is restricted to PREROUTING, we can check for existing rtable instead, it catches locally-generated broad/multicast case, too. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_rpfilter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index c30130062cd6..c49dcd0284a0 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -66,6 +66,12 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, return dev_match; } +static bool rpfilter_is_local(const struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + return rt && (rt->rt_flags & RTCF_LOCAL); +} + static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_rpfilter_info *info; @@ -76,7 +82,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) info = par->matchinfo; invert = info->flags & XT_RPFILTER_INVERT; - if (par->in->flags & IFF_LOOPBACK) + if (rpfilter_is_local(skb)) return true ^ invert; iph = ip_hdr(skb); -- cgit v1.2.1 From 12fb3dd9dc3c64ba7d64cec977cca9b5fb7b1d4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Apr 2013 07:19:48 +0000 Subject: tcp: call tcp_replace_ts_recent() from tcp_ack() commit bd090dfc634d (tcp: tcp_replace_ts_recent() should not be called from tcp_validate_incoming()) introduced a TS ecr bug in slow path processing. 1 A > B P. 1:10001(10000) ack 1 2 B < A . 1:1(0) ack 1 win 257 3 A > B . 1:1001(1000) ack 1 win 227 4 A > B . 1001:2001(1000) ack 1 win 227 (ecr 200 should be ecr 300 in packets 3 & 4) Problem is tcp_ack() can trigger send of new packets (retransmits), reflecting the prior TSval, instead of the TSval contained in the currently processed incoming packet. Fix this by calling tcp_replace_ts_recent() from tcp_ack() after the checks, but before the actions. Reported-by: Yuchung Cheng Signed-off-by: Eric Dumazet Cc: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 64 +++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3bd55bad230a..13b9c08fc158 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -113,6 +113,7 @@ int sysctl_tcp_early_retrans __read_mostly = 2; #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ +#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -3564,6 +3565,27 @@ static void tcp_send_challenge_ack(struct sock *sk) } } +static void tcp_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = get_seconds(); +} + +static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if (tcp_paws_check(&tp->rx_opt, 0)) + tcp_store_ts_recent(tp); + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3607,6 +3629,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); + /* ts_recent update must be made after we are sure that the packet + * is in window. + */ + if (flag & FLAG_UPDATE_TS_RECENT) + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { /* Window is constant, pure forward advance. * No more checks are required. @@ -3927,27 +3955,6 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) EXPORT_SYMBOL(tcp_parse_md5sig_option); #endif -static inline void tcp_store_ts_recent(struct tcp_sock *tp) -{ - tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = get_seconds(); -} - -static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) -{ - if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { - /* PAWS bug workaround wrt. ACK frames, the PAWS discard - * extra check below makes sure this can only happen - * for pure ACK frames. -DaveM - * - * Not only, also it occurs for expired timestamps. - */ - - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); - } -} - /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM * * It is not fatal. If this ACK does _not_ change critical state (seqs, window) @@ -5543,14 +5550,9 @@ slow_path: return 0; step5: - if (tcp_ack(sk, skb, FLAG_SLOWPATH) < 0) + if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) goto discard; - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ @@ -5986,7 +5988,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* step 5: check the ACK field */ if (true) { - int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT) > 0; switch (sk->sk_state) { case TCP_SYN_RECV: @@ -6137,11 +6140,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } } - /* ts_recent update must be made after we are sure that the packet - * is in window. - */ - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - /* step 6: check the URG bit */ tcp_urg(sk, skb, th); -- cgit v1.2.1