summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 09:00:47 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-15 09:00:47 -0700
commit6c373ca89399c5a3f7ef210ad8f63dc3437da345 (patch)
tree74d1ec65087df1da1021b43ac51acc1ee8601809 /net/ipv4/tcp_output.c
parentbb0fd7ab0986105765d11baa82e619c618a235aa (diff)
parent9f9151412dd7aae0e3f51a89ae4a1f8755fdb4d0 (diff)
downloadtalos-obmc-linux-6c373ca89399c5a3f7ef210ad8f63dc3437da345.tar.gz
talos-obmc-linux-6c373ca89399c5a3f7ef210ad8f63dc3437da345.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) Add BQL support to via-rhine, from Tino Reichardt. 2) Integrate SWITCHDEV layer support into the DSA layer, so DSA drivers can support hw switch offloading. From Floria Fainelli. 3) Allow 'ip address' commands to initiate multicast group join/leave, from Madhu Challa. 4) Many ipv4 FIB lookup optimizations from Alexander Duyck. 5) Support EBPF in cls_bpf classifier and act_bpf action, from Daniel Borkmann. 6) Remove the ugly compat support in ARP for ugly layers like ax25, rose, etc. And use this to clean up the neigh layer, then use it to implement MPLS support. All from Eric Biederman. 7) Support L3 forwarding offloading in switches, from Scott Feldman. 8) Collapse the LOCAL and MAIN ipv4 FIB tables when possible, to speed up route lookups even further. From Alexander Duyck. 9) Many improvements and bug fixes to the rhashtable implementation, from Herbert Xu and Thomas Graf. In particular, in the case where an rhashtable user bulk adds a large number of items into an empty table, we expand the table much more sanely. 10) Don't make the tcp_metrics hash table per-namespace, from Eric Biederman. 11) Extend EBPF to access SKB fields, from Alexei Starovoitov. 12) Split out new connection request sockets so that they can be established in the main hash table. Much less false sharing since hash lookups go direct to the request sockets instead of having to go first to the listener then to the request socks hashed underneath. From Eric Dumazet. 13) Add async I/O support for crytpo AF_ALG sockets, from Tadeusz Struk. 14) Support stable privacy address generation for RFC7217 in IPV6. From Hannes Frederic Sowa. 15) Hash network namespace into IP frag IDs, also from Hannes Frederic Sowa. 16) Convert PTP get/set methods to use 64-bit time, from Richard Cochran. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1816 commits) fm10k: Bump driver version to 0.15.2 fm10k: corrected VF multicast update fm10k: mbx_update_max_size does not drop all oversized messages fm10k: reset head instead of calling update_max_size fm10k: renamed mbx_tx_dropped to mbx_tx_oversized fm10k: update xcast mode before synchronizing multicast addresses fm10k: start service timer on probe fm10k: fix function header comment fm10k: comment next_vf_mbx flow fm10k: don't handle mailbox events in iov_event path and always process mailbox fm10k: use separate workqueue for fm10k driver fm10k: Set PF queues to unlimited bandwidth during virtualization fm10k: expose tx_timeout_count as an ethtool stat fm10k: only increment tx_timeout_count in Tx hang path fm10k: remove extraneous "Reset interface" message fm10k: separate PF only stats so that VF does not display them fm10k: use hw->mac.max_queues for stats fm10k: only show actual queues, not the maximum in hardware fm10k: allow creation of VLAN on default vid fm10k: fix unused warnings ...
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c177
1 files changed, 122 insertions, 55 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1db253e36045..8c8d7e06b72f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -518,17 +518,26 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+ u8 *p = (u8 *)ptr;
+ u32 len; /* Fast Open option length */
+
+ if (foc->exp) {
+ len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+ p += TCPOLEN_EXP_FASTOPEN_BASE;
+ } else {
+ len = TCPOLEN_FASTOPEN_BASE + foc->len;
+ *p++ = TCPOPT_FASTOPEN;
+ *p++ = len;
+ }
- *ptr++ = htonl((TCPOPT_EXP << 24) |
- ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
- TCPOPT_FASTOPEN_MAGIC);
-
- memcpy(ptr, foc->val, foc->len);
- if ((foc->len & 3) == 2) {
- u8 *align = ((u8 *)ptr) + foc->len;
- align[0] = align[1] = TCPOPT_NOP;
+ memcpy(p, foc->val, foc->len);
+ if ((len & 3) == 2) {
+ p[foc->len] = TCPOPT_NOP;
+ p[foc->len + 1] = TCPOPT_NOP;
}
- ptr += (foc->len + 3) >> 2;
+ ptr += (len + 3) >> 2;
}
}
@@ -565,7 +574,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+ if (likely(sysctl_tcp_timestamps && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
@@ -583,13 +592,17 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
if (fastopen && fastopen->cookie.len >= 0) {
- u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+ u32 need = fastopen->cookie.len;
+
+ need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
opts->fastopen_cookie = &fastopen->cookie;
remaining -= need;
tp->syn_fastopen = 1;
+ tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
}
}
@@ -601,15 +614,14 @@ static unsigned int tcp_synack_options(struct sock *sk,
struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5,
+ const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
- if (*md5) {
+ if (md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
@@ -620,8 +632,6 @@ static unsigned int tcp_synack_options(struct sock *sk,
*/
ireq->tstamp_ok &= !ireq->sack_ok;
}
-#else
- *md5 = NULL;
#endif
/* We always send an MSS option. */
@@ -645,7 +655,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
if (foc != NULL && foc->len >= 0) {
- u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ u32 need = foc->len;
+
+ need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
@@ -989,7 +1002,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
- md5, sk, NULL, skb);
+ md5, sk, skb);
}
#endif
@@ -1151,7 +1164,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, gfp);
- if (buff == NULL)
+ if (!buff)
return -ENOMEM; /* We'll just try again later. */
sk->sk_wmem_queued += buff->truesize;
@@ -1354,6 +1367,8 @@ void tcp_mtup_init(struct sock *sk)
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
EXPORT_SYMBOL(tcp_mtup_init);
@@ -1708,7 +1723,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
return tcp_fragment(sk, skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp);
- if (unlikely(buff == NULL))
+ if (unlikely(!buff))
return -ENOMEM;
sk->sk_wmem_queued += buff->truesize;
@@ -1752,20 +1767,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited, u32 max_segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 send_win, cong_win, limit, in_flight;
+ u32 age, send_win, cong_win, limit, in_flight;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ struct sk_buff *head;
int win_divisor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
goto send_now;
- /* Defer for less than two clock ticks. */
- if (tp->tso_deferred &&
- (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent.
+ */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
@@ -1807,11 +1825,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
goto send_now;
}
- /* Ok, it looks like it is advisable to defer.
- * Do not rearm the timer if already set to not break TCP ACK clocking.
- */
- if (!tp->tso_deferred)
- tp->tso_deferred = 1 | (jiffies << 1);
+ head = tcp_write_queue_head(sk);
+ skb_mstamp_get(&now);
+ age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+ /* If next ACK is likely to come too late (half srtt), do not defer */
+ if (age < (tp->srtt_us >> 4))
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer. */
if (cong_win < send_win && cong_win < skb->len)
*is_cwnd_limited = true;
@@ -1819,10 +1840,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
return true;
send_now:
- tp->tso_deferred = 0;
return false;
}
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -1837,11 +1882,13 @@ static int tcp_mtu_probe(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
@@ -1854,12 +1901,25 @@ static int tcp_mtu_probe(struct sock *sk)
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
- /* Very simple search strategy: just double the MSS. */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
mss_now = tcp_current_mss(sk);
- probe_size = 2 * tp->mss_cache;
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
@@ -1881,7 +1941,8 @@ static int tcp_mtu_probe(struct sock *sk)
}
/* We're allowed to probe. Build it now. */
- if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+ if (!nskb)
return -1;
sk->sk_wmem_queued += nskb->truesize;
sk_mem_charge(sk, nskb->truesize);
@@ -2179,7 +2240,7 @@ void tcp_send_loss_probe(struct sock *sk)
int mss = tcp_current_mss(sk);
int err = -1;
- if (tcp_send_head(sk) != NULL) {
+ if (tcp_send_head(sk)) {
err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
goto rearm_timer;
}
@@ -2689,7 +2750,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == tcp_send_head(sk))
break;
/* we could do better than to assign each time */
- if (hole == NULL)
+ if (!hole)
tp->retransmit_skb_hint = skb;
/* Assume this retransmit will generate
@@ -2713,7 +2774,7 @@ begin_fwd:
if (!tcp_can_forward_retransmit(sk))
break;
/* Backtrack if necessary to non-L'ed skb */
- if (hole != NULL) {
+ if (hole) {
skb = hole;
hole = NULL;
}
@@ -2721,7 +2782,7 @@ begin_fwd:
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) {
- if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+ if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
@@ -2766,7 +2827,7 @@ void tcp_send_fin(struct sock *sk)
*/
mss_now = tcp_current_mss(sk);
- if (tcp_send_head(sk) != NULL) {
+ if (tcp_send_head(sk)) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
@@ -2824,14 +2885,14 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_debug("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
- if (nskb == NULL)
+ if (!nskb)
return -ENOMEM;
tcp_unlink_write_queue(skb, sk);
__skb_header_release(nskb);
@@ -2866,7 +2927,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
struct sk_buff *skb;
- struct tcp_md5sig_key *md5;
+ struct tcp_md5sig_key *md5 = NULL;
int tcp_header_size;
int mss;
@@ -2879,7 +2940,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
skb_reserve(skb, MAX_TCP_HEADER);
skb_dst_set(skb, dst);
- security_skb_owned_by(skb, sk);
mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
@@ -2892,7 +2952,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
else
#endif
skb_mstamp_get(&skb->skb_mstamp);
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+
+#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
+ md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
@@ -2923,12 +2988,14 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
- if (md5) {
+ if (md5)
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
- md5, NULL, req, skb);
- }
+ md5, req_to_sk(req), skb);
+ rcu_read_unlock();
#endif
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
@@ -2966,7 +3033,7 @@ static void tcp_connect_init(struct sock *sk)
(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk) != NULL)
+ if (tp->af_specific->md5_lookup(sk, sk))
tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
@@ -3252,7 +3319,7 @@ void tcp_send_ack(struct sock *sk)
* sock.
*/
buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (buff == NULL) {
+ if (!buff) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3296,7 +3363,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
/* We don't queue it, tcp_transmit_skb() sets ownership. */
skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (skb == NULL)
+ if (!skb)
return -1;
/* Reserve space for headers and set control bits. */
@@ -3327,8 +3394,8 @@ int tcp_write_wakeup(struct sock *sk)
if (sk->sk_state == TCP_CLOSE)
return -1;
- if ((skb = tcp_send_head(sk)) != NULL &&
- before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ skb = tcp_send_head(sk);
+ if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
OpenPOWER on IntegriCloud