summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c185
1 files changed, 153 insertions, 32 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c21e8a22fb3b..316ebdf8151d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -79,6 +79,7 @@
#include <trace/events/tcp.h>
#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
+#include <net/mptcp.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -266,7 +267,7 @@ static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
{
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
}
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
@@ -359,7 +360,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
+ WRITE_ONCE(sk->sk_sndbuf,
+ min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -483,8 +485,9 @@ static void tcp_clamp_window(struct sock *sk)
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- net->ipv4.sysctl_tcp_rmem[2]);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min(atomic_read(&sk->sk_rmem_alloc),
+ net->ipv4.sysctl_tcp_rmem[2]));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -648,7 +651,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
rcvbuf = min_t(u64, rcvwin * rcvmem,
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = rcvbuf;
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
/* Make the window clamp follow along. */
tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
@@ -913,9 +916,10 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
/* This must be called before lost_out is incremented */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- if (!tp->retransmit_skb_hint ||
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
+ (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
tp->retransmit_skb_hint = skb;
}
@@ -1420,7 +1424,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
- if (!tcp_skb_can_collapse_to(prev))
+ if (!tcp_skb_can_collapse(prev, skb))
goto fallback;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
@@ -1725,8 +1729,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
/* Ignore very old stuff early */
- if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
+ if (i == 0)
+ first_sack_index = -1;
continue;
+ }
used_sacks++;
}
@@ -2666,7 +2673,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
- if ((flag & FLAG_SND_UNA_ADVANCED || tp->fastopen_rsk) &&
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
tcp_try_undo_loss(sk, false))
return;
@@ -2990,7 +2997,7 @@ void tcp_rearm_rto(struct sock *sk)
/* If the retrans timer is currently being used by Fast Open
* for SYN-ACK retrans purpose, stay put.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return;
if (!tp->packets_out) {
@@ -3158,6 +3165,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retransmit_skb_hint = NULL;
if (unlikely(skb == tp->lost_skb_hint))
tp->lost_skb_hint = NULL;
+ tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
@@ -3362,7 +3370,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
sock_owned_by_me((struct sock *)tp);
tp->bytes_received += delta;
- tp->rcv_nxt = seq;
+ WRITE_ONCE(tp->rcv_nxt, seq);
}
/* Update our send window.
@@ -3548,7 +3556,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return;
- if (unlikely(rexmit == 2)) {
+ if (unlikely(rexmit == REXMIT_NEW)) {
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
@@ -3782,6 +3790,49 @@ static void smc_parse_options(const struct tcphdr *th,
#endif
}
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+ u16 mss = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return mss;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return mss;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return mss;
+ if (opsize > length)
+ return mss; /* fail on partial options */
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+ u16 in_mss = get_unaligned_be16(ptr);
+
+ if (in_mss) {
+ if (user_mss && user_mss < in_mss)
+ in_mss = user_mss;
+ mss = in_mss;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return mss;
+}
+
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
* But, this can also be called on packets in the established flow when
* the fast version below fails.
@@ -3875,6 +3926,10 @@ void tcp_parse_options(const struct net *net,
*/
break;
#endif
+ case TCPOPT_MPTCP:
+ mptcp_parse_option(skb, ptr, opsize, opt_rx);
+ break;
+
case TCPOPT_FASTOPEN:
tcp_parse_fastopen_option(
opsize - TCPOLEN_FASTOPEN_BASE,
@@ -4216,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
* The receiver remembers and reflects via DSACKs. Leverage the
* DSACK state and change the txhash to re-route speculatively.
*/
- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq)
+ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) {
sk_rethink_txhash(sk);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+ }
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -4375,6 +4432,9 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+ if (!mptcp_skb_can_collapse(to, from))
+ return false;
+
#ifdef CONFIG_TLS_DEVICE
if (from->decrypted != to->decrypted)
return false;
@@ -4512,6 +4572,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4712,6 +4773,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
bool fragstolen;
int eaten;
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb, &tp->rx_opt);
+
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
@@ -4883,7 +4947,7 @@ restart:
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
- * overlaps to the next one.
+ * overlaps to the next one and mptcp allow collapsing.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(sk, skb->truesize) > skb->len ||
@@ -4892,7 +4956,7 @@ restart:
break;
}
- if (n && n != tail &&
+ if (n && n != tail && mptcp_skb_can_collapse(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
@@ -4925,6 +4989,7 @@ restart:
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
+ mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -4944,6 +5009,7 @@ restart:
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
+ !mptcp_skb_can_collapse(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
#ifdef CONFIG_TLS_DEVICE
@@ -5312,7 +5378,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
tp->urg_data = TCP_URG_NOTYET;
- tp->urg_seq = ptr;
+ WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -5768,6 +5834,10 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
+ if (tp->total_retrans)
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+ else
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
skb_rbtree_walk_from(data) {
if (__tcp_retransmit_skb(sk, data, 1))
break;
@@ -5838,8 +5908,14 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* the segment and return)"
*/
if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
+ if (icsk->icsk_retransmits == 0)
+ inet_csk_reset_xmit_timer(sk,
+ ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, TCP_RTO_MAX);
goto reset_and_undo;
+ }
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
@@ -5888,7 +5964,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5914,10 +5990,13 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
+ if (sk_is_mptcp(sk))
+ mptcp_rcv_synsent(sk);
+
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
smc_check_reset_syn(tp);
@@ -5991,8 +6070,8 @@ discard:
tp->tcp_header_len = sizeof(struct tcphdr);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -6043,6 +6122,8 @@ reset_and_undo:
static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
{
+ struct request_sock *req;
+
tcp_try_undo_loss(sk, false);
/* Reset rtx states to prevent spurious retransmits_timed_out() */
@@ -6052,7 +6133,9 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
/* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
* we no longer need req so release it.
*/
- reqsk_fastopen_remove(sk, tcp_sk(sk)->fastopen_rsk, false);
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ reqsk_fastopen_remove(sk, req, false);
/* Re-arm the timer because data may have been sent out.
* This is similar to the regular data transmission case
@@ -6127,7 +6210,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
- req = tp->fastopen_rsk;
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
if (req) {
bool req_stolen;
@@ -6167,7 +6251,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_try_undo_spurious_syn(sk);
tp->retrans_stamp = 0;
tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -6274,8 +6358,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb, &tp->rx_opt);
break;
+ }
/* fall through */
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
@@ -6422,9 +6509,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk,
- const struct sk_buff *skb,
- const char *proto)
+static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
@@ -6444,7 +6529,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
+ proto, sk->sk_num, msg);
return want_cookie;
}
@@ -6466,6 +6551,36 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
}
}
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 &&
+ !inet_csk_reqsk_queue_is_full(sk))
+ return 0;
+
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+ return 0;
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ return 0;
+ }
+
+ mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ if (!mss)
+ mss = af_ops->mss_clamp;
+
+ return mss;
+}
+EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss);
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
@@ -6487,7 +6602,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+ want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
@@ -6503,6 +6618,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
+#if IS_ENABLED(CONFIG_MPTCP)
+ tcp_rsk(req)->is_mptcp = 0;
+#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
@@ -6525,6 +6643,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
af_ops->init_req(req, sk, skb);
+ if (IS_ENABLED(CONFIG_MPTCP) && want_cookie)
+ tcp_rsk(req)->is_mptcp = 0;
+
if (security_inet_conn_request(sk, skb, req))
goto drop_and_free;
OpenPOWER on IntegriCloud