summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c135
1 files changed, 96 insertions, 39 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1fa15beb8380..f98a1882e537 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -205,11 +206,6 @@
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
- *
* Description of States:
*
* TCP_SYN_SENT sent a connection request, waiting for ack
@@ -321,6 +317,11 @@ struct tcp_splice_state {
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);
+DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
+EXPORT_SYMBOL(tcp_rx_skb_cache_key);
+
+DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
+
void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
@@ -855,7 +856,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
if (likely(!size)) {
skb = sk->sk_tx_skb_cache;
- if (skb && !skb_cloned(skb)) {
+ if (skb) {
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
sk->sk_tx_skb_cache = NULL;
pskb_trim(skb, 0);
@@ -934,6 +935,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
+/* In some cases, both sendpage() and sendmsg() could have added
+ * an skb to the write queue, but failed adding payload on it.
+ * We need to remove it to consume less memory, but more
+ * importantly be able to generate EPOLLOUT for Edge Trigger epoll()
+ * users.
+ */
+static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb && !skb->len) {
+ tcp_unlink_write_queue(skb, sk);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ sk_wmem_free_skb(sk, skb);
+ }
+}
+
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags)
{
@@ -983,6 +1000,9 @@ new_segment:
if (!skb)
goto wait_for_memory;
+#ifdef CONFIG_TLS_DEVICE
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
skb_entail(sk, skb);
copy = size_goal;
}
@@ -1060,6 +1080,7 @@ out:
return copied;
do_error:
+ tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
if (copied)
goto out;
out_err:
@@ -1161,7 +1182,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
- bool process_backlog = false;
+ int process_backlog = 0;
bool zc = false;
long timeo;
@@ -1253,9 +1274,10 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- if (process_backlog && sk_flush_backlog(sk)) {
- process_backlog = false;
- goto restart;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
@@ -1263,7 +1285,7 @@ new_segment:
if (!skb)
goto wait_for_memory;
- process_backlog = true;
+ process_backlog++;
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
@@ -1384,18 +1406,11 @@ out_nopush:
sock_zerocopy_put(uarg);
return copied + copied_syn;
+do_error:
+ skb = tcp_write_queue_tail(sk);
do_fault:
- if (!skb->len) {
- tcp_unlink_write_queue(skb, sk);
- /* It is the one place in all of TCP, except connection
- * reset, where we can be unlinking the send_head.
- */
- if (tcp_write_queue_empty(sk))
- tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- sk_wmem_free_skb(sk, skb);
- }
+ tcp_remove_empty_skb(sk, skb);
-do_error:
if (copied + copied_syn)
goto out;
out_err:
@@ -1775,18 +1790,18 @@ static int tcp_zerocopy_receive(struct sock *sk,
break;
frags = skb_shinfo(skb)->frags;
while (offset) {
- if (frags->size > offset)
+ if (skb_frag_size(frags) > offset)
goto out;
- offset -= frags->size;
+ offset -= skb_frag_size(frags);
frags++;
}
}
- if (frags->size != PAGE_SIZE || frags->page_offset) {
+ if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
int remaining = zc->recv_skip_hint;
- while (remaining && (frags->size != PAGE_SIZE ||
- frags->page_offset)) {
- remaining -= frags->size;
+ while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
+ skb_frag_off(frags))) {
+ remaining -= skb_frag_size(frags);
frags++;
}
zc->recv_skip_hint -= remaining;
@@ -2613,6 +2628,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
tp->bytes_sent = 0;
+ tp->bytes_acked = 0;
+ tp->bytes_received = 0;
tp->bytes_retrans = 0;
tp->duplicate_sack[0].start_seq = 0;
tp->duplicate_sack[0].end_seq = 0;
@@ -2634,6 +2651,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->rx_opt.saw_tstamp = 0;
tp->rx_opt.dsack = 0;
tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
/* Clean up fastopen related fields */
@@ -2740,6 +2758,21 @@ static int tcp_repair_options_est(struct sock *sk,
return 0;
}
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_SYMBOL(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(void)
+{
+ if (!static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ static int __tcp_tx_delay_enabled = 0;
+
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+ static_branch_enable(&tcp_tx_delay_enabled);
+ pr_info("TCP_TX_DELAY enabled\n");
+ }
+ }
+}
+
/*
* Socket option code for TCP.
*/
@@ -2767,7 +2800,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0;
lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true, true);
+ err = tcp_set_congestion_control(sk, name, true, true,
+ ns_capable(sock_net(sk)->user_ns,
+ CAP_NET_ADMIN));
release_sock(sk);
return err;
}
@@ -2790,15 +2825,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
return err;
}
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+ __u8 *backup_key = NULL;
- if (optlen != sizeof(key))
+ /* Allow a backup key as well to facilitate key rotation
+ * First key is the active one.
+ */
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
return -EINVAL;
if (copy_from_user(key, optval, optlen))
return -EFAULT;
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
}
default:
/* fallthru */
@@ -3082,6 +3125,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+ case TCP_TX_DELAY:
+ if (val)
+ tcp_enable_tx_delay();
+ tp->tcp_tx_delay = val;
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -3246,6 +3294,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3452,21 +3502,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
return 0;
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
struct tcp_fastopen_context *ctx;
+ unsigned int key_len = 0;
if (get_user(len, optlen))
return -EFAULT;
rcu_read_lock();
ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
- if (ctx)
- memcpy(key, ctx->key, sizeof(key));
- else
- len = 0;
+ if (ctx) {
+ key_len = tcp_fastopen_context_len(ctx) *
+ TCP_FASTOPEN_KEY_LENGTH;
+ memcpy(&key[0], &ctx->key[0], key_len);
+ }
rcu_read_unlock();
- len = min_t(unsigned int, len, sizeof(key));
+ len = min_t(unsigned int, len, key_len);
if (put_user(len, optlen))
return -EFAULT;
if (copy_to_user(optval, key, len))
@@ -3539,6 +3591,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->fastopen_no_cookie;
break;
+ case TCP_TX_DELAY:
+ val = tp->tcp_tx_delay;
+ break;
+
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + tp->tsoffset;
break;
@@ -3742,8 +3798,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
- const struct skb_frag_struct *f = &shi->frags[i];
- unsigned int offset = f->page_offset;
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
sg_set_page(&sg, page, skb_frag_size(f),
@@ -3872,6 +3928,7 @@ void __init tcp_init(void)
unsigned long limit;
unsigned int i;
+ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
FIELD_SIZEOF(struct sk_buff, cb));
OpenPOWER on IntegriCloud