diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 135 |
1 files changed, 96 insertions, 39 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1fa15beb8380..f98a1882e537 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -205,11 +206,6 @@ * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or(at your option) any later version. - * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack @@ -321,6 +317,11 @@ struct tcp_splice_state { unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); +DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); +EXPORT_SYMBOL(tcp_rx_skb_cache_key); + +DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); + void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; @@ -855,7 +856,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, if (likely(!size)) { skb = sk->sk_tx_skb_cache; - if (skb && !skb_cloned(skb)) { + if (skb) { skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); sk->sk_tx_skb_cache = NULL; pskb_trim(skb, 0); @@ -934,6 +935,22 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) return mss_now; } +/* In some cases, both sendpage() and sendmsg() could have added + * an skb to the write queue, but failed adding payload on it. + * We need to remove it to consume less memory, but more + * importantly be able to generate EPOLLOUT for Edge Trigger epoll() + * users. + */ +static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +{ + if (skb && !skb->len) { + tcp_unlink_write_queue(skb, sk); + if (tcp_write_queue_empty(sk)) + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + sk_wmem_free_skb(sk, skb); + } +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -983,6 +1000,9 @@ new_segment: if (!skb) goto wait_for_memory; +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif skb_entail(sk, skb); copy = size_goal; } @@ -1060,6 +1080,7 @@ out: return copied; do_error: + tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); if (copied) goto out; out_err: @@ -1161,7 +1182,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; - bool process_backlog = false; + int process_backlog = 0; bool zc = false; long timeo; @@ -1253,9 +1274,10 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - if (process_backlog && sk_flush_backlog(sk)) { - process_backlog = false; - goto restart; + if (unlikely(process_backlog >= 16)) { + process_backlog = 0; + if (sk_flush_backlog(sk)) + goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, @@ -1263,7 +1285,7 @@ new_segment: if (!skb) goto wait_for_memory; - process_backlog = true; + process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); @@ -1384,18 +1406,11 @@ out_nopush: sock_zerocopy_put(uarg); return copied + copied_syn; +do_error: + skb = tcp_write_queue_tail(sk); do_fault: - if (!skb->len) { - tcp_unlink_write_queue(skb, sk); - /* It is the one place in all of TCP, except connection - * reset, where we can be unlinking the send_head. - */ - if (tcp_write_queue_empty(sk)) - tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - sk_wmem_free_skb(sk, skb); - } + tcp_remove_empty_skb(sk, skb); -do_error: if (copied + copied_syn) goto out; out_err: @@ -1775,18 +1790,18 @@ static int tcp_zerocopy_receive(struct sock *sk, break; frags = skb_shinfo(skb)->frags; while (offset) { - if (frags->size > offset) + if (skb_frag_size(frags) > offset) goto out; - offset -= frags->size; + offset -= skb_frag_size(frags); frags++; } } - if (frags->size != PAGE_SIZE || frags->page_offset) { + if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) { int remaining = zc->recv_skip_hint; - while (remaining && (frags->size != PAGE_SIZE || - frags->page_offset)) { - remaining -= frags->size; + while (remaining && (skb_frag_size(frags) != PAGE_SIZE || + skb_frag_off(frags))) { + remaining -= skb_frag_size(frags); frags++; } zc->recv_skip_hint -= remaining; @@ -2613,6 +2628,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->bytes_sent = 0; + tp->bytes_acked = 0; + tp->bytes_received = 0; tp->bytes_retrans = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; @@ -2634,6 +2651,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ @@ -2740,6 +2758,21 @@ static int tcp_repair_options_est(struct sock *sk, return 0; } +DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); +EXPORT_SYMBOL(tcp_tx_delay_enabled); + +static void tcp_enable_tx_delay(void) +{ + if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + static int __tcp_tx_delay_enabled = 0; + + if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { + static_branch_enable(&tcp_tx_delay_enabled); + pr_info("TCP_TX_DELAY enabled\n"); + } + } +} + /* * Socket option code for TCP. */ @@ -2767,7 +2800,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true); + err = tcp_set_congestion_control(sk, name, true, true, + ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); release_sock(sk); return err; } @@ -2790,15 +2825,23 @@ static int do_tcp_setsockopt(struct sock *sk, int level, return err; } case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; + __u8 *backup_key = NULL; - if (optlen != sizeof(key)) + /* Allow a backup key as well to facilitate key rotation + * First key is the active one. + */ + if (optlen != TCP_FASTOPEN_KEY_LENGTH && + optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_user(key, optval, optlen)) return -EFAULT; - return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key)); + if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) + backup_key = key + TCP_FASTOPEN_KEY_LENGTH; + + return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ @@ -3082,6 +3125,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, else tp->recvmsg_inq = val; break; + case TCP_TX_DELAY: + if (val) + tcp_enable_tx_delay(); + tp->tcp_tx_delay = val; + break; default: err = -ENOPROTOOPT; break; @@ -3246,6 +3294,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; + info->tcpi_rcv_ooopack = tp->rcv_ooopack; + info->tcpi_snd_wnd = tp->snd_wnd; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -3452,21 +3502,23 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; case TCP_FASTOPEN_KEY: { - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; struct tcp_fastopen_context *ctx; + unsigned int key_len = 0; if (get_user(len, optlen)) return -EFAULT; rcu_read_lock(); ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); - if (ctx) - memcpy(key, ctx->key, sizeof(key)); - else - len = 0; + if (ctx) { + key_len = tcp_fastopen_context_len(ctx) * + TCP_FASTOPEN_KEY_LENGTH; + memcpy(&key[0], &ctx->key[0], key_len); + } rcu_read_unlock(); - len = min_t(unsigned int, len, sizeof(key)); + len = min_t(unsigned int, len, key_len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, key, len)) @@ -3539,6 +3591,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->fastopen_no_cookie; break; + case TCP_TX_DELAY: + val = tp->tcp_tx_delay; + break; + case TCP_TIMESTAMP: val = tcp_time_stamp_raw() + tp->tsoffset; break; @@ -3742,8 +3798,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, return 1; for (i = 0; i < shi->nr_frags; ++i) { - const struct skb_frag_struct *f = &shi->frags[i]; - unsigned int offset = f->page_offset; + const skb_frag_t *f = &shi->frags[i]; + unsigned int offset = skb_frag_off(f); struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), @@ -3872,6 +3928,7 @@ void __init tcp_init(void) unsigned long limit; unsigned int i; + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); |