diff options
author | David S. Miller <davem@davemloft.net> | 2014-09-28 16:35:49 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-09-28 16:35:49 -0400 |
commit | dc83d4d8f6c897022c974a00769b7a6efee6aed8 (patch) | |
tree | 0d769e6155899c89ae95c3e31c79ce011eb96a39 | |
parent | ff04a771ad25fc9ba91690e73465b4d34b6bf8b3 (diff) | |
parent | 971f10eca186cab238c49daa91f703c5a001b0b1 (diff) | |
download | blackbird-op-linux-dc83d4d8f6c897022c974a00769b7a6efee6aed8.tar.gz blackbird-op-linux-dc83d4d8f6c897022c974a00769b7a6efee6aed8.zip |
Merge branch 'tcp_skb_cb'
Eric Dumazet says:
====================
tcp: better TCP_SKB_CB layout
TCP had the assumption that IPCB and IP6CB are first members of skb->cb[]
This is fine, except that IPCB/IP6CB are used in TCP for a very short time
in input path.
What really matters for TCP stack is to get skb->next,
TCP_SKB_CB(skb)->seq, and TCP_SKB_CB(skb)->end_seq in the same cache line.
skb that are immediately consumed do not care because whole skb->cb[] is
hot in cpu cache, while skb that sit in wocket write queue or receive queues
do not need TCP_SKB_CB(skb)->header at all.
This patch set implements the prereq for IPv4, IPv6, and TCP to make this
possible. This makes TCP more efficient.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | include/net/ip.h | 15 | ||||
-rw-r--r-- | include/net/ipv6.h | 3 | ||||
-rw-r--r-- | include/net/tcp.h | 12 | ||||
-rw-r--r-- | net/dccp/ipv6.c | 2 | ||||
-rw-r--r-- | net/ipv4/ip_options.c | 6 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 29 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 5 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 4 | ||||
-rw-r--r-- | net/ipv6/syncookies.c | 2 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 12 |
11 files changed, 64 insertions, 34 deletions
diff --git a/include/net/ip.h b/include/net/ip.h index fcd9068fb8c3..0bb620702929 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -180,8 +180,10 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len); #define IP_INC_STATS(net, field) SNMP_INC_STATS64((net)->mib.ip_statistics, field) @@ -511,7 +513,14 @@ int ip_forward(struct sk_buff *skb); void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb); + +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt); +static inline int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +{ + return __ip_options_echo(dopt, skb, &IPCB(skb)->opt); +} + void ip_options_fragment(struct sk_buff *skb); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7e247e9b8765..97f472012438 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -288,7 +288,8 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb); +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { diff --git a/include/net/tcp.h b/include/net/tcp.h index a4201ef216e8..4dc6641ee990 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -696,12 +696,6 @@ static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; /* For incoming frames */ __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ __u32 tcp_tw_isn; /* isn chosen by tcp_timewait_state_process() */ @@ -720,6 +714,12 @@ struct tcp_skb_cb { __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ /* 1 byte hole */ __u32 ack_seq; /* Sequence number ACK'd */ + union { + struct inet_skb_parm h4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; /* For incoming frames */ }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 04cb17d4b0ce..ad2acfe1ca61 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -404,7 +404,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ad382499bace..5b3d91be2db0 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -87,17 +87,15 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, * NOTE: dopt cannot point to skb. */ -int ip_options_echo(struct ip_options *dopt, struct sk_buff *skb) +int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb, + const struct ip_options *sopt) { - const struct ip_options *sopt; unsigned char *sptr, *dptr; int soffset, doffset; int optlen; memset(dopt, 0, sizeof(struct ip_options)); - sopt = &(IPCB(skb)->opt); - if (sopt->optlen == 0) return 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 215af2b155cb..c8fa62476461 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1522,8 +1522,10 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .uc_ttl = -1, }; -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, - __be32 saddr, const struct ip_reply_arg *arg, +void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, + const struct ip_options *sopt, + __be32 daddr, __be32 saddr, + const struct ip_reply_arg *arg, unsigned int len) { struct ip_options_data replyopts; @@ -1534,7 +1536,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr, struct sock *sk; struct inet_sock *inet; - if (ip_options_echo(&replyopts.opt.opt, skb)) + if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; ipc.addr = daddr; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3b2e49cb2b61..9ce3eac02957 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -681,8 +681,9 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); @@ -764,8 +765,9 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, + &arg, arg.iov[0].iov_len); TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } @@ -884,18 +886,16 @@ EXPORT_SYMBOL(tcp_syn_flood_action); */ static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb) { - const struct ip_options *opt = &(IPCB(skb)->opt); + const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt && opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); - if (dopt) { - if (ip_options_echo(&dopt->opt, skb)) { - kfree(dopt); - dopt = NULL; - } + if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) { + kfree(dopt); + dopt = NULL; } } return dopt; @@ -1429,7 +1429,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_SYN_COOKIES if (!th->syn) - sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); + sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt); #endif return sk; } @@ -1634,6 +1634,13 @@ int tcp_v4_rcv(struct sk_buff *skb) th = tcp_hdr(skb); iph = ip_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb), + sizeof(struct inet_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f173b1c4f815..a462fb1db896 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -974,6 +974,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Our usage of tstamp should remain private */ skb->tstamp.tv64 = 0; + + /* Cleanup our debris for IP stacks */ + memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (likely(err <= 0)) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e4865a3ebe1d..34f726f59814 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -672,10 +672,10 @@ int inet6_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); -bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb) +bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, + const struct inet6_skb_parm *opt) { const struct ipv6_pinfo *np = inet6_sk(sk); - const struct inet6_skb_parm *opt = IP6CB(skb); if (np->rxopt.all) { if ((opt->hop && (np->rxopt.bits.hopopts || diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c643dc907ce7..9a2838e93cc5 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -203,7 +203,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->ir_num = ntohs(th->dest); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - if (ipv6_opt_accepted(sk, skb) || + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { atomic_inc(&skb->users); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index de51a88bec6f..132bac137aed 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -742,7 +742,8 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk, ireq->ir_iif = inet6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && - (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo || + (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || np->repflow)) { atomic_inc(&skb->users); @@ -1367,7 +1368,7 @@ ipv6_pktoptions: np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb)) { + if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { skb_set_owner_r(opt_skb, sk); opt_skb = xchg(&np->pktoptions, opt_skb); } else { @@ -1411,6 +1412,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) th = tcp_hdr(skb); hdr = ipv6_hdr(skb); + /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() + * barrier() makes sure compiler wont play fool^Waliasing games. + */ + memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), + sizeof(struct inet6_skb_parm)); + barrier(); + TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); |