diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/ip_forward.c | 3 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 29 | ||||
-rw-r--r-- | net/ipv4/ip_sockglue.c | 8 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_redir_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv4/ping.c | 5 | ||||
-rw-r--r-- | net/ipv4/route.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 32 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 39 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 37 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_veno.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_yeah.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp_diag.c | 4 |
14 files changed, 100 insertions, 86 deletions
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3a83ce5efa80..787b3c294ce6 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -129,7 +129,8 @@ int ip_forward(struct sk_buff *skb) * We now generate an ICMP HOST REDIRECT giving the route * we calculated. */ - if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb)) + if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr && + !skb_sec_path(skb)) ip_rt_send_redirect(skb); skb->priority = rt_tos2priority(iph->tos); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b50861b22b6b..c373c0708d97 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1506,23 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. - * - * Use a fake percpu inet socket to avoid false sharing and contention. */ -static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { - .sk = { - .__sk_common = { - .skc_refcnt = ATOMIC_INIT(1), - }, - .sk_wmem_alloc = ATOMIC_INIT(1), - .sk_allocation = GFP_ATOMIC, - .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), - }, - .pmtudisc = IP_PMTUDISC_WANT, - .uc_ttl = -1, -}; - -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -1532,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct sk_buff *nskb; - struct sock *sk; - struct inet_sock *inet; int err; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) @@ -1565,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, if (IS_ERR(rt)) return; - inet = &get_cpu_var(unicast_sock); + inet_sk(sk)->tos = arg->tos; - inet->tos = arg->tos; - sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - sock_net_set(sk, net); - __skb_queue_head_init(&sk->sk_write_queue); sk->sk_sndbuf = sysctl_wmem_default; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); @@ -1589,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_orphan(nskb); skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: - put_cpu_var(unicast_sock); - ip_rt_put(rt); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 8a89c738b7a3..6b85adb05003 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -461,17 +461,13 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; - sin->sin_family = AF_UNSPEC; + memset(sin, 0, sizeof(*sin)); if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { - struct inet_sock *inet = inet_sk(sk); - sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; - sin->sin_port = 0; - memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); - if (inet->cmsg_flags) + if (inet_sk(sk)->cmsg_flags) ip_cmsg_recv(msg, skb); } diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c index ff2d23d8c87a..6ecfce63201a 100644 --- a/net/ipv4/netfilter/nft_redir_ipv4.c +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -27,10 +27,10 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr, memset(&mr, 0, sizeof(mr)); if (priv->sreg_proto_min) { - mr.range[0].min.all = (__force __be16) - data[priv->sreg_proto_min].data[0]; - mr.range[0].max.all = (__force __be16) - data[priv->sreg_proto_max].data[0]; + mr.range[0].min.all = + *(__be16 *)&data[priv->sreg_proto_min].data[0]; + mr.range[0].max.all = + *(__be16 *)&data[priv->sreg_proto_max].data[0]; mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c0d82f78d364..2a3720fb5a5f 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -966,8 +966,11 @@ bool ping_rcv(struct sk_buff *skb) sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); if (sk != NULL) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + pr_debug("rcv on socket %p\n", sk); - ping_queue_rcv_skb(sk, skb_get(skb)); + if (skb2) + ping_queue_rcv_skb(sk, skb2); sock_put(sk); return true; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6a2155b02602..52e1f2bf0ca2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (dst->dev->mtu < mtu) return; + if (rt->rt_pmtu && rt->rt_pmtu < mtu) + return; + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; @@ -1554,11 +1557,10 @@ static int __mkroute_input(struct sk_buff *skb, do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && + skb->protocol == htons(ETH_P_IP) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { - flags |= RTCF_DOREDIRECT; - do_cache = false; - } + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + IPCB(skb)->flags |= IPSKB_DOREDIRECT; if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -2303,6 +2305,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; + if (IPCB(skb)->flags & IPSKB_DOREDIRECT) + r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_be32(skb, RTA_DST, dst)) goto nla_put_failure; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index bb395d46a389..c037644eafb7 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + tcp_cong_avoid_ai(tp, ca->cnt, 1); } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 27ead0dd16bc..8670e68e2ce6 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,26 +291,32 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -void tcp_slow_start(struct tcp_sock *tp, u32 acked) +u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; + acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + + return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); -/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), + * for every packet that was ACKed. + */ +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else { - tp->snd_cwnd_cnt++; + u32 delta = tp->snd_cwnd_cnt / w; + + tp->snd_cwnd_cnt -= delta * w; + tp->snd_cwnd += delta; } + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); @@ -329,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + if (tp->snd_cwnd <= tp->snd_ssthresh) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } /* In dangerous area, increase slowly. */ - else - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 6b6002416a73..4b276d1ed980 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,9 +93,7 @@ struct bictcp { u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ -#define ACK_RATIO_SHIFT 4 -#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) - u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ + u16 unused; u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ @@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->bic_K = 0; ca->delay_min = 0; ca->epoch_start = 0; - ca->delayed_ack = 2 << ACK_RATIO_SHIFT; ca->ack_cnt = 0; ca->tcp_cwnd = 0; ca->found = 0; @@ -205,23 +202,30 @@ static u32 cubic_root(u64 a) /* * Compute congestion window to use. */ -static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) { u32 delta, bic_target, max_cnt; u64 offs, t; - ca->ack_cnt++; /* count the number of ACKs */ + ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) return; + /* The CUBIC function can update ca->cnt at most once per jiffy. + * On all cwnd reduction events, ca->epoch_start is set to 0, + * which will force a recalculation of ca->cnt. + */ + if (ca->epoch_start && tcp_time_stamp == ca->last_time) + goto tcp_friendliness; + ca->last_cwnd = cwnd; ca->last_time = tcp_time_stamp; if (ca->epoch_start == 0) { ca->epoch_start = tcp_time_stamp; /* record beginning */ - ca->ack_cnt = 1; /* start counting */ + ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { @@ -283,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ +tcp_friendliness: /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; @@ -301,7 +306,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } } - ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; if (ca->cnt == 0) /* cannot be zero */ ca->cnt = 1; } @@ -317,11 +321,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (tp->snd_cwnd <= tp->snd_ssthresh) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); - tcp_slow_start(tp, acked); - } else { - bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + acked = tcp_slow_start(tp, acked); + if (!acked) + return; } + bictcp_update(ca, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, ca->cnt, acked); } static u32 bictcp_recalc_ssthresh(struct sock *sk) @@ -411,20 +416,10 @@ static void hystart_update(struct sock *sk, u32 delay) */ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) { - const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; - if (icsk->icsk_ca_state == TCP_CA_Open) { - u32 ratio = ca->delayed_ack; - - ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ratio += cnt; - - ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); - } - /* Some calls are for duplicates without timetamps */ if (rtt_us < 0) return; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f72d7fc06c..d22f54482bab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.bound_dev_if = sk->sk_bound_dev_if; arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); @@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); @@ -2428,14 +2430,39 @@ struct proto tcp_prot = { }; EXPORT_SYMBOL(tcp_prot); +static void __net_exit tcp_sk_exit(struct net *net) +{ + int cpu; + + for_each_possible_cpu(cpu) + inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); + free_percpu(net->ipv4.tcp_sk); +} + static int __net_init tcp_sk_init(struct net *net) { + int res, cpu; + + net->ipv4.tcp_sk = alloc_percpu(struct sock *); + if (!net->ipv4.tcp_sk) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct sock *sk; + + res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, + IPPROTO_TCP, net); + if (res) + goto fail; + *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; + } net->ipv4.sysctl_tcp_ecn = 2; return 0; -} -static void __net_exit tcp_sk_exit(struct net *net) -{ +fail: + tcp_sk_exit(net); + + return res; } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6824afb65d93..333bcb2415ff 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp, acked); else - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + 1); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index a4d2d2d88dca..112151eeee45 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* In the "non-congestive state", increase cwnd * every rtt. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } else { /* In the "congestive state", increase cwnd * every other rtt. diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index cd7273218598..17d35662930d 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } else { /* Reno */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 7927db0a9279..4a000f1dd757 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -99,11 +99,13 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlin s_slot = cb->args[0]; num = s_num = cb->args[1]; - for (slot = s_slot; slot <= table->mask; num = s_num = 0, slot++) { + for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct sock *sk; struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + num = 0; + if (hlist_nulls_empty(&hslot->head)) continue; |