From a9fe8e29945d56f35235a3a0fba99b4cf181d211 Mon Sep 17 00:00:00 2001 From: Hannes Frederic Sowa Date: Tue, 2 Sep 2014 15:49:26 +0200 Subject: ipv4: implement igmp_qrv sysctl to tune igmp robustness variable As in IPv6 people might increase the igmp query robustness variable to make sure unsolicited state change reports aren't lost on the network. Add and document this new knob to igmp code. RFCs allow tuning this parameter back to first IGMP RFC, so we also use this setting for all counters, including source specific multicast. Also take over sysctl value when upping the interface and don't reuse the last one seen on the interface. Cc: Flavio Leitner Signed-off-by: Hannes Frederic Sowa Acked-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 79a007c52558..45d156dacd61 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -450,6 +450,16 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, -- cgit v1.2.3 From 49a601589caaf0e93194c0cc9b4ecddbe75dd2d5 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Fri, 5 Sep 2014 15:09:03 +0200 Subject: net/ipv4: bind ip_nonlocal_bind to current netns net.ipv4.ip_nonlocal_bind sysctl was global to all network namespaces. This patch allows to set a different value for each network namespace. Signed-off-by: Vincent Bernat Signed-off-by: David S. Miller --- include/net/ip.h | 2 -- include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 6 +----- net/ipv4/ping.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv6/af_inet6.c | 2 +- net/sctp/protocol.c | 2 +- 7 files changed, 12 insertions(+), 17 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/include/net/ip.h b/include/net/ip.h index c8fd6112bd0b..14bfc8e1bcf9 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -229,8 +229,6 @@ static inline int inet_is_local_reserved_port(struct net *net, int port) } #endif -extern int sysctl_ip_nonlocal_bind; - /* From inetpeer.c */ extern int inet_peer_threshold; extern int inet_peer_minttl; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index aec5e12f9f19..24945cefc4fd 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,6 +76,7 @@ struct netns_ipv4 { int sysctl_tcp_ecn; int sysctl_ip_no_pmtu_disc; int sysctl_ip_fwd_use_pmtu; + int sysctl_ip_nonlocal_bind; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d156b3c5f363..b537bd94906c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -418,10 +418,6 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -/* It is off by default, see below. */ -int sysctl_ip_nonlocal_bind __read_mostly; -EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -461,7 +457,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * is temporarily down) */ err = -EADDRNOTAVAIL; - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && addr->sin_addr.s_addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index a3c59a077a5f..57f7c9804139 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -311,7 +311,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr->sin_addr.s_addr == htonl(INADDR_ANY)) chk_addr_ret = RTN_LOCAL; - if ((sysctl_ip_nonlocal_bind == 0 && + if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 && isk->freebind == 0 && isk->transparent == 0 && chk_addr_ret != RTN_LOCAL) || chk_addr_ret == RTN_MULTICAST || diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 45d156dacd61..1599966f4639 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -285,13 +285,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, - { - .procname = "ip_nonlocal_bind", - .data = &sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_syn_retries", .data = &sysctl_tcp_syn_retries, @@ -848,6 +841,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b9393e6a21fe..e4865a3ebe1d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -302,7 +302,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Reproduce AF_INET checks to make the bindings consistent */ v4addr = addr->sin6_addr.s6_addr32[3]; chk_addr_ret = inet_addr_type(net, v4addr); - if (!sysctl_ip_nonlocal_bind && + if (!net->ipv4.sysctl_ip_nonlocal_bind && !(inet->freebind || inet->transparent) && v4addr != htonl(INADDR_ANY) && chk_addr_ret != RTN_LOCAL && diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 6240834f4b95..9d2c6c9facb6 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -366,7 +366,7 @@ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && - !sysctl_ip_nonlocal_bind) + !net->ipv4.sysctl_ip_nonlocal_bind) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) -- cgit v1.2.3 From 4cdf507d54525842dfd9f6313fdafba039084046 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2014 07:38:40 -0700 Subject: icmp: add a global rate limitation Current ICMP rate limiting uses inetpeer cache, which is an RBL tree protected by a lock, meaning that hosts can be stuck hard if all cpus want to check ICMP limits. When say a DNS or NTP server process is restarted, inetpeer tree grows quick and machine comes to its knees. iptables can not help because the bottleneck happens before ICMP messages are even cooked and sent. This patch adds a new global limitation, using a token bucket filter, controlled by two new sysctl : icmp_msgs_per_sec - INTEGER Limit maximal number of ICMP packets sent per second from this host. Only messages whose type matches icmp_ratemask are controlled by this limit. Default: 1000 icmp_msgs_burst - INTEGER icmp_msgs_per_sec controls number of ICMP packets sent per second, while icmp_msgs_burst controls the burst size of these packets. Default: 50 Note that if we really want to send millions of ICMP messages per second, we might extend idea and infra added in commit 04ca6973f7c1a ("ip: make IP identifiers less predictable") : add a token bucket in the ip_idents hash and no longer rely on inetpeer. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++ include/net/ip.h | 4 +++ net/ipv4/icmp.c | 64 +++++++++++++++++++++++++++++++--- net/ipv4/sysctl_net_ipv4.c | 16 +++++++++ net/ipv6/icmp.c | 20 ++++++----- 5 files changed, 105 insertions(+), 12 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1b5581a30d77..c7a81ace35d0 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -769,8 +769,21 @@ icmp_ratelimit - INTEGER icmp_ratemask (see below) to specific targets. 0 to disable any limiting, otherwise the minimal space between responses in milliseconds. + Note that another sysctl, icmp_msgs_per_sec limits the number + of ICMP packets sent on all targets. Default: 1000 +icmp_msgs_per_sec - INTEGER + Limit maximal number of ICMP packets sent per second from this host. + Only messages whose type matches icmp_ratemask (see below) are + controlled by this limit. + Default: 1000 + +icmp_msgs_burst - INTEGER + icmp_msgs_per_sec controls number of ICMP packets sent per second, + while icmp_msgs_burst controls the burst size of these packets. + Default: 50 + icmp_ratemask - INTEGER Mask made of ICMP types for which rates are being limited. Significant bits: IHGFEDCBA9876543210 diff --git a/include/net/ip.h b/include/net/ip.h index 14bfc8e1bcf9..fcd9068fb8c3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -548,6 +548,10 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, u32 info); +bool icmp_global_allow(void); +extern int sysctl_icmp_msgs_per_sec; +extern int sysctl_icmp_msgs_burst; + #ifdef CONFIG_PROC_FS int ip_misc_proc_init(void); #endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index ea7d4afe8205..5882f584910e 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -231,12 +231,62 @@ static inline void icmp_xmit_unlock(struct sock *sk) spin_unlock_bh(&sk->sk_lock.slock); } +int sysctl_icmp_msgs_per_sec __read_mostly = 1000; +int sysctl_icmp_msgs_burst __read_mostly = 50; + +static struct { + spinlock_t lock; + u32 credit; + u32 stamp; +} icmp_global = { + .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), +}; + +/** + * icmp_global_allow - Are we allowed to send one more ICMP message ? + * + * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Returns false if we reached the limit and can not send another packet. + * Note: called with BH disabled + */ +bool icmp_global_allow(void) +{ + u32 credit, delta, incr = 0, now = (u32)jiffies; + bool rc = false; + + /* Check if token bucket is empty and cannot be refilled + * without taking the spinlock. + */ + if (!icmp_global.credit) { + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta < HZ / 50) + return false; + } + + spin_lock(&icmp_global.lock); + delta = min_t(u32, now - icmp_global.stamp, HZ); + if (delta >= HZ / 50) { + incr = sysctl_icmp_msgs_per_sec * delta / HZ ; + if (incr) + icmp_global.stamp = now; + } + credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); + if (credit) { + credit--; + rc = true; + } + icmp_global.credit = credit; + spin_unlock(&icmp_global.lock); + return rc; +} +EXPORT_SYMBOL(icmp_global_allow); + /* * Send an ICMP frame. */ -static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) +static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code) { struct dst_entry *dst = &rt->dst; bool rc = true; @@ -253,8 +303,14 @@ static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask)) + goto out; + + rc = false; + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1599966f4639..8a25509c35b3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -730,6 +730,22 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "icmp_msgs_per_sec", + .data = &sysctl_icmp_msgs_per_sec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "icmp_msgs_burst", + .data = &sysctl_icmp_msgs_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { .procname = "udp_mem", .data = &sysctl_udp_mem, diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 394bb824fe4b..141e1f3ab74e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -170,11 +170,11 @@ static bool is_ineligible(const struct sk_buff *skb) /* * Check the ICMP output rate limit */ -static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) +static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, + struct flowi6 *fl6) { - struct dst_entry *dst; struct net *net = sock_net(sk); + struct dst_entry *dst; bool res = false; /* Informational messages are not limited. */ @@ -199,16 +199,20 @@ static inline bool icmpv6_xrlim_allow(struct sock *sk, u8 type, } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; - struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); - res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + if (icmp_global_allow()) { + struct inet_peer *peer; + + peer = inet_getpeer_v6(net->ipv6.peers, + &rt->rt6i_dst.addr, 1); + res = inet_peer_xrlim_allow(peer, tmo); + if (peer) + inet_putpeer(peer); + } } dst_release(dst); return res; -- cgit v1.2.3