diff options
Diffstat (limited to 'net/ipv4')
30 files changed, 730 insertions, 1900 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a5a1050595d1..cbb505ba9324 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER If unsure, say N here. -choice - prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" - depends on IP_ADVANCED_ROUTER - default ASK_IP_FIB_HASH - -config ASK_IP_FIB_HASH - bool "FIB_HASH" - ---help--- - Current FIB is very proven and good enough for most users. - -config IP_FIB_TRIE - bool "FIB_TRIE" - ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, - June 1999 - - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - <http://www.csc.kth.se/~snilsson/software/dyntrie2/> - -endchoice - -config IP_FIB_HASH - def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER - config IP_FIB_TRIE_STATS bool "FIB TRIE statistics" - depends on IP_FIB_TRIE + depends on IP_ADVANCED_ROUTER ---help--- Keep track of statistics on structure of FIB TRIE table. Useful for testing and measuring TRIE performance. @@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE handled by the klogd daemon which is responsible for kernel messages ("man klogd"). +config IP_ROUTE_CLASSID + bool + config IP_PNP bool "IP: kernel level autoconfiguration" help @@ -657,4 +624,3 @@ config TCP_MD5SIG on the Internet. If unsure, say N. - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4978d22f9a75..0dc772d0d125 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ - fib_frontend.o fib_semantics.o \ + fib_frontend.o fib_semantics.o fib_trie.o \ inet_fragment.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o -obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o -obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 45b89d7bda5a..7ceb80447631 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1231,7 +1231,7 @@ out: return err; } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 86961bec70ab..325053df6e70 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -201,7 +201,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ttl = 0; top_iph->check = 0; - ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + if (x->props.flags & XFRM_STATE_ALIGN4) + ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + else + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; @@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; - if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && - ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) - goto out; + if (x->props.flags & XFRM_STATE_ALIGN4) { + if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } else { + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } if (!pskb_may_pull(skb, ah_hlen)) goto out; @@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x) BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + - ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ALIGN4) + x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + else + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index df4616fce929..90389281d97a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -51,6 +51,7 @@ #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> +#include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; +/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE + * value. So if you change this define, make appropriate changes to + * inet_addr_hash as well. + */ +#define IN4_ADDR_HSIZE 256 +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; +static DEFINE_SPINLOCK(inet_addr_hash_lock); + +static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) +{ + u32 val = (__force u32) addr ^ hash_ptr(net, 8); + + return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & + (IN4_ADDR_HSIZE - 1)); +} + +static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) +{ + unsigned int hash = inet_addr_hash(net, ifa->ifa_address); + + spin_lock(&inet_addr_hash_lock); + hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + spin_unlock(&inet_addr_hash_lock); +} + +static void inet_hash_remove(struct in_ifaddr *ifa) +{ + spin_lock(&inet_addr_hash_lock); + hlist_del_init_rcu(&ifa->hash); + spin_unlock(&inet_addr_hash_lock); +} + +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL + */ +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) +{ + unsigned int hash = inet_addr_hash(net, addr); + struct net_device *result = NULL; + struct in_ifaddr *ifa; + struct hlist_node *node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { + struct net_device *dev = ifa->ifa_dev->dev; + + if (!net_eq(dev_net(dev), net)) + continue; + if (ifa->ifa_address == addr) { + result = dev; + break; + } + } + if (result && devref) + dev_hold(result); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL(__ip_dev_find); + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } if (!do_promote) { + inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); @@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, /* 2. Unlink it */ *ifap = ifa1->ifa_next; + inet_hash_remove(ifa1); /* 3. Announce address deletion */ @@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_next = *ifap; *ifap = ifa; + inet_hash_insert(dev_net(in_dev->dev), ifa); + /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ @@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_ADDRESS] == NULL) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = ifm->ifa_flags; @@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; ifa = inet_alloc_ifa(); + INIT_HLIST_NODE(&ifa->hash); if (!ifa) break; if (colon) @@ -1084,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct in_ifaddr *ifa = inet_alloc_ifa(); if (ifa) { + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1720,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = { void __init devinet_init(void) { + int i; + + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet_addr_lst[i]); + register_pernet_subsys(&devinet_ops); register_gifconf(PF_INET, inet_gifconf); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1d2cdd43a878..ad0778a3fa53 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; - local_table = fib_hash_table(RT_TABLE_LOCAL); + local_table = fib_trie_table(RT_TABLE_LOCAL); if (local_table == NULL) return -ENOMEM; - main_table = fib_hash_table(RT_TABLE_MAIN); + main_table = fib_trie_table(RT_TABLE_MAIN); if (main_table == NULL) goto fail; @@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib_hash_table(id); + tb = fib_trie_table(id); if (!tb) return NULL; h = id & (FIB_TABLE_HASHSZ - 1); @@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ -void fib_select_default(struct net *net, - const struct flowi *flp, struct fib_result *res) -{ - struct fib_table *tb; - int table = RT_TABLE_MAIN; -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) - return; - table = res->r->table; -#endif - tb = fib_get_table(net, table); - if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - fib_table_select_default(tb, flp, res); -} - static void fib_flush(struct net *net) { int flushed = 0; @@ -147,46 +132,6 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/** - * __ip_dev_find - find the first device with a given source address. - * @net: the net namespace - * @addr: the source address - * @devref: if true, take a reference on the found device - * - * If a caller uses devref=false, it should be protected by RCU, or RTNL - */ -struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) -{ - struct flowi fl = { - .fl4_dst = addr, - }; - struct fib_result res = { 0 }; - struct net_device *dev = NULL; - struct fib_table *local_table; - -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - - rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || - fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { - rcu_read_unlock(); - return NULL; - } - if (res.type != RTN_LOCAL) - goto out; - dev = FIB_RES_DEV(res); - - if (dev && devref) - dev_hold(dev); -out: - rcu_read_unlock(); - return dev; -} -EXPORT_SYMBOL(__ip_dev_find); - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -1101,5 +1046,5 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_hash_init(); + fib_trie_init(); } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c deleted file mode 100644 index b3acb0417b21..000000000000 --- a/net/ipv4/fib_hash.c +++ /dev/null @@ -1,1133 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IPv4 FIB: lookup engine and maintenance routines. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/proc_fs.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/init.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/sock.h> -#include <net/ip_fib.h> - -#include "fib_lookup.h" - -static struct kmem_cache *fn_hash_kmem __read_mostly; -static struct kmem_cache *fn_alias_kmem __read_mostly; - -struct fib_node { - struct hlist_node fn_hash; - struct list_head fn_alias; - __be32 fn_key; - struct fib_alias fn_embedded_alias; -}; - -#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) - -struct fn_zone { - struct fn_zone __rcu *fz_next; /* Next not empty zone */ - struct hlist_head __rcu *fz_hash; /* Hash table pointer */ - seqlock_t fz_lock; - u32 fz_hashmask; /* (fz_divisor - 1) */ - - u8 fz_order; /* Zone order (0..32) */ - u8 fz_revorder; /* 32 - fz_order */ - __be32 fz_mask; /* inet_make_mask(order) */ -#define FZ_MASK(fz) ((fz)->fz_mask) - - struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; - - int fz_nent; /* Number of entries */ - int fz_divisor; /* Hash size (mask+1) */ -}; - -struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone __rcu *fn_zone_list; -}; - -static inline u32 fn_hash(__be32 key, struct fn_zone *fz) -{ - u32 h = ntohl(key) >> fz->fz_revorder; - h ^= (h>>20); - h ^= (h>>10); - h ^= (h>>5); - h &= fz->fz_hashmask; - return h; -} - -static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) -{ - return dst & FZ_MASK(fz); -} - -static unsigned int fib_hash_genid; - -#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) - -static struct hlist_head *fz_hash_alloc(int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); - - return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); -} - -/* The fib hash lock must be held when this is called. */ -static inline void fn_rebuild_zone(struct fn_zone *fz, - struct hlist_head *old_ht, - int old_divisor) -{ - int i; - - for (i = 0; i < old_divisor; i++) { - struct hlist_node *node, *n; - struct fib_node *f; - - hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head *new_head; - - hlist_del_rcu(&f->fn_hash); - - new_head = rcu_dereference_protected(fz->fz_hash, 1) + - fn_hash(f->fn_key, fz); - hlist_add_head_rcu(&f->fn_hash, new_head); - } - } -} - -static void fz_hash_free(struct hlist_head *hash, int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - kfree(hash); - else - free_pages((unsigned long)hash, get_order(size)); -} - -static void fn_rehash_zone(struct fn_zone *fz) -{ - struct hlist_head *ht, *old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - new_divisor = old_divisor = fz->fz_divisor; - - switch (old_divisor) { - case EMBEDDED_HASH_SIZE: - new_divisor *= EMBEDDED_HASH_SIZE; - break; - case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: - new_divisor *= (EMBEDDED_HASH_SIZE/2); - break; - default: - if ((old_divisor << 1) > FZ_MAX_DIVISOR) { - printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); - return; - } - new_divisor = (old_divisor << 1); - break; - } - - new_hashmask = (new_divisor - 1); - -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", - fz->fz_order, old_divisor); -#endif - - ht = fz_hash_alloc(new_divisor); - - if (ht) { - struct fn_zone nfz; - - memcpy(&nfz, fz, sizeof(nfz)); - - write_seqlock_bh(&fz->fz_lock); - old_ht = rcu_dereference_protected(fz->fz_hash, 1); - RCU_INIT_POINTER(nfz.fz_hash, ht); - nfz.fz_hashmask = new_hashmask; - nfz.fz_divisor = new_divisor; - fn_rebuild_zone(&nfz, old_ht, old_divisor); - fib_hash_genid++; - rcu_assign_pointer(fz->fz_hash, ht); - fz->fz_hashmask = new_hashmask; - fz->fz_divisor = new_divisor; - write_sequnlock_bh(&fz->fz_lock); - - if (old_ht != fz->fz_embedded_hash) { - synchronize_rcu(); - fz_hash_free(old_ht, old_divisor); - } - } -} - -static void fn_free_node_rcu(struct rcu_head *head) -{ - struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); - - kmem_cache_free(fn_hash_kmem, f); -} - -static inline void fn_free_node(struct fib_node *f) -{ - call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); -} - -static void fn_free_alias_rcu(struct rcu_head *head) -{ - struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - - kmem_cache_free(fn_alias_kmem, fa); -} - -static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) -{ - fib_release_info(fa->fa_info); - if (fa == &f->fn_embedded_alias) - fa->fa_info = NULL; - else - call_rcu(&fa->rcu, fn_free_alias_rcu); -} - -static struct fn_zone * -fn_new_zone(struct fn_hash *table, int z) -{ - int i; - struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL); - if (!fz) - return NULL; - - seqlock_init(&fz->fz_lock); - fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; - fz->fz_hashmask = fz->fz_divisor - 1; - RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); - fz->fz_order = z; - fz->fz_revorder = 32 - z; - fz->fz_mask = inet_make_mask(z); - - /* Find the first not empty zone with more specific mask */ - for (i = z + 1; i <= 32; i++) - if (table->fn_zones[i]) - break; - if (i > 32) { - /* No more specific masks, we are the first. */ - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zone_list)); - rcu_assign_pointer(table->fn_zone_list, fz); - } else { - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zones[i]->fz_next)); - rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); - } - table->fn_zones[z] = fz; - fib_hash_genid++; - return fz; -} - -int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res, - int fib_flags) -{ - int err; - struct fn_zone *fz; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - - rcu_read_lock(); - for (fz = rcu_dereference(t->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next)) { - struct hlist_head *head; - struct hlist_node *node; - struct fib_node *f; - __be32 k; - unsigned int seq; - - do { - seq = read_seqbegin(&fz->fz_lock); - k = fz_key(flp->fl4_dst, fz); - - head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; - - err = fib_semantic_match(&f->fn_alias, - flp, res, - fz->fz_order, fib_flags); - if (err <= 0) - goto out; - } - } while (read_seqretry(&fz->fz_lock, seq)); - } - err = 1; -out: - rcu_read_unlock(); - return err; -} - -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) -{ - int order, last_idx; - struct hlist_node *node; - struct fib_node *f; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - struct fn_zone *fz = t->fn_zones[0]; - struct hlist_head *head; - - if (fz == NULL) - return; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - head = rcu_dereference(fz->fz_hash); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - } - - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - -/* Insert node F to FZ. */ -static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); - - hlist_add_head_rcu(&f->fn_hash, head); -} - -/* Return the node in FZ matching KEY. */ -static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); - struct hlist_node *node; - struct fib_node *f; - - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key == key) - return f; - } - - return NULL; -} - - -static struct fib_alias *fib_fast_alloc(struct fib_node *f) -{ - struct fib_alias *fa = &f->fn_embedded_alias; - - if (fa->fa_info != NULL) - fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - return fa; -} - -/* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fib_node *new_f = NULL; - struct fib_node *f; - struct fib_alias *fa, *new_fa; - struct fn_zone *fz; - struct fib_info *fi; - u8 tos = cfg->fc_tos; - __be32 key; - int err; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - fz = table->fn_zones[cfg->fc_dst_len]; - if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len))) - return -ENOBUFS; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - fi = fib_create_info(cfg); - if (IS_ERR(fi)) - return PTR_ERR(fi); - - if (fz->fz_nent > (fz->fz_divisor<<1) && - fz->fz_divisor < FZ_MAX_DIVISOR && - (cfg->fc_dst_len == 32 || - (1 << cfg->fc_dst_len) > fz->fz_divisor)) - fn_rehash_zone(fz); - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); - - /* Now fa, if non-NULL, points to the first fib alias - * with the same keys [prefix,tos,priority], if such key already - * exists or to the node before which we will insert new one. - * - * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. - */ - - if (fa && fa->fa_tos == tos && - fa->fa_info->fib_priority == fi->fib_priority) { - struct fib_alias *fa_first, *fa_match; - - err = -EEXIST; - if (cfg->fc_nlflags & NLM_F_EXCL) - goto out; - - /* We have 2 goals: - * 1. Find exact match for type, scope, fib_info to avoid - * duplicate routes - * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it - */ - fa_match = NULL; - fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - if (fa->fa_tos != tos) - break; - if (fa->fa_info->fib_priority != fi->fib_priority) - break; - if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { - fa_match = fa; - break; - } - } - - if (cfg->fc_nlflags & NLM_F_REPLACE) { - u8 state; - - fa = fa_first; - if (fa_match) { - if (fa == fa_match) - err = 0; - goto out; - } - err = -ENOBUFS; - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_tos = fa->fa_tos; - new_fa->fa_info = fi; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - state = fa->fa_state; - new_fa->fa_state = state & ~FA_S_ACCESSED; - fib_hash_genid++; - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - - fn_free_alias(fa, f); - if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); - return 0; - } - - /* Error if we find a perfect match which - * uses the same scope, type, and nexthop - * information. - */ - if (fa_match) - goto out; - - if (!(cfg->fc_nlflags & NLM_F_APPEND)) - fa = fa_first; - } - - err = -ENOENT; - if (!(cfg->fc_nlflags & NLM_F_CREATE)) - goto out; - - err = -ENOBUFS; - - if (!f) { - new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); - if (new_f == NULL) - goto out; - - INIT_HLIST_NODE(&new_f->fn_hash); - INIT_LIST_HEAD(&new_f->fn_alias); - new_f->fn_key = key; - f = new_f; - } - - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_info = fi; - new_fa->fa_tos = tos; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - new_fa->fa_state = 0; - - /* - * Insert new entry to the list. - */ - - if (new_f) - fib_insert_node(fz, new_f); - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : &f->fn_alias)); - fib_hash_genid++; - - if (new_f) - fz->fz_nent++; - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, 0); - return 0; - -out: - if (new_f) - kmem_cache_free(fn_hash_kmem, new_f); - fib_release_info(fi); - return err; -} - -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - struct fib_node *f; - struct fib_alias *fa, *fa_to_delete; - struct fn_zone *fz; - __be32 key; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL) - return -ESRCH; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0); - if (!fa) - return -ESRCH; - - fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fa->fa_tos != cfg->fc_tos) - break; - - if ((!cfg->fc_type || - fa->fa_type == cfg->fc_type) && - (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && - (!cfg->fc_protocol || - fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { - fa_to_delete = fa; - break; - } - } - - if (fa_to_delete) { - int kill_fn; - - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, 0); - - kill_fn = 0; - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_fn = 1; - } - fib_hash_genid++; - - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - fn_free_alias(fa, f); - if (kill_fn) { - fn_free_node(f); - fz->fz_nent--; - } - - return 0; - } - return -ESRCH; -} - -static int fn_flush_list(struct fn_zone *fz, int idx) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; - struct hlist_node *node, *n; - struct fib_node *f; - int found = 0; - - hlist_for_each_entry_safe(f, node, n, head, fn_hash) { - struct fib_alias *fa, *fa_node; - int kill_f; - - kill_f = 0; - list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_f = 1; - } - fib_hash_genid++; - - fn_free_alias(fa, f); - found++; - } - } - if (kill_f) { - fn_free_node(f); - fz->fz_nent--; - } - } - return found; -} - -/* caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz; - int found = 0; - - for (fz = rtnl_dereference(table->fn_zone_list); - fz != NULL; - fz = rtnl_dereference(fz->fz_next)) { - int i; - - for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); - } - return found; -} - -void fib_free_table(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz, *next; - - next = table->fn_zone_list; - while (next != NULL) { - fz = next; - next = fz->fz_next; - - if (fz->fz_hash != fz->fz_embedded_hash) - fz_hash_free(fz->fz_hash, fz->fz_divisor); - - kfree(fz); - } - - kfree(tb); -} - -static inline int -fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz, - struct hlist_head *head) -{ - struct hlist_node *node; - struct fib_node *f; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - if (i < s_i) - goto next; - - if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - fa->fa_scope, - f->fn_key, - fz->fz_order, - fa->fa_tos, - fa->fa_info, - NLM_F_MULTI) < 0) { - cb->args[4] = i; - return -1; - } -next: - i++; - } - } - cb->args[4] = i; - return skb->len; -} - -static inline int -fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz) -{ - int h, s_h; - struct hlist_head *head = rcu_dereference(fz->fz_hash); - - if (head == NULL) - return skb->len; - s_h = cb->args[3]; - for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(head + h)) - continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { - cb->args[3] = h; - return -1; - } - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - } - cb->args[3] = h; - return skb->len; -} - -int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) -{ - int m = 0, s_m; - struct fn_zone *fz; - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - - s_m = cb->args[2]; - rcu_read_lock(); - for (fz = rcu_dereference(table->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next), m++) { - if (m < s_m) - continue; - if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { - cb->args[2] = m; - rcu_read_unlock(); - return -1; - } - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); - } - rcu_read_unlock(); - cb->args[2] = m; - return skb->len; -} - -void __init fib_hash_init(void) -{ - fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), - 0, SLAB_PANIC, NULL); - - fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); - -} - -struct fib_table *fib_hash_table(u32 id) -{ - struct fib_table *tb; - - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), - GFP_KERNEL); - if (tb == NULL) - return NULL; - - tb->tb_id = id; - tb->tb_default = -1; - - memset(tb->tb_data, 0, sizeof(struct fn_hash)); - return tb; -} - -/* ------------------------------------------------------------------------ */ -#ifdef CONFIG_PROC_FS - -struct fib_iter_state { - struct seq_net_private p; - struct fn_zone *zone; - int bucket; - struct hlist_head *hash_head; - struct fib_node *fn; - struct fib_alias *fa; - loff_t pos; - unsigned int genid; - int valid; -}; - -static struct fib_alias *fib_get_first(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_table *main_table; - struct fn_hash *table; - - main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); - table = (struct fn_hash *)main_table->tb_data; - - iter->bucket = 0; - iter->hash_head = NULL; - iter->fn = NULL; - iter->fa = NULL; - iter->pos = 0; - iter->genid = fib_hash_genid; - iter->valid = 1; - - for (iter->zone = rcu_dereference(table->fn_zone_list); - iter->zone != NULL; - iter->zone = rcu_dereference(iter->zone->fz_next)) { - int maxslot; - - if (!iter->zone->fz_nent) - continue; - - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - maxslot = iter->zone->fz_divisor; - - for (iter->bucket = 0; iter->bucket < maxslot; - ++iter->bucket, ++iter->hash_head) { - struct hlist_node *node; - struct fib_node *fn; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - } -out: - return iter->fa; -} - -static struct fib_alias *fib_get_next(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_node *fn; - struct fib_alias *fa; - - /* Advance FA, if any. */ - fn = iter->fn; - fa = iter->fa; - if (fa) { - BUG_ON(!fn); - list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - - fa = iter->fa = NULL; - - /* Advance FN. */ - if (fn) { - struct hlist_node *node = &fn->fn_hash; - hlist_for_each_entry_continue(fn, node, fn_hash) { - iter->fn = fn; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - } - - fn = iter->fn = NULL; - - /* Advance hash chain. */ - if (!iter->zone) - goto out; - - for (;;) { - struct hlist_node *node; - int maxslot; - - maxslot = iter->zone->fz_divisor; - - while (++iter->bucket < maxslot) { - iter->hash_head++; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - - iter->zone = rcu_dereference(iter->zone->fz_next); - - if (!iter->zone) - goto out; - - iter->bucket = 0; - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } -out: - iter->pos++; - return fa; -} - -static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -{ - struct fib_iter_state *iter = seq->private; - struct fib_alias *fa; - - if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { - fa = iter->fa; - pos -= iter->pos; - } else - fa = fib_get_first(seq); - - if (fa) - while (pos && (fa = fib_get_next(seq))) - --pos; - return pos ? NULL : fa; -} - -static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - void *v = NULL; - - rcu_read_lock(); - if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) - v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - return v; -} - -static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); -} - -static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -{ - static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, - [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; - - if (fi && fi->fib_nh->nh_gw) - flags |= RTF_GATEWAY; - if (mask == htonl(0xFFFFFFFF)) - flags |= RTF_HOST; - flags |= RTF_UP; - return flags; -} - -/* - * This outputs /proc/net/route. - * - * It always works in backward compatibility mode. - * The format of the file is not supposed to be changed. - */ -static int fib_seq_show(struct seq_file *seq, void *v) -{ - struct fib_iter_state *iter; - int len; - __be32 prefix, mask; - unsigned flags; - struct fib_node *f; - struct fib_alias *fa; - struct fib_info *fi; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " - "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" - "\tWindow\tIRTT"); - goto out; - } - - iter = seq->private; - f = iter->fn; - fa = iter->fa; - fi = fa->fa_info; - prefix = f->fn_key; - mask = FZ_MASK(iter->zone); - flags = fib_flag_trans(fa->fa_type, mask, fi); - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, - mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3, &len); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); -out: - return 0; -} - -static const struct seq_operations fib_seq_ops = { - .start = fib_seq_start, - .next = fib_seq_next, - .stop = fib_seq_stop, - .show = fib_seq_show, -}; - -static int fib_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &fib_seq_ops, - sizeof(struct fib_iter_state)); -} - -static const struct file_operations fib_seq_fops = { - .owner = THIS_MODULE, - .open = fib_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -int __net_init fib_proc_init(struct net *net) -{ - if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) - return -ENOMEM; - return 0; -} - -void __net_exit fib_proc_exit(struct net *net) -{ - proc_net_remove(net, "route"); -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index c079cc0ec651..d5c40d8f6632 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -25,7 +25,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -extern int fib_semantic_match(struct list_head *head, +extern int fib_semantic_match(struct fib_table *tb, struct list_head *head, const struct flowi *flp, struct fib_result *res, int prefixlen, int fib_flags); extern void fib_release_info(struct fib_info *); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24f5c7b..3018efbaea77 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -41,13 +41,13 @@ struct fib4_rule { __be32 srcmask; __be32 dst; __be32 dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; -#ifdef CONFIG_NET_CLS_ROUTE -u32 fib_rules_tclass(struct fib_result *res) +#ifdef CONFIG_IP_ROUTE_CLASSID +u32 fib_rules_tclass(const struct fib_result *res) { return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; } @@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); #endif @@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->tos && (rule4->tos != frh->tos)) return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif @@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->src_len) NLA_PUT_BE32(skb, FRA_SRC, rule4->src); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid) NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 12d3dc3df1b7..562f34cd9303 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -49,7 +49,7 @@ static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_hash_size; +static unsigned int fib_info_hash_size; static unsigned int fib_info_cnt; #define DEVINDEX_HASHBITS 8 @@ -152,6 +152,8 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + if (fi->fib_metrics != (u32 *) dst_default_metrics) + kfree(fi->fib_metrics); kfree(fi); } @@ -200,7 +202,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -221,7 +223,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) static inline unsigned int fib_info_hashfn(const struct fib_info *fi) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; val ^= fi->fib_protocol; @@ -422,7 +424,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_GATEWAY); nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif @@ -476,7 +478,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla && nla_get_be32(nla) != nh->nh_gw) return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) return 1; @@ -613,14 +615,14 @@ out: static inline unsigned int fib_laddr_hashfn(__be32 val) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; } -static struct hlist_head *fib_hash_alloc(int bytes) +static struct hlist_head *fib_info_hash_alloc(int bytes) { if (bytes <= PAGE_SIZE) return kzalloc(bytes, GFP_KERNEL); @@ -630,7 +632,7 @@ static struct hlist_head *fib_hash_alloc(int bytes) get_order(bytes)); } -static void fib_hash_free(struct hlist_head *hash, int bytes) +static void fib_info_hash_free(struct hlist_head *hash, int bytes) { if (!hash) return; @@ -641,18 +643,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes) free_pages((unsigned long) hash, get_order(bytes)); } -static void fib_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) { struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_hash_size; + unsigned int old_size = fib_info_hash_size; unsigned int i, bytes; spin_lock_bh(&fib_info_lock); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; - fib_hash_size = new_size; + fib_info_hash_size = new_size; for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; @@ -693,8 +695,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash, spin_unlock_bh(&fib_info_lock); bytes = old_size * sizeof(struct hlist_head *); - fib_hash_free(old_info_hash, bytes); - fib_hash_free(old_laddrhash, bytes); + fib_info_hash_free(old_info_hash, bytes); + fib_info_hash_free(old_laddrhash, bytes); } struct fib_info *fib_create_info(struct fib_config *cfg) @@ -718,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #endif err = -ENOBUFS; - if (fib_info_cnt >= fib_hash_size) { - unsigned int new_size = fib_hash_size << 1; + if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; unsigned int bytes; @@ -727,21 +729,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!new_size) new_size = 1; bytes = new_size * sizeof(struct hlist_head *); - new_info_hash = fib_hash_alloc(bytes); - new_laddrhash = fib_hash_alloc(bytes); + new_info_hash = fib_info_hash_alloc(bytes); + new_laddrhash = fib_info_hash_alloc(bytes); if (!new_info_hash || !new_laddrhash) { - fib_hash_free(new_info_hash, bytes); - fib_hash_free(new_laddrhash, bytes); + fib_info_hash_free(new_info_hash, bytes); + fib_info_hash_free(new_laddrhash, bytes); } else - fib_hash_move(new_info_hash, new_laddrhash, new_size); + fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - if (!fib_hash_size) + if (!fib_info_hash_size) goto failure; } fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + if (cfg->fc_mx) { + fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!fi->fib_metrics) + goto failure; + } else + fi->fib_metrics = (u32 *) dst_default_metrics; fib_info_cnt++; fi->fib_net = hold_net(net); @@ -779,7 +787,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) goto err_inval; #endif @@ -792,7 +800,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -881,8 +889,9 @@ failure: } /* Note! fib_semantic_match intentionally uses RCU list functions. */ -int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags) +int fib_semantic_match(struct fib_table *tb, struct list_head *head, + const struct flowi *flp, struct fib_result *res, + int prefixlen, int fib_flags) { struct fib_alias *fa; int nh_sel = 0; @@ -946,6 +955,8 @@ out_fill_res: res->type = fa->fa_type; res->scope = fa->fa_scope; res->fi = fa->fa_info; + res->table = tb; + res->fa_head = head; if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&res->fi->fib_clntref); return 0; @@ -1002,7 +1013,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (fi->fib_nh->nh_oif) NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif @@ -1027,7 +1038,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (nh->nh_gw) NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); #endif @@ -1125,6 +1136,62 @@ int fib_sync_down_dev(struct net_device *dev, int force) return ret; } +/* Must be invoked inside of an RCU protected region. */ +void fib_select_default(struct fib_result *res) +{ + struct fib_info *fi = NULL, *last_resort = NULL; + struct list_head *fa_head = res->fa_head; + struct fib_table *tb = res->table; + int order = -1, last_idx = -1; + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, fa_head, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (fa->fa_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + + fib_alias_accessed(fa); + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + fi = next_fi; + order++; + } + + if (order <= 0 || fi == NULL) { + tb->tb_default = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: + return; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH /* diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0f280348e0fd..edf3b0997e01 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -95,7 +95,7 @@ typedef unsigned int t_key; #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) -struct node { +struct rt_trie_node { unsigned long parent; t_key key; }; @@ -126,7 +126,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct node *child[0]; + struct rt_trie_node *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,16 +151,16 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct rt_trie_node *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); -static struct node *resize(struct trie *t, struct tnode *tn); +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ @@ -177,12 +177,12 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct node *node) +static inline struct tnode *node_parent(struct rt_trie_node *node) { return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct node *node) +static inline struct tnode *node_parent_rcu(struct rt_trie_node *node) { struct tnode *ret = node_parent(node); @@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node) /* Same as rcu_assign_pointer * but that macro() assumes that value is a pointer. */ -static inline void node_set_parent(struct node *node, struct tnode *ptr) +static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) { smp_wmb(); node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); return tn->child[i]; } -static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { - struct node *ret = tnode_get_child(tn, i); + struct rt_trie_node *ret = tnode_get_child(tn, i); return rcu_dereference_rtnl(ret); } @@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn) return 1 << tn->bits; } -static inline t_key mask_pfx(t_key k, unsigned short l) +static inline t_key mask_pfx(t_key k, unsigned int l) { return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); } -static inline t_key tkey_extract_bits(t_key a, int offset, int bits) +static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) { if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); @@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); size_t size = sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); if (size <= PAGE_SIZE) kfree(tn); @@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn) tn->tnode_free = tnode_free_head; tnode_free_head = tn; tnode_free_size += sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); } static void tnode_free_flush(void) @@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); + size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); struct tnode *tn = tnode_alloc(sz); if (tn) { @@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct node) << bits); + sizeof(struct rt_trie_node) << bits); return tn; } @@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) { if (n == NULL || IS_LEAF(n)) return 0; @@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) } static inline void put_child(struct trie *t, struct tnode *tn, int i, - struct node *n) + struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct node *chi = tn->child[i]; + struct rt_trie_node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<<tn->bits); @@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, } #define MAX_WORK 10 -static struct node *resize(struct trie *t, struct tnode *tn) +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) { int i; struct tnode *old_tn; @@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!node_parent((struct node *)tn)) { + if (!node_parent((struct rt_trie_node *)tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; } else { @@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return (struct node *) tn; + return (struct rt_trie_node *) tn; /* * Halve as long as the number of empty children in this @@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (tn->empty_children == tnode_child_length(tn) - 1) { one_child: for (i = 0; i < tnode_child_length(tn); i++) { - struct node *n; + struct rt_trie_node *n; n = tn->child[i]; if (!n) @@ -676,7 +676,7 @@ one_child: return n; } } - return (struct node *) tn; + return (struct rt_trie_node *) tn; } static struct tnode *inflate(struct trie *t, struct tnode *tn) @@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct node *) left); - put_child(t, tn, 2*i+1, (struct node *) right); + put_child(t, tn, 2*i, (struct rt_trie_node *) left); + put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); } } for (i = 0; i < olen; i++) { struct tnode *inode; - struct node *node = tnode_get_child(oldtnode, i); + struct rt_trie_node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; @@ -825,7 +825,7 @@ nomem: static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; - struct node *left, *right; + struct rt_trie_node *left, *right; int i; int olen = tnode_child_length(tn); @@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct node *)newn); + put_child(t, tn, i/2, (struct rt_trie_node *)newn); } } @@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key) { int pos; struct tnode *tn; - struct node *n; + struct rt_trie_node *n; pos = 0; n = rcu_dereference_rtnl(t->trie); @@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) key = tn->key; - while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { + while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize(t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex, - (struct node *)tn, wasfull); + (struct rt_trie_node *)tn, wasfull); - tp = node_parent((struct node *) tn); + tp = node_parent((struct rt_trie_node *) tn); if (!tp) - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); if (!tp) @@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); } @@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; - struct node *n; + struct rt_trie_node *n; struct leaf *l; int missbit; struct list_head *fa_head = NULL; @@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - node_set_parent((struct node *)l, tp); + node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) return NULL; } - node_set_parent((struct node *)tn, tp); + node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct node *)l); + put_child(t, tn, missbit, (struct rt_trie_node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, - (struct node *)tn); + (struct rt_trie_node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; } } @@ -1340,7 +1340,7 @@ err: } /* should be called with rcu_read_lock */ -static int check_leaf(struct trie *t, struct leaf *l, +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, t_key key, const struct flowi *flp, struct fib_result *res, int fib_flags) { @@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l, if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); + err = fib_semantic_match(tb, &li->falh, flp, res, plen, fib_flags); #ifdef CONFIG_IP_FIB_TRIE_STATS if (err <= 0) @@ -1376,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, { struct trie *t = (struct trie *) tb->tb_data; int ret; - struct node *n; + struct rt_trie_node *n; struct tnode *pn; - int pos, bits; + unsigned int pos, bits; t_key key = ntohl(flp->fl4_dst); - int chopped_off; + unsigned int chopped_off; t_key cindex = 0; - int current_prefix_length = KEYLENGTH; + unsigned int current_prefix_length = KEYLENGTH; struct tnode *cn; t_key pref_mismatch; @@ -1398,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); goto found; } @@ -1423,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, } if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; @@ -1541,7 +1541,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent_rcu((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); if (!parent) goto failed; @@ -1568,7 +1568,7 @@ found: */ static void trie_leaf_remove(struct trie *t, struct leaf *l) { - struct tnode *tp = node_parent((struct node *) l); + struct tnode *tp = node_parent((struct rt_trie_node *) l); pr_debug("entering trie_leaf_remove(%p)\n", l); @@ -1706,7 +1706,7 @@ static int trie_flush_leaf(struct leaf *l) * Scan for the next right leaf starting at node p->child[idx] * Since we have back pointer, no recursion necessary. */ -static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) +static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) { do { t_key idx; @@ -1732,7 +1732,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) } /* Node empty, walk back up to parent */ - c = (struct node *) p; + c = (struct rt_trie_node *) p; } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ @@ -1753,7 +1753,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { - struct node *c = (struct node *) l; + struct rt_trie_node *c = (struct rt_trie_node *) l; struct tnode *p = node_parent_rcu(c); if (!p) @@ -1802,80 +1802,6 @@ void fib_free_table(struct fib_table *tb) kfree(tb); } -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, - struct fib_result *res) -{ - struct trie *t = (struct trie *) tb->tb_data; - int order, last_idx; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fib_alias *fa = NULL; - struct list_head *fa_head; - struct leaf *l; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - - l = fib_find_node(t, 0); - if (!l) - goto out; - - fa_head = get_fa_head(l, 0); - if (!fa_head) - goto out; - - if (list_empty(fa_head)) - goto out; - - list_for_each_entry_rcu(fa, fa_head, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) @@ -1990,7 +1916,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, return skb->len; } -void __init fib_hash_init(void) +void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), @@ -2003,8 +1929,7 @@ void __init fib_hash_init(void) } -/* Fix more generic FIB names for init later */ -struct fib_table *fib_hash_table(u32 id) +struct fib_table *fib_trie_table(u32 id) { struct fib_table *tb; struct trie *t; @@ -2036,7 +1961,7 @@ struct fib_trie_iter { unsigned int depth; }; -static struct node *fib_trie_get_next(struct fib_trie_iter *iter) +static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; unsigned int cindex = iter->index; @@ -2050,7 +1975,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<<tn->bits)) { - struct node *n = tnode_get_child_rcu(tn, cindex); + struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -2069,7 +1994,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = node_parent_rcu((struct node *)tn); + p = node_parent_rcu((struct rt_trie_node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2081,10 +2006,10 @@ rescan: return NULL; } -static struct node *fib_trie_get_first(struct fib_trie_iter *iter, +static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct node *n; + struct rt_trie_node *n; if (!t) return NULL; @@ -2108,7 +2033,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct node *n; + struct rt_trie_node *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -2181,7 +2106,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct node *) * pointers; + bytes += sizeof(struct rt_trie_node *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2262,7 +2187,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2275,7 +2200,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { - struct node *n; + struct rt_trie_node *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2304,7 +2229,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct node *n; + struct rt_trie_node *n; ++*pos; /* next node in same table */ @@ -2390,7 +2315,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct node *n = v; + struct rt_trie_node *n = v; if (!node_parent_rcu(n)) fib_table_print(seq, iter->tb); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4aa1b7f01ea0..ad2bcf1b69ae 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk) * Send an ICMP frame. */ -/* - * Check transmit rate limitation for given message. - * The rate information is held in the destination cache now. - * This function is generic and could be used for other purposes - * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. - * - * Note that the same dst_entry fields are modified by functions in - * route.c too, but these work for packet destinations while xrlim_allow - * works for icmp destinations. This means the rate limiting information - * for one "ip object" is shared - and these ICMPs are twice limited: - * by source and by destination. - * - * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits - * - * Shared between ICMPv4 and ICMPv6. - */ -#define XRLIM_BURST_FACTOR 6 -int xrlim_allow(struct dst_entry *dst, int timeout) -{ - unsigned long now, token = dst->rate_tokens; - int rc = 0; - - now = jiffies; - token += now - dst->rate_last; - dst->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; - if (token >= timeout) { - token -= timeout; - rc = 1; - } - dst->rate_tokens = token; - return rc; -} -EXPORT_SYMBOL(xrlim_allow); - -static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, int type, int code) { struct dst_entry *dst = &rt->dst; - int rc = 1; + bool rc = true; if (type > NR_ICMP_TYPES) goto out; @@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) - rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { + if (!rt->peer) + rt_bind_peer(rt, 1); + rc = inet_peer_xrlim_allow(rt->peer, + net->ipv4.sysctl_icmp_ratelimit); + } out: return rc; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index a96e65674ac3..48f8d4592ccd 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a, int i, n = (a->family == AF_INET ? 1 : 4); for (i = 0; i < n; i++) { - if (a->a6[i] == b->a6[i]) + if (a->addr.a6[i] == b->addr.a6[i]) continue; - if (a->a6[i] < b->a6[i]) + if (a->addr.a6[i] < b->addr.a6[i]) return -1; return 1; } @@ -510,8 +510,13 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); + atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); p->tcp_ts_stamp = 0; + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + p->rate_last = 0; + p->pmtu_expires = 0; + memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->unused); @@ -579,3 +584,44 @@ void inet_putpeer(struct inet_peer *p) local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_putpeer); + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the inet_peer entries now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same inet_peer fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) +{ + unsigned long now, token; + bool rc = false; + + if (!peer) + return true; + + token = peer->rate_tokens; + now = jiffies; + token += now - peer->rate_last; + peer->rate_last = now; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; + rc = true; + } + peer->rate_tokens = token; + return rc; +} +EXPORT_SYMBOL(inet_peer_xrlim_allow); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb7..d7b2b0987a3b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) } } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2bae5f..f926a310075d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP ---help--- This module implements an Application Layer Gateway (ALG) for diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e855fffaed95..e95054c690c6 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 652efea013dc..ef7d7b9680ea 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a4897655..403ca57f6011 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) { - pr_info("no conntrack!\n"); - /* FIXME: need to drop invalid ones, since replies - * to outgoing connections of other nodes will be - * marked as INVALID */ + if (ct == NULL) return NF_DROP; - } /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 72ffc8fda2e9..d76d6c9ed946 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf, } #endif - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, 0); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a32f293..aef5d1fbe77d 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 63f60fc5d26a..5585980fce2e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -20,6 +20,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_acct.h> +#include <linux/rculist_nulls.h> struct ct_iter_state { struct seq_net_private p; @@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); } return head; } @@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 0f23b3f06df0..703f366fd235 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; + int res; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) + res = nf_ct_expect_related(exp); + if (res == 0) break; - else if (ret != -EBUSY) { + else if (res != -EBUSY) { port = 0; break; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c04787ce1a71..21bcf471b25a 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { + /* try the original tuple first */ + if (in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + return; + } + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, + range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct, struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; - int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; } - /* Place in source hash if this is the first time. */ - if (have_to_hash) { + if (maniptype == IP_NAT_MANIP_SRC) { unsigned int srchash; srchash = hash_by_src(net, nf_ct_zone(ct), @@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_DST_NAT_DONE; else - set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } @@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } @@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (nat == NULL || nat->ct == NULL) return; - NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); hlist_del_rcu(&nat->bysource); @@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old) struct nf_conn_nat *old_nat = old; struct nf_conn *ct = old_nat->ct; - if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) return; spin_lock_bh(&nf_nat_lock); - new_nat->ct = ct; hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); spin_unlock_bh(&nf_nat_lock); } @@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - net->ipv4.nat_htable_size); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a56..8812a02078ab 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); @@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) { int ret = 0; - ret = nf_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - nf_conntrack_helper_unregister(&snmp_helper); + rcu_assign_pointer(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6ed6603c2f6d..52b077d45208 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; - /* * Interface to generic destination cache. */ @@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { } +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!rt->peer) + rt_bind_peer(rt, 1); + + peer = rt->peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } else { + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + } + } + return p; +} + static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), @@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = { .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .default_mtu = ipv4_default_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, @@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } @@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth) static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->dst.expires; + (rth->peer && rth->peer->pmtu_expires); } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t if (atomic_read(&rth->dst.__refcnt)) goto out; - ret = 1; - if (rth->dst.expires && - time_after_eq(jiffies, rth->dst.expires)) - goto out; - age = jiffies - rth->dst.lastuse; - ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) goto out; @@ -793,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) return ONE; } -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - /* * Pertubation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -1272,6 +1208,13 @@ skip_hashing: return 0; } +static atomic_t __rt_peer_genid = ATOMIC_INIT(0); + +static u32 rt_peer_genid(void) +{ + return atomic_read(&__rt_peer_genid); +} + void rt_bind_peer(struct rtable *rt, int create) { struct inet_peer *peer; @@ -1280,6 +1223,8 @@ void rt_bind_peer(struct rtable *rt, int create) if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); + else + rt->rt_peer_genid = rt_peer_genid(); } /* @@ -1349,13 +1294,8 @@ static void rt_del(unsigned hash, struct rtable *rt) void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { - int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth; - struct rtable __rcu **rthp; - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct netevent_redirect netevent; + struct inet_peer *peer; struct net *net; if (!in_dev) @@ -1367,9 +1307,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ipv4_is_zeronet(new_gw)) goto reject_redirect; - if (!rt_caching(net)) - goto reject_redirect; - if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; @@ -1380,91 +1317,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - rt_is_expired(rth) || - !net_eq(dev_net(rth->dst.dev), net)) { - rthp = &rth->dst.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->dst.error || - rth->rt_gateway != old_gw || - rth->dst.dev != dev) - break; - - dst_hold(&rth->dst); - - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - return; - } - - /* Copy all the information. */ - *rt = *rth; - rt->dst.__use = 1; - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.child = NULL; - if (rt->dst.dev) - dev_hold(rt->dst.dev); - rt->dst.obsolete = -1; - rt->dst.lastuse = jiffies; - rt->dst.path = &rt->dst; - rt->dst.neighbour = NULL; - rt->dst.hh = NULL; -#ifdef CONFIG_XFRM - rt->dst.xfrm = NULL; -#endif - rt->rt_genid = rt_genid(net); - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->dst) || - !(rt->dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->dst.neighbour) - neigh_event_send(rt->dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } + peer = inet_getpeer_v4(daddr, 1); + if (peer) { + peer->redirect_learned.a4 = new_gw; - netevent.old = &rth->dst; - netevent.new = &rt->dst; - call_netevent_notifiers(NETEVENT_REDIRECT, - &netevent); + inet_putpeer(peer); - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) - ip_rt_put(rt); - goto do_next; - } - do_next: - ; - } + atomic_inc(&__rt_peer_genid); } return; @@ -1488,9 +1347,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->dst.expires && - time_after_eq(jiffies, rt->dst.expires))) { + } else if (rt->rt_flags & RTCF_REDIRECTED) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); @@ -1500,6 +1357,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) #endif rt_del(hash, rt); ret = NULL; + } else if (rt->peer && + rt->peer->pmtu_expires && + time_after_eq(jiffies, rt->peer->pmtu_expires)) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(dst, RTAX_MTU, + rt->peer->pmtu_orig); } } return ret; @@ -1525,6 +1390,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; + struct inet_peer *peer; int log_martians; rcu_read_lock(); @@ -1536,33 +1402,41 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + return; + } + /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) - rt->dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ - if (rt->dst.rate_tokens >= ip_rt_redirect_number) { - rt->dst.rate_last = jiffies; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->dst.rate_tokens == 0 || + if (peer->rate_tokens == 0 || time_after(jiffies, - (rt->dst.rate_last + - (ip_rt_redirect_load << rt->dst.rate_tokens)))) { + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->dst.rate_last = jiffies; - ++rt->dst.rate_tokens; + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->dst.rate_tokens == ip_rt_redirect_number && + peer->rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &rt->rt_src, rt->rt_iif, @@ -1574,7 +1448,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + bool send; int code; switch (rt->dst.error) { @@ -1594,15 +1470,24 @@ static int ip_error(struct sk_buff *skb) break; } - now = jiffies; - rt->dst.rate_tokens += now - rt->dst.rate_last; - if (rt->dst.rate_tokens > ip_rt_error_burst) - rt->dst.rate_tokens = ip_rt_error_burst; - rt->dst.rate_last = now; - if (rt->dst.rate_tokens >= ip_rt_error_cost) { - rt->dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb(skb); return 0; @@ -1630,88 +1515,130 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev) { - int i, k; unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - int ikeys[2] = { dev->ifindex, 0 }; - __be32 skeys[2] = { iph->saddr, 0, }; - __be32 daddr = iph->daddr; unsigned short est_mtu = 0; + struct inet_peer *peer; - for (k = 0; k < 2; k++) { - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - unsigned short mtu = new_mtu; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->rt_dst != daddr || - rth->rt_src != iph->saddr || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - dst_metric_locked(&rth->dst, RTAX_MTU) || - !net_eq(dev_net(rth->dst.dev), net) || - rt_is_expired(rth)) - continue; - - if (new_mtu < 68 || new_mtu >= old_mtu) { + peer = inet_getpeer_v4(iph->daddr, 1); + if (peer) { + unsigned short mtu = new_mtu; - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= dst_mtu(&rth->dst) && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; + if (new_mtu < 68 || new_mtu >= old_mtu) { + /* BSD 4.2 derived systems incorrectly adjust + * tot_len by the IP header length, and report + * a zero MTU in the ICMP message. + */ + if (mtu == 0 && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + mtu = guess_mtu(old_mtu); + } - mtu = guess_mtu(old_mtu); - } - if (mtu <= dst_mtu(&rth->dst)) { - if (mtu < dst_mtu(&rth->dst)) { - dst_confirm(&rth->dst); - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(&rth->dst, - RTAX_LOCK); - mtu = ip_rt_min_pmtu; - lock |= (1 << RTAX_MTU); - dst_metric_set(&rth->dst, RTAX_LOCK, - lock); - } - dst_metric_set(&rth->dst, RTAX_MTU, mtu); - dst_set_expires(&rth->dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - rcu_read_unlock(); + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + est_mtu = mtu; + peer->pmtu_learned = mtu; + peer->pmtu_expires = jiffies + ip_rt_mtu_expires; } + + inet_putpeer(peer); + + atomic_inc(&__rt_peer_genid); } return est_mtu ? : new_mtu; } +static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +{ + unsigned long expires = peer->pmtu_expires; + + if (time_before(expires, jiffies)) { + u32 orig_dst_mtu = dst_mtu(dst); + if (peer->pmtu_learned < orig_dst_mtu) { + if (!peer->pmtu_orig) + peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); + dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); + } + } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) + dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); +} + static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst_mtu(dst) > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(dst, RTAX_LOCK); + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + + dst_confirm(dst); + + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + if (peer) { + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + peer->pmtu_learned = mtu; + peer->pmtu_expires = jiffies + ip_rt_mtu_expires; + + atomic_inc(&__rt_peer_genid); + rt->rt_peer_genid = rt_peer_genid(); + + check_peer_pmtu(dst, peer); } - dst_metric_set(dst, RTAX_MTU, mtu); - dst_set_expires(dst, ip_rt_mtu_expires); - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + inet_putpeer(peer); + } +} + +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + + dst_confirm(&rt->dst); + + neigh_release(rt->dst.neighbour); + rt->dst.neighbour = NULL; + + rt->rt_gateway = peer->redirect_learned.a4; + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, + rt->dst.neighbour); } + return 0; } static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - if (rt_is_expired((struct rtable *)dst)) + struct rtable *rt = (struct rtable *) dst; + + if (rt_is_expired(rt)) return NULL; + if (rt->rt_peer_genid != rt_peer_genid()) { + struct inet_peer *peer; + + if (!rt->peer) + rt_bind_peer(rt, 0); + + peer = rt->peer; + if (peer && peer->pmtu_expires) + check_peer_pmtu(dst, peer); + + if (peer && peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } + + rt->rt_peer_genid = rt_peer_genid(); + } return dst; } @@ -1720,6 +1647,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst) struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer = rt->peer; + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } if (peer) { rt->peer = NULL; inet_putpeer(peer); @@ -1734,8 +1665,14 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt) - dst_set_expires(&rt->dst, 0); + if (rt && + rt->peer && + rt->peer->pmtu_expires) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); + } } static int ip_rt_bug(struct sk_buff *skb) @@ -1775,7 +1712,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) @@ -1815,17 +1752,52 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst) return mtu; } -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) +{ + struct inet_peer *peer; + int create = 0; + + /* If a peer entry exists for this destination, we must hook + * it up in order to get at cached metrics. + */ + if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS) + create = 1; + + rt_bind_peer(rt, create); + peer = rt->peer; + if (peer) { + if (inet_metrics_new(peer)) + memcpy(peer->metrics, fi->fib_metrics, + sizeof(u32) * RTAX_MAX); + dst_init_metrics(&rt->dst, peer->metrics, false); + + if (peer->pmtu_expires) + check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + rt->rt_gateway = peer->redirect_learned.a4; + rt->rt_flags |= RTCF_REDIRECTED; + } + } else { + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); + } +} + +static void rt_set_nexthop(struct rtable *rt, const struct fib_result *res, + struct fib_info *fi, u16 type, u32 itag) { struct dst_entry *dst = &rt->dst; - struct fib_info *fi = res->fi; if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - dst_import_metrics(dst, fi->fib_metrics); -#ifdef CONFIG_NET_CLS_ROUTE + rt_init_metrics(rt, fi); +#ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } @@ -1835,13 +1807,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; + rt->rt_type = type; +} + +static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm) +{ + struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1); + if (rt) { + rt->dst.obsolete = -1; + + rt->dst.flags = DST_HOST | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0); + } + return rt; } /* called in rcu_read_lock() section */ @@ -1874,24 +1859,19 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.output = ip_rt_bug; - rth->dst.obsolete = -1; - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_iif = @@ -1959,7 +1939,7 @@ static void ip_handle_martian_source(struct net_device *dev, /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, - struct fib_result *res, + const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) @@ -2013,19 +1993,13 @@ static int __mkroute_input(struct sk_buff *skb, } } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; @@ -2040,12 +2014,11 @@ static int __mkroute_input(struct sk_buff *skb, rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; - rth->dst.obsolete = -1; rth->dst.input = ip_forward; rth->dst.output = ip_output; rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); - rt_set_nexthop(rth, res, itag); + rt_set_nexthop(rth, res, res->fi, res->type, itag); rth->rt_flags = flags; @@ -2190,25 +2163,20 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; - rth->dst.obsolete = -1; rth->rt_genid = rt_genid(net); - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; rth->fl.fl4_dst = daddr; rth->rt_dst = daddr; rth->fl.fl4_tos = tos; rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_iif = @@ -2351,38 +2319,39 @@ skip_cache: EXPORT_SYMBOL(ip_route_input_common); /* called with rcu_read_lock() */ -static int __mkroute_output(struct rtable **result, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; - struct in_device *in_dev; + struct fib_info *fi = res->fi; u32 tos = RT_FL_TOS(oldflp); + struct in_device *in_dev; + u16 type = res->type; + struct rtable *rth; if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl->fl4_dst)) - res->type = RTN_BROADCAST; + type = RTN_BROADCAST; else if (ipv4_is_multicast(fl->fl4_dst)) - res->type = RTN_MULTICAST; + type = RTN_MULTICAST; else if (ipv4_is_zeronet(fl->fl4_dst)) - return -EINVAL; + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) - return -EINVAL; + return ERR_PTR(-EINVAL); - if (res->type == RTN_BROADCAST) { + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - res->fi = NULL; - } else if (res->type == RTN_MULTICAST) { + fi = NULL; + } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto)) @@ -2391,21 +2360,14 @@ static int __mkroute_output(struct rtable **result, * default one, but do not gateway in this case. * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) - return -ENOBUFS; - - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; + return ERR_PTR(-ENOBUFS); rth->fl.fl4_dst = oldflp->fl4_dst; rth->fl.fl4_tos = tos; @@ -2423,7 +2385,6 @@ static int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->dst.output=ip_output; - rth->dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2440,7 +2401,7 @@ static int __mkroute_output(struct rtable **result, RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(oldflp->fl4_dst)) { rth->dst.input = ip_mr_input; @@ -2450,31 +2411,10 @@ static int __mkroute_output(struct rtable **result, #endif } - rt_set_nexthop(rth, res, 0); + rt_set_nexthop(rth, res, fi, type, 0); rth->rt_flags = flags; - *result = rth; - return 0; -} - -/* called with rcu_read_lock() */ -static int ip_mkroute_output(struct rtable **rp, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, - rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); - } - - return err; + return rth; } /* @@ -2497,6 +2437,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, struct fib_result res; unsigned int flags = 0; struct net_device *dev_out = NULL; + struct rtable *rth; int err; @@ -2505,6 +2446,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, res.r = NULL; #endif + rcu_read_lock(); if (oldflp->fl4_src) { err = -EINVAL; if (ipv4_is_multicast(oldflp->fl4_src) || @@ -2645,7 +2587,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, else #endif if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(net, &fl, &res); + fib_select_default(&res); if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); @@ -2655,17 +2597,27 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, &fl, oldflp, dev_out, flags); + if (IS_ERR(rth)) + err = PTR_ERR(rth); + else { + unsigned int hash; -out: return err; + hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, + rt_genid(dev_net(dev_out))); + err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); + } + +out: + rcu_read_unlock(); + return err; } int __ip_route_output_key(struct net *net, struct rtable **rp, const struct flowi *flp) { - unsigned int hash; - int res; struct rtable *rth; + unsigned int hash; if (!rt_caching(net)) goto slow_output; @@ -2695,10 +2647,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_unlock_bh(); slow_output: - rcu_read_lock(); - res = ip_route_output_slow(net, rp, flp); - rcu_read_unlock(); - return res; + return ip_route_output_slow(net, rp, flp); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2731,12 +2680,11 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi { struct rtable *ort = *rp; struct rtable *rt = (struct rtable *) - dst_alloc(&ipv4_dst_blackhole_ops); + dst_alloc(&ipv4_dst_blackhole_ops, 1); if (rt) { struct dst_entry *new = &rt->dst; - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; @@ -2759,6 +2707,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi rt->peer = ort->peer; if (rt->peer) atomic_inc(&rt->peer->refcnt); + rt->fi = ort->fi; + if (rt->fi) + atomic_inc(&rt->fi->fib_clntref); dst_free(new); } @@ -2835,7 +2786,7 @@ static int rt_fill_info(struct net *net, } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif @@ -2854,7 +2805,8 @@ static int rt_fill_info(struct net *net, NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); error = rt->dst.error; - expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; + expires = (rt->peer && rt->peer->pmtu_expires) ? + rt->peer->pmtu_expires - jiffies : 0; if (rt->peer) { inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; @@ -3256,9 +3208,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +#endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) @@ -3274,7 +3226,7 @@ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); @@ -3311,14 +3263,6 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6c11eece262c..f9867d2dbef4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb7f82ebf4a3..2f692cefd3b0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); + cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 02f583b3744a..e2b9be27f226 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1341,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->daddr.a4 == saddr) { + peer->daddr.addr.a4 == saddr) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8157b17959ee..d37baaa1dbe3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2199,7 +2199,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) return 0; } -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b057d40addec..19fbdec6baaa 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -196,8 +196,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + dst_destroy_metrics_generic(dst); + if (likely(xdst->u.rt.peer)) inet_putpeer(xdst->u.rt.peer); + xfrm_dst_destroy(xdst); } @@ -215,6 +218,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, |