summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig42
-rw-r--r--net/ipv4/Makefile4
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/ah4.c25
-rw-r--r--net/ipv4/devinet.c78
-rw-r--r--net/ipv4/fib_frontend.c63
-rw-r--r--net/ipv4/fib_hash.c1133
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_rules.c12
-rw-r--r--net/ipv4/fib_semantics.c125
-rw-r--r--net/ipv4/fib_trie.c217
-rw-r--r--net/ipv4/icmp.c49
-rw-r--r--net/ipv4/inetpeer.c52
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/route.c726
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/tcp_input.c2
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/xfrm4_policy.c4
30 files changed, 730 insertions, 1900 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d1..cbb505ba9324 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER
If unsure, say N here.
-choice
- prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
- depends on IP_ADVANCED_ROUTER
- default ASK_IP_FIB_HASH
-
-config ASK_IP_FIB_HASH
- bool "FIB_HASH"
- ---help---
- Current FIB is very proven and good enough for most users.
-
-config IP_FIB_TRIE
- bool "FIB_TRIE"
- ---help---
- Use new experimental LC-trie as FIB lookup algorithm.
- This improves lookup performance if you have a large
- number of routes.
-
- LC-trie is a longest matching prefix lookup algorithm which
- performs better than FIB_HASH for large routing tables.
- But, it consumes more memory and is more complex.
-
- LC-trie is described in:
-
- IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
- IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
- June 1999
-
- An experimental study of compression methods for dynamic tries
- Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- <http://www.csc.kth.se/~snilsson/software/dyntrie2/>
-
-endchoice
-
-config IP_FIB_HASH
- def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
-
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
- depends on IP_FIB_TRIE
+ depends on IP_ADVANCED_ROUTER
---help---
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.
@@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE
handled by the klogd daemon which is responsible for kernel messages
("man klogd").
+config IP_ROUTE_CLASSID
+ bool
+
config IP_PNP
bool "IP: kernel level autoconfiguration"
help
@@ -657,4 +624,3 @@ config TCP_MD5SIG
on the Internet.
If unsure, say N.
-
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 4978d22f9a75..0dc772d0d125 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_minisocks.o tcp_cong.o \
datagram.o raw.o udp.o udplite.o \
arp.o icmp.o devinet.o af_inet.o igmp.o \
- fib_frontend.o fib_semantics.o \
+ fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
-obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o
-obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 45b89d7bda5a..7ceb80447631 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1231,7 +1231,7 @@ out:
return err;
}
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features)
+static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct iphdr *iph;
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 86961bec70ab..325053df6e70 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -201,7 +201,10 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ttl = 0;
top_iph->check = 0;
- ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
+ else
+ ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
@@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
- if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
- ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
- goto out;
+ if (x->props.flags & XFRM_STATE_ALIGN4) {
+ if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ } else {
+ if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
+ ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
+ goto out;
+ }
if (!pskb_may_pull(skb, ah_hlen))
goto out;
@@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x)
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
- x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
- ahp->icv_trunc_len);
+ if (x->props.flags & XFRM_STATE_ALIGN4)
+ x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
+ else
+ x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
+ ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
x->props.header_len += sizeof(struct iphdr);
x->data = ahp;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index df4616fce929..90389281d97a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -51,6 +51,7 @@
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/slab.h>
+#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
};
+/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE
+ * value. So if you change this define, make appropriate changes to
+ * inet_addr_hash as well.
+ */
+#define IN4_ADDR_HSIZE 256
+static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
+static DEFINE_SPINLOCK(inet_addr_hash_lock);
+
+static inline unsigned int inet_addr_hash(struct net *net, __be32 addr)
+{
+ u32 val = (__force u32) addr ^ hash_ptr(net, 8);
+
+ return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) &
+ (IN4_ADDR_HSIZE - 1));
+}
+
+static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
+{
+ unsigned int hash = inet_addr_hash(net, ifa->ifa_address);
+
+ spin_lock(&inet_addr_hash_lock);
+ hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+static void inet_hash_remove(struct in_ifaddr *ifa)
+{
+ spin_lock(&inet_addr_hash_lock);
+ hlist_del_init_rcu(&ifa->hash);
+ spin_unlock(&inet_addr_hash_lock);
+}
+
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
+ */
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
+{
+ unsigned int hash = inet_addr_hash(net, addr);
+ struct net_device *result = NULL;
+ struct in_ifaddr *ifa;
+ struct hlist_node *node;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) {
+ struct net_device *dev = ifa->ifa_dev->dev;
+
+ if (!net_eq(dev_net(dev), net))
+ continue;
+ if (ifa->ifa_address == addr) {
+ result = dev;
+ break;
+ }
+ }
+ if (result && devref)
+ dev_hold(result);
+ rcu_read_unlock();
+ return result;
+}
+EXPORT_SYMBOL(__ip_dev_find);
+
static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
@@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
}
if (!do_promote) {
+ inet_hash_remove(ifa);
*ifap1 = ifa->ifa_next;
rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid);
@@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
+ inet_hash_remove(ifa1);
/* 3. Announce address deletion */
@@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
ifa->ifa_next = *ifap;
*ifap = ifa;
+ inet_hash_insert(dev_net(in_dev->dev), ifa);
+
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
listeners of netlink will know about new ifaddr */
@@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh)
if (tb[IFA_ADDRESS] == NULL)
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
ifa->ifa_flags = ifm->ifa_flags;
@@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
ifa = inet_alloc_ifa();
+ INIT_HLIST_NODE(&ifa->hash);
if (!ifa)
break;
if (colon)
@@ -1084,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
struct in_ifaddr *ifa = inet_alloc_ifa();
if (ifa) {
+ INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1720,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = {
void __init devinet_init(void)
{
+ int i;
+
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet_addr_lst[i]);
+
register_pernet_subsys(&devinet_ops);
register_gifconf(PF_INET, inet_gifconf);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 1d2cdd43a878..ad0778a3fa53 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net)
{
struct fib_table *local_table, *main_table;
- local_table = fib_hash_table(RT_TABLE_LOCAL);
+ local_table = fib_trie_table(RT_TABLE_LOCAL);
if (local_table == NULL)
return -ENOMEM;
- main_table = fib_hash_table(RT_TABLE_MAIN);
+ main_table = fib_trie_table(RT_TABLE_MAIN);
if (main_table == NULL)
goto fail;
@@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
if (tb)
return tb;
- tb = fib_hash_table(id);
+ tb = fib_trie_table(id);
if (!tb)
return NULL;
h = id & (FIB_TABLE_HASHSZ - 1);
@@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
-void fib_select_default(struct net *net,
- const struct flowi *flp, struct fib_result *res)
-{
- struct fib_table *tb;
- int table = RT_TABLE_MAIN;
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (res->r == NULL || res->r->action != FR_ACT_TO_TBL)
- return;
- table = res->r->table;
-#endif
- tb = fib_get_table(net, table);
- if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- fib_table_select_default(tb, flp, res);
-}
-
static void fib_flush(struct net *net)
{
int flushed = 0;
@@ -147,46 +132,6 @@ static void fib_flush(struct net *net)
rt_cache_flush(net, -1);
}
-/**
- * __ip_dev_find - find the first device with a given source address.
- * @net: the net namespace
- * @addr: the source address
- * @devref: if true, take a reference on the found device
- *
- * If a caller uses devref=false, it should be protected by RCU, or RTNL
- */
-struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
-{
- struct flowi fl = {
- .fl4_dst = addr,
- };
- struct fib_result res = { 0 };
- struct net_device *dev = NULL;
- struct fib_table *local_table;
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- res.r = NULL;
-#endif
-
- rcu_read_lock();
- local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table ||
- fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
- rcu_read_unlock();
- return NULL;
- }
- if (res.type != RTN_LOCAL)
- goto out;
- dev = FIB_RES_DEV(res);
-
- if (dev && devref)
- dev_hold(dev);
-out:
- rcu_read_unlock();
- return dev;
-}
-EXPORT_SYMBOL(__ip_dev_find);
-
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -1101,5 +1046,5 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- fib_hash_init();
+ fib_trie_init();
}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
deleted file mode 100644
index b3acb0417b21..000000000000
--- a/net/ipv4/fib_hash.c
+++ /dev/null
@@ -1,1133 +0,0 @@
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * IPv4 FIB: lookup engine and maintenance routines.
- *
- * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <asm/uaccess.h>
-#include <asm/system.h>
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/errno.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/proc_fs.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/protocol.h>
-#include <net/route.h>
-#include <net/tcp.h>
-#include <net/sock.h>
-#include <net/ip_fib.h>
-
-#include "fib_lookup.h"
-
-static struct kmem_cache *fn_hash_kmem __read_mostly;
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-
-struct fib_node {
- struct hlist_node fn_hash;
- struct list_head fn_alias;
- __be32 fn_key;
- struct fib_alias fn_embedded_alias;
-};
-
-#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
-
-struct fn_zone {
- struct fn_zone __rcu *fz_next; /* Next not empty zone */
- struct hlist_head __rcu *fz_hash; /* Hash table pointer */
- seqlock_t fz_lock;
- u32 fz_hashmask; /* (fz_divisor - 1) */
-
- u8 fz_order; /* Zone order (0..32) */
- u8 fz_revorder; /* 32 - fz_order */
- __be32 fz_mask; /* inet_make_mask(order) */
-#define FZ_MASK(fz) ((fz)->fz_mask)
-
- struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE];
-
- int fz_nent; /* Number of entries */
- int fz_divisor; /* Hash size (mask+1) */
-};
-
-struct fn_hash {
- struct fn_zone *fn_zones[33];
- struct fn_zone __rcu *fn_zone_list;
-};
-
-static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
-{
- u32 h = ntohl(key) >> fz->fz_revorder;
- h ^= (h>>20);
- h ^= (h>>10);
- h ^= (h>>5);
- h &= fz->fz_hashmask;
- return h;
-}
-
-static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
-{
- return dst & FZ_MASK(fz);
-}
-
-static unsigned int fib_hash_genid;
-
-#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
-
-static struct hlist_head *fz_hash_alloc(int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
-
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-}
-
-/* The fib hash lock must be held when this is called. */
-static inline void fn_rebuild_zone(struct fn_zone *fz,
- struct hlist_head *old_ht,
- int old_divisor)
-{
- int i;
-
- for (i = 0; i < old_divisor; i++) {
- struct hlist_node *node, *n;
- struct fib_node *f;
-
- hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
- struct hlist_head *new_head;
-
- hlist_del_rcu(&f->fn_hash);
-
- new_head = rcu_dereference_protected(fz->fz_hash, 1) +
- fn_hash(f->fn_key, fz);
- hlist_add_head_rcu(&f->fn_hash, new_head);
- }
- }
-}
-
-static void fz_hash_free(struct hlist_head *hash, int divisor)
-{
- unsigned long size = divisor * sizeof(struct hlist_head);
-
- if (size <= PAGE_SIZE)
- kfree(hash);
- else
- free_pages((unsigned long)hash, get_order(size));
-}
-
-static void fn_rehash_zone(struct fn_zone *fz)
-{
- struct hlist_head *ht, *old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- new_divisor = old_divisor = fz->fz_divisor;
-
- switch (old_divisor) {
- case EMBEDDED_HASH_SIZE:
- new_divisor *= EMBEDDED_HASH_SIZE;
- break;
- case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
- new_divisor *= (EMBEDDED_HASH_SIZE/2);
- break;
- default:
- if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
- printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
- return;
- }
- new_divisor = (old_divisor << 1);
- break;
- }
-
- new_hashmask = (new_divisor - 1);
-
-#if RT_CACHE_DEBUG >= 2
- printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n",
- fz->fz_order, old_divisor);
-#endif
-
- ht = fz_hash_alloc(new_divisor);
-
- if (ht) {
- struct fn_zone nfz;
-
- memcpy(&nfz, fz, sizeof(nfz));
-
- write_seqlock_bh(&fz->fz_lock);
- old_ht = rcu_dereference_protected(fz->fz_hash, 1);
- RCU_INIT_POINTER(nfz.fz_hash, ht);
- nfz.fz_hashmask = new_hashmask;
- nfz.fz_divisor = new_divisor;
- fn_rebuild_zone(&nfz, old_ht, old_divisor);
- fib_hash_genid++;
- rcu_assign_pointer(fz->fz_hash, ht);
- fz->fz_hashmask = new_hashmask;
- fz->fz_divisor = new_divisor;
- write_sequnlock_bh(&fz->fz_lock);
-
- if (old_ht != fz->fz_embedded_hash) {
- synchronize_rcu();
- fz_hash_free(old_ht, old_divisor);
- }
- }
-}
-
-static void fn_free_node_rcu(struct rcu_head *head)
-{
- struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
-
- kmem_cache_free(fn_hash_kmem, f);
-}
-
-static inline void fn_free_node(struct fib_node *f)
-{
- call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
-}
-
-static void fn_free_alias_rcu(struct rcu_head *head)
-{
- struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
-
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
-static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
-{
- fib_release_info(fa->fa_info);
- if (fa == &f->fn_embedded_alias)
- fa->fa_info = NULL;
- else
- call_rcu(&fa->rcu, fn_free_alias_rcu);
-}
-
-static struct fn_zone *
-fn_new_zone(struct fn_hash *table, int z)
-{
- int i;
- struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL);
- if (!fz)
- return NULL;
-
- seqlock_init(&fz->fz_lock);
- fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
- fz->fz_hashmask = fz->fz_divisor - 1;
- RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash);
- fz->fz_order = z;
- fz->fz_revorder = 32 - z;
- fz->fz_mask = inet_make_mask(z);
-
- /* Find the first not empty zone with more specific mask */
- for (i = z + 1; i <= 32; i++)
- if (table->fn_zones[i])
- break;
- if (i > 32) {
- /* No more specific masks, we are the first. */
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zone_list));
- rcu_assign_pointer(table->fn_zone_list, fz);
- } else {
- rcu_assign_pointer(fz->fz_next,
- rtnl_dereference(table->fn_zones[i]->fz_next));
- rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
- }
- table->fn_zones[z] = fz;
- fib_hash_genid++;
- return fz;
-}
-
-int fib_table_lookup(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res,
- int fib_flags)
-{
- int err;
- struct fn_zone *fz;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
-
- rcu_read_lock();
- for (fz = rcu_dereference(t->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next)) {
- struct hlist_head *head;
- struct hlist_node *node;
- struct fib_node *f;
- __be32 k;
- unsigned int seq;
-
- do {
- seq = read_seqbegin(&fz->fz_lock);
- k = fz_key(flp->fl4_dst, fz);
-
- head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key != k)
- continue;
-
- err = fib_semantic_match(&f->fn_alias,
- flp, res,
- fz->fz_order, fib_flags);
- if (err <= 0)
- goto out;
- }
- } while (read_seqretry(&fz->fz_lock, seq));
- }
- err = 1;
-out:
- rcu_read_unlock();
- return err;
-}
-
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp, struct fib_result *res)
-{
- int order, last_idx;
- struct hlist_node *node;
- struct fib_node *f;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fn_hash *t = (struct fn_hash *)tb->tb_data;
- struct fn_zone *fz = t->fn_zones[0];
- struct hlist_head *head;
-
- if (fz == NULL)
- return;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
- head = rcu_dereference(fz->fz_hash);
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- }
-
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
-
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
-/* Insert node F to FZ. */
-static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz);
-
- hlist_add_head_rcu(&f->fn_hash, head);
-}
-
-/* Return the node in FZ matching KEY. */
-static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz);
- struct hlist_node *node;
- struct fib_node *f;
-
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- if (f->fn_key == key)
- return f;
- }
-
- return NULL;
-}
-
-
-static struct fib_alias *fib_fast_alloc(struct fib_node *f)
-{
- struct fib_alias *fa = &f->fn_embedded_alias;
-
- if (fa->fa_info != NULL)
- fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
- return fa;
-}
-
-/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fib_node *new_f = NULL;
- struct fib_node *f;
- struct fib_alias *fa, *new_fa;
- struct fn_zone *fz;
- struct fib_info *fi;
- u8 tos = cfg->fc_tos;
- __be32 key;
- int err;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- fz = table->fn_zones[cfg->fc_dst_len];
- if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len)))
- return -ENOBUFS;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- fi = fib_create_info(cfg);
- if (IS_ERR(fi))
- return PTR_ERR(fi);
-
- if (fz->fz_nent > (fz->fz_divisor<<1) &&
- fz->fz_divisor < FZ_MAX_DIVISOR &&
- (cfg->fc_dst_len == 32 ||
- (1 << cfg->fc_dst_len) > fz->fz_divisor))
- fn_rehash_zone(fz);
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
-
- /* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
- * exists or to the node before which we will insert new one.
- *
- * If fa is NULL, we will need to allocate a new one and
- * insert to the head of f.
- *
- * If f is NULL, no fib node matched the destination key
- * and we need to allocate a new one of those as well.
- */
-
- if (fa && fa->fa_tos == tos &&
- fa->fa_info->fib_priority == fi->fib_priority) {
- struct fib_alias *fa_first, *fa_match;
-
- err = -EEXIST;
- if (cfg->fc_nlflags & NLM_F_EXCL)
- goto out;
-
- /* We have 2 goals:
- * 1. Find exact match for type, scope, fib_info to avoid
- * duplicate routes
- * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
- */
- fa_match = NULL;
- fa_first = fa;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- if (fa->fa_tos != tos)
- break;
- if (fa->fa_info->fib_priority != fi->fib_priority)
- break;
- if (fa->fa_type == cfg->fc_type &&
- fa->fa_scope == cfg->fc_scope &&
- fa->fa_info == fi) {
- fa_match = fa;
- break;
- }
- }
-
- if (cfg->fc_nlflags & NLM_F_REPLACE) {
- u8 state;
-
- fa = fa_first;
- if (fa_match) {
- if (fa == fa_match)
- err = 0;
- goto out;
- }
- err = -ENOBUFS;
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_tos = fa->fa_tos;
- new_fa->fa_info = fi;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- state = fa->fa_state;
- new_fa->fa_state = state & ~FA_S_ACCESSED;
- fib_hash_genid++;
- list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
- fn_free_alias(fa, f);
- if (state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
- return 0;
- }
-
- /* Error if we find a perfect match which
- * uses the same scope, type, and nexthop
- * information.
- */
- if (fa_match)
- goto out;
-
- if (!(cfg->fc_nlflags & NLM_F_APPEND))
- fa = fa_first;
- }
-
- err = -ENOENT;
- if (!(cfg->fc_nlflags & NLM_F_CREATE))
- goto out;
-
- err = -ENOBUFS;
-
- if (!f) {
- new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL);
- if (new_f == NULL)
- goto out;
-
- INIT_HLIST_NODE(&new_f->fn_hash);
- INIT_LIST_HEAD(&new_f->fn_alias);
- new_f->fn_key = key;
- f = new_f;
- }
-
- new_fa = fib_fast_alloc(f);
- if (new_fa == NULL)
- goto out;
-
- new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
- new_fa->fa_type = cfg->fc_type;
- new_fa->fa_scope = cfg->fc_scope;
- new_fa->fa_state = 0;
-
- /*
- * Insert new entry to the list.
- */
-
- if (new_f)
- fib_insert_node(fz, new_f);
- list_add_tail_rcu(&new_fa->fa_list,
- (fa ? &fa->fa_list : &f->fn_alias));
- fib_hash_genid++;
-
- if (new_f)
- fz->fz_nent++;
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-
- rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id,
- &cfg->fc_nlinfo, 0);
- return 0;
-
-out:
- if (new_f)
- kmem_cache_free(fn_hash_kmem, new_f);
- fib_release_info(fi);
- return err;
-}
-
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
-{
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
- struct fib_node *f;
- struct fib_alias *fa, *fa_to_delete;
- struct fn_zone *fz;
- __be32 key;
-
- if (cfg->fc_dst_len > 32)
- return -EINVAL;
-
- if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL)
- return -ESRCH;
-
- key = 0;
- if (cfg->fc_dst) {
- if (cfg->fc_dst & ~FZ_MASK(fz))
- return -EINVAL;
- key = fz_key(cfg->fc_dst, fz);
- }
-
- f = fib_find_node(fz, key);
-
- if (!f)
- fa = NULL;
- else
- fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0);
- if (!fa)
- return -ESRCH;
-
- fa_to_delete = NULL;
- fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
- list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fa->fa_tos != cfg->fc_tos)
- break;
-
- if ((!cfg->fc_type ||
- fa->fa_type == cfg->fc_type) &&
- (cfg->fc_scope == RT_SCOPE_NOWHERE ||
- fa->fa_scope == cfg->fc_scope) &&
- (!cfg->fc_protocol ||
- fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi) == 0) {
- fa_to_delete = fa;
- break;
- }
- }
-
- if (fa_to_delete) {
- int kill_fn;
-
- fa = fa_to_delete;
- rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len,
- tb->tb_id, &cfg->fc_nlinfo, 0);
-
- kill_fn = 0;
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_fn = 1;
- }
- fib_hash_genid++;
-
- if (fa->fa_state & FA_S_ACCESSED)
- rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
- fn_free_alias(fa, f);
- if (kill_fn) {
- fn_free_node(f);
- fz->fz_nent--;
- }
-
- return 0;
- }
- return -ESRCH;
-}
-
-static int fn_flush_list(struct fn_zone *fz, int idx)
-{
- struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx;
- struct hlist_node *node, *n;
- struct fib_node *f;
- int found = 0;
-
- hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
- struct fib_alias *fa, *fa_node;
- int kill_f;
-
- kill_f = 0;
- list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
- list_del_rcu(&fa->fa_list);
- if (list_empty(&f->fn_alias)) {
- hlist_del_rcu(&f->fn_hash);
- kill_f = 1;
- }
- fib_hash_genid++;
-
- fn_free_alias(fa, f);
- found++;
- }
- }
- if (kill_f) {
- fn_free_node(f);
- fz->fz_nent--;
- }
- }
- return found;
-}
-
-/* caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz;
- int found = 0;
-
- for (fz = rtnl_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rtnl_dereference(fz->fz_next)) {
- int i;
-
- for (i = fz->fz_divisor - 1; i >= 0; i--)
- found += fn_flush_list(fz, i);
- }
- return found;
-}
-
-void fib_free_table(struct fib_table *tb)
-{
- struct fn_hash *table = (struct fn_hash *) tb->tb_data;
- struct fn_zone *fz, *next;
-
- next = table->fn_zone_list;
- while (next != NULL) {
- fz = next;
- next = fz->fz_next;
-
- if (fz->fz_hash != fz->fz_embedded_hash)
- fz_hash_free(fz->fz_hash, fz->fz_divisor);
-
- kfree(fz);
- }
-
- kfree(tb);
-}
-
-static inline int
-fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz,
- struct hlist_head *head)
-{
- struct hlist_node *node;
- struct fib_node *f;
- int i, s_i;
-
- s_i = cb->args[4];
- i = 0;
- hlist_for_each_entry_rcu(f, node, head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
- if (i < s_i)
- goto next;
-
- if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->tb_id,
- fa->fa_type,
- fa->fa_scope,
- f->fn_key,
- fz->fz_order,
- fa->fa_tos,
- fa->fa_info,
- NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
-next:
- i++;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static inline int
-fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
- struct fib_table *tb,
- struct fn_zone *fz)
-{
- int h, s_h;
- struct hlist_head *head = rcu_dereference(fz->fz_hash);
-
- if (head == NULL)
- return skb->len;
- s_h = cb->args[3];
- for (h = s_h; h < fz->fz_divisor; h++) {
- if (hlist_empty(head + h))
- continue;
- if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) {
- cb->args[3] = h;
- return -1;
- }
- memset(&cb->args[4], 0,
- sizeof(cb->args) - 4*sizeof(cb->args[0]));
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- int m = 0, s_m;
- struct fn_zone *fz;
- struct fn_hash *table = (struct fn_hash *)tb->tb_data;
-
- s_m = cb->args[2];
- rcu_read_lock();
- for (fz = rcu_dereference(table->fn_zone_list);
- fz != NULL;
- fz = rcu_dereference(fz->fz_next), m++) {
- if (m < s_m)
- continue;
- if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
- cb->args[2] = m;
- rcu_read_unlock();
- return -1;
- }
- memset(&cb->args[3], 0,
- sizeof(cb->args) - 3*sizeof(cb->args[0]));
- }
- rcu_read_unlock();
- cb->args[2] = m;
- return skb->len;
-}
-
-void __init fib_hash_init(void)
-{
- fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node),
- 0, SLAB_PANIC, NULL);
-
- fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
-
-}
-
-struct fib_table *fib_hash_table(u32 id)
-{
- struct fib_table *tb;
-
- tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
- GFP_KERNEL);
- if (tb == NULL)
- return NULL;
-
- tb->tb_id = id;
- tb->tb_default = -1;
-
- memset(tb->tb_data, 0, sizeof(struct fn_hash));
- return tb;
-}
-
-/* ------------------------------------------------------------------------ */
-#ifdef CONFIG_PROC_FS
-
-struct fib_iter_state {
- struct seq_net_private p;
- struct fn_zone *zone;
- int bucket;
- struct hlist_head *hash_head;
- struct fib_node *fn;
- struct fib_alias *fa;
- loff_t pos;
- unsigned int genid;
- int valid;
-};
-
-static struct fib_alias *fib_get_first(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_table *main_table;
- struct fn_hash *table;
-
- main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
- table = (struct fn_hash *)main_table->tb_data;
-
- iter->bucket = 0;
- iter->hash_head = NULL;
- iter->fn = NULL;
- iter->fa = NULL;
- iter->pos = 0;
- iter->genid = fib_hash_genid;
- iter->valid = 1;
-
- for (iter->zone = rcu_dereference(table->fn_zone_list);
- iter->zone != NULL;
- iter->zone = rcu_dereference(iter->zone->fz_next)) {
- int maxslot;
-
- if (!iter->zone->fz_nent)
- continue;
-
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
- maxslot = iter->zone->fz_divisor;
-
- for (iter->bucket = 0; iter->bucket < maxslot;
- ++iter->bucket, ++iter->hash_head) {
- struct hlist_node *node;
- struct fib_node *fn;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- struct fib_alias *fa;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
- }
-out:
- return iter->fa;
-}
-
-static struct fib_alias *fib_get_next(struct seq_file *seq)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_node *fn;
- struct fib_alias *fa;
-
- /* Advance FA, if any. */
- fn = iter->fn;
- fa = iter->fa;
- if (fa) {
- BUG_ON(!fn);
- list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
-
- fa = iter->fa = NULL;
-
- /* Advance FN. */
- if (fn) {
- struct hlist_node *node = &fn->fn_hash;
- hlist_for_each_entry_continue(fn, node, fn_hash) {
- iter->fn = fn;
-
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- fn = iter->fn = NULL;
-
- /* Advance hash chain. */
- if (!iter->zone)
- goto out;
-
- for (;;) {
- struct hlist_node *node;
- int maxslot;
-
- maxslot = iter->zone->fz_divisor;
-
- while (++iter->bucket < maxslot) {
- iter->hash_head++;
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-
- iter->zone = rcu_dereference(iter->zone->fz_next);
-
- if (!iter->zone)
- goto out;
-
- iter->bucket = 0;
- iter->hash_head = rcu_dereference(iter->zone->fz_hash);
-
- hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
- list_for_each_entry(fa, &fn->fn_alias, fa_list) {
- iter->fn = fn;
- iter->fa = fa;
- goto out;
- }
- }
- }
-out:
- iter->pos++;
- return fa;
-}
-
-static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
-{
- struct fib_iter_state *iter = seq->private;
- struct fib_alias *fa;
-
- if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
- fa = iter->fa;
- pos -= iter->pos;
- } else
- fa = fib_get_first(seq);
-
- if (fa)
- while (pos && (fa = fib_get_next(seq)))
- --pos;
- return pos ? NULL : fa;
-}
-
-static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- void *v = NULL;
-
- rcu_read_lock();
- if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
- v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
- return v;
-}
-
-static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
-}
-
-static void fib_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-{
- static const unsigned type2flags[RTN_MAX + 1] = {
- [7] = RTF_REJECT,
- [8] = RTF_REJECT,
- };
- unsigned flags = type2flags[type];
-
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
- if (mask == htonl(0xFFFFFFFF))
- flags |= RTF_HOST;
- flags |= RTF_UP;
- return flags;
-}
-
-/*
- * This outputs /proc/net/route.
- *
- * It always works in backward compatibility mode.
- * The format of the file is not supposed to be changed.
- */
-static int fib_seq_show(struct seq_file *seq, void *v)
-{
- struct fib_iter_state *iter;
- int len;
- __be32 prefix, mask;
- unsigned flags;
- struct fib_node *f;
- struct fib_alias *fa;
- struct fib_info *fi;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
- "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
- "\tWindow\tIRTT");
- goto out;
- }
-
- iter = seq->private;
- f = iter->fn;
- fa = iter->fa;
- fi = fa->fa_info;
- prefix = f->fn_key;
- mask = FZ_MASK(iter->zone);
- flags = fib_flag_trans(fa->fa_type, mask, fi);
- if (fi)
- seq_printf(seq,
- "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- fi->fib_dev ? fi->fib_dev->name : "*", prefix,
- fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
- mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
- fi->fib_window,
- fi->fib_rtt >> 3, &len);
- else
- seq_printf(seq,
- "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n",
- prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len);
-
- seq_printf(seq, "%*s\n", 127 - len, "");
-out:
- return 0;
-}
-
-static const struct seq_operations fib_seq_ops = {
- .start = fib_seq_start,
- .next = fib_seq_next,
- .stop = fib_seq_stop,
- .show = fib_seq_show,
-};
-
-static int fib_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &fib_seq_ops,
- sizeof(struct fib_iter_state));
-}
-
-static const struct file_operations fib_seq_fops = {
- .owner = THIS_MODULE,
- .open = fib_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-int __net_init fib_proc_init(struct net *net)
-{
- if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops))
- return -ENOMEM;
- return 0;
-}
-
-void __net_exit fib_proc_exit(struct net *net)
-{
- proc_net_remove(net, "route");
-}
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c079cc0ec651..d5c40d8f6632 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -25,7 +25,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
}
/* Exported by fib_semantics.c */
-extern int fib_semantic_match(struct list_head *head,
+extern int fib_semantic_match(struct fib_table *tb, struct list_head *head,
const struct flowi *flp,
struct fib_result *res, int prefixlen, int fib_flags);
extern void fib_release_info(struct fib_info *);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7b..3018efbaea77 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,13 +41,13 @@ struct fib4_rule {
__be32 srcmask;
__be32 dst;
__be32 dstmask;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
u32 tclassid;
#endif
};
-#ifdef CONFIG_NET_CLS_ROUTE
-u32 fib_rules_tclass(struct fib_result *res)
+#ifdef CONFIG_IP_ROUTE_CLASSID
+u32 fib_rules_tclass(const struct fib_result *res)
{
return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
}
@@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (frh->dst_len)
rule4->dst = nla_get_be32(tb[FRA_DST]);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW])
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
#endif
@@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->tos && (rule4->tos != frh->tos))
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
@@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
if (rule4->src_len)
NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (rule4->tclassid)
NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b7..562f34cd9303 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -49,7 +49,7 @@
static DEFINE_SPINLOCK(fib_info_lock);
static struct hlist_head *fib_info_hash;
static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_hash_size;
+static unsigned int fib_info_hash_size;
static unsigned int fib_info_cnt;
#define DEVINDEX_HASHBITS 8
@@ -152,6 +152,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
+ if (fi->fib_metrics != (u32 *) dst_default_metrics)
+ kfree(fi->fib_metrics);
kfree(fi);
}
@@ -200,7 +202,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
nh->nh_weight != onh->nh_weight ||
#endif
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -221,7 +223,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
unsigned int val = fi->fib_nhs;
val ^= fi->fib_protocol;
@@ -422,7 +424,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
#endif
@@ -476,7 +478,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla && nla_get_be32(nla) != nh->nh_gw)
return 1;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
if (nla && nla_get_u32(nla) != nh->nh_tclassid)
return 1;
@@ -613,14 +615,14 @@ out:
static inline unsigned int fib_laddr_hashfn(__be32 val)
{
- unsigned int mask = (fib_hash_size - 1);
+ unsigned int mask = (fib_info_hash_size - 1);
return ((__force u32)val ^
((__force u32)val >> 7) ^
((__force u32)val >> 14)) & mask;
}
-static struct hlist_head *fib_hash_alloc(int bytes)
+static struct hlist_head *fib_info_hash_alloc(int bytes)
{
if (bytes <= PAGE_SIZE)
return kzalloc(bytes, GFP_KERNEL);
@@ -630,7 +632,7 @@ static struct hlist_head *fib_hash_alloc(int bytes)
get_order(bytes));
}
-static void fib_hash_free(struct hlist_head *hash, int bytes)
+static void fib_info_hash_free(struct hlist_head *hash, int bytes)
{
if (!hash)
return;
@@ -641,18 +643,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes)
free_pages((unsigned long) hash, get_order(bytes));
}
-static void fib_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+static void fib_info_hash_move(struct hlist_head *new_info_hash,
+ struct hlist_head *new_laddrhash,
+ unsigned int new_size)
{
struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_hash_size;
+ unsigned int old_size = fib_info_hash_size;
unsigned int i, bytes;
spin_lock_bh(&fib_info_lock);
old_info_hash = fib_info_hash;
old_laddrhash = fib_info_laddrhash;
- fib_hash_size = new_size;
+ fib_info_hash_size = new_size;
for (i = 0; i < old_size; i++) {
struct hlist_head *head = &fib_info_hash[i];
@@ -693,8 +695,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash,
spin_unlock_bh(&fib_info_lock);
bytes = old_size * sizeof(struct hlist_head *);
- fib_hash_free(old_info_hash, bytes);
- fib_hash_free(old_laddrhash, bytes);
+ fib_info_hash_free(old_info_hash, bytes);
+ fib_info_hash_free(old_laddrhash, bytes);
}
struct fib_info *fib_create_info(struct fib_config *cfg)
@@ -718,8 +720,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
#endif
err = -ENOBUFS;
- if (fib_info_cnt >= fib_hash_size) {
- unsigned int new_size = fib_hash_size << 1;
+ if (fib_info_cnt >= fib_info_hash_size) {
+ unsigned int new_size = fib_info_hash_size << 1;
struct hlist_head *new_info_hash;
struct hlist_head *new_laddrhash;
unsigned int bytes;
@@ -727,21 +729,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (!new_size)
new_size = 1;
bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_hash_alloc(bytes);
- new_laddrhash = fib_hash_alloc(bytes);
+ new_info_hash = fib_info_hash_alloc(bytes);
+ new_laddrhash = fib_info_hash_alloc(bytes);
if (!new_info_hash || !new_laddrhash) {
- fib_hash_free(new_info_hash, bytes);
- fib_hash_free(new_laddrhash, bytes);
+ fib_info_hash_free(new_info_hash, bytes);
+ fib_info_hash_free(new_laddrhash, bytes);
} else
- fib_hash_move(new_info_hash, new_laddrhash, new_size);
+ fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
- if (!fib_hash_size)
+ if (!fib_info_hash_size)
goto failure;
}
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (fi == NULL)
goto failure;
+ if (cfg->fc_mx) {
+ fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
+ if (!fi->fib_metrics)
+ goto failure;
+ } else
+ fi->fib_metrics = (u32 *) dst_default_metrics;
fib_info_cnt++;
fi->fib_net = hold_net(net);
@@ -779,7 +787,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto err_inval;
if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
goto err_inval;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
goto err_inval;
#endif
@@ -792,7 +800,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
nh->nh_oif = cfg->fc_oif;
nh->nh_gw = cfg->fc_gw;
nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid = cfg->fc_flow;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -881,8 +889,9 @@ failure:
}
/* Note! fib_semantic_match intentionally uses RCU list functions. */
-int fib_semantic_match(struct list_head *head, const struct flowi *flp,
- struct fib_result *res, int prefixlen, int fib_flags)
+int fib_semantic_match(struct fib_table *tb, struct list_head *head,
+ const struct flowi *flp, struct fib_result *res,
+ int prefixlen, int fib_flags)
{
struct fib_alias *fa;
int nh_sel = 0;
@@ -946,6 +955,8 @@ out_fill_res:
res->type = fa->fa_type;
res->scope = fa->fa_scope;
res->fi = fa->fa_info;
+ res->table = tb;
+ res->fa_head = head;
if (!(fib_flags & FIB_LOOKUP_NOREF))
atomic_inc(&res->fi->fib_clntref);
return 0;
@@ -1002,7 +1013,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (fi->fib_nh->nh_oif)
NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (fi->fib_nh[0].nh_tclassid)
NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
#endif
@@ -1027,7 +1038,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
if (nh->nh_gw)
NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (nh->nh_tclassid)
NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
#endif
@@ -1125,6 +1136,62 @@ int fib_sync_down_dev(struct net_device *dev, int force)
return ret;
}
+/* Must be invoked inside of an RCU protected region. */
+void fib_select_default(struct fib_result *res)
+{
+ struct fib_info *fi = NULL, *last_resort = NULL;
+ struct list_head *fa_head = res->fa_head;
+ struct fib_table *tb = res->table;
+ int order = -1, last_idx = -1;
+ struct fib_alias *fa;
+
+ list_for_each_entry_rcu(fa, fa_head, fa_list) {
+ struct fib_info *next_fi = fa->fa_info;
+
+ if (fa->fa_scope != res->scope ||
+ fa->fa_type != RTN_UNICAST)
+ continue;
+
+ if (next_fi->fib_priority > res->fi->fib_priority)
+ break;
+ if (!next_fi->fib_nh[0].nh_gw ||
+ next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+ continue;
+
+ fib_alias_accessed(fa);
+
+ if (fi == NULL) {
+ if (next_fi != res->fi)
+ break;
+ } else if (!fib_detect_death(fi, order, &last_resort,
+ &last_idx, tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+ fi = next_fi;
+ order++;
+ }
+
+ if (order <= 0 || fi == NULL) {
+ tb->tb_default = -1;
+ goto out;
+ }
+
+ if (!fib_detect_death(fi, order, &last_resort, &last_idx,
+ tb->tb_default)) {
+ fib_result_assign(res, fi);
+ tb->tb_default = order;
+ goto out;
+ }
+
+ if (last_idx >= 0)
+ fib_result_assign(res, last_resort);
+ tb->tb_default = last_idx;
+out:
+ return;
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0f280348e0fd..edf3b0997e01 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,7 +95,7 @@ typedef unsigned int t_key;
#define IS_TNODE(n) (!(n->parent & T_LEAF))
#define IS_LEAF(n) (n->parent & T_LEAF)
-struct node {
+struct rt_trie_node {
unsigned long parent;
t_key key;
};
@@ -126,7 +126,7 @@ struct tnode {
struct work_struct work;
struct tnode *tnode_free;
};
- struct node *child[0];
+ struct rt_trie_node *child[0];
};
#ifdef CONFIG_IP_FIB_TRIE_STATS
@@ -151,16 +151,16 @@ struct trie_stat {
};
struct trie {
- struct node *trie;
+ struct rt_trie_node *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats stats;
#endif
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
-static struct node *resize(struct trie *t, struct tnode *tn);
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
@@ -177,12 +177,12 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
-static inline struct tnode *node_parent(struct node *node)
+static inline struct tnode *node_parent(struct rt_trie_node *node)
{
return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
}
-static inline struct tnode *node_parent_rcu(struct node *node)
+static inline struct tnode *node_parent_rcu(struct rt_trie_node *node)
{
struct tnode *ret = node_parent(node);
@@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node)
/* Same as rcu_assign_pointer
* but that macro() assumes that value is a pointer.
*/
-static inline void node_set_parent(struct node *node, struct tnode *ptr)
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
smp_wmb();
node->parent = (unsigned long)ptr | NODE_TYPE(node);
}
-static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
+static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i)
{
BUG_ON(i >= 1U << tn->bits);
return tn->child[i];
}
-static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
+static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
{
- struct node *ret = tnode_get_child(tn, i);
+ struct rt_trie_node *ret = tnode_get_child(tn, i);
return rcu_dereference_rtnl(ret);
}
@@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn)
return 1 << tn->bits;
}
-static inline t_key mask_pfx(t_key k, unsigned short l)
+static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
}
-static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
+static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
{
if (offset < KEYLENGTH)
return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
@@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head)
{
struct tnode *tn = container_of(head, struct tnode, rcu);
size_t size = sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
if (size <= PAGE_SIZE)
kfree(tn);
@@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn)
tn->tnode_free = tnode_free_head;
tnode_free_head = tn;
tnode_free_size += sizeof(struct tnode) +
- (sizeof(struct node *) << tn->bits);
+ (sizeof(struct rt_trie_node *) << tn->bits);
}
static void tnode_free_flush(void)
@@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen)
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
+ size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
if (tn) {
@@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct node) << bits);
+ sizeof(struct rt_trie_node) << bits);
return tn;
}
@@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct node *n)
+static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
if (n == NULL || IS_LEAF(n))
return 0;
@@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n)
}
static inline void put_child(struct trie *t, struct tnode *tn, int i,
- struct node *n)
+ struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i,
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull)
{
- struct node *chi = tn->child[i];
+ struct rt_trie_node *chi = tn->child[i];
int isfull;
BUG_ON(i >= 1<<tn->bits);
@@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
}
#define MAX_WORK 10
-static struct node *resize(struct trie *t, struct tnode *tn)
+static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
int i;
struct tnode *old_tn;
@@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Keep root node larger */
- if (!node_parent((struct node *)tn)) {
+ if (!node_parent((struct rt_trie_node *)tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
@@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
/*
* Halve as long as the number of empty children in this
@@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
if (tn->empty_children == tnode_child_length(tn) - 1) {
one_child:
for (i = 0; i < tnode_child_length(tn); i++) {
- struct node *n;
+ struct rt_trie_node *n;
n = tn->child[i];
if (!n)
@@ -676,7 +676,7 @@ one_child:
return n;
}
}
- return (struct node *) tn;
+ return (struct rt_trie_node *) tn;
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(t, tn, 2*i, (struct node *) left);
- put_child(t, tn, 2*i+1, (struct node *) right);
+ put_child(t, tn, 2*i, (struct rt_trie_node *) left);
+ put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
}
}
for (i = 0; i < olen; i++) {
struct tnode *inode;
- struct node *node = tnode_get_child(oldtnode, i);
+ struct rt_trie_node *node = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
@@ -825,7 +825,7 @@ nomem:
static struct tnode *halve(struct trie *t, struct tnode *tn)
{
struct tnode *oldtnode = tn;
- struct node *left, *right;
+ struct rt_trie_node *left, *right;
int i;
int olen = tnode_child_length(tn);
@@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(t, tn, i/2, (struct node *)newn);
+ put_child(t, tn, i/2, (struct rt_trie_node *)newn);
}
}
@@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key)
{
int pos;
struct tnode *tn;
- struct node *n;
+ struct rt_trie_node *n;
pos = 0;
n = rcu_dereference_rtnl(t->trie);
@@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
key = tn->key;
- while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
+ while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *) resize(t, (struct tnode *)tn);
tnode_put_child_reorg((struct tnode *)tp, cindex,
- (struct node *)tn, wasfull);
+ (struct rt_trie_node *)tn, wasfull);
- tp = node_parent((struct node *) tn);
+ tp = node_parent((struct rt_trie_node *) tn);
if (!tp)
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
if (!tp)
@@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
if (IS_TNODE(tn))
tn = (struct tnode *)resize(t, (struct tnode *)tn);
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tnode_free_flush();
}
@@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct node *n;
+ struct rt_trie_node *n;
struct leaf *l;
int missbit;
struct list_head *fa_head = NULL;
@@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct node *)l, tp);
+ node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
+ put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
return NULL;
}
- node_set_parent((struct node *)tn, tp);
+ node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct node *)l);
+ put_child(t, tn, missbit, (struct rt_trie_node *)l);
put_child(t, tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
put_child(t, (struct tnode *)tp, cindex,
- (struct node *)tn);
+ (struct rt_trie_node *)tn);
} else {
- rcu_assign_pointer(t->trie, (struct node *)tn);
+ rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
}
}
@@ -1340,7 +1340,7 @@ err:
}
/* should be called with rcu_read_lock */
-static int check_leaf(struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
t_key key, const struct flowi *flp,
struct fib_result *res, int fib_flags)
{
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
if (l->key != (key & ntohl(mask)))
continue;
- err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
+ err = fib_semantic_match(tb, &li->falh, flp, res, plen, fib_flags);
#ifdef CONFIG_IP_FIB_TRIE_STATS
if (err <= 0)
@@ -1376,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
- struct node *n;
+ struct rt_trie_node *n;
struct tnode *pn;
- int pos, bits;
+ unsigned int pos, bits;
t_key key = ntohl(flp->fl4_dst);
- int chopped_off;
+ unsigned int chopped_off;
t_key cindex = 0;
- int current_prefix_length = KEYLENGTH;
+ unsigned int current_prefix_length = KEYLENGTH;
struct tnode *cn;
t_key pref_mismatch;
@@ -1398,7 +1398,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
goto found;
}
@@ -1423,7 +1423,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
}
if (IS_LEAF(n)) {
- ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
@@ -1541,7 +1541,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent_rcu((struct node *) pn);
+ struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
if (!parent)
goto failed;
@@ -1568,7 +1568,7 @@ found:
*/
static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- struct tnode *tp = node_parent((struct node *) l);
+ struct tnode *tp = node_parent((struct rt_trie_node *) l);
pr_debug("entering trie_leaf_remove(%p)\n", l);
@@ -1706,7 +1706,7 @@ static int trie_flush_leaf(struct leaf *l)
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
+static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
{
do {
t_key idx;
@@ -1732,7 +1732,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
}
/* Node empty, walk back up to parent */
- c = (struct node *) p;
+ c = (struct rt_trie_node *) p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
@@ -1753,7 +1753,7 @@ static struct leaf *trie_firstleaf(struct trie *t)
static struct leaf *trie_nextleaf(struct leaf *l)
{
- struct node *c = (struct node *) l;
+ struct rt_trie_node *c = (struct rt_trie_node *) l;
struct tnode *p = node_parent_rcu(c);
if (!p)
@@ -1802,80 +1802,6 @@ void fib_free_table(struct fib_table *tb)
kfree(tb);
}
-void fib_table_select_default(struct fib_table *tb,
- const struct flowi *flp,
- struct fib_result *res)
-{
- struct trie *t = (struct trie *) tb->tb_data;
- int order, last_idx;
- struct fib_info *fi = NULL;
- struct fib_info *last_resort;
- struct fib_alias *fa = NULL;
- struct list_head *fa_head;
- struct leaf *l;
-
- last_idx = -1;
- last_resort = NULL;
- order = -1;
-
- rcu_read_lock();
-
- l = fib_find_node(t, 0);
- if (!l)
- goto out;
-
- fa_head = get_fa_head(l, 0);
- if (!fa_head)
- goto out;
-
- if (list_empty(fa_head))
- goto out;
-
- list_for_each_entry_rcu(fa, fa_head, fa_list) {
- struct fib_info *next_fi = fa->fa_info;
-
- if (fa->fa_scope != res->scope ||
- fa->fa_type != RTN_UNICAST)
- continue;
-
- if (next_fi->fib_priority > res->fi->fib_priority)
- break;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
- continue;
-
- fib_alias_accessed(fa);
-
- if (fi == NULL) {
- if (next_fi != res->fi)
- break;
- } else if (!fib_detect_death(fi, order, &last_resort,
- &last_idx, tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- fi = next_fi;
- order++;
- }
- if (order <= 0 || fi == NULL) {
- tb->tb_default = -1;
- goto out;
- }
-
- if (!fib_detect_death(fi, order, &last_resort, &last_idx,
- tb->tb_default)) {
- fib_result_assign(res, fi);
- tb->tb_default = order;
- goto out;
- }
- if (last_idx >= 0)
- fib_result_assign(res, last_resort);
- tb->tb_default = last_idx;
-out:
- rcu_read_unlock();
-}
-
static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
@@ -1990,7 +1916,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
return skb->len;
}
-void __init fib_hash_init(void)
+void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
@@ -2003,8 +1929,7 @@ void __init fib_hash_init(void)
}
-/* Fix more generic FIB names for init later */
-struct fib_table *fib_hash_table(u32 id)
+struct fib_table *fib_trie_table(u32 id)
{
struct fib_table *tb;
struct trie *t;
@@ -2036,7 +1961,7 @@ struct fib_trie_iter {
unsigned int depth;
};
-static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
unsigned int cindex = iter->index;
@@ -2050,7 +1975,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct node *n = tnode_get_child_rcu(tn, cindex);
+ struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -2069,7 +1994,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent_rcu((struct node *)tn);
+ p = node_parent_rcu((struct rt_trie_node *)tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2081,10 +2006,10 @@ rescan:
return NULL;
}
-static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct node *n;
+ struct rt_trie_node *n;
if (!t)
return NULL;
@@ -2108,7 +2033,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct node *n;
+ struct rt_trie_node *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -2181,7 +2106,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct node *) * pointers;
+ bytes += sizeof(struct rt_trie_node *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2262,7 +2187,7 @@ static const struct file_operations fib_triestat_fops = {
.release = single_release_net,
};
-static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2275,7 +2200,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
- struct node *n;
+ struct rt_trie_node *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2304,7 +2229,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct node *n;
+ struct rt_trie_node *n;
++*pos;
/* next node in same table */
@@ -2390,7 +2315,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct node *n = v;
+ struct rt_trie_node *n = v;
if (!node_parent_rcu(n))
fib_table_print(seq, iter->tb);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4aa1b7f01ea0..ad2bcf1b69ae 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk)
* Send an ICMP frame.
*/
-/*
- * Check transmit rate limitation for given message.
- * The rate information is held in the destination cache now.
- * This function is generic and could be used for other purposes
- * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
- *
- * Note that the same dst_entry fields are modified by functions in
- * route.c too, but these work for packet destinations while xrlim_allow
- * works for icmp destinations. This means the rate limiting information
- * for one "ip object" is shared - and these ICMPs are twice limited:
- * by source and by destination.
- *
- * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
- * SHOULD allow setting of rate limits
- *
- * Shared between ICMPv4 and ICMPv6.
- */
-#define XRLIM_BURST_FACTOR 6
-int xrlim_allow(struct dst_entry *dst, int timeout)
-{
- unsigned long now, token = dst->rate_tokens;
- int rc = 0;
-
- now = jiffies;
- token += now - dst->rate_last;
- dst->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
- if (token >= timeout) {
- token -= timeout;
- rc = 1;
- }
- dst->rate_tokens = token;
- return rc;
-}
-EXPORT_SYMBOL(xrlim_allow);
-
-static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
+static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
int type, int code)
{
struct dst_entry *dst = &rt->dst;
- int rc = 1;
+ bool rc = true;
if (type > NR_ICMP_TYPES)
goto out;
@@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
goto out;
/* Limit if icmp type is enabled in ratemask. */
- if ((1 << type) & net->ipv4.sysctl_icmp_ratemask)
- rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit);
+ if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) {
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ rc = inet_peer_xrlim_allow(rt->peer,
+ net->ipv4.sysctl_icmp_ratelimit);
+ }
out:
return rc;
}
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index a96e65674ac3..48f8d4592ccd 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a,
int i, n = (a->family == AF_INET ? 1 : 4);
for (i = 0; i < n; i++) {
- if (a->a6[i] == b->a6[i])
+ if (a->addr.a6[i] == b->addr.a6[i])
continue;
- if (a->a6[i] < b->a6[i])
+ if (a->addr.a6[i] < b->addr.a6[i])
return -1;
return 1;
}
@@ -510,8 +510,13 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create)
p->daddr = *daddr;
atomic_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
- atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4));
+ atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4));
p->tcp_ts_stamp = 0;
+ p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
+ p->rate_tokens = 0;
+ p->rate_last = 0;
+ p->pmtu_expires = 0;
+ memset(&p->redirect_learned, 0, sizeof(p->redirect_learned));
INIT_LIST_HEAD(&p->unused);
@@ -579,3 +584,44 @@ void inet_putpeer(struct inet_peer *p)
local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_putpeer);
+
+/*
+ * Check transmit rate limitation for given message.
+ * The rate information is held in the inet_peer entries now.
+ * This function is generic and could be used for other purposes
+ * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ * Note that the same inet_peer fields are modified by functions in
+ * route.c too, but these work for packet destinations while xrlim_allow
+ * works for icmp destinations. This means the rate limiting information
+ * for one "ip object" is shared - and these ICMPs are twice limited:
+ * by source and by destination.
+ *
+ * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ * SHOULD allow setting of rate limits
+ *
+ * Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
+{
+ unsigned long now, token;
+ bool rc = false;
+
+ if (!peer)
+ return true;
+
+ token = peer->rate_tokens;
+ now = jiffies;
+ token += now - peer->rate_last;
+ peer->rate_last = now;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ if (token >= timeout) {
+ token -= timeout;
+ rc = true;
+ }
+ peer->rate_tokens = token;
+ return rc;
+}
+EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..d7b2b0987a3b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
}
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5f..f926a310075d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
- depends on NF_NAT
+ depends on NF_CONNTRACK_SNMP && NF_NAT
depends on NETFILTER_ADVANCED
+ default NF_NAT && NF_CONNTRACK_SNMP
---help---
This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed95..e95054c690c6 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(NFPROTO_ARP, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(NFPROTO_ARP);
+ xt_compat_init_offsets(NFPROTO_ARP, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013dc..ef7d7b9680ea 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
+ xt_compat_init_offsets(AF_INET, info->number);
xt_entry_foreach(iter, loc_cpu_entry, info->size) {
ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
if (ret != 0)
@@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net,
duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
+ xt_compat_init_offsets(AF_INET, number);
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter0, entry0, total_size) {
ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a4897655..403ca57f6011 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
* that the ->target() function isn't called after ->destroy() */
ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL) {
- pr_info("no conntrack!\n");
- /* FIXME: need to drop invalid ones, since replies
- * to outgoing connections of other nodes will be
- * marked as INVALID */
+ if (ct == NULL)
return NF_DROP;
- }
/* special case: ICMP error handling. conntrack distinguishes between
* error messages (RELATED) and information requests (see below) */
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e9..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
}
#endif
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
dev_net(out)->ipv4.iptable_mangle);
/* Reroute for ANY change. */
- if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
+ if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26a..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_acct.h>
+#include <linux/rculist_nulls.h>
struct ct_iter_state {
struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
}
return head;
}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df0..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
/* Try to get same port: if not, try to change it. */
for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
- int ret;
+ int res;
exp->tuple.dst.u.tcp.port = htons(port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
+ res = nf_ct_expect_related(exp);
+ if (res == 0)
break;
- else if (ret != -EBUSY) {
+ else if (res != -EBUSY) {
port = 0;
break;
}
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a71..21bcf471b25a 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
manips not an issue. */
if (maniptype == IP_NAT_MANIP_SRC &&
!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
- if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
+ /* try the original tuple first */
+ if (in_range(orig_tuple, range)) {
+ if (!nf_nat_used_tuple(orig_tuple, ct)) {
+ *tuple = *orig_tuple;
+ return;
+ }
+ } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
+ range)) {
pr_debug("get_unique_tuple: Found current src map\n");
if (!nf_nat_used_tuple(tuple, ct))
return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
- int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
/* nat helper or nfctnetlink also setup binding */
nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
}
- /* Place in source hash if this is the first time. */
- if (have_to_hash) {
+ if (maniptype == IP_NAT_MANIP_SRC) {
unsigned int srchash;
srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
/* It's done. */
if (maniptype == IP_NAT_MANIP_DST)
- set_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
+ ct->status |= IPS_DST_NAT_DONE;
else
- set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
+ ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
int ret = 0;
spin_lock_bh(&nf_nat_lock);
- if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) {
+ if (rcu_dereference_protected(
+ nf_nat_protos[proto->protonum],
+ lockdep_is_held(&nf_nat_lock)
+ ) != &nf_nat_unknown_protocol) {
ret = -EBUSY;
goto out;
}
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
if (nat == NULL || nat->ct == NULL)
return;
- NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK);
+ NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
spin_lock_bh(&nf_nat_lock);
hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
struct nf_conn_nat *old_nat = old;
struct nf_conn *ct = old_nat->ct;
- if (!ct || !(ct->status & IPS_NAT_DONE_MASK))
+ if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
return;
spin_lock_bh(&nf_nat_lock);
- new_nat->ct = ct;
hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
spin_unlock_bh(&nf_nat_lock);
}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
{
/* Leave them the same for the moment. */
net->ipv4.nat_htable_size = net->ct.htable_size;
- net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
- &net->ipv4.nat_vmalloced, 0);
+ net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
if (!net->ipv4.nat_bysource)
return -ENOMEM;
return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
{
nf_ct_iterate_cleanup(net, &clean_nat, NULL);
synchronize_rcu();
- nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
- net->ipv4.nat_htable_size);
+ nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
{
int ret = 0;
- ret = nf_conntrack_helper_register(&snmp_helper);
- if (ret < 0)
- return ret;
+ BUG_ON(nf_nat_snmp_hook != NULL);
+ rcu_assign_pointer(nf_nat_snmp_hook, help);
+
ret = nf_conntrack_helper_register(&snmp_trap_helper);
if (ret < 0) {
nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
static void __exit nf_nat_snmp_basic_fini(void)
{
- nf_conntrack_helper_unregister(&snmp_helper);
+ rcu_assign_pointer(nf_nat_snmp_hook, NULL);
nf_conntrack_helper_unregister(&snmp_trap_helper);
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ed6603c2f6d..52b077d45208 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
static int rt_chain_length_max __read_mostly = 20;
-static struct delayed_work expires_work;
-static unsigned long expires_ljiffies;
-
/*
* Interface to generic destination cache.
*/
@@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
{
}
+static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer;
+ u32 *p = NULL;
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+
+ peer = rt->peer;
+ if (peer) {
+ u32 *old_p = __DST_METRICS_PTR(old);
+ unsigned long prev, new;
+
+ p = peer->metrics;
+ if (inet_metrics_new(peer))
+ memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
+
+ new = (unsigned long) p;
+ prev = cmpxchg(&dst->_metrics, old, new);
+
+ if (prev != old) {
+ p = __DST_METRICS_PTR(prev);
+ if (prev & DST_METRICS_READ_ONLY)
+ p = NULL;
+ } else {
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
+ }
+ }
+ return p;
+}
+
static struct dst_ops ipv4_dst_ops = {
.family = AF_INET,
.protocol = cpu_to_be16(ETH_P_IP),
@@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = {
.check = ipv4_dst_check,
.default_advmss = ipv4_default_advmss,
.default_mtu = ipv4_default_mtu,
+ .cow_metrics = ipv4_cow_metrics,
.destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
@@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = {
.release = seq_release,
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
struct ip_rt_acct *dst, *src;
@@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
if (!pde)
goto err2;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
if (!pde)
goto err3;
#endif
return 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
@@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
remove_proc_entry("rt_cache", net->proc_net_stat);
remove_proc_entry("rt_cache", net->proc_net);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
remove_proc_entry("rt_acct", net->proc_net);
#endif
}
@@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth)
static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
- rth->dst.expires;
+ (rth->peer && rth->peer->pmtu_expires);
}
static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
@@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t
if (atomic_read(&rth->dst.__refcnt))
goto out;
- ret = 1;
- if (rth->dst.expires &&
- time_after_eq(jiffies, rth->dst.expires))
- goto out;
-
age = jiffies - rth->dst.lastuse;
- ret = 0;
if ((age <= tmo1 && !rt_fast_clean(rth)) ||
(age <= tmo2 && rt_valuable(rth)))
goto out;
@@ -793,97 +820,6 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
return ONE;
}
-static void rt_check_expire(void)
-{
- static unsigned int rover;
- unsigned int i = rover, goal;
- struct rtable *rth;
- struct rtable __rcu **rthp;
- unsigned long samples = 0;
- unsigned long sum = 0, sum2 = 0;
- unsigned long delta;
- u64 mult;
-
- delta = jiffies - expires_ljiffies;
- expires_ljiffies = jiffies;
- mult = ((u64)delta) << rt_hash_log;
- if (ip_rt_gc_timeout > 1)
- do_div(mult, ip_rt_gc_timeout);
- goal = (unsigned int)mult;
- if (goal > rt_hash_mask)
- goal = rt_hash_mask + 1;
- for (; goal > 0; goal--) {
- unsigned long tmo = ip_rt_gc_timeout;
- unsigned long length;
-
- i = (i + 1) & rt_hash_mask;
- rthp = &rt_hash_table[i].chain;
-
- if (need_resched())
- cond_resched();
-
- samples++;
-
- if (rcu_dereference_raw(*rthp) == NULL)
- continue;
- length = 0;
- spin_lock_bh(rt_hash_lock_addr(i));
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
- prefetch(rth->dst.rt_next);
- if (rt_is_expired(rth)) {
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- continue;
- }
- if (rth->dst.expires) {
- /* Entry is expired even if it is in use */
- if (time_before_eq(jiffies, rth->dst.expires)) {
-nofree:
- tmo >>= 1;
- rthp = &rth->dst.rt_next;
- /*
- * We only count entries on
- * a chain with equal hash inputs once
- * so that entries for different QOS
- * levels, and other non-hash input
- * attributes don't unfairly skew
- * the length computation
- */
- length += has_noalias(rt_hash_table[i].chain, rth);
- continue;
- }
- } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
- goto nofree;
-
- /* Cleanup aged off entries. */
- *rthp = rth->dst.rt_next;
- rt_free(rth);
- }
- spin_unlock_bh(rt_hash_lock_addr(i));
- sum += length;
- sum2 += length*length;
- }
- if (samples) {
- unsigned long avg = sum / samples;
- unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
- rt_chain_length_max = max_t(unsigned long,
- ip_rt_gc_elasticity,
- (avg + 4*sd) >> FRACT_BITS);
- }
- rover = i;
-}
-
-/*
- * rt_worker_func() is run in process context.
- * we call rt_check_expire() to scan part of the hash table
- */
-static void rt_worker_func(struct work_struct *work)
-{
- rt_check_expire();
- schedule_delayed_work(&expires_work, ip_rt_gc_interval);
-}
-
/*
* Pertubation of rt_genid by a small quantity [1..256]
* Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1272,6 +1208,13 @@ skip_hashing:
return 0;
}
+static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
+
+static u32 rt_peer_genid(void)
+{
+ return atomic_read(&__rt_peer_genid);
+}
+
void rt_bind_peer(struct rtable *rt, int create)
{
struct inet_peer *peer;
@@ -1280,6 +1223,8 @@ void rt_bind_peer(struct rtable *rt, int create)
if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
inet_putpeer(peer);
+ else
+ rt->rt_peer_genid = rt_peer_genid();
}
/*
@@ -1349,13 +1294,8 @@ static void rt_del(unsigned hash, struct rtable *rt)
void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
__be32 saddr, struct net_device *dev)
{
- int i, k;
struct in_device *in_dev = __in_dev_get_rcu(dev);
- struct rtable *rth;
- struct rtable __rcu **rthp;
- __be32 skeys[2] = { saddr, 0 };
- int ikeys[2] = { dev->ifindex, 0 };
- struct netevent_redirect netevent;
+ struct inet_peer *peer;
struct net *net;
if (!in_dev)
@@ -1367,9 +1307,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
ipv4_is_zeronet(new_gw))
goto reject_redirect;
- if (!rt_caching(net))
- goto reject_redirect;
-
if (!IN_DEV_SHARED_MEDIA(in_dev)) {
if (!inet_addr_onlink(in_dev, new_gw, old_gw))
goto reject_redirect;
@@ -1380,91 +1317,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
goto reject_redirect;
}
- for (i = 0; i < 2; i++) {
- for (k = 0; k < 2; k++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rthp = &rt_hash_table[hash].chain;
-
- while ((rth = rcu_dereference(*rthp)) != NULL) {
- struct rtable *rt;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- rt_is_expired(rth) ||
- !net_eq(dev_net(rth->dst.dev), net)) {
- rthp = &rth->dst.rt_next;
- continue;
- }
-
- if (rth->rt_dst != daddr ||
- rth->rt_src != saddr ||
- rth->dst.error ||
- rth->rt_gateway != old_gw ||
- rth->dst.dev != dev)
- break;
-
- dst_hold(&rth->dst);
-
- rt = dst_alloc(&ipv4_dst_ops);
- if (rt == NULL) {
- ip_rt_put(rth);
- return;
- }
-
- /* Copy all the information. */
- *rt = *rth;
- rt->dst.__use = 1;
- atomic_set(&rt->dst.__refcnt, 1);
- rt->dst.child = NULL;
- if (rt->dst.dev)
- dev_hold(rt->dst.dev);
- rt->dst.obsolete = -1;
- rt->dst.lastuse = jiffies;
- rt->dst.path = &rt->dst;
- rt->dst.neighbour = NULL;
- rt->dst.hh = NULL;
-#ifdef CONFIG_XFRM
- rt->dst.xfrm = NULL;
-#endif
- rt->rt_genid = rt_genid(net);
- rt->rt_flags |= RTCF_REDIRECTED;
-
- /* Gateway is different ... */
- rt->rt_gateway = new_gw;
-
- /* Redirect received -> path was valid */
- dst_confirm(&rth->dst);
-
- if (rt->peer)
- atomic_inc(&rt->peer->refcnt);
-
- if (arp_bind_neighbour(&rt->dst) ||
- !(rt->dst.neighbour->nud_state &
- NUD_VALID)) {
- if (rt->dst.neighbour)
- neigh_event_send(rt->dst.neighbour, NULL);
- ip_rt_put(rth);
- rt_drop(rt);
- goto do_next;
- }
+ peer = inet_getpeer_v4(daddr, 1);
+ if (peer) {
+ peer->redirect_learned.a4 = new_gw;
- netevent.old = &rth->dst;
- netevent.new = &rt->dst;
- call_netevent_notifiers(NETEVENT_REDIRECT,
- &netevent);
+ inet_putpeer(peer);
- rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
- ip_rt_put(rt);
- goto do_next;
- }
- do_next:
- ;
- }
+ atomic_inc(&__rt_peer_genid);
}
return;
@@ -1488,9 +1347,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
- } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- (rt->dst.expires &&
- time_after_eq(jiffies, rt->dst.expires))) {
+ } else if (rt->rt_flags & RTCF_REDIRECTED) {
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
rt->fl.oif,
rt_genid(dev_net(dst->dev)));
@@ -1500,6 +1357,14 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
#endif
rt_del(hash, rt);
ret = NULL;
+ } else if (rt->peer &&
+ rt->peer->pmtu_expires &&
+ time_after_eq(jiffies, rt->peer->pmtu_expires)) {
+ unsigned long orig = rt->peer->pmtu_expires;
+
+ if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
+ dst_metric_set(dst, RTAX_MTU,
+ rt->peer->pmtu_orig);
}
}
return ret;
@@ -1525,6 +1390,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct in_device *in_dev;
+ struct inet_peer *peer;
int log_martians;
rcu_read_lock();
@@ -1536,33 +1402,41 @@ void ip_rt_send_redirect(struct sk_buff *skb)
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
rcu_read_unlock();
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ if (!peer) {
+ icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+ return;
+ }
+
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
- rt->dst.rate_tokens = 0;
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ peer->rate_tokens = 0;
/* Too many ignored redirects; do not send anything
* set dst.rate_last to the last seen redirected packet.
*/
- if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
- rt->dst.rate_last = jiffies;
+ if (peer->rate_tokens >= ip_rt_redirect_number) {
+ peer->rate_last = jiffies;
return;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (rt->dst.rate_tokens == 0 ||
+ if (peer->rate_tokens == 0 ||
time_after(jiffies,
- (rt->dst.rate_last +
- (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
+ (peer->rate_last +
+ (ip_rt_redirect_load << peer->rate_tokens)))) {
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
- rt->dst.rate_last = jiffies;
- ++rt->dst.rate_tokens;
+ peer->rate_last = jiffies;
+ ++peer->rate_tokens;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (log_martians &&
- rt->dst.rate_tokens == ip_rt_redirect_number &&
+ peer->rate_tokens == ip_rt_redirect_number &&
net_ratelimit())
printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
&rt->rt_src, rt->rt_iif,
@@ -1574,7 +1448,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
static int ip_error(struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
+ struct inet_peer *peer;
unsigned long now;
+ bool send;
int code;
switch (rt->dst.error) {
@@ -1594,15 +1470,24 @@ static int ip_error(struct sk_buff *skb)
break;
}
- now = jiffies;
- rt->dst.rate_tokens += now - rt->dst.rate_last;
- if (rt->dst.rate_tokens > ip_rt_error_burst)
- rt->dst.rate_tokens = ip_rt_error_burst;
- rt->dst.rate_last = now;
- if (rt->dst.rate_tokens >= ip_rt_error_cost) {
- rt->dst.rate_tokens -= ip_rt_error_cost;
- icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+
+ send = true;
+ if (peer) {
+ now = jiffies;
+ peer->rate_tokens += now - peer->rate_last;
+ if (peer->rate_tokens > ip_rt_error_burst)
+ peer->rate_tokens = ip_rt_error_burst;
+ peer->rate_last = now;
+ if (peer->rate_tokens >= ip_rt_error_cost)
+ peer->rate_tokens -= ip_rt_error_cost;
+ else
+ send = false;
}
+ if (send)
+ icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
out: kfree_skb(skb);
return 0;
@@ -1630,88 +1515,130 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
unsigned short new_mtu,
struct net_device *dev)
{
- int i, k;
unsigned short old_mtu = ntohs(iph->tot_len);
- struct rtable *rth;
- int ikeys[2] = { dev->ifindex, 0 };
- __be32 skeys[2] = { iph->saddr, 0, };
- __be32 daddr = iph->daddr;
unsigned short est_mtu = 0;
+ struct inet_peer *peer;
- for (k = 0; k < 2; k++) {
- for (i = 0; i < 2; i++) {
- unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
- rt_genid(net));
-
- rcu_read_lock();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->dst.rt_next)) {
- unsigned short mtu = new_mtu;
-
- if (rth->fl.fl4_dst != daddr ||
- rth->fl.fl4_src != skeys[i] ||
- rth->rt_dst != daddr ||
- rth->rt_src != iph->saddr ||
- rth->fl.oif != ikeys[k] ||
- rt_is_input_route(rth) ||
- dst_metric_locked(&rth->dst, RTAX_MTU) ||
- !net_eq(dev_net(rth->dst.dev), net) ||
- rt_is_expired(rth))
- continue;
-
- if (new_mtu < 68 || new_mtu >= old_mtu) {
+ peer = inet_getpeer_v4(iph->daddr, 1);
+ if (peer) {
+ unsigned short mtu = new_mtu;
- /* BSD 4.2 compatibility hack :-( */
- if (mtu == 0 &&
- old_mtu >= dst_mtu(&rth->dst) &&
- old_mtu >= 68 + (iph->ihl << 2))
- old_mtu -= iph->ihl << 2;
+ if (new_mtu < 68 || new_mtu >= old_mtu) {
+ /* BSD 4.2 derived systems incorrectly adjust
+ * tot_len by the IP header length, and report
+ * a zero MTU in the ICMP message.
+ */
+ if (mtu == 0 &&
+ old_mtu >= 68 + (iph->ihl << 2))
+ old_mtu -= iph->ihl << 2;
+ mtu = guess_mtu(old_mtu);
+ }
- mtu = guess_mtu(old_mtu);
- }
- if (mtu <= dst_mtu(&rth->dst)) {
- if (mtu < dst_mtu(&rth->dst)) {
- dst_confirm(&rth->dst);
- if (mtu < ip_rt_min_pmtu) {
- u32 lock = dst_metric(&rth->dst,
- RTAX_LOCK);
- mtu = ip_rt_min_pmtu;
- lock |= (1 << RTAX_MTU);
- dst_metric_set(&rth->dst, RTAX_LOCK,
- lock);
- }
- dst_metric_set(&rth->dst, RTAX_MTU, mtu);
- dst_set_expires(&rth->dst,
- ip_rt_mtu_expires);
- }
- est_mtu = mtu;
- }
- }
- rcu_read_unlock();
+ if (mtu < ip_rt_min_pmtu)
+ mtu = ip_rt_min_pmtu;
+ if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+ est_mtu = mtu;
+ peer->pmtu_learned = mtu;
+ peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
}
+
+ inet_putpeer(peer);
+
+ atomic_inc(&__rt_peer_genid);
}
return est_mtu ? : new_mtu;
}
+static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
+{
+ unsigned long expires = peer->pmtu_expires;
+
+ if (time_before(expires, jiffies)) {
+ u32 orig_dst_mtu = dst_mtu(dst);
+ if (peer->pmtu_learned < orig_dst_mtu) {
+ if (!peer->pmtu_orig)
+ peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
+ dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
+ }
+ } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
+ dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
+}
+
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
- if (dst_mtu(dst) > mtu && mtu >= 68 &&
- !(dst_metric_locked(dst, RTAX_MTU))) {
- if (mtu < ip_rt_min_pmtu) {
- u32 lock = dst_metric(dst, RTAX_LOCK);
+ struct rtable *rt = (struct rtable *) dst;
+ struct inet_peer *peer;
+
+ dst_confirm(dst);
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 1);
+ peer = rt->peer;
+ if (peer) {
+ if (mtu < ip_rt_min_pmtu)
mtu = ip_rt_min_pmtu;
- dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
+ if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
+ peer->pmtu_learned = mtu;
+ peer->pmtu_expires = jiffies + ip_rt_mtu_expires;
+
+ atomic_inc(&__rt_peer_genid);
+ rt->rt_peer_genid = rt_peer_genid();
+
+ check_peer_pmtu(dst, peer);
}
- dst_metric_set(dst, RTAX_MTU, mtu);
- dst_set_expires(dst, ip_rt_mtu_expires);
- call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
+ inet_putpeer(peer);
+ }
+}
+
+static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+{
+ struct rtable *rt = (struct rtable *) dst;
+ __be32 orig_gw = rt->rt_gateway;
+
+ dst_confirm(&rt->dst);
+
+ neigh_release(rt->dst.neighbour);
+ rt->dst.neighbour = NULL;
+
+ rt->rt_gateway = peer->redirect_learned.a4;
+ if (arp_bind_neighbour(&rt->dst) ||
+ !(rt->dst.neighbour->nud_state & NUD_VALID)) {
+ if (rt->dst.neighbour)
+ neigh_event_send(rt->dst.neighbour, NULL);
+ rt->rt_gateway = orig_gw;
+ return -EAGAIN;
+ } else {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE,
+ rt->dst.neighbour);
}
+ return 0;
}
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
- if (rt_is_expired((struct rtable *)dst))
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (rt_is_expired(rt))
return NULL;
+ if (rt->rt_peer_genid != rt_peer_genid()) {
+ struct inet_peer *peer;
+
+ if (!rt->peer)
+ rt_bind_peer(rt, 0);
+
+ peer = rt->peer;
+ if (peer && peer->pmtu_expires)
+ check_peer_pmtu(dst, peer);
+
+ if (peer && peer->redirect_learned.a4 &&
+ peer->redirect_learned.a4 != rt->rt_gateway) {
+ if (check_peer_redir(dst, peer))
+ return NULL;
+ }
+
+ rt->rt_peer_genid = rt_peer_genid();
+ }
return dst;
}
@@ -1720,6 +1647,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
struct rtable *rt = (struct rtable *) dst;
struct inet_peer *peer = rt->peer;
+ if (rt->fi) {
+ fib_info_put(rt->fi);
+ rt->fi = NULL;
+ }
if (peer) {
rt->peer = NULL;
inet_putpeer(peer);
@@ -1734,8 +1665,14 @@ static void ipv4_link_failure(struct sk_buff *skb)
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
rt = skb_rtable(skb);
- if (rt)
- dst_set_expires(&rt->dst, 0);
+ if (rt &&
+ rt->peer &&
+ rt->peer->pmtu_expires) {
+ unsigned long orig = rt->peer->pmtu_expires;
+
+ if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig)
+ dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
+ }
}
static int ip_rt_bug(struct sk_buff *skb)
@@ -1775,7 +1712,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
memcpy(addr, &src, 4);
}
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
if (!(rt->dst.tclassid & 0xFFFF))
@@ -1815,17 +1752,52 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
return mtu;
}
-static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+static void rt_init_metrics(struct rtable *rt, struct fib_info *fi)
+{
+ struct inet_peer *peer;
+ int create = 0;
+
+ /* If a peer entry exists for this destination, we must hook
+ * it up in order to get at cached metrics.
+ */
+ if (rt->fl.flags & FLOWI_FLAG_PRECOW_METRICS)
+ create = 1;
+
+ rt_bind_peer(rt, create);
+ peer = rt->peer;
+ if (peer) {
+ if (inet_metrics_new(peer))
+ memcpy(peer->metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX);
+ dst_init_metrics(&rt->dst, peer->metrics, false);
+
+ if (peer->pmtu_expires)
+ check_peer_pmtu(&rt->dst, peer);
+ if (peer->redirect_learned.a4 &&
+ peer->redirect_learned.a4 != rt->rt_gateway) {
+ rt->rt_gateway = peer->redirect_learned.a4;
+ rt->rt_flags |= RTCF_REDIRECTED;
+ }
+ } else {
+ if (fi->fib_metrics != (u32 *) dst_default_metrics) {
+ rt->fi = fi;
+ atomic_inc(&fi->fib_clntref);
+ }
+ dst_init_metrics(&rt->dst, fi->fib_metrics, true);
+ }
+}
+
+static void rt_set_nexthop(struct rtable *rt, const struct fib_result *res,
+ struct fib_info *fi, u16 type, u32 itag)
{
struct dst_entry *dst = &rt->dst;
- struct fib_info *fi = res->fi;
if (fi) {
if (FIB_RES_GW(*res) &&
FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = FIB_RES_GW(*res);
- dst_import_metrics(dst, fi->fib_metrics);
-#ifdef CONFIG_NET_CLS_ROUTE
+ rt_init_metrics(rt, fi);
+#ifdef CONFIG_IP_ROUTE_CLASSID
dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
#endif
}
@@ -1835,13 +1807,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
set_class_tag(rt, fib_rules_tclass(res));
#endif
set_class_tag(rt, itag);
#endif
- rt->rt_type = res->type;
+ rt->rt_type = type;
+}
+
+static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm)
+{
+ struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1);
+ if (rt) {
+ rt->dst.obsolete = -1;
+
+ rt->dst.flags = DST_HOST |
+ (nopolicy ? DST_NOPOLICY : 0) |
+ (noxfrm ? DST_NOXFRM : 0);
+ }
+ return rt;
}
/* called in rcu_read_lock() section */
@@ -1874,24 +1859,19 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (err < 0)
goto e_err;
}
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.output = ip_rt_bug;
- rth->dst.obsolete = -1;
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
rth->rt_iif =
@@ -1959,7 +1939,7 @@ static void ip_handle_martian_source(struct net_device *dev,
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
- struct fib_result *res,
+ const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
@@ -2013,19 +1993,13 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
- if (IN_DEV_CONF_GET(out_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
@@ -2040,12 +2014,11 @@ static int __mkroute_input(struct sk_buff *skb,
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
- rth->dst.obsolete = -1;
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
- rt_set_nexthop(rth, res, itag);
+ rt_set_nexthop(rth, res, res->fi, res->type, itag);
rth->rt_flags = flags;
@@ -2190,25 +2163,20 @@ brd_input:
RT_CACHE_STAT_INC(in_brd);
local_input:
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.output= ip_rt_bug;
- rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(net);
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
rth->fl.fl4_dst = daddr;
rth->rt_dst = daddr;
rth->fl.fl4_tos = tos;
rth->fl.mark = skb->mark;
rth->fl.fl4_src = saddr;
rth->rt_src = saddr;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
rth->rt_iif =
@@ -2351,38 +2319,39 @@ skip_cache:
EXPORT_SYMBOL(ip_route_input_common);
/* called with rcu_read_lock() */
-static int __mkroute_output(struct rtable **result,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+static struct rtable *__mkroute_output(const struct fib_result *res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned int flags)
{
- struct rtable *rth;
- struct in_device *in_dev;
+ struct fib_info *fi = res->fi;
u32 tos = RT_FL_TOS(oldflp);
+ struct in_device *in_dev;
+ u16 type = res->type;
+ struct rtable *rth;
if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (ipv4_is_lbcast(fl->fl4_dst))
- res->type = RTN_BROADCAST;
+ type = RTN_BROADCAST;
else if (ipv4_is_multicast(fl->fl4_dst))
- res->type = RTN_MULTICAST;
+ type = RTN_MULTICAST;
else if (ipv4_is_zeronet(fl->fl4_dst))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
in_dev = __in_dev_get_rcu(dev_out);
if (!in_dev)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- if (res->type == RTN_BROADCAST) {
+ if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- res->fi = NULL;
- } else if (res->type == RTN_MULTICAST) {
+ fi = NULL;
+ } else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
oldflp->proto))
@@ -2391,21 +2360,14 @@ static int __mkroute_output(struct rtable **result,
* default one, but do not gateway in this case.
* Yes, it is hack.
*/
- if (res->fi && res->prefixlen < 4)
- res->fi = NULL;
+ if (fi && res->prefixlen < 4)
+ fi = NULL;
}
-
- rth = dst_alloc(&ipv4_dst_ops);
+ rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(in_dev, NOXFRM));
if (!rth)
- return -ENOBUFS;
-
- atomic_set(&rth->dst.__refcnt, 1);
- rth->dst.flags= DST_HOST;
- if (IN_DEV_CONF_GET(in_dev, NOXFRM))
- rth->dst.flags |= DST_NOXFRM;
- if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
- rth->dst.flags |= DST_NOPOLICY;
+ return ERR_PTR(-ENOBUFS);
rth->fl.fl4_dst = oldflp->fl4_dst;
rth->fl.fl4_tos = tos;
@@ -2423,7 +2385,6 @@ static int __mkroute_output(struct rtable **result,
rth->rt_spec_dst= fl->fl4_src;
rth->dst.output=ip_output;
- rth->dst.obsolete = -1;
rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2440,7 +2401,7 @@ static int __mkroute_output(struct rtable **result,
RT_CACHE_STAT_INC(out_slow_mc);
}
#ifdef CONFIG_IP_MROUTE
- if (res->type == RTN_MULTICAST) {
+ if (type == RTN_MULTICAST) {
if (IN_DEV_MFORWARD(in_dev) &&
!ipv4_is_local_multicast(oldflp->fl4_dst)) {
rth->dst.input = ip_mr_input;
@@ -2450,31 +2411,10 @@ static int __mkroute_output(struct rtable **result,
#endif
}
- rt_set_nexthop(rth, res, 0);
+ rt_set_nexthop(rth, res, fi, type, 0);
rth->rt_flags = flags;
- *result = rth;
- return 0;
-}
-
-/* called with rcu_read_lock() */
-static int ip_mkroute_output(struct rtable **rp,
- struct fib_result *res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
-{
- struct rtable *rth = NULL;
- int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
- unsigned hash;
- if (err == 0) {
- hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
- rt_genid(dev_net(dev_out)));
- err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
- }
-
- return err;
+ return rth;
}
/*
@@ -2497,6 +2437,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
struct fib_result res;
unsigned int flags = 0;
struct net_device *dev_out = NULL;
+ struct rtable *rth;
int err;
@@ -2505,6 +2446,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
res.r = NULL;
#endif
+ rcu_read_lock();
if (oldflp->fl4_src) {
err = -EINVAL;
if (ipv4_is_multicast(oldflp->fl4_src) ||
@@ -2645,7 +2587,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
else
#endif
if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
- fib_select_default(net, &fl, &res);
+ fib_select_default(&res);
if (!fl.fl4_src)
fl.fl4_src = FIB_RES_PREFSRC(res);
@@ -2655,17 +2597,27 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
make_route:
- err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+ rth = __mkroute_output(&res, &fl, oldflp, dev_out, flags);
+ if (IS_ERR(rth))
+ err = PTR_ERR(rth);
+ else {
+ unsigned int hash;
-out: return err;
+ hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
+ rt_genid(dev_net(dev_out)));
+ err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
+ }
+
+out:
+ rcu_read_unlock();
+ return err;
}
int __ip_route_output_key(struct net *net, struct rtable **rp,
const struct flowi *flp)
{
- unsigned int hash;
- int res;
struct rtable *rth;
+ unsigned int hash;
if (!rt_caching(net))
goto slow_output;
@@ -2695,10 +2647,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
rcu_read_unlock_bh();
slow_output:
- rcu_read_lock();
- res = ip_route_output_slow(net, rp, flp);
- rcu_read_unlock();
- return res;
+ return ip_route_output_slow(net, rp, flp);
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
@@ -2731,12 +2680,11 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
{
struct rtable *ort = *rp;
struct rtable *rt = (struct rtable *)
- dst_alloc(&ipv4_dst_blackhole_ops);
+ dst_alloc(&ipv4_dst_blackhole_ops, 1);
if (rt) {
struct dst_entry *new = &rt->dst;
- atomic_set(&new->__refcnt, 1);
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard;
@@ -2759,6 +2707,9 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
rt->peer = ort->peer;
if (rt->peer)
atomic_inc(&rt->peer->refcnt);
+ rt->fi = ort->fi;
+ if (rt->fi)
+ atomic_inc(&rt->fi->fib_clntref);
dst_free(new);
}
@@ -2835,7 +2786,7 @@ static int rt_fill_info(struct net *net,
}
if (rt->dst.dev)
NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
if (rt->dst.tclassid)
NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
#endif
@@ -2854,7 +2805,8 @@ static int rt_fill_info(struct net *net,
NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
error = rt->dst.error;
- expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
+ expires = (rt->peer && rt->peer->pmtu_expires) ?
+ rt->peer->pmtu_expires - jiffies : 0;
if (rt->peer) {
inet_peer_refcheck(rt->peer);
id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
@@ -3256,9 +3208,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
};
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
-#endif /* CONFIG_NET_CLS_ROUTE */
+#endif /* CONFIG_IP_ROUTE_CLASSID */
static __initdata unsigned long rhash_entries;
static int __init set_rhash_entries(char *str)
@@ -3274,7 +3226,7 @@ int __init ip_rt_init(void)
{
int rc = 0;
-#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_ROUTE_CLASSID
ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
if (!ip_rt_acct)
panic("IP: failed to allocate ip_rt_acct\n");
@@ -3311,14 +3263,6 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
- /* All the timers, started at system startup tend
- to synchronize. Perturb it a bit.
- */
- INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
- expires_ljiffies = jiffies;
- schedule_delayed_work(&expires_work,
- net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
-
if (ip_rt_proc_init())
printk(KERN_ERR "Unable to create route proc files\n");
#ifdef CONFIG_XFRM
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262c..f9867d2dbef4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_getsockopt);
#endif
-struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
+struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct tcphdr *th;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb7f82ebf4a3..2f692cefd3b0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
if (!cwnd)
- cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
+ cwnd = TCP_INIT_CWND;
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 02f583b3744a..e2b9be27f226 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1341,7 +1341,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
tcp_death_row.sysctl_tw_recycle &&
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
- peer->daddr.a4 == saddr) {
+ peer->daddr.addr.a4 == saddr) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959ee..d37baaa1dbe3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2199,7 +2199,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
return 0;
}
-struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
+struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
unsigned int mss;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index b057d40addec..19fbdec6baaa 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -196,8 +196,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ dst_destroy_metrics_generic(dst);
+
if (likely(xdst->u.rt.peer))
inet_putpeer(xdst->u.rt.peer);
+
xfrm_dst_destroy(xdst);
}
@@ -215,6 +218,7 @@ static struct dst_ops xfrm4_dst_ops = {
.protocol = cpu_to_be16(ETH_P_IP),
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
+ .cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
.ifdown = xfrm4_dst_ifdown,
.local_out = __ip_local_out,
OpenPOWER on IntegriCloud