summaryrefslogtreecommitdiffstats
path: root/net/ipv4/route.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r--net/ipv4/route.c318
1 files changed, 200 insertions, 118 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ccb25d80f679..bf4e4adc2d00 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -360,18 +360,6 @@ static int rt_acct_proc_show(struct seq_file *m, void *v)
kfree(dst);
return 0;
}
-
-static int rt_acct_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rt_acct_proc_show, NULL);
-}
-
-static const struct file_operations rt_acct_proc_fops = {
- .open = rt_acct_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif
static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -389,7 +377,8 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
goto err2;
#ifdef CONFIG_IP_ROUTE_CLASSID
- pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+ pde = proc_create_single("rt_acct", 0, net->proc_net,
+ rt_acct_proc_show);
if (!pde)
goto err3;
#endif
@@ -709,7 +698,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu;
fnhe->fnhe_mtu_locked = lock;
- fnhe->fnhe_expires = expires;
+ fnhe->fnhe_expires = max(1UL, expires);
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception
@@ -1297,6 +1286,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nh->nh_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
{
struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
@@ -1310,12 +1329,49 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
fnhe = rcu_dereference(fnhe->fnhe_next)) {
- if (fnhe->fnhe_daddr == daddr)
+ if (fnhe->fnhe_daddr == daddr) {
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires)) {
+ ip_del_fnhe(nh, daddr);
+ break;
+ }
return fnhe;
+ }
}
return NULL;
}
+/* MTU selection:
+ * 1. mtu on route is locked - use it
+ * 2. mtu from nexthop exception
+ * 3. mtu from egress device
+ */
+
+u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
+{
+ struct fib_info *fi = res->fi;
+ struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
+ struct net_device *dev = nh->nh_dev;
+ u32 mtu = 0;
+
+ if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+ fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
+ mtu = fi->fib_mtu;
+
+ if (likely(!mtu)) {
+ struct fib_nh_exception *fnhe;
+
+ fnhe = find_exception(nh, daddr);
+ if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
+ mtu = fnhe->fnhe_pmtu;
+ }
+
+ if (likely(!mtu))
+ mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
+
+ return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+}
+
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
__be32 daddr, const bool do_cache)
{
@@ -1339,6 +1395,7 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
fnhe->fnhe_gw = 0;
fnhe->fnhe_pmtu = 0;
fnhe->fnhe_expires = 0;
+ fnhe->fnhe_mtu_locked = false;
fnhe_flush_routes(fnhe);
orig = NULL;
}
@@ -1636,36 +1693,6 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
-{
- struct fnhe_hash_bucket *hash;
- struct fib_nh_exception *fnhe, __rcu **fnhe_p;
- u32 hval = fnhe_hashfun(daddr);
-
- spin_lock_bh(&fnhe_lock);
-
- hash = rcu_dereference_protected(nh->nh_exceptions,
- lockdep_is_held(&fnhe_lock));
- hash += hval;
-
- fnhe_p = &hash->chain;
- fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
- while (fnhe) {
- if (fnhe->fnhe_daddr == daddr) {
- rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
- fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
- fnhe_flush_routes(fnhe);
- kfree_rcu(fnhe, rcu);
- break;
- }
- fnhe_p = &fnhe->fnhe_next;
- fnhe = rcu_dereference_protected(fnhe->fnhe_next,
- lockdep_is_held(&fnhe_lock));
- }
-
- spin_unlock_bh(&fnhe_lock);
-}
-
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1719,20 +1746,10 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe) {
+ if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input);
- if (rth && rth->dst.expires &&
- time_after(jiffies, rth->dst.expires)) {
- ip_del_fnhe(&FIB_RES_NH(*res), daddr);
- fnhe = NULL;
- } else {
- goto rt_cache;
- }
- }
-
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-
-rt_cache:
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -1964,8 +1981,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.saddr = saddr;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
+ if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
flkeys = &_flkeys;
+ } else {
+ fl4.flowi4_proto = 0;
+ fl4.fl4_sport = 0;
+ fl4.fl4_dport = 0;
+ }
err = fib_lookup(net, &fl4, res, 0);
if (err != 0) {
@@ -2216,39 +2238,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
* the loopback interface and the IP_PKTINFO ipi_ifindex will
* be set to the loopback interface as well.
*/
- fi = NULL;
+ do_cache = false;
}
fnhe = NULL;
do_cache &= fi != NULL;
- if (do_cache) {
+ if (fi) {
struct rtable __rcu **prth;
struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr);
+ if (!do_cache)
+ goto add;
if (fnhe) {
prth = &fnhe->fnhe_rth_output;
- rth = rcu_dereference(*prth);
- if (rth && rth->dst.expires &&
- time_after(jiffies, rth->dst.expires)) {
- ip_del_fnhe(nh, fl4->daddr);
- fnhe = NULL;
- } else {
- goto rt_cache;
+ } else {
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
}
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
}
-
- if (unlikely(fl4->flowi4_flags &
- FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
- do_cache = false;
- goto add;
- }
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
-
-rt_cache:
if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth;
}
@@ -2580,11 +2594,10 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
-static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
- struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
- u32 seq)
+static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
+ struct rtable *rt, u32 table_id, struct flowi4 *fl4,
+ struct sk_buff *skb, u32 portid, u32 seq)
{
- struct rtable *rt = skb_rtable(skb);
struct rtmsg *r;
struct nlmsghdr *nlh;
unsigned long expires = 0;
@@ -2680,7 +2693,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
}
} else
#endif
- if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
goto nla_put_failure;
}
@@ -2695,43 +2708,93 @@ nla_put_failure:
return -EMSGSIZE;
}
+static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
+ u8 ip_proto, __be16 sport,
+ __be16 dport)
+{
+ struct sk_buff *skb;
+ struct iphdr *iph;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ /* Reserve room for dummy headers, this skb can pass
+ * through good chunk of routing engine.
+ */
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ iph = skb_put(skb, sizeof(struct iphdr));
+ iph->protocol = ip_proto;
+ iph->saddr = src;
+ iph->daddr = dst;
+ iph->version = 0x4;
+ iph->frag_off = 0;
+ iph->ihl = 0x5;
+ skb_set_transport_header(skb, skb->len);
+
+ switch (iph->protocol) {
+ case IPPROTO_UDP: {
+ struct udphdr *udph;
+
+ udph = skb_put_zero(skb, sizeof(struct udphdr));
+ udph->source = sport;
+ udph->dest = dport;
+ udph->len = sizeof(struct udphdr);
+ udph->check = 0;
+ break;
+ }
+ case IPPROTO_TCP: {
+ struct tcphdr *tcph;
+
+ tcph = skb_put_zero(skb, sizeof(struct tcphdr));
+ tcph->source = sport;
+ tcph->dest = dport;
+ tcph->doff = sizeof(struct tcphdr) / 4;
+ tcph->rst = 1;
+ tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
+ src, dst, 0);
+ break;
+ }
+ case IPPROTO_ICMP: {
+ struct icmphdr *icmph;
+
+ icmph = skb_put_zero(skb, sizeof(struct icmphdr));
+ icmph->type = ICMP_ECHO;
+ icmph->code = 0;
+ }
+ }
+
+ return skb;
+}
+
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
- struct rtmsg *rtm;
struct nlattr *tb[RTA_MAX+1];
+ u32 table_id = RT_TABLE_MAIN;
+ __be16 sport = 0, dport = 0;
struct fib_result res = {};
+ u8 ip_proto = IPPROTO_UDP;
struct rtable *rt = NULL;
+ struct sk_buff *skb;
+ struct rtmsg *rtm;
struct flowi4 fl4;
__be32 dst = 0;
__be32 src = 0;
+ kuid_t uid;
u32 iif;
int err;
int mark;
- struct sk_buff *skb;
- u32 table_id = RT_TABLE_MAIN;
- kuid_t uid;
err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
extack);
if (err < 0)
- goto errout;
+ return err;
rtm = nlmsg_data(nlh);
-
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb) {
- err = -ENOBUFS;
- goto errout;
- }
-
- /* Reserve room for dummy headers, this skb can pass
- through good chunk of routing engine.
- */
- skb_reset_mac_header(skb);
- skb_reset_network_header(skb);
-
src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
@@ -2741,14 +2804,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
else
uid = (iif ? INVALID_UID : current_uid());
- /* Bugfix: need to give ip_route_input enough of an IP header to
- * not gag.
- */
- ip_hdr(skb)->protocol = IPPROTO_UDP;
- ip_hdr(skb)->saddr = src;
- ip_hdr(skb)->daddr = dst;
+ if (tb[RTA_IP_PROTO]) {
+ err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+ &ip_proto, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[RTA_SPORT])
+ sport = nla_get_be16(tb[RTA_SPORT]);
- skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+ if (tb[RTA_DPORT])
+ dport = nla_get_be16(tb[RTA_DPORT]);
+
+ skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
+ if (!skb)
+ return -ENOBUFS;
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
@@ -2757,6 +2828,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
+ if (sport)
+ fl4.fl4_sport = sport;
+ if (dport)
+ fl4.fl4_dport = dport;
+ fl4.flowi4_proto = ip_proto;
rcu_read_lock();
@@ -2766,10 +2842,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
err = -ENODEV;
- goto errout_free;
+ goto errout_rcu;
}
- skb->protocol = htons(ETH_P_IP);
+ fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
@@ -2789,7 +2865,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
}
if (err)
- goto errout_free;
+ goto errout_rcu;
if (rtm->rtm_flags & RTM_F_NOTIFY)
rt->rt_flags |= RTCF_NOTIFY;
@@ -2797,34 +2873,40 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = res.table ? res.table->tb_id : 0;
+ /* reset skb for netlink reply msg */
+ skb_trim(skb, 0);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb_reset_mac_header(skb);
+
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
- goto errout_free;
+ goto errout_rcu;
}
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
rt->rt_type, res.prefix, res.prefixlen,
fl4.flowi4_tos, res.fi, 0);
} else {
- err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
+ err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
}
if (err < 0)
- goto errout_free;
+ goto errout_rcu;
rcu_read_unlock();
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
-errout:
- return err;
errout_free:
+ return err;
+errout_rcu:
rcu_read_unlock();
kfree_skb(skb);
- goto errout;
+ goto errout_free;
}
void ip_rt_multicast_event(struct in_device *in_dev)
OpenPOWER on IntegriCloud