diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv6 | |
download | blackbird-op-linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz blackbird-op-linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/ipv6')
60 files changed, 35124 insertions, 0 deletions
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig new file mode 100644 index 000000000000..e66ca9381cfd --- /dev/null +++ b/net/ipv6/Kconfig @@ -0,0 +1,79 @@ +# +# IPv6 configuration +# +config IPV6_PRIVACY + bool "IPv6: Privacy Extensions (RFC 3041) support" + depends on IPV6 + ---help--- + Privacy Extensions for Stateless Address Autoconfiguration in IPv6 + support. With this option, additional periodically-alter + pseudo-random global-scope unicast address(es) will assigned to + your interface(s). + + By default, kernel do not generate temporary addresses. + To use temporary addresses, do + + echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr + + See <file:Documentation/networking/ip-sysctl.txt> for details. + +config INET6_AH + tristate "IPv6: AH transformation" + depends on IPV6 + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET6_ESP + tristate "IPv6: ESP transformation" + depends on IPV6 + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET6_IPCOMP + tristate "IPv6: IPComp transformation" + depends on IPV6 + select XFRM + select INET6_TUNNEL + select CRYPTO + select CRYPTO_DEFLATE + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config INET6_TUNNEL + tristate "IPv6: tunnel transformation" + depends on IPV6 + select XFRM + ---help--- + Support for generic IPv6-in-IPv6 tunnel transformation, which is + required by the IPv6-in-IPv6 tunneling module as well as tunnel mode + IPComp. + + If unsure, say Y. + +config IPV6_TUNNEL + tristate "IPv6: IPv6-in-IPv6 tunnel" + depends on IPV6 + select INET6_TUNNEL + ---help--- + Support for IPv6-in-IPv6 tunnels described in RFC 2473. + + If unsure, say N. + diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile new file mode 100644 index 000000000000..b39e04940590 --- /dev/null +++ b/net/ipv6/Makefile @@ -0,0 +1,25 @@ +# +# Makefile for the Linux TCP/IP (INET6) layer. +# + +obj-$(CONFIG_IPV6) += ipv6.o + +ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ + route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ + protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ + ip6_flowlabel.o ipv6_syms.o + +ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ + xfrm6_output.o +ipv6-objs += $(ipv6-y) + +obj-$(CONFIG_INET6_AH) += ah6.o +obj-$(CONFIG_INET6_ESP) += esp6.o +obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o +obj-$(CONFIG_INET6_TUNNEL) += xfrm6_tunnel.o +obj-$(CONFIG_NETFILTER) += netfilter/ + +obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o + +obj-y += exthdrs_core.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c new file mode 100644 index 000000000000..5ffde14ddc09 --- /dev/null +++ b/net/ipv6/addrconf.c @@ -0,0 +1,3615 @@ +/* + * IPv6 Address [auto]configuration + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * $Id: addrconf.c,v 1.69 2001/10/31 21:55:54 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Janos Farkas : delete timer on ifdown + * <chexum@bankinf.banki.hu> + * Andi Kleen : kill double kfree on module + * unload. + * Maciej W. Rozycki : FDDI support + * sekiya@USAGI : Don't send too many RS + * packets. + * yoshfuji@USAGI : Fixed interval between DAD + * packets. + * YOSHIFUJI Hideaki @USAGI : improved accuracy of + * address validation timer. + * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041) + * support. + * Yuji SEKIYA @USAGI : Don't assign a same IPv6 + * address on a same interface. + * YOSHIFUJI Hideaki @USAGI : ARCnet support + * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to + * seq_file. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/if_arcnet.h> +#include <linux/if_infiniband.h> +#include <linux/route.h> +#include <linux/inetdevice.h> +#include <linux/init.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <linux/delay.h> +#include <linux/notifier.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/tcp.h> +#include <net/ip.h> +#include <linux/if_tunnel.h> +#include <linux/rtnetlink.h> + +#ifdef CONFIG_IPV6_PRIVACY +#include <linux/random.h> +#include <linux/crypto.h> +#include <asm/scatterlist.h> +#endif + +#include <asm/uaccess.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* Set to 3 to get tracing... */ +#define ACONF_DEBUG 2 + +#if ACONF_DEBUG >= 3 +#define ADBG(x) printk x +#else +#define ADBG(x) +#endif + +#define INFINITY_LIFE_TIME 0xFFFFFFFF +#define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) + +#ifdef CONFIG_SYSCTL +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); +static void addrconf_sysctl_unregister(struct ipv6_devconf *p); +#endif + +#ifdef CONFIG_IPV6_PRIVACY +static int __ipv6_regen_rndid(struct inet6_dev *idev); +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_regen_rndid(unsigned long data); + +static int desync_factor = MAX_DESYNC_FACTOR * HZ; +static struct crypto_tfm *md5_tfm; +static DEFINE_SPINLOCK(md5_tfm_lock); +#endif + +static int ipv6_count_addresses(struct inet6_dev *idev); + +/* + * Configured unicast address hash table + */ +static struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE]; +static DEFINE_RWLOCK(addrconf_hash_lock); + +/* Protects inet6 devices */ +DEFINE_RWLOCK(addrconf_lock); + +static void addrconf_verify(unsigned long); + +static struct timer_list addr_chk_timer = + TIMER_INITIALIZER(addrconf_verify, 0, 0); +static DEFINE_SPINLOCK(addrconf_verify_lock); + +static void addrconf_join_anycast(struct inet6_ifaddr *ifp); +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); + +static int addrconf_ifdown(struct net_device *dev, int how); + +static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags); +static void addrconf_dad_timer(unsigned long data); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_rs_timer(unsigned long data); +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo); +static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev); + +static struct notifier_block *inet6addr_chain; + +struct ipv6_devconf ipv6_devconf = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .force_mld_version = 0, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, +}; + +static struct ipv6_devconf ipv6_devconf_dflt = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, +}; + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +#if 0 +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +#endif +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; + +int ipv6_addr_type(const struct in6_addr *addr) +{ + int type; + u32 st; + + st = addr->s6_addr32[0]; + + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + type = IPV6_ADDR_MULTICAST; + + switch((st & htonl(0x00FF0000))) { + case __constant_htonl(0x00010000): + type |= IPV6_ADDR_LOOPBACK; + break; + + case __constant_htonl(0x00020000): + type |= IPV6_ADDR_LINKLOCAL; + break; + + case __constant_htonl(0x00050000): + type |= IPV6_ADDR_SITELOCAL; + break; + }; + return type; + } + + type = IPV6_ADDR_UNICAST; + + /* Consider all addresses with the first three bits different of + 000 and 111 as finished. + */ + if ((st & htonl(0xE0000000)) != htonl(0x00000000) && + (st & htonl(0xE0000000)) != htonl(0xE0000000)) + return type; + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | type); + + if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) + return (IPV6_ADDR_SITELOCAL | type); + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { + if (addr->s6_addr32[2] == 0) { + if (addr->s6_addr32[3] == 0) + return IPV6_ADDR_ANY; + + if (addr->s6_addr32[3] == htonl(0x00000001)) + return (IPV6_ADDR_LOOPBACK | type); + + return (IPV6_ADDR_COMPATv4 | type); + } + + if (addr->s6_addr32[2] == htonl(0x0000ffff)) + return IPV6_ADDR_MAPPED; + } + + st &= htonl(0xFF000000); + if (st == 0) + return IPV6_ADDR_RESERVED; + st &= htonl(0xFE000000); + if (st == htonl(0x02000000)) + return IPV6_ADDR_RESERVED; /* for NSAP */ + if (st == htonl(0x04000000)) + return IPV6_ADDR_RESERVED; /* for IPX */ + return type; +} + +static void addrconf_del_timer(struct inet6_ifaddr *ifp) +{ + if (del_timer(&ifp->timer)) + __in6_ifa_put(ifp); +} + +enum addrconf_timer_t +{ + AC_NONE, + AC_DAD, + AC_RS, +}; + +static void addrconf_mod_timer(struct inet6_ifaddr *ifp, + enum addrconf_timer_t what, + unsigned long when) +{ + if (!del_timer(&ifp->timer)) + in6_ifa_hold(ifp); + + switch (what) { + case AC_DAD: + ifp->timer.function = addrconf_dad_timer; + break; + case AC_RS: + ifp->timer.function = addrconf_rs_timer; + break; + default:; + } + ifp->timer.expires = jiffies + when; + add_timer(&ifp->timer); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + BUG_TRAP(idev->addr_list==NULL); + BUG_TRAP(idev->mc_list==NULL); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + printk("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree(idev); +} + +static struct inet6_dev * ipv6_add_dev(struct net_device *dev) +{ + struct inet6_dev *ndev; + + ASSERT_RTNL(); + + if (dev->mtu < IPV6_MIN_MTU) + return NULL; + + ndev = kmalloc(sizeof(struct inet6_dev), GFP_KERNEL); + + if (ndev) { + memset(ndev, 0, sizeof(struct inet6_dev)); + + rwlock_init(&ndev->lock); + ndev->dev = dev; + memcpy(&ndev->cnf, &ipv6_devconf_dflt, sizeof(ndev->cnf)); + ndev->cnf.mtu6 = dev->mtu; + ndev->cnf.sysctl = NULL; + ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); + if (ndev->nd_parms == NULL) { + kfree(ndev); + return NULL; + } + /* We refer to the device */ + dev_hold(dev); + + if (snmp6_alloc_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot allocate memory for statistics; dev=%s.\n", + __FUNCTION__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + if (snmp6_register_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot create /proc/net/dev_snmp6/%s\n", + __FUNCTION__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + /* One reference from device. We must do this before + * we invoke __ipv6_regen_rndid(). + */ + in6_dev_hold(ndev); + +#ifdef CONFIG_IPV6_PRIVACY + get_random_bytes(ndev->rndid, sizeof(ndev->rndid)); + get_random_bytes(ndev->entropy, sizeof(ndev->entropy)); + init_timer(&ndev->regen_timer); + ndev->regen_timer.function = ipv6_regen_rndid; + ndev->regen_timer.data = (unsigned long) ndev; + if ((dev->flags&IFF_LOOPBACK) || + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_SIT) { + printk(KERN_INFO + "Disabled Privacy Extensions on device %p(%s)\n", + dev, dev->name); + ndev->cnf.use_tempaddr = -1; + } else { + in6_dev_hold(ndev); + ipv6_regen_rndid((unsigned long) ndev); + } +#endif + + write_lock_bh(&addrconf_lock); + dev->ip6_ptr = ndev; + write_unlock_bh(&addrconf_lock); + + ipv6_mc_init_dev(ndev); + ndev->tstamp = jiffies; +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, ndev->nd_parms, NET_IPV6, + NET_IPV6_NEIGH, "ipv6", + &ndisc_ifinfo_sysctl_change, + NULL); + addrconf_sysctl_register(ndev, &ndev->cnf); +#endif + } + return ndev; +} + +static struct inet6_dev * ipv6_find_idev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + if ((idev = __in6_dev_get(dev)) == NULL) { + if ((idev = ipv6_add_dev(dev)) == NULL) + return NULL; + } + if (dev->flags&IFF_UP) + ipv6_mc_up(idev); + return idev; +} + +#ifdef CONFIG_SYSCTL +static void dev_forward_change(struct inet6_dev *idev) +{ + struct net_device *dev; + struct inet6_ifaddr *ifa; + struct in6_addr addr; + + if (!idev) + return; + dev = idev->dev; + if (dev && (dev->flags & IFF_MULTICAST)) { + ipv6_addr_all_routers(&addr); + + if (idev->cnf.forwarding) + ipv6_dev_mc_inc(dev, &addr); + else + ipv6_dev_mc_dec(dev, &addr); + } + for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { + if (idev->cnf.forwarding) + addrconf_join_anycast(ifa); + else + addrconf_leave_anycast(ifa); + } +} + + +static void addrconf_forward_change(void) +{ + struct net_device *dev; + struct inet6_dev *idev; + + read_lock(&dev_base_lock); + for (dev=dev_base; dev; dev=dev->next) { + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding); + idev->cnf.forwarding = ipv6_devconf.forwarding; + if (changed) + dev_forward_change(idev); + } + read_unlock(&addrconf_lock); + } + read_unlock(&dev_base_lock); +} +#endif + +/* Nobody refers to this ifaddr, destroy it */ + +void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) +{ + BUG_TRAP(ifp->if_next==NULL); + BUG_TRAP(ifp->lst_next==NULL); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); +#endif + + in6_dev_put(ifp->idev); + + if (del_timer(&ifp->timer)) + printk("Timer is still running, when freeing ifa=%p\n", ifp); + + if (!ifp->dead) { + printk("Freeing alive inet6 address %p\n", ifp); + return; + } + dst_release(&ifp->rt->u.dst); + + kfree(ifp); +} + +/* On success it returns ifp with increased reference count */ + +static struct inet6_ifaddr * +ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, + int scope, unsigned flags) +{ + struct inet6_ifaddr *ifa = NULL; + struct rt6_info *rt; + int hash; + int err = 0; + + read_lock_bh(&addrconf_lock); + if (idev->dead) { + err = -ENODEV; /*XXX*/ + goto out2; + } + + write_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(addr, idev->dev)) { + ADBG(("ipv6_add_addr: already assigned\n")); + err = -EEXIST; + goto out; + } + + ifa = kmalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + + if (ifa == NULL) { + ADBG(("ipv6_add_addr: malloc failed\n")); + err = -ENOBUFS; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 0); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; + } + + memset(ifa, 0, sizeof(struct inet6_ifaddr)); + ipv6_addr_copy(&ifa->addr, addr); + + spin_lock_init(&ifa->lock); + init_timer(&ifa->timer); + ifa->timer.data = (unsigned long) ifa; + ifa->scope = scope; + ifa->prefix_len = pfxlen; + ifa->flags = flags | IFA_F_TENTATIVE; + ifa->cstamp = ifa->tstamp = jiffies; + + ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ + in6_ifa_hold(ifa); + + /* Add to big hash table */ + hash = ipv6_addr_hash(addr); + + ifa->lst_next = inet6_addr_lst[hash]; + inet6_addr_lst[hash] = ifa; + in6_ifa_hold(ifa); + write_unlock(&addrconf_hash_lock); + + write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ + ifa->if_next = idev->addr_list; + idev->addr_list = ifa; + +#ifdef CONFIG_IPV6_PRIVACY + if (ifa->flags&IFA_F_TEMPORARY) { + ifa->tmp_next = idev->tempaddr_list; + idev->tempaddr_list = ifa; + in6_ifa_hold(ifa); + } +#endif + + ifa->rt = rt; + + in6_ifa_hold(ifa); + write_unlock(&idev->lock); +out2: + read_unlock_bh(&addrconf_lock); + + if (unlikely(err == 0)) + notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + else { + kfree(ifa); + ifa = ERR_PTR(err); + } + + return ifa; +out: + write_unlock(&addrconf_hash_lock); + goto out2; +} + +/* This function wants to get referenced ifp and releases it before return */ + +static void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifa, **ifap; + struct inet6_dev *idev = ifp->idev; + int hash; + int deleted = 0, onlink = 0; + unsigned long expires = jiffies; + + hash = ipv6_addr_hash(&ifp->addr); + + ifp->dead = 1; + + write_lock_bh(&addrconf_hash_lock); + for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL; + ifap = &ifa->lst_next) { + if (ifa == ifp) { + *ifap = ifa->lst_next; + __in6_ifa_put(ifp); + ifa->lst_next = NULL; + break; + } + } + write_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&idev->lock); +#ifdef CONFIG_IPV6_PRIVACY + if (ifp->flags&IFA_F_TEMPORARY) { + for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL; + ifap = &ifa->tmp_next) { + if (ifa == ifp) { + *ifap = ifa->tmp_next; + if (ifp->ifpub) { + in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; + } + __in6_ifa_put(ifp); + ifa->tmp_next = NULL; + break; + } + } + } +#endif + + for (ifap = &idev->addr_list; (ifa=*ifap) != NULL; + ifap = &ifa->if_next) { + if (ifa == ifp) { + *ifap = ifa->if_next; + __in6_ifa_put(ifp); + ifa->if_next = NULL; + if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) + break; + deleted = 1; + } else if (ifp->flags & IFA_F_PERMANENT) { + if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) { + if (ifa->flags & IFA_F_PERMANENT) { + onlink = 1; + if (deleted) + break; + } else { + unsigned long lifetime; + + if (!onlink) + onlink = -1; + + spin_lock(&ifa->lock); + lifetime = min_t(unsigned long, + ifa->valid_lft, 0x7fffffffUL/HZ); + if (time_before(expires, + ifa->tstamp + lifetime * HZ)) + expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); + } + } + } + } + write_unlock_bh(&idev->lock); + + ipv6_ifa_notify(RTM_DELADDR, ifp); + + notifier_call_chain(&inet6addr_chain,NETDEV_DOWN,ifp); + + addrconf_del_timer(ifp); + + /* + * Purge or update corresponding prefix + * + * 1) we don't purge prefix here if address was not permanent. + * prefix is managed by its own lifetime. + * 2) if there're no addresses, delete prefix. + * 3) if there're still other permanent address(es), + * corresponding prefix is still permanent. + * 4) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + * + * --yoshfuji + */ + if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { + struct in6_addr prefix; + struct rt6_info *rt; + + ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); + rt = rt6_lookup(&prefix, NULL, ifp->idev->dev->ifindex, 1); + + if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { + if (onlink == 0) { + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { + rt->rt6i_expires = expires; + rt->rt6i_flags |= RTF_EXPIRES; + } + } + dst_release(&rt->u.dst); + } + + in6_ifa_put(ifp); +} + +#ifdef CONFIG_IPV6_PRIVACY +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +{ + struct inet6_dev *idev = ifp->idev; + struct in6_addr addr, *tmpaddr; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; + int tmp_plen; + int ret = 0; + int max_addresses; + + write_lock(&idev->lock); + if (ift) { + spin_lock_bh(&ift->lock); + memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); + spin_unlock_bh(&ift->lock); + tmpaddr = &addr; + } else { + tmpaddr = NULL; + } +retry: + in6_dev_hold(idev); + if (idev->cnf.use_tempaddr <= 0) { + write_unlock(&idev->lock); + printk(KERN_INFO + "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + spin_lock_bh(&ifp->lock); + if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { + idev->cnf.use_tempaddr = -1; /*XXX*/ + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + in6_ifa_hold(ifp); + memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); + if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } + memcpy(&addr.s6_addr[8], idev->rndid, 8); + tmp_valid_lft = min_t(__u32, + ifp->valid_lft, + idev->cnf.temp_valid_lft); + tmp_prefered_lft = min_t(__u32, + ifp->prefered_lft, + idev->cnf.temp_prefered_lft - desync_factor / HZ); + tmp_plen = ifp->prefix_len; + max_addresses = idev->cnf.max_addresses; + tmp_cstamp = ifp->cstamp; + tmp_tstamp = ifp->tstamp; + spin_unlock_bh(&ifp->lock); + + write_unlock(&idev->lock); + ift = !max_addresses || + ipv6_count_addresses(idev) < max_addresses ? + ipv6_add_addr(idev, &addr, tmp_plen, + ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, IFA_F_TEMPORARY) : NULL; + if (!ift || IS_ERR(ift)) { + in6_ifa_put(ifp); + in6_dev_put(idev); + printk(KERN_INFO + "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + tmpaddr = &addr; + write_lock(&idev->lock); + goto retry; + } + + spin_lock_bh(&ift->lock); + ift->ifpub = ifp; + ift->valid_lft = tmp_valid_lft; + ift->prefered_lft = tmp_prefered_lft; + ift->cstamp = tmp_cstamp; + ift->tstamp = tmp_tstamp; + spin_unlock_bh(&ift->lock); + + addrconf_dad_start(ift, 0); + in6_ifa_put(ift); + in6_dev_put(idev); +out: + return ret; +} +#endif + +/* + * Choose an appropriate source address + * should do: + * i) get an address with an appropriate scope + * ii) see if there is a specific route for the destination and use + * an address of the attached interface + * iii) don't use deprecated addresses + */ +static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref) +{ + int pref; + pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2; +#ifdef CONFIG_IPV6_PRIVACY + pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1; +#endif + return pref; +} + +#ifdef CONFIG_IPV6_PRIVACY +#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3) +#else +#define IPV6_GET_SADDR_MAXSCORE(score) (score) +#endif + +int ipv6_dev_get_saddr(struct net_device *dev, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + struct inet6_ifaddr *ifp = NULL; + struct inet6_ifaddr *match = NULL; + struct inet6_dev *idev; + int scope; + int err; + int hiscore = -1, score; + + scope = ipv6_addr_scope(daddr); + + /* + * known dev + * search dev and walk through dev addresses + */ + + if (dev) { + if (dev->flags & IFF_LOOPBACK) + scope = IFA_HOST; + + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == scope) { + if (ifp->flags&IFA_F_TENTATIVE) + continue; +#ifdef CONFIG_IPV6_PRIVACY + score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); +#else + score = ipv6_saddr_pref(ifp, 0); +#endif + if (score <= hiscore) + continue; + + if (match) + in6_ifa_put(match); + match = ifp; + hiscore = score; + in6_ifa_hold(ifp); + + if (IPV6_GET_SADDR_MAXSCORE(score)) { + read_unlock_bh(&idev->lock); + read_unlock(&addrconf_lock); + goto out; + } + } + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + } + + if (scope == IFA_LINK) + goto out; + + /* + * dev == NULL or search failed for specified dev + */ + + read_lock(&dev_base_lock); + read_lock(&addrconf_lock); + for (dev = dev_base; dev; dev=dev->next) { + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == scope) { + if (ifp->flags&IFA_F_TENTATIVE) + continue; +#ifdef CONFIG_IPV6_PRIVACY + score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); +#else + score = ipv6_saddr_pref(ifp, 0); +#endif + if (score <= hiscore) + continue; + + if (match) + in6_ifa_put(match); + match = ifp; + hiscore = score; + in6_ifa_hold(ifp); + + if (IPV6_GET_SADDR_MAXSCORE(score)) { + read_unlock_bh(&idev->lock); + goto out_unlock_base; + } + } + } + read_unlock_bh(&idev->lock); + } + } + +out_unlock_base: + read_unlock(&addrconf_lock); + read_unlock(&dev_base_lock); + +out: + err = -EADDRNOTAVAIL; + if (match) { + ipv6_addr_copy(saddr, &match->addr); + err = 0; + in6_ifa_put(match); + } + + return err; +} + + +int ipv6_get_saddr(struct dst_entry *dst, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + return ipv6_dev_get_saddr(dst ? ((struct rt6_info *)dst)->rt6i_idev->dev : NULL, daddr, saddr); +} + + +int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + read_lock(&addrconf_lock); + if ((idev = __in6_dev_get(dev)) != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + ipv6_addr_copy(addr, &ifp->addr); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + return err; +} + +static int ipv6_count_addresses(struct inet6_dev *idev) +{ + int cnt = 0; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) + cnt++; + read_unlock_bh(&idev->lock); + return cnt; +} + +int ipv6_chk_addr(struct in6_addr *addr, struct net_device *dev, int strict) +{ + struct inet6_ifaddr * ifp; + u8 hash = ipv6_addr_hash(addr); + + read_lock_bh(&addrconf_hash_lock); + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr) && + !(ifp->flags&IFA_F_TENTATIVE)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) + break; + } + } + read_unlock_bh(&addrconf_hash_lock); + return ifp != NULL; +} + +static +int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_ifaddr * ifp; + u8 hash = ipv6_addr_hash(addr); + + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev) + break; + } + } + return ifp != NULL; +} + +struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *dev, int strict) +{ + struct inet6_ifaddr * ifp; + u8 hash = ipv6_addr_hash(addr); + + read_lock_bh(&addrconf_hash_lock); + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + in6_ifa_hold(ifp); + break; + } + } + } + read_unlock_bh(&addrconf_hash_lock); + + return ifp; +} + +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +{ + const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; + u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = tcp_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + if (!sk2_rcv_saddr && !sk_ipv6only) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + if (addr_type == IPV6_ADDR_MAPPED && + !sk2_ipv6only && + (!sk2_rcv_saddr || !sk_rcv_saddr || sk_rcv_saddr == sk2_rcv_saddr)) + return 1; + + return 0; +} + +/* Gets referenced address, destroys ifaddr */ + +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + if (net_ratelimit()) + printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); + if (ifp->flags&IFA_F_PERMANENT) { + spin_lock_bh(&ifp->lock); + addrconf_del_timer(ifp); + ifp->flags |= IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + in6_ifa_put(ifp); +#ifdef CONFIG_IPV6_PRIVACY + } else if (ifp->flags&IFA_F_TEMPORARY) { + struct inet6_ifaddr *ifpub; + spin_lock_bh(&ifp->lock); + ifpub = ifp->ifpub; + if (ifpub) { + in6_ifa_hold(ifpub); + spin_unlock_bh(&ifp->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + } else { + spin_unlock_bh(&ifp->lock); + } + ipv6_del_addr(ifp); +#endif + } else + ipv6_del_addr(ifp); +} + + +/* Join to solicited addr multicast group. */ + +void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +} + +void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + __ipv6_dev_mc_dec(idev, &maddr); +} + +void addrconf_join_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + ipv6_dev_ac_inc(ifp->idev->dev, &addr); +} + +void addrconf_leave_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + __ipv6_dev_ac_dec(ifp->idev, &addr); +} + +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802_TR: + if (dev->addr_len != ETH_ALEN) + return -1; + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + + /* + * The zSeries OSA network cards can be shared among various + * OS instances, but the OSA cards have only one MAC address. + * This leads to duplicate address conflicts in conjunction + * with IPv6 if more than one instance uses the same card. + * + * The driver for these cards can deliver a unique 16-bit + * identifier for each instance sharing the same card. It is + * placed instead of 0xFFFE in the interface identifier. The + * "u" bit of the interface identifier is not inverted in this + * case. Hence the resulting interface identifier has local + * scope according to RFC2373. + */ + if (dev->dev_id) { + eui[3] = (dev->dev_id >> 8) & 0xFF; + eui[4] = dev->dev_id & 0xFF; + } else { + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + } + return 0; + case ARPHRD_ARCNET: + /* XXX: inherit EUI-64 from other interface -- yoshfuji */ + if (dev->addr_len != ARCNET_ALEN) + return -1; + memset(eui, 0, 7); + eui[7] = *(u8*)dev->dev_addr; + return 0; + case ARPHRD_INFINIBAND: + if (dev->addr_len != INFINIBAND_ALEN) + return -1; + memcpy(eui, dev->dev_addr + 12, 8); + eui[0] |= 2; + return 0; + } + return -1; +} + +static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) +{ + int err = -1; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + memcpy(eui, ifp->addr.s6_addr+8, 8); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + return err; +} + +#ifdef CONFIG_IPV6_PRIVACY +/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ +static int __ipv6_regen_rndid(struct inet6_dev *idev) +{ + struct net_device *dev; + struct scatterlist sg[2]; + + sg[0].page = virt_to_page(idev->entropy); + sg[0].offset = offset_in_page(idev->entropy); + sg[0].length = 8; + sg[1].page = virt_to_page(idev->work_eui64); + sg[1].offset = offset_in_page(idev->work_eui64); + sg[1].length = 8; + + dev = idev->dev; + + if (ipv6_generate_eui64(idev->work_eui64, dev)) { + printk(KERN_INFO + "__ipv6_regen_rndid(idev=%p): cannot get EUI64 identifier; use random bytes.\n", + idev); + get_random_bytes(idev->work_eui64, sizeof(idev->work_eui64)); + } +regen: + spin_lock(&md5_tfm_lock); + if (unlikely(md5_tfm == NULL)) { + spin_unlock(&md5_tfm_lock); + return -1; + } + crypto_digest_init(md5_tfm); + crypto_digest_update(md5_tfm, sg, 2); + crypto_digest_final(md5_tfm, idev->work_digest); + spin_unlock(&md5_tfm_lock); + + memcpy(idev->rndid, &idev->work_digest[0], 8); + idev->rndid[0] &= ~0x02; + memcpy(idev->entropy, &idev->work_digest[8], 8); + + /* + * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>: + * check if generated address is not inappropriate + * + * - Reserved subnet anycast (RFC 2526) + * 11111101 11....11 1xxxxxxx + * - ISATAP (draft-ietf-ngtrans-isatap-13.txt) 5.1 + * 00-00-5E-FE-xx-xx-xx-xx + * - value 0 + * - XXX: already assigned to an address on the device + */ + if (idev->rndid[0] == 0xfd && + (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && + (idev->rndid[7]&0x80)) + goto regen; + if ((idev->rndid[0]|idev->rndid[1]) == 0) { + if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) + goto regen; + if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) + goto regen; + } + + return 0; +} + +static void ipv6_regen_rndid(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *) data; + unsigned long expires; + + read_lock_bh(&addrconf_lock); + write_lock_bh(&idev->lock); + + if (idev->dead) + goto out; + + if (__ipv6_regen_rndid(idev) < 0) + goto out; + + expires = jiffies + + idev->cnf.temp_prefered_lft * HZ - + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor; + if (time_before(expires, jiffies)) { + printk(KERN_WARNING + "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", + idev->dev->name); + goto out; + } + + if (!mod_timer(&idev->regen_timer, expires)) + in6_dev_hold(idev); + +out: + write_unlock_bh(&idev->lock); + read_unlock_bh(&addrconf_lock); + in6_dev_put(idev); +} + +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { + int ret = 0; + + if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) + ret = __ipv6_regen_rndid(idev); + return ret; +} +#endif + +/* + * Add prefix route. + */ + +static void +addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, + unsigned long expires, unsigned flags) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(rtmsg)); + ipv6_addr_copy(&rtmsg.rtmsg_dst, pfx); + rtmsg.rtmsg_dst_len = plen; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_info = expires; + rtmsg.rtmsg_flags = RTF_UP|flags; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + /* Prevent useless cloning on PtP SIT. + This thing is done here expecting that the whole + class of non-broadcast devices need not cloning. + */ + if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) + rtmsg.rtmsg_flags |= RTF_NONEXTHOP; + + ip6_route_add(&rtmsg, NULL, NULL); +} + +/* Create "default" multicast route to the interface */ + +static void addrconf_add_mroute(struct net_device *dev) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(rtmsg)); + ipv6_addr_set(&rtmsg.rtmsg_dst, + htonl(0xFF000000), 0, 0, 0); + rtmsg.rtmsg_dst_len = 8; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_flags = RTF_UP; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + ip6_route_add(&rtmsg, NULL, NULL); +} + +static void sit_route_add(struct net_device *dev) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(rtmsg)); + + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + + /* prefix length - 96 bits "::d.d.d.d" */ + rtmsg.rtmsg_dst_len = 96; + rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; + rtmsg.rtmsg_ifindex = dev->ifindex; + + ip6_route_add(&rtmsg, NULL, NULL); +} + +static void addrconf_add_lroute(struct net_device *dev) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); +} + +static struct inet6_dev *addrconf_add_dev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) + return NULL; + + /* Add default multicast route */ + addrconf_add_mroute(dev); + + /* Add link local route */ + addrconf_add_lroute(dev); + return idev; +} + +void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) +{ + struct prefix_info *pinfo; + __u32 valid_lft; + __u32 prefered_lft; + int addr_type; + unsigned long rt_expires; + struct inet6_dev *in6_dev; + + pinfo = (struct prefix_info *) opt; + + if (len < sizeof(struct prefix_info)) { + ADBG(("addrconf: prefix option too short\n")); + return; + } + + /* + * Validation checks ([ADDRCONF], page 19) + */ + + addr_type = ipv6_addr_type(&pinfo->prefix); + + if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) + return; + + valid_lft = ntohl(pinfo->valid); + prefered_lft = ntohl(pinfo->prefered); + + if (prefered_lft > valid_lft) { + if (net_ratelimit()) + printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); + return; + } + + in6_dev = in6_dev_get(dev); + + if (in6_dev == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } + + /* + * Two things going on here: + * 1) Add routes for on-link prefixes + * 2) Configure prefixes with the auto flag set + */ + + /* Avoid arithmetic overflow. Really, we could + save rt_expires in seconds, likely valid_lft, + but it would require division in fib gc, that it + not good. + */ + if (valid_lft >= 0x7FFFFFFF/HZ) + rt_expires = 0; + else + rt_expires = jiffies + valid_lft * HZ; + + if (pinfo->onlink) { + struct rt6_info *rt; + rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1); + + if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { + if (rt->rt6i_flags&RTF_EXPIRES) { + if (valid_lft == 0) { + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } else { + rt->rt6i_expires = rt_expires; + } + } + } else if (valid_lft) { + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + } + if (rt) + dst_release(&rt->u.dst); + } + + /* Try to figure out our local address for this prefix */ + + if (pinfo->autoconf && in6_dev->cnf.autoconf) { + struct inet6_ifaddr * ifp; + struct in6_addr addr; + int create = 0, update_lft = 0; + + if (pinfo->prefix_len == 64) { + memcpy(&addr, &pinfo->prefix, 8); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + in6_dev_put(in6_dev); + return; + } + goto ok; + } + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", + pinfo->prefix_len); + in6_dev_put(in6_dev); + return; + +ok: + + ifp = ipv6_get_ifaddr(&addr, dev, 1); + + if (ifp == NULL && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, 0); + + if (!ifp || IS_ERR(ifp)) { + in6_dev_put(in6_dev); + return; + } + + update_lft = create = 1; + ifp->cstamp = jiffies; + addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); + } + + if (ifp) { + int flags; + unsigned long now; +#ifdef CONFIG_IPV6_PRIVACY + struct inet6_ifaddr *ift; +#endif + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && stored_lft) { + if (valid_lft > MIN_VALID_LIFETIME || + valid_lft > stored_lft) + update_lft = 1; + else if (stored_lft <= MIN_VALID_LIFETIME) { + /* valid_lft <= stored_lft is always true */ + /* XXX: IPsec */ + update_lft = 0; + } else { + valid_lft = MIN_VALID_LIFETIME; + if (valid_lft < prefered_lft) + prefered_lft = valid_lft; + update_lft = 1; + } + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock(&ifp->lock); + +#ifdef CONFIG_IPV6_PRIVACY + read_lock_bh(&in6_dev->lock); + /* update all temporary addresses in the list */ + for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) { + /* + * When adjusting the lifetimes of an existing + * temporary address, only lower the lifetimes. + * Implementations must not increase the + * lifetimes of an existing temporary address + * when processing a Prefix Information Option. + */ + spin_lock(&ift->lock); + flags = ift->flags; + if (ift->valid_lft > valid_lft && + ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ) + ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ; + if (ift->prefered_lft > prefered_lft && + ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ) + ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ; + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } + + if (create && in6_dev->cnf.use_tempaddr > 0) { + /* + * When a new public address is created as described in [ADDRCONF], + * also create a new temporary address. + */ + read_unlock_bh(&in6_dev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&in6_dev->lock); + } +#endif + in6_ifa_put(ifp); + addrconf_verify(0); + } + } + inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); + in6_dev_put(in6_dev); +} + +/* + * Set destination address. + * Special case for SIT interfaces where we create a new "virtual" + * device. + */ +int addrconf_set_dstaddr(void __user *arg) +{ + struct in6_ifreq ireq; + struct net_device *dev; + int err = -EINVAL; + + rtnl_lock(); + + err = -EFAULT; + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + goto err_exit; + + dev = __dev_get_by_index(ireq.ifr6_ifindex); + + err = -ENODEV; + if (dev == NULL) + goto err_exit; + + if (dev->type == ARPHRD_SIT) { + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + + err = -EADDRNOTAVAIL; + if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) + goto err_exit; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; + p.iph.saddr = 0; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (void __user *)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0) { + err = -ENOBUFS; + if ((dev = __dev_get_by_name(p.name)) == NULL) + goto err_exit; + err = dev_open(dev); + } + } + +err_exit: + rtnl_unlock(); + return err; +} + +/* + * Manual configuration of address on an interface + */ +static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + int scope; + + ASSERT_RTNL(); + + if ((dev = __dev_get_by_index(ifindex)) == NULL) + return -ENODEV; + + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + + if ((idev = addrconf_add_dev(dev)) == NULL) + return -ENOBUFS; + + scope = ipv6_addr_scope(pfx); + + ifp = ipv6_add_addr(idev, pfx, plen, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + return 0; + } + + return PTR_ERR(ifp); +} + +static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + + if ((dev = __dev_get_by_index(ifindex)) == NULL) + return -ENODEV; + + if ((idev = __in6_dev_get(dev)) == NULL) + return -ENXIO; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->prefix_len == plen && + ipv6_addr_equal(pfx, &ifp->addr)) { + in6_ifa_hold(ifp); + read_unlock_bh(&idev->lock); + + ipv6_del_addr(ifp); + + /* If the last address is deleted administratively, + disable IPv6 on this interface. + */ + if (idev->addr_list == NULL) + addrconf_ifdown(idev->dev, 1); + return 0; + } + } + read_unlock_bh(&idev->lock); + return -EADDRNOTAVAIL; +} + + +int addrconf_add_ifaddr(void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +int addrconf_del_ifaddr(void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_del(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +static void sit_add_v4_addrs(struct inet6_dev *idev) +{ + struct inet6_ifaddr * ifp; + struct in6_addr addr; + struct net_device *dev; + int scope; + + ASSERT_RTNL(); + + memset(&addr, 0, sizeof(struct in6_addr)); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + + if (idev->dev->flags&IFF_POINTOPOINT) { + addr.s6_addr32[0] = htonl(0xfe800000); + scope = IFA_LINK; + } else { + scope = IPV6_ADDR_COMPATv4; + } + + if (addr.s6_addr32[3]) { + ifp = ipv6_add_addr(idev, &addr, 128, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } + return; + } + + for (dev = dev_base; dev != NULL; dev = dev->next) { + struct in_device * in_dev = __in_dev_get(dev); + if (in_dev && (dev->flags & IFF_UP)) { + struct in_ifaddr * ifa; + + int flag = scope; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + int plen; + + addr.s6_addr32[3] = ifa->ifa_local; + + if (ifa->ifa_scope == RT_SCOPE_LINK) + continue; + if (ifa->ifa_scope >= RT_SCOPE_HOST) { + if (idev->dev->flags&IFF_POINTOPOINT) + continue; + flag |= IFA_HOST; + } + if (idev->dev->flags&IFF_POINTOPOINT) + plen = 64; + else + plen = 96; + + ifp = ipv6_add_addr(idev, &addr, plen, flag, + IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } + } + } + } +} + +static void init_loopback(struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr * ifp; + + /* ::1 */ + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init loopback: add_dev failed\n"); + return; + } + + ifp = ipv6_add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } +} + +static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + + ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + } +} + +static void addrconf_dev_config(struct net_device *dev) +{ + struct in6_addr addr; + struct inet6_dev * idev; + + ASSERT_RTNL(); + + if ((dev->type != ARPHRD_ETHER) && + (dev->type != ARPHRD_FDDI) && + (dev->type != ARPHRD_IEEE802_TR) && + (dev->type != ARPHRD_ARCNET) && + (dev->type != ARPHRD_INFINIBAND)) { + /* Alas, we support only Ethernet autoconfiguration. */ + return; + } + + idev = addrconf_add_dev(dev); + if (idev == NULL) + return; + + memset(&addr, 0, sizeof(struct in6_addr)); + addr.s6_addr32[0] = htonl(0xFE800000); + + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) + addrconf_add_linklocal(idev, &addr); +} + +static void addrconf_sit_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init sit: add_dev failed\n"); + return; + } + + sit_add_v4_addrs(idev); + + if (dev->flags&IFF_POINTOPOINT) { + addrconf_add_mroute(dev); + addrconf_add_lroute(dev); + } else + sit_route_add(dev); +} + +static inline int +ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) +{ + struct in6_addr lladdr; + + if (!ipv6_get_lladdr(link_dev, &lladdr)) { + addrconf_add_linklocal(idev, &lladdr); + return 0; + } + return -1; +} + +static void ip6_tnl_add_linklocal(struct inet6_dev *idev) +{ + struct net_device *link_dev; + + /* first try to inherit the link-local address from the link device */ + if (idev->dev->iflink && + (link_dev = __dev_get_by_index(idev->dev->iflink))) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + /* then try to inherit it from any device */ + for (link_dev = dev_base; link_dev; link_dev = link_dev->next) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); +} + +/* + * Autoconfigure tunnel with a link-local address so routing protocols, + * DHCPv6, MLD etc. can be run over the virtual link + */ + +static void addrconf_ip6_tnl_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + if ((idev = addrconf_add_dev(dev)) == NULL) { + printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); + return; + } + ip6_tnl_add_linklocal(idev); + addrconf_add_mroute(dev); +} + +static int addrconf_notify(struct notifier_block *this, unsigned long event, + void * data) +{ + struct net_device *dev = (struct net_device *) data; + struct inet6_dev *idev = __in6_dev_get(dev); + + switch(event) { + case NETDEV_UP: + switch(dev->type) { + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; + case ARPHRD_TUNNEL6: + addrconf_ip6_tnl_config(dev); + break; + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + }; + if (idev) { + /* If the MTU changed during the interface down, when the + interface up, the changed MTU must be reflected in the + idev as well as routers. + */ + if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + } + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + /* If the changed mtu during down is lower than IPV6_MIN_MTU + stop IPv6 on this interface. + */ + if (dev->mtu < IPV6_MIN_MTU) + addrconf_ifdown(dev, event != NETDEV_DOWN); + } + break; + + case NETDEV_CHANGEMTU: + if ( idev && dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + break; + } + + /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */ + + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + /* + * Remove all addresses from this interface. + */ + addrconf_ifdown(dev, event != NETDEV_DOWN); + break; + case NETDEV_CHANGE: + break; + case NETDEV_CHANGENAME: +#ifdef CONFIG_SYSCTL + if (idev) { + addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); + neigh_sysctl_register(dev, idev->nd_parms, + NET_IPV6, NET_IPV6_NEIGH, "ipv6", + &ndisc_ifinfo_sysctl_change, + NULL); + addrconf_sysctl_register(idev, &idev->cnf); + } +#endif + break; + }; + + return NOTIFY_OK; +} + +/* + * addrconf module should be notified of a device going up + */ +static struct notifier_block ipv6_dev_notf = { + .notifier_call = addrconf_notify, + .priority = 0 +}; + +static int addrconf_ifdown(struct net_device *dev, int how) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa, **bifa; + int i; + + ASSERT_RTNL(); + + if (dev == &loopback_dev && how == 1) + how = 0; + + rt6_ifdown(dev); + neigh_ifdown(&nd_tbl, dev); + + idev = __in6_dev_get(dev); + if (idev == NULL) + return -ENODEV; + + /* Step 1: remove reference to ipv6 device from parent device. + Do not dev_put! + */ + if (how == 1) { + write_lock_bh(&addrconf_lock); + dev->ip6_ptr = NULL; + idev->dead = 1; + write_unlock_bh(&addrconf_lock); + + /* Step 1.5: remove snmp6 entry */ + snmp6_unregister_dev(idev); + + } + + /* Step 2: clear hash table */ + for (i=0; i<IN6_ADDR_HSIZE; i++) { + bifa = &inet6_addr_lst[i]; + + write_lock_bh(&addrconf_hash_lock); + while ((ifa = *bifa) != NULL) { + if (ifa->idev == idev) { + *bifa = ifa->lst_next; + ifa->lst_next = NULL; + addrconf_del_timer(ifa); + in6_ifa_put(ifa); + continue; + } + bifa = &ifa->lst_next; + } + write_unlock_bh(&addrconf_hash_lock); + } + + write_lock_bh(&idev->lock); + + /* Step 3: clear flags for stateless addrconf */ + if (how != 1) + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD); + + /* Step 4: clear address list */ +#ifdef CONFIG_IPV6_PRIVACY + if (how == 1 && del_timer(&idev->regen_timer)) + in6_dev_put(idev); + + /* clear tempaddr list */ + while ((ifa = idev->tempaddr_list) != NULL) { + idev->tempaddr_list = ifa->tmp_next; + ifa->tmp_next = NULL; + ifa->dead = 1; + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + if (ifa->ifpub) { + in6_ifa_put(ifa->ifpub); + ifa->ifpub = NULL; + } + spin_unlock_bh(&ifa->lock); + in6_ifa_put(ifa); + write_lock_bh(&idev->lock); + } +#endif + while ((ifa = idev->addr_list) != NULL) { + idev->addr_list = ifa->if_next; + ifa->if_next = NULL; + ifa->dead = 1; + addrconf_del_timer(ifa); + write_unlock_bh(&idev->lock); + + __ipv6_ifa_notify(RTM_DELADDR, ifa); + in6_ifa_put(ifa); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); + + /* Step 5: Discard multicast list */ + + if (how == 1) + ipv6_mc_destroy_dev(idev); + else + ipv6_mc_down(idev); + + /* Step 5: netlink notification of this interface */ + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + + /* Shot the device (if unregistered) */ + + if (how == 1) { +#ifdef CONFIG_SYSCTL + addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); +#endif + neigh_parms_release(&nd_tbl, idev->nd_parms); + neigh_ifdown(&nd_tbl, dev); + in6_dev_put(idev); + } + return 0; +} + +static void addrconf_rs_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + + if (ifp->idev->cnf.forwarding) + goto out; + + if (ifp->idev->if_flags & IF_RA_RCVD) { + /* + * Announcement received after solicitation + * was sent + */ + goto out; + } + + spin_lock(&ifp->lock); + if (ifp->probes++ < ifp->idev->cnf.rtr_solicits) { + struct in6_addr all_routers; + + /* The wait after the last probe can be shorter */ + addrconf_mod_timer(ifp, AC_RS, + (ifp->probes == ifp->idev->cnf.rtr_solicits) ? + ifp->idev->cnf.rtr_solicit_delay : + ifp->idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + + ipv6_addr_all_routers(&all_routers); + + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); + } else { + spin_unlock(&ifp->lock); + /* + * Note: we do not support deprecated "all on-link" + * assumption any longer. + */ + printk(KERN_DEBUG "%s: no IPv6 routers present\n", + ifp->idev->dev->name); + } + +out: + in6_ifa_put(ifp); +} + +/* + * Duplicate Address Detection + */ +static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags) +{ + struct inet6_dev *idev = ifp->idev; + struct net_device *dev = idev->dev; + unsigned long rand_num; + + addrconf_join_solict(dev, &ifp->addr); + + if (ifp->prefix_len != 128 && (ifp->flags&IFA_F_PERMANENT)) + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, + flags); + + net_srandom(ifp->addr.s6_addr32[3]); + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + + read_lock_bh(&idev->lock); + if (ifp->dead) + goto out; + spin_lock_bh(&ifp->lock); + + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + !(ifp->flags&IFA_F_TENTATIVE)) { + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + + addrconf_dad_completed(ifp); + return; + } + + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); + + spin_unlock_bh(&ifp->lock); +out: + read_unlock_bh(&idev->lock); +} + +static void addrconf_dad_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + struct in6_addr unspec; + struct in6_addr mcaddr; + + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); + goto out; + } + spin_lock_bh(&ifp->lock); + if (ifp->probes == 0) { + /* + * DAD was successful + */ + + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + + addrconf_dad_completed(ifp); + + goto out; + } + + ifp->probes--; + addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + + /* send a neighbour solicitation for our addr */ + memset(&unspec, 0, sizeof(unspec)); + addrconf_addr_solict_mult(&ifp->addr, &mcaddr); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); +out: + in6_ifa_put(ifp); +} + +static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +{ + struct net_device * dev = ifp->idev->dev; + + /* + * Configure the address for reception. Now it is valid. + */ + + ipv6_ifa_notify(RTM_NEWADDR, ifp); + + /* If added prefix is link local and forwarding is off, + start sending router solicitations. + */ + + if (ifp->idev->cnf.forwarding == 0 && + ifp->idev->cnf.rtr_solicits > 0 && + (dev->flags&IFF_LOOPBACK) == 0 && + (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + struct in6_addr all_routers; + + ipv6_addr_all_routers(&all_routers); + + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); + + spin_lock_bh(&ifp->lock); + ifp->probes = 1; + ifp->idev->if_flags |= IF_RS_SENT; + addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); + spin_unlock_bh(&ifp->lock); + } +} + +#ifdef CONFIG_PROC_FS +struct if6_iter_state { + int bucket; +}; + +static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) +{ + struct inet6_ifaddr *ifa = NULL; + struct if6_iter_state *state = seq->private; + + for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + ifa = inet6_addr_lst[state->bucket]; + if (ifa) + break; + } + return ifa; +} + +static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa) +{ + struct if6_iter_state *state = seq->private; + + ifa = ifa->lst_next; +try_again: + if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { + ifa = inet6_addr_lst[state->bucket]; + goto try_again; + } + return ifa; +} + +static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct inet6_ifaddr *ifa = if6_get_first(seq); + + if (ifa) + while(pos && (ifa = if6_get_next(seq, ifa)) != NULL) + --pos; + return pos ? NULL : ifa; +} + +static void *if6_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&addrconf_hash_lock); + return if6_get_idx(seq, *pos); +} + +static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct inet6_ifaddr *ifa; + + ifa = if6_get_next(seq, v); + ++*pos; + return ifa; +} + +static void if6_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&addrconf_hash_lock); +} + +static int if6_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; + seq_printf(seq, + "%04x%04x%04x%04x%04x%04x%04x%04x %02x %02x %02x %02x %8s\n", + NIP6(ifp->addr), + ifp->idev->dev->ifindex, + ifp->prefix_len, + ifp->scope, + ifp->flags, + ifp->idev->dev->name); + return 0; +} + +static struct seq_operations if6_seq_ops = { + .start = if6_seq_start, + .next = if6_seq_next, + .show = if6_seq_show, + .stop = if6_seq_stop, +}; + +static int if6_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct if6_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + memset(s, 0, sizeof(*s)); + + rc = seq_open(file, &if6_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations if6_fops = { + .owner = THIS_MODULE, + .open = if6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init if6_proc_init(void) +{ + if (!proc_net_fops_create("if_inet6", S_IRUGO, &if6_fops)) + return -ENOMEM; + return 0; +} + +void if6_proc_exit(void) +{ + proc_net_remove("if_inet6"); +} +#endif /* CONFIG_PROC_FS */ + +/* + * Periodic address status verification + */ + +static void addrconf_verify(unsigned long foo) +{ + struct inet6_ifaddr *ifp; + unsigned long now, next; + int i; + + spin_lock_bh(&addrconf_verify_lock); + now = jiffies; + next = now + ADDR_CHECK_FREQUENCY; + + del_timer(&addr_chk_timer); + + for (i=0; i < IN6_ADDR_HSIZE; i++) { + +restart: + write_lock(&addrconf_hash_lock); + for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { + unsigned long age; +#ifdef CONFIG_IPV6_PRIVACY + unsigned long regen_advance; +#endif + + if (ifp->flags & IFA_F_PERMANENT) + continue; + + spin_lock(&ifp->lock); + age = (now - ifp->tstamp) / HZ; + +#ifdef CONFIG_IPV6_PRIVACY + regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + ifp->idev->nd_parms->retrans_time / HZ; +#endif + + if (age >= ifp->valid_lft) { + spin_unlock(&ifp->lock); + in6_ifa_hold(ifp); + write_unlock(&addrconf_hash_lock); + ipv6_del_addr(ifp); + goto restart; + } else if (age >= ifp->prefered_lft) { + /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ + int deprecate = 0; + + if (!(ifp->flags&IFA_F_DEPRECATED)) { + deprecate = 1; + ifp->flags |= IFA_F_DEPRECATED; + } + + if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + next = ifp->tstamp + ifp->valid_lft * HZ; + + spin_unlock(&ifp->lock); + + if (deprecate) { + in6_ifa_hold(ifp); + write_unlock(&addrconf_hash_lock); + + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); + goto restart; + } +#ifdef CONFIG_IPV6_PRIVACY + } else if ((ifp->flags&IFA_F_TEMPORARY) && + !(ifp->flags&IFA_F_TENTATIVE)) { + if (age >= ifp->prefered_lft - regen_advance) { + struct inet6_ifaddr *ifpub = ifp->ifpub; + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + if (!ifp->regen_count && ifpub) { + ifp->regen_count++; + in6_ifa_hold(ifp); + in6_ifa_hold(ifpub); + spin_unlock(&ifp->lock); + write_unlock(&addrconf_hash_lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; + spin_unlock(&ifp->lock); +#endif + } else { + /* ifp->prefered_lft <= ifp->valid_lft */ + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + spin_unlock(&ifp->lock); + } + } + write_unlock(&addrconf_hash_lock); + } + + addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; + add_timer(&addr_chk_timer); + spin_unlock_bh(&addrconf_verify_lock); +} + +static int +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *pfx; + + pfx = NULL; + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*pfx)) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_LOCAL-1]); + } + if (pfx == NULL) + return -EINVAL; + + return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *pfx; + + pfx = NULL; + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*pfx)) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_LOCAL-1]); + } + if (pfx == NULL) + return -EINVAL; + + return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = ifa->prefix_len; + ifm->ifa_flags = ifa->flags; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ifa->scope&IFA_HOST) + ifm->ifa_scope = RT_SCOPE_HOST; + else if (ifa->scope&IFA_LINK) + ifm->ifa_scope = RT_SCOPE_LINK; + else if (ifa->scope&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifa->idev->dev->ifindex; + RTA_PUT(skb, IFA_ADDRESS, 16, &ifa->addr); + if (!(ifa->flags&IFA_F_PERMANENT)) { + ci.ifa_prefered = ifa->prefered_lft; + ci.ifa_valid = ifa->valid_lft; + if (ci.ifa_prefered != INFINITY_LIFE_TIME) { + long tval = (jiffies - ifa->tstamp)/HZ; + ci.ifa_prefered -= tval; + if (ci.ifa_valid != INFINITY_LIFE_TIME) + ci.ifa_valid -= tval; + } + } else { + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + } + ci.cstamp = (__u32)(TIME_DELTA(ifa->cstamp, INITIAL_JIFFIES) / HZ * 100 + + TIME_DELTA(ifa->cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.tstamp = (__u32)(TIME_DELTA(ifa->tstamp, INITIAL_JIFFIES) / HZ * 100 + + TIME_DELTA(ifa->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = 128; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ipv6_addr_scope(&ifmca->mca_addr)&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifmca->idev->dev->ifindex; + RTA_PUT(skb, IFA_MULTICAST, 16, &ifmca->mca_addr); + ci.cstamp = (__u32)(TIME_DELTA(ifmca->mca_cstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifmca->mca_cstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.tstamp = (__u32)(TIME_DELTA(ifmca->mca_tstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifmca->mca_tstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = 128; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ipv6_addr_scope(&ifaca->aca_addr)&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifaca->aca_idev->dev->ifindex; + RTA_PUT(skb, IFA_ANYCAST, 16, &ifaca->aca_addr); + ci.cstamp = (__u32)(TIME_DELTA(ifaca->aca_cstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifaca->aca_cstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.tstamp = (__u32)(TIME_DELTA(ifaca->aca_tstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifaca->aca_tstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +enum addr_type_t +{ + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + enum addr_type_t type) +{ + int idx, ip_idx; + int s_idx, s_ip_idx; + int err = 1; + struct net_device *dev; + struct inet6_dev *idev = NULL; + struct inet6_ifaddr *ifa; + struct ifmcaddr6 *ifmca; + struct ifacaddr6 *ifaca; + + s_idx = cb->args[0]; + s_ip_idx = ip_idx = cb->args[1]; + read_lock(&dev_base_lock); + + for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + ip_idx = 0; + if ((idev = in6_dev_get(dev)) == NULL) + continue; + read_lock_bh(&idev->lock); + switch (type) { + case UNICAST_ADDR: + /* unicast address */ + for (ifa = idev->addr_list; ifa; + ifa = ifa->if_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) + goto done; + } + /* temp addr */ +#ifdef CONFIG_IPV6_PRIVACY + for (ifa = idev->tempaddr_list; ifa; + ifa = ifa->tmp_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) + goto done; + } +#endif + break; + case MULTICAST_ADDR: + /* multicast address */ + for (ifmca = idev->mc_list; ifmca; + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_GETMULTICAST)) <= 0) + goto done; + } + break; + case ANYCAST_ADDR: + /* anycast address */ + for (ifaca = idev->ac_list; ifaca; + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_GETANYCAST)) <= 0) + goto done; + } + break; + default: + break; + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } +done: + if (err <= 0) { + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + read_unlock(&dev_base_lock); + cb->args[0] = idx; + cb->args[1] = ip_idx; + return skb->len; +} + +static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = UNICAST_ADDR; + return inet6_dump_addr(skb, cb, type); +} + +static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = MULTICAST_ADDR; + return inet6_dump_addr(skb, cb, type); +} + + +static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = ANYCAST_ADDR; + return inet6_dump_addr(skb, cb, type); +} + +static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); + return; + } + if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC); +} + +static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, + __s32 *array, int bytes) +{ + memset(array, 0, bytes); + array[DEVCONF_FORWARDING] = cnf->forwarding; + array[DEVCONF_HOPLIMIT] = cnf->hop_limit; + array[DEVCONF_MTU6] = cnf->mtu6; + array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; + array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; + array[DEVCONF_AUTOCONF] = cnf->autoconf; + array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; + array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval; + array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay; + array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; +#ifdef CONFIG_IPV6_PRIVACY + array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; + array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; + array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; + array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; + array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; +#endif + array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; +} + +static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, + u32 pid, u32 seq, int event) +{ + struct net_device *dev = idev->dev; + __s32 *array = NULL; + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rtattr *subattr; + __u32 mtu = dev->mtu; + struct ifla_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_INET6; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = 0; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + if (dev->addr_len) + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + + subattr = (struct rtattr*)skb->tail; + + RTA_PUT(skb, IFLA_PROTINFO, 0, NULL); + + /* return the device flags */ + RTA_PUT(skb, IFLA_INET6_FLAGS, sizeof(__u32), &idev->if_flags); + + /* return interface cacheinfo */ + ci.max_reasm_len = IPV6_MAXPLEN; + ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100 + + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.reachable_time = idev->nd_parms->reachable_time; + ci.retrans_time = idev->nd_parms->retrans_time; + RTA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); + + /* return the device sysctl params */ + if ((array = kmalloc(DEVCONF_MAX * sizeof(*array), GFP_ATOMIC)) == NULL) + goto rtattr_failure; + ipv6_store_devconf(&idev->cnf, array, DEVCONF_MAX * sizeof(*array)); + RTA_PUT(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(*array), array); + + /* XXX - Statistics/MC not implemented */ + subattr->rta_len = skb->tail - (u8*)subattr; + + nlh->nlmsg_len = skb->tail - b; + kfree(array); + return skb->len; + +nlmsg_failure: +rtattr_failure: + if (array) + kfree(array); + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, err; + int s_idx = cb->args[0]; + struct net_device *dev; + struct inet6_dev *idev; + + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if ((idev = in6_dev_get(dev)) == NULL) + continue; + err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWLINK); + in6_dev_put(idev); + if (err <= 0) + break; + } + read_unlock(&dev_base_lock); + cb->args[0] = idx; + + return skb->len; +} + +void inet6_ifinfo_notify(int event, struct inet6_dev *idev) +{ + struct sk_buff *skb; + /* 128 bytes ?? */ + int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); + return; + } + if (inet6_fill_ifinfo(skb, idev, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC); +} + +static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, + struct prefix_info *pinfo, u32 pid, u32 seq, int event) +{ + struct prefixmsg *pmsg; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct prefix_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*pmsg)); + + if (pid) + nlh->nlmsg_flags |= NLM_F_MULTI; + + pmsg = NLMSG_DATA(nlh); + pmsg->prefix_family = AF_INET6; + pmsg->prefix_ifindex = idev->dev->ifindex; + pmsg->prefix_len = pinfo->prefix_len; + pmsg->prefix_type = pinfo->type; + + pmsg->prefix_flags = 0; + if (pinfo->onlink) + pmsg->prefix_flags |= IF_PREFIX_ONLINK; + if (pinfo->autoconf) + pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; + + RTA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); + + ci.preferred_time = ntohl(pinfo->prefered); + ci.valid_time = ntohl(pinfo->valid); + RTA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); + return; + } + if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC); +} + +static struct rtnetlink_link inet6_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = { + [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo, }, + [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr, }, + [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr, }, + [RTM_GETADDR - RTM_BASE] = { .dumpit = inet6_dump_ifaddr, }, + [RTM_GETMULTICAST - RTM_BASE] = { .dumpit = inet6_dump_ifmcaddr, }, + [RTM_GETANYCAST - RTM_BASE] = { .dumpit = inet6_dump_ifacaddr, }, + [RTM_NEWROUTE - RTM_BASE] = { .doit = inet6_rtm_newroute, }, + [RTM_DELROUTE - RTM_BASE] = { .doit = inet6_rtm_delroute, }, + [RTM_GETROUTE - RTM_BASE] = { .doit = inet6_rtm_getroute, + .dumpit = inet6_dump_fib, }, +}; + +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); + + switch (event) { + case RTM_NEWADDR: + dst_hold(&ifp->rt->u.dst); + if (ip6_ins_rt(ifp->rt, NULL, NULL)) + dst_release(&ifp->rt->u.dst); + if (ifp->idev->cnf.forwarding) + addrconf_join_anycast(ifp); + break; + case RTM_DELADDR: + if (ifp->idev->cnf.forwarding) + addrconf_leave_anycast(ifp); + addrconf_leave_solict(ifp->idev, &ifp->addr); + dst_hold(&ifp->rt->u.dst); + if (ip6_del_rt(ifp->rt, NULL, NULL)) + dst_free(&ifp->rt->u.dst); + else + dst_release(&ifp->rt->u.dst); + break; + } +} + +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + read_lock_bh(&addrconf_lock); + if (likely(ifp->idev->dead == 0)) + __ipv6_ifa_notify(event, ifp); + read_unlock_bh(&addrconf_lock); +} + +#ifdef CONFIG_SYSCTL + +static +int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && valp != &ipv6_devconf_dflt.forwarding) { + if (valp != &ipv6_devconf.forwarding) { + if ((!*valp) ^ (!val)) { + struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; + if (idev == NULL) + return ret; + dev_forward_change(idev); + } + } else { + ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; + addrconf_forward_change(); + } + if (*valp) + rt6_purge_dflt_routers(); + } + + return ret; +} + +static int addrconf_sysctl_forward_strategy(ctl_table *table, + int __user *name, int nlen, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int __user *)newval)) + return -EFAULT; + if (new == *valp) + return 0; + if (oldval && oldlenp) { + size_t len; + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (valp != &ipv6_devconf_dflt.forwarding) { + if (valp != &ipv6_devconf.forwarding) { + struct inet6_dev *idev = (struct inet6_dev *)table->extra1; + int changed; + if (unlikely(idev == NULL)) + return -ENODEV; + changed = (!*valp) ^ (!new); + *valp = new; + if (changed) + dev_forward_change(idev); + } else { + *valp = new; + addrconf_forward_change(); + } + + if (*valp) + rt6_purge_dflt_routers(); + } else + *valp = new; + + return 1; +} + +static struct addrconf_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table addrconf_vars[__NET_IPV6_MAX]; + ctl_table addrconf_dev[2]; + ctl_table addrconf_conf_dir[2]; + ctl_table addrconf_proto_dir[2]; + ctl_table addrconf_root_dir[2]; +} addrconf_sysctl = { + .sysctl_header = NULL, + .addrconf_vars = { + { + .ctl_name = NET_IPV6_FORWARDING, + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &addrconf_sysctl_forward, + .strategy = &addrconf_sysctl_forward_strategy, + }, + { + .ctl_name = NET_IPV6_HOP_LIMIT, + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_IPV6_MTU, + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ACCEPT_RA, + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ACCEPT_REDIRECTS, + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_AUTOCONF, + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_DAD_TRANSMITS, + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_RTR_SOLICITS, + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL, + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY, + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_FORCE_MLD_VERSION, + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IPV6_PRIVACY + { + .ctl_name = NET_IPV6_USE_TEMPADDR, + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_TEMP_VALID_LFT, + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_TEMP_PREFERED_LFT, + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_REGEN_MAX_RETRY, + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR, + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = NET_IPV6_MAX_ADDRESSES, + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_dev = { + { + .ctl_name = NET_PROTO_CONF_ALL, + .procname = "all", + .mode = 0555, + .child = addrconf_sysctl.addrconf_vars, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_conf_dir = { + { + .ctl_name = NET_IPV6_CONF, + .procname = "conf", + .mode = 0555, + .child = addrconf_sysctl.addrconf_dev, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_proto_dir = { + { + .ctl_name = NET_IPV6, + .procname = "ipv6", + .mode = 0555, + .child = addrconf_sysctl.addrconf_conf_dir, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_root_dir = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = addrconf_sysctl.addrconf_proto_dir, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, +}; + +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +{ + int i; + struct net_device *dev = idev ? idev->dev : NULL; + struct addrconf_sysctl_table *t; + char *dev_name = NULL; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + memcpy(t, &addrconf_sysctl, sizeof(*t)); + for (i=0; t->addrconf_vars[i].data; i++) { + t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + t->addrconf_vars[i].de = NULL; + t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ + } + if (dev) { + dev_name = dev->name; + t->addrconf_dev[0].ctl_name = dev->ifindex; + } else { + dev_name = "default"; + t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + dev_name = net_sysctl_strdup(dev_name); + if (!dev_name) + goto free; + + t->addrconf_dev[0].procname = dev_name; + + t->addrconf_dev[0].child = t->addrconf_vars; + t->addrconf_dev[0].de = NULL; + t->addrconf_conf_dir[0].child = t->addrconf_dev; + t->addrconf_conf_dir[0].de = NULL; + t->addrconf_proto_dir[0].child = t->addrconf_conf_dir; + t->addrconf_proto_dir[0].de = NULL; + t->addrconf_root_dir[0].child = t->addrconf_proto_dir; + t->addrconf_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); + if (t->sysctl_header == NULL) + goto free_procname; + else + p->sysctl = t; + return; + + /* error path */ + free_procname: + kfree(dev_name); + free: + kfree(t); + + return; +} + +static void addrconf_sysctl_unregister(struct ipv6_devconf *p) +{ + if (p->sysctl) { + struct addrconf_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->addrconf_dev[0].procname); + kfree(t); + } +} + + +#endif + +/* + * Device notifier + */ + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inet6addr_chain, nb); +} + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inet6addr_chain,nb); +} + +/* + * Init / cleanup code + */ + +int __init addrconf_init(void) +{ + int err = 0; + + /* The addrconf netdev notifier requires that loopback_dev + * has it's ipv6 private information allocated and setup + * before it can bring up and give link-local addresses + * to other devices which are up. + * + * Unfortunately, loopback_dev is not necessarily the first + * entry in the global dev_base list of net devices. In fact, + * it is likely to be the very last entry on that list. + * So this causes the notifier registry below to try and + * give link-local addresses to all devices besides loopback_dev + * first, then loopback_dev, which cases all the non-loopback_dev + * devices to fail to get a link-local address. + * + * So, as a temporary fix, allocate the ipv6 structure for + * loopback_dev first by hand. + * Longer term, all of the dependencies ipv6 has upon the loopback + * device and it being up should be removed. + */ + rtnl_lock(); + if (!ipv6_add_dev(&loopback_dev)) + err = -ENOMEM; + rtnl_unlock(); + if (err) + return err; + + register_netdevice_notifier(&ipv6_dev_notf); + +#ifdef CONFIG_IPV6_PRIVACY + md5_tfm = crypto_alloc_tfm("md5", 0); + if (unlikely(md5_tfm == NULL)) + printk(KERN_WARNING + "failed to load transform for md5\n"); +#endif + + addrconf_verify(0); + rtnetlink_links[PF_INET6] = inet6_rtnetlink_table; +#ifdef CONFIG_SYSCTL + addrconf_sysctl.sysctl_header = + register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0); + addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); +#endif + + return 0; +} + +void __exit addrconf_cleanup(void) +{ + struct net_device *dev; + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int i; + + unregister_netdevice_notifier(&ipv6_dev_notf); + + rtnetlink_links[PF_INET6] = NULL; +#ifdef CONFIG_SYSCTL + addrconf_sysctl_unregister(&ipv6_devconf_dflt); + addrconf_sysctl_unregister(&ipv6_devconf); +#endif + + rtnl_lock(); + + /* + * clean dev list. + */ + + for (dev=dev_base; dev; dev=dev->next) { + if ((idev = __in6_dev_get(dev)) == NULL) + continue; + addrconf_ifdown(dev, 1); + } + addrconf_ifdown(&loopback_dev, 2); + + /* + * Check hash table. + */ + + write_lock_bh(&addrconf_hash_lock); + for (i=0; i < IN6_ADDR_HSIZE; i++) { + for (ifa=inet6_addr_lst[i]; ifa; ) { + struct inet6_ifaddr *bifa; + + bifa = ifa; + ifa = ifa->lst_next; + printk(KERN_DEBUG "bug: IPv6 address leakage detected: ifa=%p\n", bifa); + /* Do not free it; something is wrong. + Now we can investigate it with debugger. + */ + } + } + write_unlock_bh(&addrconf_hash_lock); + + del_timer(&addr_chk_timer); + + rtnl_unlock(); + +#ifdef CONFIG_IPV6_PRIVACY + if (likely(md5_tfm != NULL)) { + crypto_free_tfm(md5_tfm); + md5_tfm = NULL; + } +#endif + +#ifdef CONFIG_PROC_FS + proc_net_remove("if_inet6"); +#endif +} diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c new file mode 100644 index 000000000000..768b11703daf --- /dev/null +++ b/net/ipv6/af_inet6.c @@ -0,0 +1,867 @@ +/* + * PF_INET6 socket protocol family + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Adapted from linux/net/ipv4/af_inet.c + * + * $Id: af_inet6.c,v 1.66 2002/02/01 22:01:04 davem Exp $ + * + * Fixes: + * piggy, Karl Knutson : Socket protocol table + * Hideaki YOSHIFUJI : sin6_scope_id support + * Arnaldo Melo : check proc_net_create return, cleanups + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/string.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/init.h> + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmpv6.h> +#include <linux/smp_lock.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/ipip.h> +#include <net/protocol.h> +#include <net/inet_common.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#ifdef CONFIG_IPV6_TUNNEL +#include <net/ip6_tunnel.h> +#endif + +#include <asm/uaccess.h> +#include <asm/system.h> + +MODULE_AUTHOR("Cast of dozens"); +MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); +MODULE_LICENSE("GPL"); + +/* IPv6 procfs goodies... */ + +#ifdef CONFIG_PROC_FS +extern int raw6_proc_init(void); +extern void raw6_proc_exit(void); +extern int tcp6_proc_init(void); +extern void tcp6_proc_exit(void); +extern int udp6_proc_init(void); +extern void udp6_proc_exit(void); +extern int ipv6_misc_proc_init(void); +extern void ipv6_misc_proc_exit(void); +extern int ac6_proc_init(void); +extern void ac6_proc_exit(void); +extern int if6_proc_init(void); +extern void if6_proc_exit(void); +#endif + +int sysctl_ipv6_bindv6only; + +#ifdef INET_REFCNT_DEBUG +atomic_t inet6_sock_nr; +#endif + +/* The inetsw table contains everything that inet_create needs to + * build a new socket. + */ +static struct list_head inetsw6[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw6_lock); + +static void inet6_sock_destruct(struct sock *sk) +{ + inet_sock_destruct(sk); + +#ifdef INET_REFCNT_DEBUG + atomic_dec(&inet6_sock_nr); +#endif +} + +static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +{ + const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} + +static int inet6_create(struct socket *sock, int protocol) +{ + struct inet_sock *inet; + struct ipv6_pinfo *np; + struct sock *sk; + struct list_head *p; + struct inet_protosw *answer; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int rc; + + /* Look for the requested type/protocol pair. */ + answer = NULL; + rcu_read_lock(); + list_for_each_rcu(p, &inetsw6[sock->type]) { + answer = list_entry(p, struct inet_protosw, list); + + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + answer = NULL; + } + + rc = -ESOCKTNOSUPPORT; + if (!answer) + goto out_rcu_unlock; + rc = -EPERM; + if (answer->capability > 0 && !capable(answer->capability)) + goto out_rcu_unlock; + rc = -EPROTONOSUPPORT; + if (!protocol) + goto out_rcu_unlock; + + sock->ops = answer->ops; + + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + BUG_TRAP(answer_prot->slab != NULL); + + rc = -ENOBUFS; + sk = sk_alloc(PF_INET6, GFP_KERNEL, answer_prot, 1); + if (sk == NULL) + goto out; + + sock_init_data(sock, sk); + + rc = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + + if (SOCK_RAW == sock->type) { + inet->num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + sk->sk_destruct = inet6_sock_destruct; + sk->sk_family = PF_INET6; + sk->sk_protocol = protocol; + + sk->sk_backlog_rcv = answer->prot->backlog_rcv; + + inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); + np->hop_limit = -1; + np->mcast_hops = -1; + np->mc_loop = 1; + np->pmtudisc = IPV6_PMTUDISC_WANT; + np->ipv6only = sysctl_ipv6_bindv6only; + + /* Init the ipv4 part of the socket since we can have sockets + * using v6 API for ipv4. + */ + inet->uc_ttl = -1; + + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + + +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); + atomic_inc(&inet_sock_nr); +#endif + if (inet->num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically shares. + */ + inet->sport = ntohs(inet->num); + sk->sk_prot->hash(sk); + } + if (sk->sk_prot->init) { + rc = sk->sk_prot->init(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } +out: + return rc; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + __u32 v4addr = 0; + unsigned short snum; + int addr_type = 0; + int err = 0; + + /* If the socket has its own bind function then use it. */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + return -EINVAL; + + snum = ntohs(addr->sin6_port); + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + if (sk->sk_state != TCP_CLOSE || inet->num) { + err = -EINVAL; + goto out; + } + + /* Check if the address belongs to the host. */ + if (addr_type == IPV6_ADDR_MAPPED) { + v4addr = addr->sin6_addr.s6_addr32[3]; + if (inet_addr_type(v4addr) != RTN_LOCAL) { + err = -EADDRNOTAVAIL; + goto out; + } + } else { + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another one + * is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out; + } + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out; + } + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) { + if (dev) + dev_put(dev); + err = -EADDRNOTAVAIL; + goto out; + } + } + if (dev) + dev_put(dev); + } + } + + inet->rcv_saddr = v4addr; + inet->saddr = v4addr; + + ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + + if (!(addr_type & IPV6_ADDR_MULTICAST)) + ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + + if (addr_type != IPV6_ADDR_ANY) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->sport = ntohs(inet->num); + inet->dport = 0; + inet->daddr = 0; +out: + release_sock(sk); + return err; +} + +int inet6_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk == NULL) + return -EINVAL; + + /* Free mc lists */ + ipv6_sock_mc_close(sk); + + /* Free ac lists */ + ipv6_sock_ac_close(sk); + + return inet_release(sock); +} + +int inet6_destroy_sock(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt; + + /* + * Release destination entry + */ + + sk_dst_reset(sk); + + /* Release rx options */ + + if ((skb = xchg(&np->pktoptions, NULL)) != NULL) + kfree_skb(skb); + + /* Free flowlabels */ + fl6_free_socklist(sk); + + /* Free tx options */ + + if ((opt = xchg(&np->opt, NULL)) != NULL) + sock_kfree_s(sk, opt, opt->tot_len); + + return 0; +} + +/* + * This does both peername and sockname. + */ + +int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (peer) { + if (!inet->dport) + return -ENOTCONN; + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1) + return -ENOTCONN; + sin->sin6_port = inet->dport; + ipv6_addr_copy(&sin->sin6_addr, &np->daddr); + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; + } else { + if (ipv6_addr_any(&np->rcv_saddr)) + ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + else + ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr); + + sin->sin6_port = inet->sport; + } + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = sk->sk_bound_dev_if; + *uaddr_len = sizeof(*sin); + return(0); +} + +int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = -EINVAL; + + switch(cmd) + { + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + case SIOCADDRT: + case SIOCDELRT: + + return(ipv6_route_ioctl(cmd,(void __user *)arg)); + + case SIOCSIFADDR: + return addrconf_add_ifaddr((void __user *) arg); + case SIOCDIFADDR: + return addrconf_del_ifaddr((void __user *) arg); + case SIOCSIFDSTADDR: + return addrconf_set_dstaddr((void __user *) arg); + default: + if (!sk->sk_prot->ioctl || + (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) + return(dev_ioctl(cmd,(void __user *) arg)); + return err; + } + /*NOTREACHED*/ + return(0); +} + +struct proto_ops inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = inet_accept, /* ok */ + .getname = inet6_getname, + .poll = tcp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = inet_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = tcp_sendpage +}; + +struct proto_ops inet6_dgram_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = udp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct net_proto_family inet6_family_ops = { + .family = PF_INET6, + .create = inet6_create, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_SYSCTL +extern void ipv6_sysctl_register(void); +extern void ipv6_sysctl_unregister(void); +#endif + +/* Same as inet6_dgram_ops, sans udp_poll. */ +static struct proto_ops inet6_sockraw_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = datagram_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct inet_protosw rawv6_protosw = { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &rawv6_prot, + .ops = &inet6_sockraw_ops, + .capability = CAP_NET_RAW, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + +void +inet6_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + int protocol = p->protocol; + struct list_head *last_perm; + + spin_lock_bh(&inetsw6_lock); + + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + last_perm = &inetsw6[p->type]; + list_for_each(lh, &inetsw6[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); +out: + spin_unlock_bh(&inetsw6_lock); + return; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} + +void +inet6_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw6_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw6_lock); + + synchronize_net(); + } +} + +int +snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +{ + if (ptr == NULL) + return -EINVAL; + + ptr[0] = __alloc_percpu(mibsize, mibalign); + if (!ptr[0]) + goto err0; + + ptr[1] = __alloc_percpu(mibsize, mibalign); + if (!ptr[1]) + goto err1; + + return 0; + +err1: + free_percpu(ptr[0]); + ptr[0] = NULL; +err0: + return -ENOMEM; +} + +void +snmp6_mib_free(void *ptr[2]) +{ + if (ptr == NULL) + return; + if (ptr[0]) + free_percpu(ptr[0]); + if (ptr[1]) + free_percpu(ptr[1]); + ptr[0] = ptr[1] = NULL; +} + +static int __init init_ipv6_mibs(void) +{ + if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp_mib; + if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udp_mib; + return 0; + +err_udp_mib: + snmp6_mib_free((void **)icmpv6_statistics); +err_icmp_mib: + snmp6_mib_free((void **)ipv6_statistics); +err_ip_mib: + return -ENOMEM; + +} + +static void cleanup_ipv6_mibs(void) +{ + snmp6_mib_free((void **)ipv6_statistics); + snmp6_mib_free((void **)icmpv6_statistics); + snmp6_mib_free((void **)udp_stats_in6); +} + +extern int ipv6_misc_proc_init(void); + +static int __init inet6_init(void) +{ + struct sk_buff *dummy_skb; + struct list_head *r; + int err; + +#ifdef MODULE +#if 0 /* FIXME --RR */ + if (!mod_member_present(&__this_module, can_unload)) + return -EINVAL; + + __this_module.can_unload = &ipv6_unload; +#endif +#endif + + if (sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)) { + printk(KERN_CRIT "inet6_proto_init: size fault\n"); + return -EINVAL; + } + + err = proto_register(&tcpv6_prot, 1); + if (err) + goto out; + + err = proto_register(&udpv6_prot, 1); + if (err) + goto out_unregister_tcp_proto; + + err = proto_register(&rawv6_prot, 1); + if (err) + goto out_unregister_udp_proto; + + + /* Register the socket-side information for inet6_create. */ + for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + /* We MUST register RAW sockets before we create the ICMP6, + * IGMP6, or NDISC control sockets. + */ + inet6_register_protosw(&rawv6_protosw); + + /* Register the family here so that the init calls below will + * be able to create sockets. (?? is this dangerous ??) + */ + (void) sock_register(&inet6_family_ops); + + /* Initialise ipv6 mibs */ + err = init_ipv6_mibs(); + if (err) + goto out_unregister_raw_proto; + + /* + * ipngwg API draft makes clear that the correct semantics + * for TCP and UDP is to consider one TCP and UDP instance + * in a host availiable by both INET and INET6 APIs and + * able to communicate via both network protocols. + */ + +#ifdef CONFIG_SYSCTL + ipv6_sysctl_register(); +#endif + err = icmpv6_init(&inet6_family_ops); + if (err) + goto icmp_fail; + err = ndisc_init(&inet6_family_ops); + if (err) + goto ndisc_fail; + err = igmp6_init(&inet6_family_ops); + if (err) + goto igmp_fail; + /* Create /proc/foo6 entries. */ +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (raw6_proc_init()) + goto proc_raw6_fail; + if (tcp6_proc_init()) + goto proc_tcp6_fail; + if (udp6_proc_init()) + goto proc_udp6_fail; + if (ipv6_misc_proc_init()) + goto proc_misc6_fail; + + if (ac6_proc_init()) + goto proc_anycast6_fail; + if (if6_proc_init()) + goto proc_if6_fail; +#endif + ipv6_packet_init(); + ip6_route_init(); + ip6_flowlabel_init(); + err = addrconf_init(); + if (err) + goto addrconf_fail; + sit_init(); + + /* Init v6 extension headers. */ + ipv6_rthdr_init(); + ipv6_frag_init(); + ipv6_nodata_init(); + ipv6_destopt_init(); + + /* Init v6 transport protocols. */ + udpv6_init(); + tcpv6_init(); + err = 0; +out: + return err; + +addrconf_fail: + ip6_flowlabel_cleanup(); + ip6_route_cleanup(); + ipv6_packet_cleanup(); +#ifdef CONFIG_PROC_FS + if6_proc_exit(); +proc_if6_fail: + ac6_proc_exit(); +proc_anycast6_fail: + ipv6_misc_proc_exit(); +proc_misc6_fail: + udp6_proc_exit(); +proc_udp6_fail: + tcp6_proc_exit(); +proc_tcp6_fail: + raw6_proc_exit(); +proc_raw6_fail: +#endif + igmp6_cleanup(); +igmp_fail: + ndisc_cleanup(); +ndisc_fail: + icmpv6_cleanup(); +icmp_fail: +#ifdef CONFIG_SYSCTL + ipv6_sysctl_unregister(); +#endif + cleanup_ipv6_mibs(); +out_unregister_raw_proto: + proto_unregister(&rawv6_prot); +out_unregister_udp_proto: + proto_unregister(&udpv6_prot); +out_unregister_tcp_proto: + proto_unregister(&tcpv6_prot); + goto out; +} +module_init(inet6_init); + +static void __exit inet6_exit(void) +{ + /* First of all disallow new sockets creation. */ + sock_unregister(PF_INET6); +#ifdef CONFIG_PROC_FS + if6_proc_exit(); + ac6_proc_exit(); + ipv6_misc_proc_exit(); + udp6_proc_exit(); + tcp6_proc_exit(); + raw6_proc_exit(); +#endif + /* Cleanup code parts. */ + sit_cleanup(); + ip6_flowlabel_cleanup(); + addrconf_cleanup(); + ip6_route_cleanup(); + ipv6_packet_cleanup(); + igmp6_cleanup(); + ndisc_cleanup(); + icmpv6_cleanup(); +#ifdef CONFIG_SYSCTL + ipv6_sysctl_unregister(); +#endif + cleanup_ipv6_mibs(); + proto_unregister(&rawv6_prot); + proto_unregister(&udpv6_prot); + proto_unregister(&tcpv6_prot); +} +module_exit(inet6_exit); + +MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c new file mode 100644 index 000000000000..e3ecf626cbf7 --- /dev/null +++ b/net/ipv6/ah6.c @@ -0,0 +1,478 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * + * This file is derived from net/ipv4/ah.c. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/ah.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <linux/string.h> +#include <net/icmp.h> +#include <net/ipv6.h> +#include <net/xfrm.h> +#include <asm/scatterlist.h> + +static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +{ + u8 *opt = (u8 *)opthdr; + int len = ipv6_optlen(opthdr); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + if (opt[off] & 0x20) + memset(&opt[off+2], 0, opt[off+1]); + break; + } + + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; + +bad: + return 0; +} + +/** + * ipv6_rearrange_rthdr - rearrange IPv6 routing header + * @iph: IPv6 header + * @rthdr: routing header + * + * Rearrange the destination address in @iph and the addresses in @rthdr + * so that they appear in the order they will at the final destination. + * See Appendix A2 of RFC 2402 for details. + */ +static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) +{ + int segments, segments_left; + struct in6_addr *addrs; + struct in6_addr final_addr; + + segments_left = rthdr->segments_left; + if (segments_left == 0) + return; + rthdr->segments_left = 0; + + /* The value of rthdr->hdrlen has been verified either by the system + * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming + * packets. So we can assume that it is even and that segments is + * greater than or equal to segments_left. + * + * For the same reason we can assume that this option is of type 0. + */ + segments = rthdr->hdrlen >> 1; + + addrs = ((struct rt0_hdr *)rthdr)->addr; + ipv6_addr_copy(&final_addr, addrs + segments - 1); + + addrs += segments - segments_left; + memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); + + ipv6_addr_copy(addrs, &iph->daddr); + ipv6_addr_copy(&iph->daddr, &final_addr); +} + +static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len) +{ + union { + struct ipv6hdr *iph; + struct ipv6_opt_hdr *opth; + struct ipv6_rt_hdr *rth; + char *raw; + } exthdr = { .iph = iph }; + char *end = exthdr.raw + len; + int nexthdr = iph->nexthdr; + + exthdr.iph++; + + while (exthdr.raw < end) { + switch (nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!zero_out_mutable_opts(exthdr.opth)) { + LIMIT_NETDEBUG(printk( + KERN_WARNING "overrun %sopts\n", + nexthdr == NEXTHDR_HOP ? + "hop" : "dest")); + return -EINVAL; + } + break; + + case NEXTHDR_ROUTING: + ipv6_rearrange_rthdr(iph, exthdr.rth); + break; + + default : + return 0; + } + + nexthdr = exthdr.opth->nexthdr; + exthdr.raw += ipv6_optlen(exthdr.opth); + } + + return 0; +} + +static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int extlen; + struct ipv6hdr *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + u8 nexthdr; + char tmp_base[8]; + struct { + struct in6_addr daddr; + char hdrs[0]; + } *tmp_ext; + + top_iph = (struct ipv6hdr *)skb->data; + top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); + + nexthdr = *skb->nh.raw; + *skb->nh.raw = IPPROTO_AH; + + /* When there are no extension headers, we only need to save the first + * 8 bytes of the base IP header. + */ + memcpy(tmp_base, top_iph, sizeof(tmp_base)); + + tmp_ext = NULL; + extlen = skb->h.raw - (unsigned char *)(top_iph + 1); + if (extlen) { + extlen += sizeof(*tmp_ext); + tmp_ext = kmalloc(extlen, GFP_ATOMIC); + if (!tmp_ext) { + err = -ENOMEM; + goto error; + } + memcpy(tmp_ext, &top_iph->daddr, extlen); + err = ipv6_clear_mutable_options(top_iph, + extlen - sizeof(*tmp_ext) + + sizeof(*top_iph)); + if (err) + goto error_free_iph; + } + + ah = (struct ip_auth_hdr *)skb->h.raw; + ah->nexthdr = nexthdr; + + top_iph->priority = 0; + top_iph->flow_lbl[0] = 0; + top_iph->flow_lbl[1] = 0; + top_iph->flow_lbl[2] = 0; + top_iph->hop_limit = 0; + + ahp = x->data; + ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(++x->replay.oseq); + ahp->icv(ahp, skb, ah->auth_data); + + err = 0; + + memcpy(top_iph, tmp_base, sizeof(tmp_base)); + if (tmp_ext) { + memcpy(&top_iph->daddr, tmp_ext, extlen); +error_free_iph: + kfree(tmp_ext); + } + +error: + return err; +} + +static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + /* + * Before process AH + * [IPv6][Ext1][Ext2][AH][Dest][Payload] + * |<-------------->| hdr_len + * + * To erase AH: + * Keeping copy of cleared headers. After AH processing, + * Moving the pointer of skb->nh.raw by using skb_pull as long as AH + * header length. Then copy back the copy as long as hdr_len + * If destination header following AH exists, copy it into after [Ext2]. + * + * |<>|[IPv6][Ext1][Ext2][Dest][Payload] + * There is offset of AH before IPv6 header after the process. + */ + + struct ipv6_auth_hdr *ah; + struct ah_data *ahp; + unsigned char *tmp_hdr = NULL; + u16 hdr_len; + u16 ah_hlen; + int nexthdr; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + hdr_len = skb->data - skb->nh.raw; + ah = (struct ipv6_auth_hdr*)skb->data; + ahp = x->data; + nexthdr = ah->nexthdr; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) + goto out; + memcpy(tmp_hdr, skb->nh.raw, hdr_len); + if (ipv6_clear_mutable_options(skb->nh.ipv6h, hdr_len)) + goto out; + skb->nh.ipv6h->priority = 0; + skb->nh.ipv6h->flow_lbl[0] = 0; + skb->nh.ipv6h->flow_lbl[1] = 0; + skb->nh.ipv6h->flow_lbl[2] = 0; + skb->nh.ipv6h->hop_limit = 0; + + { + u8 auth_data[MAX_AH_AUTH_LEN]; + + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + skb_push(skb, skb->data - skb->nh.raw); + ahp->icv(ahp, skb, ah->auth_data); + if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { + LIMIT_NETDEBUG( + printk(KERN_WARNING "ipsec ah authentication error\n")); + x->stats.integrity_failed++; + goto free_out; + } + } + + skb->nh.raw = skb_pull(skb, ah_hlen); + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_pull(skb, hdr_len); + skb->h.raw = skb->data; + + + kfree(tmp_hdr); + + return nexthdr; + +free_out: + kfree(tmp_hdr); +out: + return -EINVAL; +} + +static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + if (!x) + return; + + NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(ah->spi), NIP6(iph->daddr))); + + xfrm_state_put(x); +} + +static int ah6_init_state(struct xfrm_state *x, void *args) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + + if (!x->aalg) + goto error; + + /* null auth can use a zero length key */ + if (x->aalg->alg_key_len > 512) + goto error; + + if (x->encap) + goto error; + + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + memset(ahp, 0, sizeof(*ahp)); + + ahp->key = x->aalg->alg_key; + ahp->key_len = (x->aalg->alg_key_len+7)/8; + ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (!ahp->tfm) + goto error; + ahp->icv = ah_hmac_digest; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_tfm(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(ahp->tfm)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); + if (!ahp->work_icv) + goto error; + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len); + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + if (ahp->work_icv) + kfree(ahp->work_icv); + if (ahp->tfm) + crypto_free_tfm(ahp->tfm); + kfree(ahp); + } + return -EINVAL; +} + +static void ah6_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + if (ahp->work_icv) { + kfree(ahp->work_icv); + ahp->work_icv = NULL; + } + if (ahp->tfm) { + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; + } + kfree(ahp); +} + +static struct xfrm_type ah6_type = +{ + .description = "AH6", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .init_state = ah6_init_state, + .destructor = ah6_destroy, + .input = ah6_input, + .output = ah6_output +}; + +static struct inet6_protocol ah6_protocol = { + .handler = xfrm6_rcv, + .err_handler = ah6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ah6_init(void) +{ + if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + return -EAGAIN; + } + + if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + xfrm_unregister_type(&ah6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit ah6_fini(void) +{ + if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + + if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + +} + +module_init(ah6_init); +module_exit(ah6_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c new file mode 100644 index 000000000000..5d22ca3cca2e --- /dev/null +++ b/net/ipv6/anycast.c @@ -0,0 +1,594 @@ +/* + * Anycast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * David L Stevens (dlstevens@us.ibm.com) + * + * based heavily on net/ipv6/mcast.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/random.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/route.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/if_inet6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> + +#include <net/checksum.h> + +static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr); + +/* Big ac list lock for all the sockets */ +static DEFINE_RWLOCK(ipv6_sk_ac_lock); + +static int +ip6_onlink(struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int onlink; + + onlink = 0; + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { + onlink = ipv6_prefix_equal(addr, &ifa->addr, + ifa->prefix_len); + if (onlink) + break; + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + return onlink; +} + +/* + * socket join an anycast group + */ + +int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct inet6_dev *idev; + struct ipv6_ac_socklist *pac; + int ishost = !ipv6_devconf.forwarding; + int err = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (ipv6_addr_is_multicast(addr)) + return -EINVAL; + if (ipv6_chk_addr(addr, NULL, 0)) + return -EINVAL; + + pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); + if (pac == NULL) + return -ENOMEM; + pac->acl_next = NULL; + ipv6_addr_copy(&pac->acl_addr, addr); + + if (ifindex == 0) { + struct rt6_info *rt; + + rt = rt6_lookup(addr, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dev_hold(dev); + dst_release(&rt->u.dst); + } else if (ishost) { + err = -EADDRNOTAVAIL; + goto out_free_pac; + } else { + /* router, no matching interface: just pick one */ + + dev = dev_get_by_flags(IFF_UP, IFF_UP|IFF_LOOPBACK); + } + } else + dev = dev_get_by_index(ifindex); + + if (dev == NULL) { + err = -ENODEV; + goto out_free_pac; + } + + idev = in6_dev_get(dev); + if (!idev) { + if (ifindex) + err = -ENODEV; + else + err = -EADDRNOTAVAIL; + goto out_dev_put; + } + /* reset ishost, now that we have a specific device */ + ishost = !idev->cnf.forwarding; + in6_dev_put(idev); + + pac->acl_ifindex = dev->ifindex; + + /* XXX + * For hosts, allow link-local or matching prefix anycasts. + * This obviates the need for propagating anycast routes while + * still allowing some non-router anycast participation. + */ + if (!ip6_onlink(addr, dev)) { + if (ishost) + err = -EADDRNOTAVAIL; + if (err) + goto out_dev_put; + } + + err = ipv6_dev_ac_inc(dev, addr); + if (err) + goto out_dev_put; + + write_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + write_unlock_bh(&ipv6_sk_ac_lock); + + dev_put(dev); + + return 0; + +out_dev_put: + dev_put(dev); +out_free_pac: + sock_kfree_s(sk, pac, sizeof(*pac)); + return err; +} + +/* + * socket leave an anycast group + */ +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev; + struct ipv6_ac_socklist *pac, *prev_pac; + + write_lock_bh(&ipv6_sk_ac_lock); + prev_pac = NULL; + for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { + if ((ifindex == 0 || pac->acl_ifindex == ifindex) && + ipv6_addr_equal(&pac->acl_addr, addr)) + break; + prev_pac = pac; + } + if (!pac) { + write_unlock_bh(&ipv6_sk_ac_lock); + return -ENOENT; + } + if (prev_pac) + prev_pac->acl_next = pac->acl_next; + else + np->ipv6_ac_list = pac->acl_next; + + write_unlock_bh(&ipv6_sk_ac_lock); + + dev = dev_get_by_index(pac->acl_ifindex); + if (dev) { + ipv6_dev_ac_dec(dev, &pac->acl_addr); + dev_put(dev); + } + sock_kfree_s(sk, pac, sizeof(*pac)); + return 0; +} + +void ipv6_sock_ac_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct ipv6_ac_socklist *pac; + int prev_index; + + write_lock_bh(&ipv6_sk_ac_lock); + pac = np->ipv6_ac_list; + np->ipv6_ac_list = NULL; + write_unlock_bh(&ipv6_sk_ac_lock); + + prev_index = 0; + while (pac) { + struct ipv6_ac_socklist *next = pac->acl_next; + + if (pac->acl_ifindex != prev_index) { + if (dev) + dev_put(dev); + dev = dev_get_by_index(pac->acl_ifindex); + prev_index = pac->acl_ifindex; + } + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + sock_kfree_s(sk, pac, sizeof(*pac)); + pac = next; + } + if (dev) + dev_put(dev); +} + +#if 0 +/* The function is not used, which is funny. Apparently, author + * supposed to use it to filter out datagrams inside udp/raw but forgot. + * + * It is OK, anycasts are not special comparing to delivery to unicasts. + */ + +int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex) +{ + struct ipv6_ac_socklist *pac; + struct ipv6_pinfo *np = inet6_sk(sk); + int found; + + found = 0; + read_lock(&ipv6_sk_ac_lock); + for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) { + if (ifindex && pac->acl_ifindex != ifindex) + continue; + found = ipv6_addr_equal(&pac->acl_addr, addr); + if (found) + break; + } + read_unlock(&ipv6_sk_ac_lock); + + return found; +} + +#endif + +static void aca_put(struct ifacaddr6 *ac) +{ + if (atomic_dec_and_test(&ac->aca_refcnt)) { + in6_dev_put(ac->aca_idev); + dst_release(&ac->aca_rt->u.dst); + kfree(ac); + } +} + +/* + * device anycast group inc (add if not found) + */ +int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) +{ + struct ifacaddr6 *aca; + struct inet6_dev *idev; + struct rt6_info *rt; + int err; + + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + err = -ENODEV; + goto out; + } + + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) { + aca->aca_users++; + err = 0; + goto out; + } + } + + /* + * not found: create a new one. + */ + + aca = kmalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); + + if (aca == NULL) { + err = -ENOMEM; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 1); + if (IS_ERR(rt)) { + kfree(aca); + err = PTR_ERR(rt); + goto out; + } + + memset(aca, 0, sizeof(struct ifacaddr6)); + + ipv6_addr_copy(&aca->aca_addr, addr); + aca->aca_idev = idev; + aca->aca_rt = rt; + aca->aca_users = 1; + /* aca_tstamp should be updated upon changes */ + aca->aca_cstamp = aca->aca_tstamp = jiffies; + atomic_set(&aca->aca_refcnt, 2); + spin_lock_init(&aca->aca_lock); + + aca->aca_next = idev->ac_list; + idev->ac_list = aca; + write_unlock_bh(&idev->lock); + + dst_hold(&rt->u.dst); + if (ip6_ins_rt(rt, NULL, NULL)) + dst_release(&rt->u.dst); + + addrconf_join_solict(dev, &aca->aca_addr); + + aca_put(aca); + return 0; +out: + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return err; +} + +/* + * device anycast group decrement + */ +int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct ifacaddr6 *aca, *prev_aca; + + write_lock_bh(&idev->lock); + prev_aca = NULL; + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + prev_aca = aca; + } + if (!aca) { + write_unlock_bh(&idev->lock); + return -ENOENT; + } + if (--aca->aca_users > 0) { + write_unlock_bh(&idev->lock); + return 0; + } + if (prev_aca) + prev_aca->aca_next = aca->aca_next; + else + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->u.dst); + if (ip6_del_rt(aca->aca_rt, NULL, NULL)) + dst_free(&aca->aca_rt->u.dst); + else + dst_release(&aca->aca_rt->u.dst); + + aca_put(aca); + return 0; +} + +static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) +{ + int ret; + struct inet6_dev *idev = in6_dev_get(dev); + if (idev == NULL) + return -ENODEV; + ret = __ipv6_dev_ac_dec(idev, addr); + in6_dev_put(idev); + return ret; +} + +/* + * check if the interface has this anycast address + */ +static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev; + struct ifacaddr6 *aca; + + idev = in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (aca = idev->ac_list; aca; aca = aca->aca_next) + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + return aca != 0; + } + return 0; +} + +/* + * check if given interface (or any, if dev==0) has this anycast address + */ +int ipv6_chk_acast_addr(struct net_device *dev, struct in6_addr *addr) +{ + if (dev) + return ipv6_chk_acast_dev(dev, addr); + read_lock(&dev_base_lock); + for (dev=dev_base; dev; dev=dev->next) + if (ipv6_chk_acast_dev(dev, addr)) + break; + read_unlock(&dev_base_lock); + return dev != 0; +} + + +#ifdef CONFIG_PROC_FS +struct ac6_iter_state { + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) + +static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) +{ + struct ifacaddr6 *im = NULL; + struct ac6_iter_state *state = ac6_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL; + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; + idev = in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->ac_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + } + return im; +} + +static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + + im = im->aca_next; + while (!im) { + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->ac_list; + } + return im; +} + +static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifacaddr6 *im = ac6_get_first(seq); + if (im) + while (pos && (im = ac6_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return ac6_get_idx(seq, *pos); +} + +static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifacaddr6 *im; + im = ac6_get_next(seq, v); + ++*pos; + return im; +} + +static void ac6_seq_stop(struct seq_file *seq, void *v) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + read_unlock(&dev_base_lock); +} + +static int ac6_seq_show(struct seq_file *seq, void *v) +{ + struct ifacaddr6 *im = (struct ifacaddr6 *)v; + struct ac6_iter_state *state = ac6_seq_private(seq); + + seq_printf(seq, + "%-4d %-15s " + "%04x%04x%04x%04x%04x%04x%04x%04x " + "%5d\n", + state->dev->ifindex, state->dev->name, + NIP6(im->aca_addr), + im->aca_users); + return 0; +} + +static struct seq_operations ac6_seq_ops = { + .start = ac6_seq_start, + .next = ac6_seq_next, + .stop = ac6_seq_stop, + .show = ac6_seq_show, +}; + +static int ac6_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ac6_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ac6_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations ac6_seq_fops = { + .owner = THIS_MODULE, + .open = ac6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init ac6_proc_init(void) +{ + if (!proc_net_fops_create("anycast6", S_IRUGO, &ac6_seq_fops)) + return -ENOMEM; + + return 0; +} + +void ac6_proc_exit(void) +{ + proc_net_remove("anycast6"); +} +#endif + diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c new file mode 100644 index 000000000000..65b9375df57d --- /dev/null +++ b/net/ipv6/datagram.c @@ -0,0 +1,600 @@ +/* + * common UDP/RAW code + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: datagram.c,v 1.24 2002/02/01 22:01:04 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/route.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> + +#include <linux/errqueue.h> +#include <asm/uaccess.h> + +int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr, *final_p = NULL, final; + struct dst_entry *dst; + struct flowi fl; + struct ip6_flowlabel *flowlabel = NULL; + int addr_type; + int err; + + if (usin->sin6_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + err = ip4_datagram_connect(sk, uaddr, addr_len); + goto ipv4_connected; + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl, 0, sizeof(fl)); + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + } + } + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if (addr_type == IPV6_ADDR_ANY) { + /* + * connect to self + */ + usin->sin6_addr.s6_addr[15] = 0x01; + } + + daddr = &usin->sin6_addr; + + if (addr_type == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + + if (__ipv6_only_sock(sk)) { + err = -ENETUNREACH; + goto out; + } + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + sin.sin_port = usin->sin6_port; + + err = ip4_datagram_connect(sk, + (struct sockaddr*) &sin, + sizeof(sin)); + +ipv4_connected: + if (err) + goto out; + + ipv6_addr_set(&np->daddr, 0, 0, htonl(0x0000ffff), inet->daddr); + + if (ipv6_addr_any(&np->saddr)) { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000ffff), + inet->saddr); + } + + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000ffff), + inet->rcv_saddr); + } + goto out; + } + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) { + err = -EINVAL; + goto out; + } + sk->sk_bound_dev_if = usin->sin6_scope_id; + if (!sk->sk_bound_dev_if && + (addr_type & IPV6_ADDR_MULTICAST)) + fl.oif = np->mcast_oif; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out; + } + } + + ipv6_addr_copy(&np->daddr, daddr); + np->flow_label = fl.fl6_flowlabel; + + inet->dport = usin->sin6_port; + + /* + * Check for a route to destination an obtain the + * destination cache for it. + */ + + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST)) + fl.oif = np->mcast_oif; + + if (flowlabel) { + if (flowlabel->opt && flowlabel->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + } else if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + dst_release(dst); + goto out; + } + + /* source address lookup done in ip6_dst_lookup */ + + if (ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&np->saddr, &fl.fl6_src); + + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); + inet->rcv_saddr = LOOPBACK4_IPV6; + } + + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? + &np->daddr : NULL); + + sk->sk_state = TCP_ESTABLISHED; +out: + fl6_sock_release(flowlabel); + return err; +} + +void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr *icmph = (struct icmp6hdr *)skb->h.raw; + struct sock_exterr_skb *serr; + + if (!np->recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; + serr->ee.ee_type = icmph->icmp6_type; + serr->ee.ee_code = icmph->icmp6_code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct ipv6hdr*)(icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + __skb_pull(skb, payload - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct ipv6hdr *iph; + struct sk_buff *skb; + + if (!np->recverr) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr)); + skb->nh.ipv6h = iph; + ipv6_addr_copy(&iph->daddr, &fl->fl6_dst); + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = fl->fl_ip_dport; + + skb->h.raw = skb->tail; + __skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in6 *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in6 offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = serr->port; + sin->sin6_scope_id = 0; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { + ipv6_addr_copy(&sin->sin6_addr, + (struct in6_addr *)(skb->nh.raw + serr->addr_offset)); + if (np->sndflow) + sin->sin6_flowinfo = *(u32*)(skb->nh.raw + serr->addr_offset - 24) & IPV6_FLOWINFO_MASK; + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + ipv6_addr_set(&sin->sin6_addr, 0, 0, + htonl(0xffff), + *(u32*)(skb->nh.raw + serr->addr_offset)); + } + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin6_family = AF_UNSPEC; + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { + ipv6_addr_copy(&sin->sin6_addr, &skb->nh.ipv6h->saddr); + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + struct inet_sock *inet = inet_sk(sk); + + ipv6_addr_set(&sin->sin6_addr, 0, 0, + htonl(0xffff), + skb->nh.iph->saddr); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + } + + put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_irq(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_irq(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else { + spin_unlock_irq(&sk->sk_error_queue.lock); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + + +int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + + src_info.ipi6_ifindex = opt->iif; + ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr); + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + + if (np->rxopt.bits.rxhlim) { + int hlim = skb->nh.ipv6h->hop_limit; + put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + + if (np->rxopt.bits.rxflow && (*(u32*)skb->nh.raw & IPV6_FLOWINFO_MASK)) { + u32 flowinfo = *(u32*)skb->nh.raw & IPV6_FLOWINFO_MASK; + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } + if (np->rxopt.bits.hopopts && opt->hop) { + u8 *ptr = skb->nh.raw + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.dstopts && opt->dst0) { + u8 *ptr = skb->nh.raw + opt->dst0; + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.srcrt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt); + put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + } + if (np->rxopt.bits.dstopts && opt->dst1) { + u8 *ptr = skb->nh.raw + opt->dst1; + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr); + } + return 0; +} + +int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, + struct ipv6_txoptions *opt, + int *hlimit) +{ + struct in6_pktinfo *src_info; + struct cmsghdr *cmsg; + struct ipv6_rt_hdr *rthdr; + struct ipv6_opt_hdr *hdr; + int len; + int err = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + int addr_type; + struct net_device *dev = NULL; + + if (!CMSG_OK(msg, cmsg)) { + err = -EINVAL; + goto exit_f; + } + + if (cmsg->cmsg_level != SOL_IPV6) + continue; + + switch (cmsg->cmsg_type) { + case IPV6_PKTINFO: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { + err = -EINVAL; + goto exit_f; + } + + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + + if (src_info->ipi6_ifindex) { + if (fl->oif && src_info->ipi6_ifindex != fl->oif) + return -EINVAL; + fl->oif = src_info->ipi6_ifindex; + } + + addr_type = ipv6_addr_type(&src_info->ipi6_addr); + + if (addr_type == IPV6_ADDR_ANY) + break; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (!src_info->ipi6_ifindex) + return -EINVAL; + else { + dev = dev_get_by_index(src_info->ipi6_ifindex); + if (!dev) + return -ENODEV; + } + } + if (!ipv6_chk_addr(&src_info->ipi6_addr, dev, 0)) { + if (dev) + dev_put(dev); + err = -EINVAL; + goto exit_f; + } + if (dev) + dev_put(dev); + + ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr); + break; + + case IPV6_FLOWINFO: + if (cmsg->cmsg_len < CMSG_LEN(4)) { + err = -EINVAL; + goto exit_f; + } + + if (fl->fl6_flowlabel&IPV6_FLOWINFO_MASK) { + if ((fl->fl6_flowlabel^*(u32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { + err = -EINVAL; + goto exit_f; + } + } + fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(u32 *)CMSG_DATA(cmsg); + break; + + case IPV6_HOPOPTS: + if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + opt->opt_nflen += len; + opt->hopopt = hdr; + break; + + case IPV6_DSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (opt->dst1opt) { + err = -EINVAL; + goto exit_f; + } + opt->opt_flen += len; + opt->dst1opt = hdr; + break; + + case IPV6_RTHDR: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); + + /* + * TYPE 0 + */ + if (rthdr->type) { + err = -EINVAL; + goto exit_f; + } + + len = ((rthdr->hdrlen + 1) << 3); + + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + + /* segments left must also match */ + if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { + err = -EINVAL; + goto exit_f; + } + + opt->opt_nflen += len; + opt->srcrt = rthdr; + + if (opt->dst1opt) { + int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); + + opt->opt_nflen += dsthdrlen; + opt->dst0opt = opt->dst1opt; + opt->dst1opt = NULL; + opt->opt_flen -= dsthdrlen; + } + + break; + + case IPV6_HOPLIMIT: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + err = -EINVAL; + goto exit_f; + } + + *hlimit = *(int *)CMSG_DATA(cmsg); + break; + + default: + LIMIT_NETDEBUG( + printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type)); + err = -EINVAL; + break; + }; + } + +exit_f: + return err; +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c new file mode 100644 index 000000000000..be7095d6babe --- /dev/null +++ b/net/ipv6/esp6.c @@ -0,0 +1,424 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * + * This file is derived from net/ipv4/esp.c + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/esp.h> +#include <asm/scatterlist.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <linux/random.h> +#include <net/icmp.h> +#include <net/ipv6.h> +#include <linux/icmpv6.h> + +static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int hdr_len; + struct ipv6hdr *top_iph; + struct ipv6_esp_hdr *esph; + struct crypto_tfm *tfm; + struct esp_data *esp; + struct sk_buff *trailer; + int blksize; + int clen; + int alen; + int nfrags; + + esp = x->data; + hdr_len = skb->h.raw - skb->data + + sizeof(*esph) + esp->conf.ivlen; + + /* Strip IP+ESP header. */ + __skb_pull(skb, hdr_len); + + /* Now skb is pure payload to encrypt */ + err = -ENOMEM; + + /* Round to block size */ + clen = skb->len; + + alen = esp->auth.icv_trunc_len; + tfm = esp->conf.tfm; + blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3; + clen = (clen + 2 + blksize-1)&~(blksize-1); + if (esp->conf.padlen) + clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) { + goto error; + } + + /* Fill padding... */ + do { + int i; + for (i=0; i<clen-skb->len - 2; i++) + *(u8*)(trailer->tail + i) = i+1; + } while (0); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); + + top_iph = (struct ipv6hdr *)__skb_push(skb, hdr_len); + esph = (struct ipv6_esp_hdr *)skb->h.raw; + top_iph->payload_len = htons(skb->len + alen - sizeof(*top_iph)); + *(u8*)(trailer->tail - 1) = *skb->nh.raw; + *skb->nh.raw = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(++x->replay.oseq); + + if (esp->conf.ivlen) + crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + + do { + struct scatterlist *sg = &esp->sgbuf[0]; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto error; + } + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); + crypto_cipher_encrypt(tfm, sg, sg, clen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + } while (0); + + if (esp->conf.ivlen) { + memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + } + + if (esp->auth.icv_full_len) { + esp->auth.icv(esp, skb, (u8*)esph-skb->data, + sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); + pskb_put(skb, trailer, alen); + } + + err = 0; + +error: + return err; +} + +static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct ipv6_esp_hdr *esph; + struct esp_data *esp = x->data; + struct sk_buff *trailer; + int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + int alen = esp->auth.icv_trunc_len; + int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen; + + int hdr_len = skb->h.raw - skb->nh.raw; + int nfrags; + unsigned char *tmp_hdr = NULL; + int ret = 0; + + if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { + ret = -EINVAL; + goto out_nofree; + } + + if (elen <= 0 || (elen & (blksize-1))) { + ret = -EINVAL; + goto out_nofree; + } + + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) { + ret = -ENOMEM; + goto out_nofree; + } + memcpy(tmp_hdr, skb->nh.raw, hdr_len); + + /* If integrity check is required, do this. */ + if (esp->auth.icv_full_len) { + u8 sum[esp->auth.icv_full_len]; + u8 sum1[alen]; + + esp->auth.icv(esp, skb, 0, skb->len-alen, sum); + + if (skb_copy_bits(skb, skb->len-alen, sum1, alen)) + BUG(); + + if (unlikely(memcmp(sum, sum1, alen))) { + x->stats.integrity_failed++; + ret = -EINVAL; + goto out; + } + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { + ret = -EINVAL; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ipv6_esp_hdr*)skb->data; + iph = skb->nh.ipv6h; + + /* Get ivec. This can be wrong, check against another impls. */ + if (esp->conf.ivlen) + crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm)); + + { + u8 nexthdr[2]; + struct scatterlist *sg = &esp->sgbuf[0]; + u8 padlen; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) { + ret = -ENOMEM; + goto out; + } + } + skb_to_sgvec(skb, sg, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen, elen); + crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + padlen = nexthdr[0]; + if (padlen+2 >= elen) { + LIMIT_NETDEBUG( + printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen)); + ret = -EINVAL; + goto out; + } + /* ... check padding bits here. Silly. :-) */ + + pskb_trim(skb, skb->len - alen - padlen - 2); + skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); + skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + ret = nexthdr[1]; + } + +out: + kfree(tmp_hdr); +out_nofree: + return ret; +} + +static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + + if (x->props.mode) { + mtu = (mtu + 2 + blksize-1)&~(blksize-1); + } else { + /* The worst case. */ + mtu += 2 + blksize; + } + if (esp->conf.padlen) + mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + return mtu + x->props.header_len + esp->auth.icv_full_len; +} + +static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ipv6_esp_hdr *esph = (struct ipv6_esp_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(esph->spi), NIP6(iph->daddr)); + xfrm_state_put(x); +} + +static void esp6_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + if (esp->conf.tfm) { + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + } + if (esp->conf.ivec) { + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + } + if (esp->auth.tfm) { + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + } + if (esp->auth.work_icv) { + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; + } + kfree(esp); +} + +static int esp6_init_state(struct xfrm_state *x, void *args) +{ + struct esp_data *esp = NULL; + + /* null auth and encryption can have zero length keys */ + if (x->aalg) { + if (x->aalg->alg_key_len > 512) + goto error; + } + if (x->ealg == NULL) + goto error; + + if (x->encap) + goto error; + + esp = kmalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + memset(esp, 0, sizeof(*esp)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + esp->auth.key = x->aalg->alg_key; + esp->auth.key_len = (x->aalg->alg_key_len+7)/8; + esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (esp->auth.tfm == NULL) + goto error; + esp->auth.icv = esp_hmac_digest; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(esp->auth.tfm)) { + printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL); + if (!esp->auth.work_icv) + goto error; + } + esp->conf.key = x->ealg->alg_key; + esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + if (x->props.ealgo == SADB_EALG_NULL) + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB); + else + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC); + if (esp->conf.tfm == NULL) + goto error; + esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm); + esp->conf.padlen = 0; + if (esp->conf.ivlen) { + esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL); + if (unlikely(esp->conf.ivec == NULL)) + goto error; + get_random_bytes(esp->conf.ivec, esp->conf.ivlen); + } + if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len)) + goto error; + x->props.header_len = sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + x->data = esp; + return 0; + +error: + x->data = esp; + esp6_destroy(x); + x->data = NULL; + return -EINVAL; +} + +static struct xfrm_type esp6_type = +{ + .description = "ESP6", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .init_state = esp6_init_state, + .destructor = esp6_destroy, + .get_max_size = esp6_get_max_size, + .input = esp6_input, + .output = esp6_output +}; + +static struct inet6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .err_handler = esp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init esp6_init(void) +{ + if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + xfrm_unregister_type(&esp6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit esp6_fini(void) +{ + if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); +} + +module_init(esp6_init); +module_exit(esp6_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c new file mode 100644 index 000000000000..e0839eafc3a9 --- /dev/null +++ b/net/ipv6/exthdrs.c @@ -0,0 +1,575 @@ +/* + * Extension Header handling for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Andi Kleen <ak@muc.de> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * $Id: exthdrs.c,v 1.13 2001/06/19 15:58:56 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * yoshfuji : ensure not to overrun while parsing + * tlv options. + * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). + * YOSHIFUJI Hideaki @USAGI Register inbound extension header + * handlers as inet6_protocol{}. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> + +#include <asm/uaccess.h> + +/* + * Parsing tlv encoded headers. + * + * Parsing function "func" returns 1, if parsing succeed + * and 0, if it failed. + * It MUST NOT touch skb->h. + */ + +struct tlvtype_proc { + int type; + int (*func)(struct sk_buff *skb, int offset); +}; + +/********************* + Generic functions + *********************/ + +/* An unknown option is detected, decide what to do */ + +static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +{ + switch ((skb->nh.raw[optoff] & 0xC0) >> 6) { + case 0: /* ignore */ + return 1; + + case 1: /* drop packet */ + break; + + case 3: /* Send ICMP if not a multicast address and drop packet */ + /* Actually, it is redundant check. icmp_send + will recheck in any case. + */ + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + break; + case 2: /* send ICMP PARM PROB regardless and drop packet */ + icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); + return 0; + }; + + kfree_skb(skb); + return 0; +} + +/* Parse tlv encoded option header (hop-by-hop or destination) */ + +static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) +{ + struct tlvtype_proc *curr; + int off = skb->h.raw - skb->nh.raw; + int len = ((skb->h.raw[1]+1)<<3); + + if ((skb->h.raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = skb->nh.raw[off+1]+2; + + switch (skb->nh.raw[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + default: /* Other TLV code so scan list */ + if (optlen > len) + goto bad; + for (curr=procs; curr->type >= 0; curr++) { + if (curr->type == skb->nh.raw[off]) { + /* type specific length/alignment + checks will be performed in the + func(). */ + if (curr->func(skb, off) == 0) + return 0; + break; + } + } + if (curr->type < 0) { + if (ip6_tlvopt_unknown(skb, off) == 0) + return 0; + } + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; +bad: + kfree_skb(skb); + return 0; +} + +/***************************** + Destination options header. + *****************************/ + +static struct tlvtype_proc tlvprocdestopt_lst[] = { + /* No destination options are defined now */ + {-1, NULL} +}; + +static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + struct inet6_skb_parm *opt = IP6CB(skb); + + if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || + !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + opt->dst1 = skb->h.raw - skb->nh.raw; + + if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { + skb->h.raw += ((skb->h.raw[1]+1)<<3); + *nhoffp = opt->dst1; + return 1; + } + + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + return -1; +} + +static struct inet6_protocol destopt_protocol = { + .handler = ipv6_destopt_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_destopt_init(void) +{ + if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0) + printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n"); +} + +/******************************** + NONE header. No data in packet. + ********************************/ + +static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + + kfree_skb(skb); + return 0; +} + +static struct inet6_protocol nodata_protocol = { + .handler = ipv6_nodata_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_nodata_init(void) +{ + if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0) + printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n"); +} + +/******************************** + Routing header. + ********************************/ + +static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + struct inet6_skb_parm *opt = IP6CB(skb); + struct in6_addr *addr; + struct in6_addr daddr; + int n, i; + + struct ipv6_rt_hdr *hdr; + struct rt0_hdr *rthdr; + + if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || + !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + hdr = (struct ipv6_rt_hdr *) skb->h.raw; + + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr) || + skb->pkt_type != PACKET_HOST) { + IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + +looped_back: + if (hdr->segments_left == 0) { + opt->srcrt = skb->h.raw - skb->nh.raw; + skb->h.raw += (hdr->hdrlen + 1) << 3; + opt->dst0 = opt->dst1; + opt->dst1 = 0; + *nhoffp = (&hdr->nexthdr) - skb->nh.raw; + return 1; + } + + if (hdr->type != IPV6_SRCRT_TYPE_0) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb->nh.raw); + return -1; + } + + if (hdr->hdrlen & 0x01) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->hdrlen) - skb->nh.raw); + return -1; + } + + /* + * This is the routing header forwarding algorithm from + * RFC 2460, page 16. + */ + + n = hdr->hdrlen >> 1; + + if (hdr->segments_left > n) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->segments_left) - skb->nh.raw); + return -1; + } + + /* We are about to mangle packet header. Be careful! + Do not damage packets queued somewhere. + */ + if (skb_cloned(skb)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); + kfree_skb(skb); + /* the copy is a forwarded packet */ + if (skb2 == NULL) { + IP6_INC_STATS_BH(IPSTATS_MIB_OUTDISCARDS); + return -1; + } + *skbp = skb = skb2; + opt = IP6CB(skb2); + hdr = (struct ipv6_rt_hdr *) skb2->h.raw; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + + i = n - --hdr->segments_left; + + rthdr = (struct rt0_hdr *) hdr; + addr = rthdr->addr; + addr += i - 1; + + if (ipv6_addr_is_multicast(addr)) { + IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + + ipv6_addr_copy(&daddr, addr); + ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr); + + dst_release(xchg(&skb->dst, NULL)); + ip6_route_input(skb); + if (skb->dst->error) { + skb_push(skb, skb->data - skb->nh.raw); + dst_input(skb); + return -1; + } + + if (skb->dst->dev->flags&IFF_LOOPBACK) { + if (skb->nh.ipv6h->hop_limit <= 1) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0, skb->dev); + kfree_skb(skb); + return -1; + } + skb->nh.ipv6h->hop_limit--; + goto looped_back; + } + + skb_push(skb, skb->data - skb->nh.raw); + dst_input(skb); + return -1; +} + +static struct inet6_protocol rthdr_protocol = { + .handler = ipv6_rthdr_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_rthdr_init(void) +{ + if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0) + printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n"); +}; + +/* + This function inverts received rthdr. + NOTE: specs allow to make it automatically only if + packet authenticated. + + I will not discuss it here (though, I am really pissed off at + this stupid requirement making rthdr idea useless) + + Actually, it creates severe problems for us. + Embryonic requests has no associated sockets, + so that user have no control over it and + cannot not only to set reply options, but + even to know, that someone wants to connect + without success. :-( + + For now we need to test the engine, so that I created + temporary (or permanent) backdoor. + If listening socket set IPV6_RTHDR to 2, then we invert header. + --ANK (980729) + */ + +struct ipv6_txoptions * +ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) +{ + /* Received rthdr: + + [ H1 -> H2 -> ... H_prev ] daddr=ME + + Inverted result: + [ H_prev -> ... -> H1 ] daddr =sender + + Note, that IP output engine will rewrite this rthdr + by rotating it left by one addr. + */ + + int n, i; + struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr; + struct rt0_hdr *irthdr; + struct ipv6_txoptions *opt; + int hdrlen = ipv6_optlen(hdr); + + if (hdr->segments_left || + hdr->type != IPV6_SRCRT_TYPE_0 || + hdr->hdrlen & 0x01) + return NULL; + + n = hdr->hdrlen >> 1; + opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC); + if (opt == NULL) + return NULL; + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + hdrlen; + opt->srcrt = (void*)(opt+1); + opt->opt_nflen = hdrlen; + + memcpy(opt->srcrt, hdr, sizeof(*hdr)); + irthdr = (struct rt0_hdr*)opt->srcrt; + /* Obsolete field, MBZ, when originated by us */ + irthdr->bitmap = 0; + opt->srcrt->segments_left = n; + for (i=0; i<n; i++) + memcpy(irthdr->addr+i, rthdr->addr+(n-1-i), 16); + return opt; +} + +/********************************** + Hop-by-hop options. + **********************************/ + +/* Router Alert as of RFC 2711 */ + +static int ipv6_hop_ra(struct sk_buff *skb, int optoff) +{ + if (skb->nh.raw[optoff+1] == 2) { + IP6CB(skb)->ra = optoff; + return 1; + } + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1])); + kfree_skb(skb); + return 0; +} + +/* Jumbo payload */ + +static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) +{ + u32 pkt_len; + + if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1])); + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + pkt_len = ntohl(*(u32*)(skb->nh.raw+optoff+2)); + if (pkt_len <= IPV6_MAXPLEN) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); + return 0; + } + if (skb->nh.ipv6h->payload_len) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); + return 0; + } + + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { + IP6_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { + __pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr)); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } + return 1; + +drop: + kfree_skb(skb); + return 0; +} + +static struct tlvtype_proc tlvprochopopt_lst[] = { + { + .type = IPV6_TLV_ROUTERALERT, + .func = ipv6_hop_ra, + }, + { + .type = IPV6_TLV_JUMBO, + .func = ipv6_hop_jumbo, + }, + { -1, } +}; + +int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff) +{ + IP6CB(skb)->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb)) + return sizeof(struct ipv6hdr); + return -1; +} + +/* + * Creating outbound headers. + * + * "build" functions work when skb is filled from head to tail (datagram) + * "push" functions work when headers are added from tail to head (tcp) + * + * In both cases we assume, that caller reserved enough room + * for headers. + */ + +static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p) +{ + struct rt0_hdr *phdr, *ihdr; + int hops; + + ihdr = (struct rt0_hdr *) opt; + + phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); + memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); + + hops = ihdr->rt_hdr.hdrlen >> 1; + + if (hops > 1) + memcpy(phdr->addr, ihdr->addr + 1, + (hops - 1) * sizeof(struct in6_addr)); + + ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + *addr_p = ihdr->addr; + + phdr->rt_hdr.nexthdr = *proto; + *proto = NEXTHDR_ROUTING; +} + +static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); + + memcpy(h, opt, ipv6_optlen(opt)); + h->nexthdr = *proto; + *proto = type; +} + +void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 *proto, + struct in6_addr **daddr) +{ + if (opt->srcrt) + ipv6_push_rthdr(skb, proto, opt->srcrt, daddr); + if (opt->dst0opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + if (opt->hopopt) + ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); +} + +void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +{ + if (opt->dst1opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); +} + +struct ipv6_txoptions * +ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) +{ + struct ipv6_txoptions *opt2; + + opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + if (opt2) { + long dif = (char*)opt2 - (char*)opt; + memcpy(opt2, opt, opt->tot_len); + if (opt2->hopopt) + *((char**)&opt2->hopopt) += dif; + if (opt2->dst0opt) + *((char**)&opt2->dst0opt) += dif; + if (opt2->dst1opt) + *((char**)&opt2->dst1opt) += dif; + if (opt2->srcrt) + *((char**)&opt2->srcrt) += dif; + } + return opt2; +} diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c new file mode 100644 index 000000000000..6dda815c013f --- /dev/null +++ b/net/ipv6/exthdrs_core.c @@ -0,0 +1,109 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ +#include <net/ipv6.h> + +/* + * find out if nexthdr is a well-known extension header or a protocol + */ + +int ipv6_ext_hdr(u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return ( (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST) ); +} + +/* + * Skip any extension headers. This is used by the ICMP module. + * + * Note that strictly speaking this conflicts with RFC 2460 4.0: + * ...The contents and semantics of each extension header determine whether + * or not to proceed to the next header. Therefore, extension headers must + * be processed strictly in the order they appear in the packet; a + * receiver must not, for example, scan through a packet looking for a + * particular kind of extension header and process that header prior to + * processing all preceding ones. + * + * We do exactly this. This is a protocol bug. We can't decide after a + * seeing an unknown discard-with-error flavour TLV option if it's a + * ICMP error message or not (errors should never be send in reply to + * ICMP error messages). + * + * But I see no other way to do this. This might need to be reexamined + * when Linux implements ESP (and maybe AUTH) headers. + * --AK + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * If it is not NULL *nexthdr is updated by type/protocol of this header. + * + * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. + * - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns NULL. + * - First fragment header is skipped, not-first ones + * are considered as unparsable. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + * + * --ANK (980726) + */ + +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + return -1; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + BUG(); + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short _frag_off, *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -1; + + if (ntohs(*fp) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +EXPORT_SYMBOL(ipv6_ext_hdr); +EXPORT_SYMBOL(ipv6_skip_exthdr); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c new file mode 100644 index 000000000000..87b9082ceab2 --- /dev/null +++ b/net/ipv6/icmp.c @@ -0,0 +1,822 @@ +/* + * Internet Control Message Protocol (ICMPv6) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: icmp.c,v 1.38 2002/02/08 03:57:19 davem Exp $ + * + * Based on net/ipv4/icmp.c + * + * RFC 1885 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Andi Kleen : exception handling + * Andi Kleen add rate limits. never reply to a icmp. + * add more length checks and other fixes. + * yoshfuji : ensure to sent parameter problem for + * fragments. + * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. + * Randy Dunlap and + * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support + * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data + */ + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/in.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/init.h> + +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/icmpv6.h> + +#include <net/ip.h> +#include <net/sock.h> + +#include <net/ipv6.h> +#include <net/ip6_checksum.h> +#include <net/protocol.h> +#include <net/raw.h> +#include <net/rawv6.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/icmp.h> + +#include <asm/uaccess.h> +#include <asm/system.h> + +DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL; +#define icmpv6_socket __get_cpu_var(__icmpv6_socket) + +static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp); + +static struct inet6_protocol icmpv6_protocol = { + .handler = icmpv6_rcv, + .flags = INET6_PROTO_FINAL, +}; + +static __inline__ int icmpv6_xmit_lock(void) +{ + local_bh_disable(); + + if (unlikely(!spin_trylock(&icmpv6_socket->sk->sk_lock.slock))) { + /* This can happen if the output path (f.e. SIT or + * ip6ip6 tunnel) signals dst_link_failure() for an + * outgoing ICMP6 packet. + */ + local_bh_enable(); + return 1; + } + return 0; +} + +static __inline__ void icmpv6_xmit_unlock(void) +{ + spin_unlock_bh(&icmpv6_socket->sk->sk_lock.slock); +} + +/* + * Slightly more convenient version of icmpv6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, int code, int pos) +{ + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev); + kfree_skb(skb); +} + +/* + * Figure out, may we reply to this packet with icmp error. + * + * We do not reply, if: + * - it was icmp error message. + * - it is truncated, so that it is known, that protocol is ICMPV6 + * (i.e. in the middle of some exthdr) + * + * --ANK (980726) + */ + +static int is_ineligible(struct sk_buff *skb) +{ + int ptr = (u8*)(skb->nh.ipv6h+1) - skb->data; + int len = skb->len - ptr; + __u8 nexthdr = skb->nh.ipv6h->nexthdr; + + if (len < 0) + return 1; + + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, len); + if (ptr < 0) + return 0; + if (nexthdr == IPPROTO_ICMPV6) { + u8 _type, *tp; + tp = skb_header_pointer(skb, + ptr+offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + if (tp == NULL || + !(*tp & ICMPV6_INFOMSG_MASK)) + return 1; + } + return 0; +} + +static int sysctl_icmpv6_time = 1*HZ; + +/* + * Check the ICMP output rate limit + */ +static inline int icmpv6_xrlim_allow(struct sock *sk, int type, + struct flowi *fl) +{ + struct dst_entry *dst; + int res = 0; + + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return 1; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return 1; + + /* + * Look up the output route. + * XXX: perhaps the expire for routing entries cloned by + * this lookup should be more aggressive (not longer than timeout). + */ + dst = ip6_route_output(sk, fl); + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + res = 1; + } else { + struct rt6_info *rt = (struct rt6_info *)dst; + int tmo = sysctl_icmpv6_time; + + /* Give more bandwidth to wider prefixes. */ + if (rt->rt6i_dst.plen < 128) + tmo >>= ((128 - rt->rt6i_dst.plen)>>5); + + res = xrlim_allow(dst, tmo); + } + dst_release(dst); + return res; +} + +/* + * an inline helper for the "simple" if statement below + * checks if parameter problem report is caused by an + * unrecognized IPv6 option that has the Option Type + * highest-order two bits set to 10 + */ + +static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +{ + u8 _optval, *op; + + offset += skb->nh.raw - skb->data; + op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); + if (op == NULL) + return 1; + return (*op & 0xC0) == 0x80; +} + +static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len) +{ + struct sk_buff *skb; + struct icmp6hdr *icmp6h; + int err = 0; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + icmp6h = (struct icmp6hdr*) skb->h.raw; + memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); + icmp6h->icmp6_cksum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + skb->csum = csum_partial((char *)icmp6h, + sizeof(struct icmp6hdr), skb->csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, + skb->csum); + } else { + u32 tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + + tmp_csum = csum_partial((char *)icmp6h, + sizeof(struct icmp6hdr), tmp_csum); + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, tmp_csum); + icmp6h->icmp6_cksum = tmp_csum; + } + if (icmp6h->icmp6_cksum == 0) + icmp6h->icmp6_cksum = -1; + ip6_push_pending_frames(sk); +out: + return err; +} + +struct icmpv6_msg { + struct sk_buff *skb; + int offset; +}; + +static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct icmpv6_msg *msg = (struct icmpv6_msg *) from; + struct sk_buff *org_skb = msg->skb; + __u32 csum = 0; + + csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, + to, len, csum); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; +} + +/* + * Send an ICMP message in response to a packet in error + */ +void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, + struct net_device *dev) +{ + struct inet6_dev *idev = NULL; + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct sock *sk = icmpv6_socket->sk; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *saddr = NULL; + struct dst_entry *dst; + struct icmp6hdr tmp_hdr; + struct flowi fl; + struct icmpv6_msg msg; + int iif = 0; + int addr_type = 0; + int len; + int hlimit; + int err = 0; + + if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail) + return; + + /* + * Make sure we respect the rules + * i.e. RFC 1885 2.4(e) + * Rule (e.1) is enforced by not using icmpv6_send + * in any code that processes icmp errors. + */ + addr_type = ipv6_addr_type(&hdr->daddr); + + if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0)) + saddr = &hdr->daddr; + + /* + * Dest addr check + */ + + if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) { + if (type != ICMPV6_PKT_TOOBIG && + !(type == ICMPV6_PARAMPROB && + code == ICMPV6_UNK_OPTION && + (opt_unrec(skb, info)))) + return; + + saddr = NULL; + } + + addr_type = ipv6_addr_type(&hdr->saddr); + + /* + * Source addr check + */ + + if (addr_type & IPV6_ADDR_LINKLOCAL) + iif = skb->dev->ifindex; + + /* + * Must not send if we know that source is Anycast also. + * for now we don't know that. + */ + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n")); + return; + } + + /* + * Never answer to a ICMP packet. + */ + if (is_ineligible(skb)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n")); + return; + } + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_ICMPV6; + ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr); + if (saddr) + ipv6_addr_copy(&fl.fl6_src, saddr); + fl.oif = iif; + fl.fl_icmp_type = type; + fl.fl_icmp_code = code; + + if (icmpv6_xmit_lock()) + return; + + if (!icmpv6_xrlim_allow(sk, type, &fl)) + goto out; + + tmp_hdr.icmp6_type = type; + tmp_hdr.icmp6_code = code; + tmp_hdr.icmp6_cksum = 0; + tmp_hdr.icmp6_pointer = htonl(info); + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out_dst_release; + + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + + msg.skb = skb; + msg.offset = skb->nh.raw - skb->data; + + len = skb->len - msg.offset; + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); + if (len < 0) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmp: len problem\n")); + goto out_dst_release; + } + + idev = in6_dev_get(skb->dev); + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), + hlimit, NULL, &fl, (struct rt6_info*)dst, + MSG_DONTWAIT); + if (err) { + ip6_flush_pending_frames(sk); + goto out_put; + } + err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr)); + + if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) + ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_OUTDESTUNREACHS, type - ICMPV6_DEST_UNREACH); + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); + +out_put: + if (likely(idev != NULL)) + in6_dev_put(idev); +out_dst_release: + dst_release(dst); +out: + icmpv6_xmit_unlock(); +} + +static void icmpv6_echo_reply(struct sk_buff *skb) +{ + struct sock *sk = icmpv6_socket->sk; + struct inet6_dev *idev; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *saddr = NULL; + struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw; + struct icmp6hdr tmp_hdr; + struct flowi fl; + struct icmpv6_msg msg; + struct dst_entry *dst; + int err = 0; + int hlimit; + + saddr = &skb->nh.ipv6h->daddr; + + if (!ipv6_unicast_destination(skb)) + saddr = NULL; + + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); + tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_ICMPV6; + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + if (saddr) + ipv6_addr_copy(&fl.fl6_src, saddr); + fl.oif = skb->dev->ifindex; + fl.fl_icmp_type = ICMPV6_ECHO_REPLY; + + if (icmpv6_xmit_lock()) + return; + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out_dst_release; + + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + + idev = in6_dev_get(skb->dev); + + msg.skb = skb; + msg.offset = 0; + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, NULL, &fl, + (struct rt6_info*)dst, MSG_DONTWAIT); + + if (err) { + ip6_flush_pending_frames(sk); + goto out_put; + } + err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTECHOREPLIES); + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); + +out_put: + if (likely(idev != NULL)) + in6_dev_put(idev); +out_dst_release: + dst_release(dst); +out: + icmpv6_xmit_unlock(); +} + +static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info) +{ + struct in6_addr *saddr, *daddr; + struct inet6_protocol *ipprot; + struct sock *sk; + int inner_offset; + int hash; + u8 nexthdr; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return; + + nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; + if (ipv6_ext_hdr(nexthdr)) { + /* now skip over extension headers */ + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, skb->len - sizeof(struct ipv6hdr)); + if (inner_offset<0) + return; + } else { + inner_offset = sizeof(struct ipv6hdr); + } + + /* Checkin header including 8 bytes of inner protocol header. */ + if (!pskb_may_pull(skb, inner_offset+8)) + return; + + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. + Without this we will not able f.e. to make source routed + pmtu discovery. + Corresponding argument (opt) to notifiers is already added. + --ANK (980726) + */ + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + rcu_read_lock(); + ipprot = rcu_dereference(inet6_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, NULL, type, code, inner_offset, info); + rcu_read_unlock(); + + read_lock(&raw_v6_lock); + if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { + while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) { + rawv6_err(sk, skb, NULL, type, code, inner_offset, info); + sk = sk_next(sk); + } + } + read_unlock(&raw_v6_lock); +} + +/* + * Handle icmp messages + */ + +static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); + struct in6_addr *saddr, *daddr; + struct ipv6hdr *orig_hdr; + struct icmp6hdr *hdr; + int type; + + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS); + + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + + /* Perform checksum. */ + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ICMPv6 hw checksum failed\n")); + skb->ip_summed = CHECKSUM_NONE; + } + } + if (skb->ip_summed == CHECKSUM_NONE) { + if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb_checksum(skb, 0, skb->len, 0))) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", + NIP6(*saddr), NIP6(*daddr))); + goto discard_it; + } + } + + if (!pskb_pull(skb, sizeof(struct icmp6hdr))) + goto discard_it; + + hdr = (struct icmp6hdr *) skb->h.raw; + + type = hdr->icmp6_type; + + if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) + ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INDESTUNREACHS, type - ICMPV6_DEST_UNREACH); + else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT) + ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INECHOS, type - ICMPV6_ECHO_REQUEST); + + switch (type) { + case ICMPV6_ECHO_REQUEST: + icmpv6_echo_reply(skb); + break; + + case ICMPV6_ECHO_REPLY: + /* we couldn't care less */ + break; + + case ICMPV6_PKT_TOOBIG: + /* BUGGG_FUTURE: if packet contains rthdr, we cannot update + standard destination cache. Seems, only "advanced" + destination cache will allow to solve this problem + --ANK (980726) + */ + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto discard_it; + hdr = (struct icmp6hdr *) skb->h.raw; + orig_hdr = (struct ipv6hdr *) (hdr + 1); + rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, + ntohl(hdr->icmp6_mtu)); + + /* + * Drop through to notify + */ + + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + break; + + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + ndisc_rcv(skb); + break; + + case ICMPV6_MGM_QUERY: + igmp6_event_query(skb); + break; + + case ICMPV6_MGM_REPORT: + igmp6_event_report(skb); + break; + + case ICMPV6_MGM_REDUCTION: + case ICMPV6_NI_QUERY: + case ICMPV6_NI_REPLY: + case ICMPV6_MLD2_REPORT: + case ICMPV6_DHAAD_REQUEST: + case ICMPV6_DHAAD_REPLY: + case ICMPV6_MOBILE_PREFIX_SOL: + case ICMPV6_MOBILE_PREFIX_ADV: + break; + + default: + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmpv6: msg of unknown type\n")); + + /* informational */ + if (type & ICMPV6_INFOMSG_MASK) + break; + + /* + * error of unknown type. + * must pass to upper level + */ + + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + }; + kfree_skb(skb); + return 0; + +discard_it: + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS); + kfree_skb(skb); + return 0; +} + +int __init icmpv6_init(struct net_proto_family *ops) +{ + struct sock *sk; + int err, i, j; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, + &per_cpu(__icmpv6_socket, i)); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the ICMP6 control socket " + "(err %d).\n", + err); + goto fail; + } + + sk = per_cpu(__icmpv6_socket, i)->sk; + sk->sk_allocation = GFP_ATOMIC; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + sk->sk_sndbuf = + (2 * ((64 * 1024) + sizeof(struct sk_buff))); + + sk->sk_prot->unhash(sk); + } + + + if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) { + printk(KERN_ERR "Failed to register ICMP6 protocol\n"); + err = -EAGAIN; + goto fail; + } + + return 0; + + fail: + for (j = 0; j < i; j++) { + if (!cpu_possible(j)) + continue; + sock_release(per_cpu(__icmpv6_socket, j)); + } + + return err; +} + +void icmpv6_cleanup(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + sock_release(per_cpu(__icmpv6_socket, i)); + } + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); +} + +static struct icmp6_err { + int err; + int fatal; +} tab_unreach[] = { + { /* NOROUTE */ + .err = ENETUNREACH, + .fatal = 0, + }, + { /* ADM_PROHIBITED */ + .err = EACCES, + .fatal = 1, + }, + { /* Was NOT_NEIGHBOUR, now reserved */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* ADDR_UNREACH */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* PORT_UNREACH */ + .err = ECONNREFUSED, + .fatal = 1, + }, +}; + +int icmpv6_err_convert(int type, int code, int *err) +{ + int fatal = 0; + + *err = EPROTO; + + switch (type) { + case ICMPV6_DEST_UNREACH: + fatal = 1; + if (code <= ICMPV6_PORT_UNREACH) { + *err = tab_unreach[code].err; + fatal = tab_unreach[code].fatal; + } + break; + + case ICMPV6_PKT_TOOBIG: + *err = EMSGSIZE; + break; + + case ICMPV6_PARAMPROB: + *err = EPROTO; + fatal = 1; + break; + + case ICMPV6_TIME_EXCEED: + *err = EHOSTUNREACH; + break; + }; + + return fatal; +} + +#ifdef CONFIG_SYSCTL +ctl_table ipv6_icmp_table[] = { + { + .ctl_name = NET_IPV6_ICMP_RATELIMIT, + .procname = "ratelimit", + .data = &sysctl_icmpv6_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 }, +}; +#endif + diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c new file mode 100644 index 000000000000..405740b75abb --- /dev/null +++ b/net/ipv6/ip6_fib.c @@ -0,0 +1,1225 @@ +/* + * Linux INET6 implementation + * Forwarding Information Database + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + */ +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/route.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/init.h> + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +#endif + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> + +#include <net/ip6_fib.h> +#include <net/ip6_route.h> + +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + +struct rt6_statistics rt6_stats; + +static kmem_cache_t * fib6_node_kmem; + +enum fib_walk_state_t +{ +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_cleaner_t +{ + struct fib6_walker_t w; + int (*func)(struct rt6_info *, void *arg); + void *arg; +}; + +DEFINE_RWLOCK(fib6_walker_lock); + + +#ifdef CONFIG_IPV6_SUBTREES +#define FWS_INIT FWS_S +#define SUBTREE(fn) ((fn)->subtree) +#else +#define FWS_INIT FWS_L +#define SUBTREE(fn) NULL +#endif + +static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt); +static struct fib6_node * fib6_repair_tree(struct fib6_node *fn); + +/* + * A routing update causes an increase of the serial number on the + * affected subtree. This allows for cached routes to be asynchronously + * tested when modifications are made to the destination cache as a + * result of redirects, path MTU changes, etc. + */ + +static __u32 rt_sernum; + +static struct timer_list ip6_fib_timer = TIMER_INITIALIZER(fib6_run_gc, 0, 0); + +struct fib6_walker_t fib6_walker_list = { + .prev = &fib6_walker_list, + .next = &fib6_walker_list, +}; + +#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next) + +static __inline__ u32 fib6_new_sernum(void) +{ + u32 n = ++rt_sernum; + if ((__s32)n <= 0) + rt_sernum = n = 1; + return n; +} + +/* + * Auxiliary address test functions for the radix tree. + * + * These assume a 32bit processor (although it will work on + * 64bit processors) + */ + +/* + * test bit + */ + +static __inline__ int addr_bit_set(void *token, int fn_bit) +{ + __u32 *addr = token; + + return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; +} + +/* + * find the first different bit between two addresses + * length of address must be a multiple of 32bits + */ + +static __inline__ int addr_diff(void *token1, void *token2, int addrlen) +{ + __u32 *a1 = token1; + __u32 *a2 = token2; + int i; + + addrlen >>= 2; + + for (i = 0; i < addrlen; i++) { + __u32 xb; + + xb = a1[i] ^ a2[i]; + + if (xb) { + int j = 31; + + xb = ntohl(xb); + + while ((xb & (1 << j)) == 0) + j--; + + return (i * 32 + 31 - j); + } + } + + /* + * we should *never* get to this point since that + * would mean the addrs are equal + * + * However, we do get to it 8) And exacly, when + * addresses are equal 8) + * + * ip route add 1111::/128 via ... + * ip route add 1111::/64 via ... + * and we are here. + * + * Ideally, this function should stop comparison + * at prefix length. It does not, but it is still OK, + * if returned value is greater than prefix length. + * --ANK (980803) + */ + + return addrlen<<5; +} + +static __inline__ struct fib6_node * node_alloc(void) +{ + struct fib6_node *fn; + + if ((fn = kmem_cache_alloc(fib6_node_kmem, SLAB_ATOMIC)) != NULL) + memset(fn, 0, sizeof(struct fib6_node)); + + return fn; +} + +static __inline__ void node_free(struct fib6_node * fn) +{ + kmem_cache_free(fib6_node_kmem, fn); +} + +static __inline__ void rt6_release(struct rt6_info *rt) +{ + if (atomic_dec_and_test(&rt->rt6i_ref)) + dst_free(&rt->u.dst); +} + + +/* + * Routing Table + * + * return the appropriate node for a routing tree "add" operation + * by either creating and inserting or by returning an existing + * node. + */ + +static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, + int addrlen, int plen, + int offset) +{ + struct fib6_node *fn, *in, *ln; + struct fib6_node *pn = NULL; + struct rt6key *key; + int bit; + int dir = 0; + __u32 sernum = fib6_new_sernum(); + + RT6_TRACE("fib6_add_1\n"); + + /* insert node in tree */ + + fn = root; + + do { + key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + goto insert_above; + + /* + * Exact match ? + */ + + if (plen == fn->fn_bit) { + /* clean up an intermediate node */ + if ((fn->fn_flags & RTN_RTINFO) == 0) { + rt6_release(fn->leaf); + fn->leaf = NULL; + } + + fn->fn_sernum = sernum; + + return fn; + } + + /* + * We have more bits to go + */ + + /* Try to walk down on tree. */ + fn->fn_sernum = sernum; + dir = addr_bit_set(addr, fn->fn_bit); + pn = fn; + fn = dir ? fn->right: fn->left; + } while (fn); + + /* + * We walked to the bottom of tree. + * Create new leaf node without children. + */ + + ln = node_alloc(); + + if (ln == NULL) + return NULL; + ln->fn_bit = plen; + + ln->parent = pn; + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + return ln; + + +insert_above: + /* + * split since we don't have a common prefix anymore or + * we have a less significant route. + * we've to insert an intermediate node on the list + * this new node will point to the one we need to create + * and the current + */ + + pn = fn->parent; + + /* find 1st bit in difference between the 2 addrs. + + See comment in addr_diff: bit may be an invalid value, + but if it is >= plen, the value is ignored in any case. + */ + + bit = addr_diff(addr, &key->addr, addrlen); + + /* + * (intermediate)[in] + * / \ + * (new leaf node)[ln] (old node)[fn] + */ + if (plen > bit) { + in = node_alloc(); + ln = node_alloc(); + + if (in == NULL || ln == NULL) { + if (in) + node_free(in); + if (ln) + node_free(ln); + return NULL; + } + + /* + * new intermediate node. + * RTN_RTINFO will + * be off since that an address that chooses one of + * the branches would not match less specific routes + * in the other branch + */ + + in->fn_bit = bit; + + in->parent = pn; + in->leaf = fn->leaf; + atomic_inc(&in->leaf->rt6i_ref); + + in->fn_sernum = sernum; + + /* update parent pointer */ + if (dir) + pn->right = in; + else + pn->left = in; + + ln->fn_bit = plen; + + ln->parent = in; + fn->parent = in; + + ln->fn_sernum = sernum; + + if (addr_bit_set(addr, bit)) { + in->right = ln; + in->left = fn; + } else { + in->left = ln; + in->right = fn; + } + } else { /* plen <= bit */ + + /* + * (new leaf node)[ln] + * / \ + * (old node)[fn] NULL + */ + + ln = node_alloc(); + + if (ln == NULL) + return NULL; + + ln->fn_bit = plen; + + ln->parent = pn; + + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + if (addr_bit_set(&key->addr, plen)) + ln->right = fn; + else + ln->left = fn; + + fn->parent = ln; + } + return ln; +} + +/* + * Insert routing information in a node. + */ + +static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, + struct nlmsghdr *nlh) +{ + struct rt6_info *iter = NULL; + struct rt6_info **ins; + + ins = &fn->leaf; + + if (fn->fn_flags&RTN_TL_ROOT && + fn->leaf == &ip6_null_entry && + !(rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ){ + fn->leaf = rt; + rt->u.next = NULL; + goto out; + } + + for (iter = fn->leaf; iter; iter=iter->u.next) { + /* + * Search for duplicates + */ + + if (iter->rt6i_metric == rt->rt6i_metric) { + /* + * Same priority level + */ + + if (iter->rt6i_dev == rt->rt6i_dev && + iter->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&iter->rt6i_gateway, + &rt->rt6i_gateway)) { + if (!(iter->rt6i_flags&RTF_EXPIRES)) + return -EEXIST; + iter->rt6i_expires = rt->rt6i_expires; + if (!(rt->rt6i_flags&RTF_EXPIRES)) { + iter->rt6i_flags &= ~RTF_EXPIRES; + iter->rt6i_expires = 0; + } + return -EEXIST; + } + } + + if (iter->rt6i_metric > rt->rt6i_metric) + break; + + ins = &iter->u.next; + } + + /* + * insert node + */ + +out: + rt->u.next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, nlh); + rt6_stats.fib_rt_entries++; + + if ((fn->fn_flags & RTN_RTINFO) == 0) { + rt6_stats.fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } + + return 0; +} + +static __inline__ void fib6_start_gc(struct rt6_info *rt) +{ + if (ip6_fib_timer.expires == 0 && + (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); +} + +void fib6_force_start_gc(void) +{ + if (ip6_fib_timer.expires == 0) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); +} + +/* + * Add routing information to the routing tree. + * <destination addr>/<source addr> + * with source addr info in sub-trees + */ + +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_node *fn; + int err = -ENOMEM; + + fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), + rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); + + if (fn == NULL) + goto out; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen) { + struct fib6_node *sn; + + if (fn->subtree == NULL) { + struct fib6_node *sfn; + + /* + * Create subtree. + * + * fn[main tree] + * | + * sfn[subtree root] + * \ + * sn[new leaf node] + */ + + /* Create subtree root node */ + sfn = node_alloc(); + if (sfn == NULL) + goto st_failure; + + sfn->leaf = &ip6_null_entry; + atomic_inc(&ip6_null_entry.rt6i_ref); + sfn->fn_flags = RTN_ROOT; + sfn->fn_sernum = fib6_new_sernum(); + + /* Now add the first leaf node to new subtree */ + + sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src)); + + if (sn == NULL) { + /* If it is failed, discard just allocated + root, and then (in st_failure) stale node + in main tree. + */ + node_free(sfn); + goto st_failure; + } + + /* Now link new subtree to main tree */ + sfn->parent = fn; + fn->subtree = sfn; + if (fn->leaf == NULL) { + fn->leaf = rt; + atomic_inc(&rt->rt6i_ref); + } + } else { + sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src)); + + if (sn == NULL) + goto st_failure; + } + + fn = sn; + } +#endif + + err = fib6_add_rt2node(fn, rt, nlh); + + if (err == 0) { + fib6_start_gc(rt); + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(fn, rt); + } + +out: + if (err) + dst_free(&rt->u.dst); + return err; + +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree creation failed, probably main tree node + is orphan. If it is, shoot it. + */ +st_failure: + if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + fib6_repair_tree(fn); + dst_free(&rt->u.dst); + return err; +#endif +} + +/* + * Routing tree lookup + * + */ + +struct lookup_args { + int offset; /* key offset on rt6_info */ + struct in6_addr *addr; /* search key */ +}; + +static struct fib6_node * fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) +{ + struct fib6_node *fn; + int dir; + + /* + * Descend on a tree + */ + + fn = root; + + for (;;) { + struct fib6_node *next; + + dir = addr_bit_set(args->addr, fn->fn_bit); + + next = dir ? fn->right : fn->left; + + if (next) { + fn = next; + continue; + } + + break; + } + + while ((fn->fn_flags & RTN_ROOT) == 0) { +#ifdef CONFIG_IPV6_SUBTREES + if (fn->subtree) { + struct fib6_node *st; + struct lookup_args *narg; + + narg = args + 1; + + if (narg->addr) { + st = fib6_lookup_1(fn->subtree, narg); + + if (st && !(st->fn_flags & RTN_ROOT)) + return st; + } + } +#endif + + if (fn->fn_flags & RTN_RTINFO) { + struct rt6key *key; + + key = (struct rt6key *) ((u8 *) fn->leaf + + args->offset); + + if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) + return fn; + } + + fn = fn->parent; + } + + return NULL; +} + +struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct lookup_args args[2]; + struct fib6_node *fn; + + args[0].offset = offsetof(struct rt6_info, rt6i_dst); + args[0].addr = daddr; + +#ifdef CONFIG_IPV6_SUBTREES + args[1].offset = offsetof(struct rt6_info, rt6i_src); + args[1].addr = saddr; +#endif + + fn = fib6_lookup_1(root, args); + + if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) + fn = root; + + return fn; +} + +/* + * Get node with specified destination prefix (and source prefix, + * if subtrees are used) + */ + + +static struct fib6_node * fib6_locate_1(struct fib6_node *root, + struct in6_addr *addr, + int plen, int offset) +{ + struct fib6_node *fn; + + for (fn = root; fn ; ) { + struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + return NULL; + + if (plen == fn->fn_bit) + return fn; + + /* + * We have more bits to go + */ + if (addr_bit_set(addr, fn->fn_bit)) + fn = fn->right; + else + fn = fn->left; + } + return NULL; +} + +struct fib6_node * fib6_locate(struct fib6_node *root, + struct in6_addr *daddr, int dst_len, + struct in6_addr *saddr, int src_len) +{ + struct fib6_node *fn; + + fn = fib6_locate_1(root, daddr, dst_len, + offsetof(struct rt6_info, rt6i_dst)); + +#ifdef CONFIG_IPV6_SUBTREES + if (src_len) { + BUG_TRAP(saddr!=NULL); + if (fn == NULL) + fn = fn->subtree; + if (fn) + fn = fib6_locate_1(fn, saddr, src_len, + offsetof(struct rt6_info, rt6i_src)); + } +#endif + + if (fn && fn->fn_flags&RTN_RTINFO) + return fn; + + return NULL; +} + + +/* + * Deletion + * + */ + +static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) +{ + if (fn->fn_flags&RTN_ROOT) + return &ip6_null_entry; + + while(fn) { + if(fn->left) + return fn->left->leaf; + + if(fn->right) + return fn->right->leaf; + + fn = SUBTREE(fn); + } + return NULL; +} + +/* + * Called to trim the tree of intermediate nodes when possible. "fn" + * is the node we want to try and remove. + */ + +static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) +{ + int children; + int nstate; + struct fib6_node *child, *pn; + struct fib6_walker_t *w; + int iter = 0; + + for (;;) { + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + iter++; + + BUG_TRAP(!(fn->fn_flags&RTN_RTINFO)); + BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT)); + BUG_TRAP(fn->leaf==NULL); + + children = 0; + child = NULL; + if (fn->right) child = fn->right, children |= 1; + if (fn->left) child = fn->left, children |= 2; + + if (children == 3 || SUBTREE(fn) +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree root (i.e. fn) may have one child */ + || (children && fn->fn_flags&RTN_ROOT) +#endif + ) { + fn->leaf = fib6_find_prefix(fn); +#if RT6_DEBUG >= 2 + if (fn->leaf==NULL) { + BUG_TRAP(fn->leaf); + fn->leaf = &ip6_null_entry; + } +#endif + atomic_inc(&fn->leaf->rt6i_ref); + return fn->parent; + } + + pn = fn->parent; +#ifdef CONFIG_IPV6_SUBTREES + if (SUBTREE(pn) == fn) { + BUG_TRAP(fn->fn_flags&RTN_ROOT); + SUBTREE(pn) = NULL; + nstate = FWS_L; + } else { + BUG_TRAP(!(fn->fn_flags&RTN_ROOT)); +#endif + if (pn->right == fn) pn->right = child; + else if (pn->left == fn) pn->left = child; +#if RT6_DEBUG >= 2 + else BUG_TRAP(0); +#endif + if (child) + child->parent = pn; + nstate = FWS_R; +#ifdef CONFIG_IPV6_SUBTREES + } +#endif + + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (child == NULL) { + if (w->root == fn) { + w->root = w->node = NULL; + RT6_TRACE("W %p adjusted by delroot 1\n", w); + } else if (w->node == fn) { + RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + w->node = pn; + w->state = nstate; + } + } else { + if (w->root == fn) { + w->root = child; + RT6_TRACE("W %p adjusted by delroot 2\n", w); + } + if (w->node == fn) { + w->node = child; + if (children&2) { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + } else { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + } + } + } + } + read_unlock(&fib6_walker_lock); + + node_free(fn); + if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn)) + return pn; + + rt6_release(pn->leaf); + pn->leaf = NULL; + fn = pn; + } +} + +static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, + struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_walker_t *w; + struct rt6_info *rt = *rtp; + + RT6_TRACE("fib6_del_route\n"); + + /* Unlink it */ + *rtp = rt->u.next; + rt->rt6i_node = NULL; + rt6_stats.fib_rt_entries--; + rt6_stats.fib_discarded_routes++; + + /* Adjust walkers */ + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (w->state == FWS_C && w->leaf == rt) { + RT6_TRACE("walker %p adjusted by delroute\n", w); + w->leaf = rt->u.next; + if (w->leaf == NULL) + w->state = FWS_U; + } + } + read_unlock(&fib6_walker_lock); + + rt->u.next = NULL; + + if (fn->leaf == NULL && fn->fn_flags&RTN_TL_ROOT) + fn->leaf = &ip6_null_entry; + + /* If it was last route, expunge its radix tree node */ + if (fn->leaf == NULL) { + fn->fn_flags &= ~RTN_RTINFO; + rt6_stats.fib_route_nodes--; + fn = fib6_repair_tree(fn); + } + + if (atomic_read(&rt->rt6i_ref) != 1) { + /* This route is used as dummy address holder in some split + * nodes. It is not leaked, but it still holds other resources, + * which must be released in time. So, scan ascendant nodes + * and replace dummy references to this route with references + * to still alive ones. + */ + while (fn) { + if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { + fn->leaf = fib6_find_prefix(fn); + atomic_inc(&fn->leaf->rt6i_ref); + rt6_release(rt); + } + fn = fn->parent; + } + /* No more references are possible at this point. */ + if (atomic_read(&rt->rt6i_ref) != 1) BUG(); + } + + inet6_rt_notify(RTM_DELROUTE, rt, nlh); + rt6_release(rt); +} + +int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_node *fn = rt->rt6i_node; + struct rt6_info **rtp; + +#if RT6_DEBUG >= 2 + if (rt->u.dst.obsolete>0) { + BUG_TRAP(fn==NULL); + return -ENOENT; + } +#endif + if (fn == NULL || rt == &ip6_null_entry) + return -ENOENT; + + BUG_TRAP(fn->fn_flags&RTN_RTINFO); + + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(fn, rt); + + /* + * Walk the leaf entries looking for ourself + */ + + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { + if (*rtp == rt) { + fib6_del_route(fn, rtp, nlh, _rtattr); + return 0; + } + } + return -ENOENT; +} + +/* + * Tree traversal function. + * + * Certainly, it is not interrupt safe. + * However, it is internally reenterable wrt itself and fib6_add/fib6_del. + * It means, that we can modify tree during walking + * and use this function for garbage collection, clone pruning, + * cleaning tree when a device goes down etc. etc. + * + * It guarantees that every node will be traversed, + * and that it will be traversed only once. + * + * Callback function w->func may return: + * 0 -> continue walking. + * positive value -> walking is suspended (used by tree dumps, + * and probably by gc, if it will be split to several slices) + * negative value -> terminate walking. + * + * The function itself returns: + * 0 -> walk is complete. + * >0 -> walk is incomplete (i.e. suspended) + * <0 -> walk is terminated by an error. + */ + +int fib6_walk_continue(struct fib6_walker_t *w) +{ + struct fib6_node *fn, *pn; + + for (;;) { + fn = w->node; + if (fn == NULL) + return 0; + + if (w->prune && fn != w->root && + fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + w->state = FWS_C; + w->leaf = fn->leaf; + } + switch (w->state) { +#ifdef CONFIG_IPV6_SUBTREES + case FWS_S: + if (SUBTREE(fn)) { + w->node = SUBTREE(fn); + continue; + } + w->state = FWS_L; +#endif + case FWS_L: + if (fn->left) { + w->node = fn->left; + w->state = FWS_INIT; + continue; + } + w->state = FWS_R; + case FWS_R: + if (fn->right) { + w->node = fn->right; + w->state = FWS_INIT; + continue; + } + w->state = FWS_C; + w->leaf = fn->leaf; + case FWS_C: + if (w->leaf && fn->fn_flags&RTN_RTINFO) { + int err = w->func(w); + if (err) + return err; + continue; + } + w->state = FWS_U; + case FWS_U: + if (fn == w->root) + return 0; + pn = fn->parent; + w->node = pn; +#ifdef CONFIG_IPV6_SUBTREES + if (SUBTREE(pn) == fn) { + BUG_TRAP(fn->fn_flags&RTN_ROOT); + w->state = FWS_L; + continue; + } +#endif + if (pn->left == fn) { + w->state = FWS_R; + continue; + } + if (pn->right == fn) { + w->state = FWS_C; + w->leaf = w->node->leaf; + continue; + } +#if RT6_DEBUG >= 2 + BUG_TRAP(0); +#endif + } + } +} + +int fib6_walk(struct fib6_walker_t *w) +{ + int res; + + w->state = FWS_INIT; + w->node = w->root; + + fib6_walker_link(w); + res = fib6_walk_continue(w); + if (res <= 0) + fib6_walker_unlink(w); + return res; +} + +static int fib6_clean_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w; + + for (rt = w->leaf; rt; rt = rt->u.next) { + res = c->func(rt, c->arg); + if (res < 0) { + w->leaf = rt; + res = fib6_del(rt, NULL, NULL); + if (res) { +#if RT6_DEBUG >= 2 + printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); +#endif + continue; + } + return 0; + } + BUG_TRAP(res==0); + } + w->leaf = rt; + return 0; +} + +/* + * Convenient frontend to tree walker. + * + * func is called on each route. + * It may return -1 -> delete this route. + * 0 -> continue walking + * + * prune==1 -> only immediate children of node (certainly, + * ignoring pure split nodes) will be scanned. + */ + +void fib6_clean_tree(struct fib6_node *root, + int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_cleaner_t c; + + c.w.root = root; + c.w.func = fib6_clean_node; + c.w.prune = prune; + c.func = func; + c.arg = arg; + + fib6_walk(&c.w); +} + +static int fib6_prune_clone(struct rt6_info *rt, void *arg) +{ + if (rt->rt6i_flags & RTF_CACHE) { + RT6_TRACE("pruning clone %p\n", rt); + return -1; + } + + return 0; +} + +static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt) +{ + fib6_clean_tree(fn, fib6_prune_clone, 1, rt); +} + +/* + * Garbage collection + */ + +static struct fib6_gc_args +{ + int timeout; + int more; +} gc_args; + +static int fib6_age(struct rt6_info *rt, void *arg) +{ + unsigned long now = jiffies; + + /* + * check addrconf expiration here. + * Routes are expired even if they are in use. + * + * Also age clones. Note, that clones are aged out + * only if they are not in use now. + */ + + if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { + if (time_after(now, rt->rt6i_expires)) { + RT6_TRACE("expiring %p\n", rt); + rt6_reset_dflt_pointer(rt); + return -1; + } + gc_args.more++; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (atomic_read(&rt->u.dst.__refcnt) == 0 && + time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) { + RT6_TRACE("aging clone %p\n", rt); + return -1; + } else if ((rt->rt6i_flags & RTF_GATEWAY) && + (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + return -1; + } + gc_args.more++; + } + + return 0; +} + +static DEFINE_SPINLOCK(fib6_gc_lock); + +void fib6_run_gc(unsigned long dummy) +{ + if (dummy != ~0UL) { + spin_lock_bh(&fib6_gc_lock); + gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval; + } else { + local_bh_disable(); + if (!spin_trylock(&fib6_gc_lock)) { + mod_timer(&ip6_fib_timer, jiffies + HZ); + local_bh_enable(); + return; + } + gc_args.timeout = ip6_rt_gc_interval; + } + gc_args.more = 0; + + + write_lock_bh(&rt6_lock); + ndisc_dst_gc(&gc_args.more); + fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); + write_unlock_bh(&rt6_lock); + + if (gc_args.more) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); + else { + del_timer(&ip6_fib_timer); + ip6_fib_timer.expires = 0; + } + spin_unlock_bh(&fib6_gc_lock); +} + +void __init fib6_init(void) +{ + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!fib6_node_kmem) + panic("cannot create fib6_nodes cache"); +} + +void fib6_gc_cleanup(void) +{ + del_timer(&ip6_fib_timer); + kmem_cache_destroy(fib6_node_kmem); +} diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c new file mode 100644 index 000000000000..a93f6dc51979 --- /dev/null +++ b/net/ipv6/ip6_flowlabel.c @@ -0,0 +1,706 @@ +/* + * ip6_flowlabel.c IPv6 flowlabel manager. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/in6.h> +#include <linux/route.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <net/sock.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/rawv6.h> +#include <net/icmp.h> +#include <net/transp_v6.h> + +#include <asm/uaccess.h> + +#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified + in old IPv6 RFC. Well, it was reasonable value. + */ +#define FL_MAX_LINGER 60 /* Maximal linger timeout */ + +/* FL hash table */ + +#define FL_MAX_PER_SOCK 32 +#define FL_MAX_SIZE 4096 +#define FL_HASH_MASK 255 +#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) + +static atomic_t fl_size = ATOMIC_INIT(0); +static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; + +static void ip6_fl_gc(unsigned long dummy); +static struct timer_list ip6_fl_gc_timer = TIMER_INITIALIZER(ip6_fl_gc, 0, 0); + +/* FL hash table lock: it protects only of GC */ + +static DEFINE_RWLOCK(ip6_fl_lock); + +/* Big socket sock */ + +static DEFINE_RWLOCK(ip6_sk_fl_lock); + + +static __inline__ struct ip6_flowlabel * __fl_lookup(u32 label) +{ + struct ip6_flowlabel *fl; + + for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + if (fl->label == label) + return fl; + } + return NULL; +} + +static struct ip6_flowlabel * fl_lookup(u32 label) +{ + struct ip6_flowlabel *fl; + + read_lock_bh(&ip6_fl_lock); + fl = __fl_lookup(label); + if (fl) + atomic_inc(&fl->users); + read_unlock_bh(&ip6_fl_lock); + return fl; +} + + +static void fl_free(struct ip6_flowlabel *fl) +{ + if (fl) + kfree(fl->opt); + kfree(fl); +} + +static void fl_release(struct ip6_flowlabel *fl) +{ + write_lock_bh(&ip6_fl_lock); + + fl->lastuse = jiffies; + if (atomic_dec_and_test(&fl->users)) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (fl->opt && fl->share == IPV6_FL_S_EXCL) { + struct ipv6_txoptions *opt = fl->opt; + fl->opt = NULL; + kfree(opt); + } + if (!timer_pending(&ip6_fl_gc_timer) || + time_after(ip6_fl_gc_timer.expires, ttd)) + mod_timer(&ip6_fl_gc_timer, ttd); + } + + write_unlock_bh(&ip6_fl_lock); +} + +static void ip6_fl_gc(unsigned long dummy) +{ + int i; + unsigned long now = jiffies; + unsigned long sched = 0; + + write_lock(&ip6_fl_lock); + + for (i=0; i<=FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl, **flp; + flp = &fl_ht[i]; + while ((fl=*flp) != NULL) { + if (atomic_read(&fl->users) == 0) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (time_after_eq(now, ttd)) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + if (!sched || time_before(ttd, sched)) + sched = ttd; + } + flp = &fl->next; + } + } + if (!sched && atomic_read(&fl_size)) + sched = now + FL_MAX_LINGER; + if (sched) { + ip6_fl_gc_timer.expires = sched; + add_timer(&ip6_fl_gc_timer); + } + write_unlock(&ip6_fl_lock); +} + +static int fl_intern(struct ip6_flowlabel *fl, __u32 label) +{ + fl->label = label & IPV6_FLOWLABEL_MASK; + + write_lock_bh(&ip6_fl_lock); + if (label == 0) { + for (;;) { + fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + if (fl->label) { + struct ip6_flowlabel *lfl; + lfl = __fl_lookup(fl->label); + if (lfl == NULL) + break; + } + } + } + + fl->lastuse = jiffies; + fl->next = fl_ht[FL_HASH(fl->label)]; + fl_ht[FL_HASH(fl->label)] = fl; + atomic_inc(&fl_size); + write_unlock_bh(&ip6_fl_lock); + return 0; +} + + + +/* Socket flowlabel lists */ + +struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label) +{ + struct ipv6_fl_socklist *sfl; + struct ipv6_pinfo *np = inet6_sk(sk); + + label &= IPV6_FLOWLABEL_MASK; + + for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + struct ip6_flowlabel *fl = sfl->fl; + if (fl->label == label) { + fl->lastuse = jiffies; + atomic_inc(&fl->users); + return fl; + } + } + return NULL; +} + +void fl6_free_socklist(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + + while ((sfl = np->ipv6_fl_list) != NULL) { + np->ipv6_fl_list = sfl->next; + fl_release(sfl->fl); + kfree(sfl); + } +} + +/* Service routines */ + + +/* + It is the only difficult place. flowlabel enforces equal headers + before and including routing header, however user may supply options + following rthdr. + */ + +struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, + struct ip6_flowlabel * fl, + struct ipv6_txoptions * fopt) +{ + struct ipv6_txoptions * fl_opt = fl->opt; + + if (fopt == NULL || fopt->opt_flen == 0) + return fl_opt; + + if (fl_opt != NULL) { + opt_space->hopopt = fl_opt->hopopt; + opt_space->dst0opt = fl_opt->dst0opt; + opt_space->srcrt = fl_opt->srcrt; + opt_space->opt_nflen = fl_opt->opt_nflen; + } else { + if (fopt->opt_nflen == 0) + return fopt; + opt_space->hopopt = NULL; + opt_space->dst0opt = NULL; + opt_space->srcrt = NULL; + opt_space->opt_nflen = 0; + } + opt_space->dst1opt = fopt->dst1opt; + opt_space->auth = fopt->auth; + opt_space->opt_flen = fopt->opt_flen; + return opt_space; +} + +static unsigned long check_linger(unsigned long ttl) +{ + if (ttl < FL_MIN_LINGER) + return FL_MIN_LINGER*HZ; + if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) + return 0; + return ttl*HZ; +} + +static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) +{ + linger = check_linger(linger); + if (!linger) + return -EPERM; + expires = check_linger(expires); + if (!expires) + return -EPERM; + fl->lastuse = jiffies; + if (time_before(fl->linger, linger)) + fl->linger = linger; + if (time_before(expires, fl->linger)) + expires = fl->linger; + if (time_before(fl->expires, fl->lastuse + expires)) + fl->expires = fl->lastuse + expires; + return 0; +} + +static struct ip6_flowlabel * +fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *err_p) +{ + struct ip6_flowlabel *fl; + int olen; + int addr_type; + int err; + + err = -ENOMEM; + fl = kmalloc(sizeof(*fl), GFP_KERNEL); + if (fl == NULL) + goto done; + memset(fl, 0, sizeof(*fl)); + + olen = optlen - CMSG_ALIGN(sizeof(*freq)); + if (olen > 0) { + struct msghdr msg; + struct flowi flowi; + int junk; + + err = -ENOMEM; + fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); + if (fl->opt == NULL) + goto done; + + memset(fl->opt, 0, sizeof(*fl->opt)); + fl->opt->tot_len = sizeof(*fl->opt) + olen; + err = -EFAULT; + if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen)) + goto done; + + msg.msg_controllen = olen; + msg.msg_control = (void*)(fl->opt+1); + flowi.oif = 0; + + err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk); + if (err) + goto done; + err = -EINVAL; + if (fl->opt->opt_flen) + goto done; + if (fl->opt->opt_nflen == 0) { + kfree(fl->opt); + fl->opt = NULL; + } + } + + fl->expires = jiffies; + err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); + if (err) + goto done; + fl->share = freq->flr_share; + addr_type = ipv6_addr_type(&freq->flr_dst); + if ((addr_type&IPV6_ADDR_MAPPED) + || addr_type == IPV6_ADDR_ANY) + goto done; + ipv6_addr_copy(&fl->dst, &freq->flr_dst); + atomic_set(&fl->users, 1); + switch (fl->share) { + case IPV6_FL_S_EXCL: + case IPV6_FL_S_ANY: + break; + case IPV6_FL_S_PROCESS: + fl->owner = current->pid; + break; + case IPV6_FL_S_USER: + fl->owner = current->euid; + break; + default: + err = -EINVAL; + goto done; + } + return fl; + +done: + fl_free(fl); + *err_p = err; + return NULL; +} + +static int mem_check(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + int room = FL_MAX_SIZE - atomic_read(&fl_size); + int count = 0; + + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) + return 0; + + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + count++; + + if (room <= 0 || + ((count >= FL_MAX_PER_SOCK || + (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) + && !capable(CAP_NET_ADMIN))) + return -ENOBUFS; + + return 0; +} + +static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2) +{ + if (h1 == h2) + return 0; + if (h1 == NULL || h2 == NULL) + return 1; + if (h1->hdrlen != h2->hdrlen) + return 1; + return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1)); +} + +static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) +{ + if (o1 == o2) + return 0; + if (o1 == NULL || o2 == NULL) + return 1; + if (o1->opt_nflen != o2->opt_nflen) + return 1; + if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt)) + return 1; + if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt)) + return 1; + if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt)) + return 1; + return 0; +} + +int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) +{ + int err; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_flowlabel_req freq; + struct ipv6_fl_socklist *sfl1=NULL; + struct ipv6_fl_socklist *sfl, **sflp; + struct ip6_flowlabel *fl; + + if (optlen < sizeof(freq)) + return -EINVAL; + + if (copy_from_user(&freq, optval, sizeof(freq))) + return -EFAULT; + + switch (freq.flr_action) { + case IPV6_FL_A_PUT: + write_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) + np->flow_label &= ~IPV6_FLOWLABEL_MASK; + *sflp = sfl->next; + write_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); + kfree(sfl); + return 0; + } + } + write_unlock_bh(&ip6_sk_fl_lock); + return -ESRCH; + + case IPV6_FL_A_RENEW: + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); + read_unlock_bh(&ip6_sk_fl_lock); + return err; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) { + fl = fl_lookup(freq.flr_label); + if (fl) { + err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); + fl_release(fl); + return err; + } + } + return -ESRCH; + + case IPV6_FL_A_GET: + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) + return -EINVAL; + + fl = fl_create(&freq, optval, optlen, &err); + if (fl == NULL) + return err; + sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + + if (freq.flr_label) { + struct ip6_flowlabel *fl1 = NULL; + + err = -EEXIST; + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_flags&IPV6_FL_F_EXCL) { + read_unlock_bh(&ip6_sk_fl_lock); + goto done; + } + fl1 = sfl->fl; + atomic_inc(&fl->users); + break; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (fl1 == NULL) + fl1 = fl_lookup(freq.flr_label); + if (fl1) { + err = -EEXIST; + if (freq.flr_flags&IPV6_FL_F_EXCL) + goto release; + err = -EPERM; + if (fl1->share == IPV6_FL_S_EXCL || + fl1->share != fl->share || + fl1->owner != fl->owner) + goto release; + + err = -EINVAL; + if (!ipv6_addr_equal(&fl1->dst, &fl->dst) || + ipv6_opt_cmp(fl1->opt, fl->opt)) + goto release; + + err = -ENOMEM; + if (sfl1 == NULL) + goto release; + if (fl->linger > fl1->linger) + fl1->linger = fl->linger; + if ((long)(fl->expires - fl1->expires) > 0) + fl1->expires = fl->expires; + write_lock_bh(&ip6_sk_fl_lock); + sfl1->fl = fl1; + sfl1->next = np->ipv6_fl_list; + np->ipv6_fl_list = sfl1; + write_unlock_bh(&ip6_sk_fl_lock); + fl_free(fl); + return 0; + +release: + fl_release(fl1); + goto done; + } + } + err = -ENOENT; + if (!(freq.flr_flags&IPV6_FL_F_CREATE)) + goto done; + + err = -ENOMEM; + if (sfl1 == NULL || (err = mem_check(sk)) != 0) + goto done; + + err = fl_intern(fl, freq.flr_label); + if (err) + goto done; + + /* Do not check for fault */ + if (!freq.flr_label) + copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, + &fl->label, sizeof(fl->label)); + + sfl1->fl = fl; + sfl1->next = np->ipv6_fl_list; + np->ipv6_fl_list = sfl1; + return 0; + + default: + return -EINVAL; + } + +done: + fl_free(fl); + kfree(sfl1); + return err; +} + +#ifdef CONFIG_PROC_FS + +struct ip6fl_iter_state { + int bucket; +}; + +#define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) + +static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) +{ + struct ip6_flowlabel *fl = NULL; + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + + for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { + if (fl_ht[state->bucket]) { + fl = fl_ht[state->bucket]; + break; + } + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) +{ + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + + fl = fl->next; + while (!fl) { + if (++state->bucket <= FL_HASH_MASK) + fl = fl_ht[state->bucket]; + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_flowlabel *fl = ip6fl_get_first(seq); + if (fl) + while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) + --pos; + return pos ? NULL : fl; +} + +static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&ip6_fl_lock); + return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_flowlabel *fl; + + if (v == SEQ_START_TOKEN) + fl = ip6fl_get_first(seq); + else + fl = ip6fl_get_next(seq, v); + ++*pos; + return fl; +} + +static void ip6fl_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ip6_fl_lock); +} + +static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl) +{ + while(fl) { + seq_printf(seq, + "%05X %-1d %-6d %-6d %-6ld %-8ld " + "%02x%02x%02x%02x%02x%02x%02x%02x " + "%-4d\n", + (unsigned)ntohl(fl->label), + fl->share, + (unsigned)fl->owner, + atomic_read(&fl->users), + fl->linger/HZ, + (long)(fl->expires - jiffies)/HZ, + NIP6(fl->dst), + fl->opt ? fl->opt->opt_nflen : 0); + fl = fl->next; + } +} + +static int ip6fl_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Label S Owner Users Linger Expires " + "Dst Opt\n"); + else + ip6fl_fl_seq_show(seq, v); + return 0; +} + +static struct seq_operations ip6fl_seq_ops = { + .start = ip6fl_seq_start, + .next = ip6fl_seq_next, + .stop = ip6fl_seq_stop, + .show = ip6fl_seq_show, +}; + +static int ip6fl_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ip6fl_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ip6fl_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations ip6fl_seq_fops = { + .owner = THIS_MODULE, + .open = ip6fl_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + + +void ip6_flowlabel_init(void) +{ +#ifdef CONFIG_PROC_FS + proc_net_fops_create("ip6_flowlabel", S_IRUGO, &ip6fl_seq_fops); +#endif +} + +void ip6_flowlabel_cleanup(void) +{ + del_timer(&ip6_fl_gc_timer); +#ifdef CONFIG_PROC_FS + proc_net_remove("ip6_flowlabel"); +#endif +} diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c new file mode 100644 index 000000000000..866f10726c58 --- /dev/null +++ b/net/ipv6/ip6_input.c @@ -0,0 +1,269 @@ +/* + * IPv6 input + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Ian P. Morris <I.P.Morris@soton.ac.uk> + * + * $Id: ip6_input.c,v 1.19 2000/12/13 18:31:50 davem Exp $ + * + * Based in linux/net/ipv4/ip_input.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* Changes + * + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/icmpv6.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/xfrm.h> + + + +static inline int ip6_rcv_finish( struct sk_buff *skb) +{ + if (skb->dst == NULL) + ip6_route_input(skb); + + return dst_input(skb); +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + IP6_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IP6_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto out; + } + + /* + * Store incoming device index. When the packet will + * be queued, we cannot refer to skb->dev anymore. + * + * BTW, when we send a packet for our own local address on a + * non-loopback interface (e.g. ethX), it is being delivered + * via the loopback interface (lo) here; skb->dev = &loopback_dev. + * It, however, should be considered as if it is being + * arrived via the sending interface (ethX), because of the + * nature of scoping architecture. --yoshfuji + */ + IP6CB(skb)->iif = skb->dst ? ((struct rt6_info *)skb->dst)->rt6i_idev->dev->ifindex : dev->ifindex; + + if (skb->len < sizeof(struct ipv6hdr)) + goto err; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + hdr = skb->nh.ipv6h; + + if (hdr->version != 6) + goto err; + + pkt_len = ntohs(hdr->payload_len); + + /* pkt_len may be zero if Jumbo payload option is present */ + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + goto truncated; + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + hdr = skb->nh.ipv6h; + } + + if (hdr->nexthdr == NEXTHDR_HOP) { + skb->h.raw = (u8*)(hdr+1); + if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + return 0; + } + hdr = skb->nh.ipv6h; + } + + return NF_HOOK(PF_INET6,NF_IP6_PRE_ROUTING, skb, dev, NULL, ip6_rcv_finish); +truncated: + IP6_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); +err: + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); +out: + return 0; +} + +/* + * Deliver the packet to the host + */ + + +static inline int ip6_input_finish(struct sk_buff *skb) +{ + struct inet6_protocol *ipprot; + struct sock *raw_sk; + unsigned int nhoff; + int nexthdr; + u8 hash; + + skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr); + + /* + * Parse extension headers + */ + + nexthdr = skb->nh.ipv6h->nexthdr; + nhoff = offsetof(struct ipv6hdr, nexthdr); + + /* Skip hop-by-hop options, they are already parsed. */ + if (nexthdr == NEXTHDR_HOP) { + nhoff = sizeof(struct ipv6hdr); + nexthdr = skb->h.raw[0]; + skb->h.raw += (skb->h.raw[1]+1)<<3; + } + + rcu_read_lock(); +resubmit: + if (!pskb_pull(skb, skb->h.raw - skb->data)) + goto discard; + nexthdr = skb->nh.raw[nhoff]; + + raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); + if (raw_sk) + ipv6_raw_deliver(skb, nexthdr); + + hash = nexthdr & (MAX_INET_PROTOS - 1); + if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + int ret; + + if (ipprot->flags & INET6_PROTO_FINAL) { + struct ipv6hdr *hdr; + + skb_postpull_rcsum(skb, skb->nh.raw, + skb->h.raw - skb->nh.raw); + hdr = skb->nh.ipv6h; + if (ipv6_addr_is_multicast(&hdr->daddr) && + !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, + &hdr->saddr) && + !ipv6_is_mld(skb, nexthdr)) + goto discard; + } + if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && + !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + ret = ipprot->handler(&skb, &nhoff); + if (ret > 0) + goto resubmit; + else if (ret == 0) + IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + } else { + if (!raw_sk) { + if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); + } + } else { + IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + } + rcu_read_unlock(); + return 0; + +discard: + IP6_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + + +int ip6_input(struct sk_buff *skb) +{ + return NF_HOOK(PF_INET6,NF_IP6_LOCAL_IN, skb, skb->dev, NULL, ip6_input_finish); +} + +int ip6_mc_input(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + int deliver; + + IP6_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + + hdr = skb->nh.ipv6h; + deliver = likely(!(skb->dev->flags & (IFF_PROMISC|IFF_ALLMULTI))) || + ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + + /* + * IPv6 multicast router mode isnt currently supported. + */ +#if 0 + if (ipv6_config.multicast_route) { + int addr_type; + + addr_type = ipv6_addr_type(&hdr->daddr); + + if (!(addr_type & (IPV6_ADDR_LOOPBACK | IPV6_ADDR_LINKLOCAL))) { + struct sk_buff *skb2; + struct dst_entry *dst; + + dst = skb->dst; + + if (deliver) { + skb2 = skb_clone(skb, GFP_ATOMIC); + dst_output(skb2); + } else { + dst_output(skb); + return 0; + } + } + } +#endif + + if (likely(deliver)) { + ip6_input(skb); + return 0; + } + /* discard */ + kfree_skb(skb); + + return 0; +} diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c new file mode 100644 index 000000000000..49208ba75094 --- /dev/null +++ b/net/ipv6/ip6_output.c @@ -0,0 +1,1197 @@ +/* + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $ + * + * Based on linux/net/ipv4/ip_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * + * H. von Brand : Added missing #include <linux/string.h> + * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/in6.h> +#include <linux/tcp.h> +#include <linux/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/rawv6.h> +#include <net/icmp.h> +#include <net/xfrm.h> +#include <net/checksum.h> + +static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr) +{ + static u32 ipv6_fragmentation_id = 1; + static DEFINE_SPINLOCK(ip6_id_lock); + + spin_lock_bh(&ip6_id_lock); + fhdr->identification = htonl(ipv6_fragmentation_id); + if (++ipv6_fragmentation_id == 0) + ipv6_fragmentation_id = 1; + spin_unlock_bh(&ip6_id_lock); +} + +static inline int ip6_output_finish(struct sk_buff *skb) +{ + + struct dst_entry *dst = skb->dst; + struct hh_cache *hh = dst->hh; + + if (hh) { + int hh_alen; + + read_lock_bh(&hh->hh_lock); + hh_alen = HH_DATA_ALIGN(hh->hh_len); + memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); + return hh->hh_output(skb); + } else if (dst->neighbour) + return dst->neighbour->output(skb); + + IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; + +} + +/* dev_loopback_xmit for use with netfilter. */ +static int ip6_dev_loopback_xmit(struct sk_buff *newskb) +{ + newskb->mac.raw = newskb->data; + __skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + BUG_TRAP(newskb->dst); + + netif_rx(newskb); + return 0; +} + + +static int ip6_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct net_device *dev = dst->dev; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) { + struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL; + + if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) && + ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr)) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + + /* Do not check for IFF_ALLMULTI; multicast routing + is not supported in any case. + */ + if (newskb) + NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL, + newskb->dev, + ip6_dev_loopback_xmit); + + if (skb->nh.ipv6h->hop_limit == 0) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + } + + IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + } + + return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish); +} + +int ip6_output(struct sk_buff *skb) +{ + if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst)) + return ip6_fragment(skb, ip6_output2); + else + return ip6_output2(skb); +} + +#ifdef CONFIG_NETFILTER +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct ipv6hdr *iph = skb->nh.ipv6h; + struct dst_entry *dst; + struct flowi fl = { + .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .nl_u = + { .ip6_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, } }, + .proto = iph->nexthdr, + }; + + dst = ip6_route_output(skb->sk, &fl); + + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + dst_release(skb->dst); + + skb->dst = dst; + return 0; +} +#endif + +static inline int ip6_maybe_reroute(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER + if (skb->nfcache & NFC_ALTERED){ + if (ip6_route_me_harder(skb) != 0){ + kfree_skb(skb); + return -EINVAL; + } + } +#endif /* CONFIG_NETFILTER */ + return dst_output(skb); +} + +/* + * xmit an sk_buff (used by TCP) + */ + +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + struct ipv6_txoptions *opt, int ipfragok) +{ + struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL; + struct in6_addr *first_hop = &fl->fl6_dst; + struct dst_entry *dst = skb->dst; + struct ipv6hdr *hdr; + u8 proto = fl->proto; + int seg_len = skb->len; + int hlimit; + u32 mtu; + + if (opt) { + int head_room; + + /* First: exthdrs may take lots of space (~8K for now) + MAX_HEADER is not enough. + */ + head_room = opt->opt_nflen + opt->opt_flen; + seg_len += head_room; + head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + + if (skb_headroom(skb) < head_room) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + kfree_skb(skb); + skb = skb2; + if (skb == NULL) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return -ENOBUFS; + } + if (sk) + skb_set_owner_w(skb, sk); + } + if (opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); + } + + hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr)); + + /* + * Fill in the IPv6 header + */ + + *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel; + hlimit = -1; + if (np) + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + + hdr->payload_len = htons(seg_len); + hdr->nexthdr = proto; + hdr->hop_limit = hlimit; + + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); + ipv6_addr_copy(&hdr->daddr, first_hop); + + mtu = dst_mtu(dst); + if ((skb->len <= mtu) || ipfragok) { + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); + } + + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; +} + +/* + * To avoid extra problems ND packets are send through this + * routine. It's code duplication but I really want to avoid + * extra checks since ipv6_build_header is used by TCP (which + * is for us performance critical) + */ + +int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + int totlen; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + totlen = len + sizeof(struct ipv6hdr); + + hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + skb->nh.ipv6h = hdr; + + *(u32*)hdr = htonl(0x60000000); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = np->hop_limit; + + ipv6_addr_copy(&hdr->saddr, saddr); + ipv6_addr_copy(&hdr->daddr, daddr); + + return 0; +} + +static int ip6_call_ra_chain(struct sk_buff *skb, int sel) +{ + struct ip6_ra_chain *ra; + struct sock *last = NULL; + + read_lock(&ip6_ra_lock); + for (ra = ip6_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && ra->sel == sel) { + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + rawv6_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + rawv6_rcv(last, skb); + read_unlock(&ip6_ra_lock); + return 1; + } + read_unlock(&ip6_ra_lock); + return 0; +} + +static inline int ip6_forward_finish(struct sk_buff *skb) +{ + return dst_output(skb); +} + +int ip6_forward(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct inet6_skb_parm *opt = IP6CB(skb); + + if (ipv6_devconf.forwarding == 0) + goto error; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + IP6_INC_STATS(IPSTATS_MIB_INDISCARDS); + goto drop; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* + * We DO NOT make any processing on + * RA packets, pushing them to user level AS IS + * without ane WARRANTY that application will be able + * to interpret them. The reason is that we + * cannot make anything clever here. + * + * We are not end-node, so that if packet contains + * AH/ESP, we cannot make anything. + * Defragmentation also would be mistake, RA packets + * cannot be fragmented, because there is no warranty + * that different fragments will go along one path. --ANK + */ + if (opt->ra) { + u8 *ptr = skb->nh.raw + opt->ra; + if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + return 0; + } + + /* + * check and decrement ttl + */ + if (hdr->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0, skb->dev); + + kfree_skb(skb); + return -ETIMEDOUT; + } + + if (!xfrm6_route_forward(skb)) { + IP6_INC_STATS(IPSTATS_MIB_INDISCARDS); + goto drop; + } + dst = skb->dst; + + /* IPv6 specs say nothing about it, but it is clear that we cannot + send redirects to source routed frames. + */ + if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) { + struct in6_addr *target = NULL; + struct rt6_info *rt; + struct neighbour *n = dst->neighbour; + + /* + * incoming and outgoing devices are the same + * send a redirect. + */ + + rt = (struct rt6_info *) dst; + if ((rt->rt6i_flags & RTF_GATEWAY)) + target = (struct in6_addr*)&n->primary_key; + else + target = &hdr->daddr; + + /* Limit redirects both by destination (here) + and by source (inside ndisc_send_redirect) + */ + if (xrlim_allow(dst, 1*HZ)) + ndisc_send_redirect(skb, n, target); + } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK + |IPV6_ADDR_LINKLOCAL)) { + /* This check is security critical. */ + goto error; + } + + if (skb->len > dst_mtu(dst)) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev); + IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + goto drop; + } + + hdr = skb->nh.ipv6h; + + /* Mangling hops number delayed to point after skb COW */ + + hdr->hop_limit--; + + IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); + +error: + IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); +drop: + kfree_skb(skb); + return -EINVAL; +} + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + to->security = from->security; + dst_release(to->dst); + to->dst = dst_clone(from->dst); + to->dev = from->dev; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif +#ifdef CONFIG_NETFILTER + to->nfmark = from->nfmark; + /* Connection association is same as pre-frag packet */ + to->nfct = from->nfct; + nf_conntrack_get(to->nfct); + to->nfctinfo = from->nfctinfo; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(to->nf_bridge); + to->nf_bridge = from->nf_bridge; + nf_bridge_get(to->nf_bridge); +#endif +#ifdef CONFIG_NETFILTER_DEBUG + to->nf_debug = from->nf_debug; +#endif +#endif +} + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1); + unsigned int packet_len = skb->tail - skb->nh.raw; + int found_rhdr = 0; + *nexthdr = &skb->nh.ipv6h->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: + if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1; + if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset; + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + default : + return offset; + } + } + + return offset; +} + +static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +{ + struct net_device *dev; + struct sk_buff *frag; + struct rt6_info *rt = (struct rt6_info*)skb->dst; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + u32 frag_id = 0; + int ptr, offset = 0, err=0; + u8 *prevhdr, nexthdr = 0; + + dev = rt->u.dst.dev; + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr); + + if (skb_shinfo(skb)->frag_list) { + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb)) + goto slow_path; + + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path; + + /* Correct socket ownership. */ + if (frag->sk == NULL) + goto slow_path; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path; + } + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = NULL; + /* BUILD HEADER */ + + tmp_hdr = kmalloc(hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return -ENOMEM; + } + + *prevhdr = NEXTHDR_FRAGMENT; + memcpy(tmp_hdr, skb->nh.raw, hlen); + __skb_pull(skb, hlen); + fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + skb->nh.raw = __skb_push(skb, hlen); + memcpy(skb->nh.raw, tmp_hdr, hlen); + + ipv6_select_ident(skb, fh); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + frag_id = fh->identification; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + frag->h.raw = frag->data; + fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + frag->nh.raw = __skb_push(frag, hlen); + memcpy(frag->nh.raw, tmp_hdr, hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next != NULL) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + + err = output(skb); + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (tmp_hdr) + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + + /* + * Keep copying data until we run out. + */ + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { + NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n")); + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + frag->nh.raw = frag->data; + fh = (struct frag_hdr*)(frag->data + hlen); + frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + memcpy(frag->nh.raw, skb->data, hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + if (frag_id) { + ipv6_select_ident(skb, fh); + frag_id = fh->identification; + } else + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, frag->h.raw, len)) + BUG(); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES); + + err = output(frag); + if (err) + goto fail; + } + kfree_skb(skb); + IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); + return err; + +fail: + kfree_skb(skb); + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; +} + +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +{ + int err = 0; + + *dst = NULL; + if (sk) { + struct ipv6_pinfo *np = inet6_sk(sk); + + *dst = sk_dst_check(sk, np->dst_cookie); + if (*dst) { + struct rt6_info *rt = (struct rt6_info*)*dst; + + /* Yes, checking route validity in not connected + case is not very simple. Take into account, + that we do not support routing by source, TOS, + and MSG_DONTROUTE --ANK (980726) + + 1. If route was host route, check that + cached destination is current. + If it is network route, we still may + check its validity using saved pointer + to the last used address: daddr_cache. + We do not want to save whole address now, + (because main consumer of this service + is tcp, which has not this problem), + so that the last trick works only on connected + sockets. + 2. oif also should be the same. + */ + + if (((rt->rt6i_dst.plen != 128 || + !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr)) + && (np->daddr_cache == NULL || + !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache))) + || (fl->oif && fl->oif != (*dst)->dev->ifindex)) { + dst_release(*dst); + *dst = NULL; + } + } + } + + if (*dst == NULL) + *dst = ip6_route_output(sk, fl); + + if ((err = (*dst)->error)) + goto out_err_release; + + if (ipv6_addr_any(&fl->fl6_src)) { + err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); + + if (err) { +#if IP6_DEBUG >= 2 + printk(KERN_DEBUG "ip6_dst_lookup: " + "no available source address\n"); +#endif + goto out_err_release; + } + } + + return 0; + +out_err_release: + dst_release(*dst); + *dst = NULL; + return err; +} + +int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + unsigned int maxfraglen, fragheaderlen; + int exthdrlen; + int hh_len; + int mtu; + int copy; + int err; + int offset = 0; + int csummode = CHECKSUM_NONE; + + if (flags&MSG_PROBE) + return 0; + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking + */ + if (opt) { + if (np->cork.opt == NULL) { + np->cork.opt = kmalloc(opt->tot_len, + sk->sk_allocation); + if (unlikely(np->cork.opt == NULL)) + return -ENOBUFS; + } else if (np->cork.opt->tot_len < opt->tot_len) { + printk(KERN_DEBUG "ip6_append_data: invalid option length\n"); + return -EINVAL; + } + memcpy(np->cork.opt, opt, opt->tot_len); + inet->cork.flags |= IPCORK_OPT; + /* need source address above miyazawa*/ + } + dst_hold(&rt->u.dst); + np->cork.rt = rt; + inet->cork.fl = *fl; + np->cork.hop_limit = hlimit; + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); + if (dst_allfrag(rt->u.dst.path)) + inet->cork.flags |= IPCORK_ALLFRAG; + inet->cork.length = 0; + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0); + length += exthdrlen; + transhdrlen += exthdrlen; + } else { + rt = np->cork.rt; + fl = &inet->cork.fl; + if (inet->cork.flags & IPCORK_OPT) + opt = np->cork.opt; + transhdrlen = 0; + exthdrlen = 0; + mtu = inet->cork.fragsize; + } + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + + if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { + if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen); + return -EMSGSIZE; + } + } + + /* + * Let's try using as much space as possible. + * Use MTU if total length of the message fits into the MTU. + * Otherwise, we need to reserve fragment header and + * fragment alignment (= 8-15 octects, in total). + * + * Note that we may need to "move" the data from the tail of + * of the buffer to the new fragment when we split + * the message. + * + * FIXME: It may be fragmented into multiple chunks + * at once if non-fragmentable extension headers + * are too large. + * --yoshfuji + */ + + inet->cork.length += length; + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; + struct sk_buff *skb_prev; +alloc_new_skb: + skb_prev = skb; + + /* There's no room in the current skb */ + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen; + + fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && + !(rt->u.dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + /* + * The last fragment gets additional space at tail. + * Note: we overallocate on fragments with MSG_MODE + * because we have no idea if we're the last one. + */ + if (datalen == length + fraggap) + alloclen += rt->u.dst.trailer_len; + + /* + * We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloclen += sizeof(struct frag_hdr); + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + } + if (skb == NULL) + goto error; + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + /* reserve for fragmentation */ + skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); + + /* + * Find where to start putting bytes + */ + data = skb_put(skb, fraglen); + skb->nh.raw = data + exthdrlen; + data += fragheaderlen; + skb->h.raw = data + exthdrlen; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + skb_trim(skb_prev, maxfraglen); + } + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + kfree_skb(skb); + goto error; + } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = sk->sk_sndmsg_page; + int off = sk->sk_sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if(i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + } + offset += copy; + length -= copy; + } + return 0; +error: + inet->cork.length -= length; + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +int ip6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + struct ipv6_txoptions *opt = np->cork.opt; + struct rt6_info *rt = np->cork.rt; + struct flowi *fl = &inet->cork.fl; + unsigned char proto = fl->proto; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb->nh.raw) + __skb_pull(skb, skb->nh.raw - skb->data); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; +#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ + skb->truesize += tmp_skb->truesize; + __sock_put(tmp_skb->sk); + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; +#endif + } + + ipv6_addr_copy(final_dst, &fl->fl6_dst); + __skb_pull(skb, skb->h.raw - skb->nh.raw); + if (opt && opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt && opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); + + skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr)); + + *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000); + + if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + else + hdr->payload_len = 0; + hdr->hop_limit = np->cork.hop_limit; + hdr->nexthdr = proto; + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); + ipv6_addr_copy(&hdr->daddr, final_dst); + + skb->dst = dst_clone(&rt->u.dst); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); + if (err) { + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; + } + +out: + inet->cork.flags &= ~IPCORK_OPT; + if (np->cork.opt) { + kfree(np->cork.opt); + np->cork.opt = NULL; + } + if (np->cork.rt) { + dst_release(&np->cork.rt->u.dst); + np->cork.rt = NULL; + inet->cork.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); + return err; +error: + goto out; +} + +void ip6_flush_pending_frames(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + } + + inet->cork.flags &= ~IPCORK_OPT; + + if (np->cork.opt) { + kfree(np->cork.opt); + np->cork.opt = NULL; + } + if (np->cork.rt) { + dst_release(&np->cork.rt->u.dst); + np->cork.rt = NULL; + inet->cork.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); +} diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c new file mode 100644 index 000000000000..3b1c9fa184ae --- /dev/null +++ b/net/ipv6/ip6_tunnel.c @@ -0,0 +1,1163 @@ +/* + * IPv6 over IPv6 tunnel device + * Linux INET6 implementation + * + * Authors: + * Ville Nuorvala <vnuorval@tcs.hut.fi> + * + * $Id$ + * + * Based on: + * linux/net/ipv6/sit.c + * + * RFC 2473 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/sockios.h> +#include <linux/if.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/if_tunnel.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <linux/route.h> +#include <linux/rtnetlink.h> +#include <linux/netfilter_ipv6.h> + +#include <asm/uaccess.h> +#include <asm/atomic.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/ip6_tunnel.h> +#include <net/xfrm.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> + +MODULE_AUTHOR("Ville Nuorvala"); +MODULE_DESCRIPTION("IPv6-in-IPv6 tunnel"); +MODULE_LICENSE("GPL"); + +#define IPV6_TLV_TEL_DST_SIZE 8 + +#ifdef IP6_TNL_DEBUG +#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __FUNCTION__) +#else +#define IP6_TNL_TRACE(x...) do {;} while(0) +#endif + +#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) + +#define HASH_SIZE 32 + +#define HASH(addr) (((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ + (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ + (HASH_SIZE - 1)) + +static int ip6ip6_fb_tnl_dev_init(struct net_device *dev); +static int ip6ip6_tnl_dev_init(struct net_device *dev); +static void ip6ip6_tnl_dev_setup(struct net_device *dev); + +/* the IPv6 tunnel fallback device */ +static struct net_device *ip6ip6_fb_tnl_dev; + + +/* lists for storing tunnels in use */ +static struct ip6_tnl *tnls_r_l[HASH_SIZE]; +static struct ip6_tnl *tnls_wc[1]; +static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l }; + +/* lock for the tunnel lists */ +static DEFINE_RWLOCK(ip6ip6_lock); + +static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +{ + struct dst_entry *dst = t->dst_cache; + + if (dst && dst->obsolete && + dst->ops->check(dst, t->dst_cookie) == NULL) { + t->dst_cache = NULL; + dst_release(dst); + return NULL; + } + + return dst; +} + +static inline void ip6_tnl_dst_reset(struct ip6_tnl *t) +{ + dst_release(t->dst_cache); + t->dst_cache = NULL; +} + +static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + dst_release(t->dst_cache); + t->dst_cache = dst; +} + +/** + * ip6ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ + +static struct ip6_tnl * +ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip6_tnl *t; + + for (t = tnls_r_l[h0 ^ h1]; t; t = t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + if ((t = tnls_wc[0]) != NULL && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * ip6ip6_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * ip6ip6_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ + +static struct ip6_tnl ** +ip6ip6_bucket(struct ip6_tnl_parm *p) +{ + struct in6_addr *remote = &p->raddr; + struct in6_addr *local = &p->laddr; + unsigned h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote) ^ HASH(local); + } + return &tnls[prio][h]; +} + +/** + * ip6ip6_tnl_link - add tunnel to hash table + * @t: tunnel to be added + **/ + +static void +ip6ip6_tnl_link(struct ip6_tnl *t) +{ + struct ip6_tnl **tp = ip6ip6_bucket(&t->parms); + + t->next = *tp; + write_lock_bh(&ip6ip6_lock); + *tp = t; + write_unlock_bh(&ip6ip6_lock); +} + +/** + * ip6ip6_tnl_unlink - remove tunnel from hash table + * @t: tunnel to be removed + **/ + +static void +ip6ip6_tnl_unlink(struct ip6_tnl *t) +{ + struct ip6_tnl **tp; + + for (tp = ip6ip6_bucket(&t->parms); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ip6ip6_lock); + *tp = t->next; + write_unlock_bh(&ip6ip6_lock); + break; + } + } +} + +/** + * ip6_tnl_create() - create a new tunnel + * @p: tunnel parameters + * @pt: pointer to new tunnel + * + * Description: + * Create tunnel matching given parameters. + * + * Return: + * 0 on success + **/ + +static int +ip6_tnl_create(struct ip6_tnl_parm *p, struct ip6_tnl **pt) +{ + struct net_device *dev; + struct ip6_tnl *t; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) { + strlcpy(name, p->name, IFNAMSIZ); + } else { + int i; + for (i = 1; i < IP6_TNL_MAX; i++) { + sprintf(name, "ip6tnl%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i == IP6_TNL_MAX) + return -ENOBUFS; + } + dev = alloc_netdev(sizeof (*t), name, ip6ip6_tnl_dev_setup); + if (dev == NULL) + return -ENOMEM; + + t = dev->priv; + dev->init = ip6ip6_tnl_dev_init; + t->parms = *p; + + if ((err = register_netdevice(dev)) < 0) { + free_netdev(dev); + return err; + } + dev_hold(dev); + + ip6ip6_tnl_link(t); + *pt = t; + return 0; +} + +/** + * ip6ip6_tnl_locate - find or create tunnel matching given parameters + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * ip6ip6_tnl_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * 0 if tunnel located or created, + * -EINVAL if parameters incorrect, + * -ENODEV if no matching tunnel available + **/ + +static int +ip6ip6_tnl_locate(struct ip6_tnl_parm *p, struct ip6_tnl **pt, int create) +{ + struct in6_addr *remote = &p->raddr; + struct in6_addr *local = &p->laddr; + struct ip6_tnl *t; + + if (p->proto != IPPROTO_IPV6) + return -EINVAL; + + for (t = *ip6ip6_bucket(p); t; t = t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr)) { + *pt = t; + return (create ? -EEXIST : 0); + } + } + if (!create) + return -ENODEV; + + return ip6_tnl_create(p, pt); +} + +/** + * ip6ip6_tnl_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * ip6ip6_tnl_dev_uninit() removes tunnel from its list + **/ + +static void +ip6ip6_tnl_dev_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = dev->priv; + + if (dev == ip6ip6_fb_tnl_dev) { + write_lock_bh(&ip6ip6_lock); + tnls_wc[0] = NULL; + write_unlock_bh(&ip6ip6_lock); + } else { + ip6ip6_tnl_unlink(t); + } + ip6_tnl_dst_reset(t); + dev_put(dev); +} + +/** + * parse_tvl_tnl_enc_lim - handle encapsulation limit option + * @skb: received socket buffer + * + * Return: + * 0 if none was found, + * else index to encapsulation limit + **/ + +static __u16 +parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +{ + struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw; + __u8 nexthdr = ipv6h->nexthdr; + __u16 off = sizeof (*ipv6h); + + while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { + __u16 optlen = 0; + struct ipv6_opt_hdr *hdr; + if (raw + off + sizeof (*hdr) > skb->data && + !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + break; + + hdr = (struct ipv6_opt_hdr *) (raw + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; + if (frag_hdr->frag_off) + break; + optlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + optlen = (hdr->hdrlen + 2) << 2; + } else { + optlen = ipv6_optlen(hdr); + } + if (nexthdr == NEXTHDR_DEST) { + __u16 i = off + 2; + while (1) { + struct ipv6_tlv_tnl_enc_lim *tel; + + /* No more room for encapsulation limit */ + if (i + sizeof (*tel) > off + optlen) + break; + + tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + /* return index of option if found and valid */ + if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && + tel->length == 1) + return i; + /* else jump to next option */ + if (tel->type) + i += tel->length + 2; + else + i++; + } + } + nexthdr = hdr->nexthdr; + off += optlen; + } + return 0; +} + +/** + * ip6ip6_err - tunnel error handler + * + * Description: + * ip6ip6_err() should handle errors in the tunnel according + * to the specifications in RFC 2473. + **/ + +static void +ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data; + struct ip6_tnl *t; + int rel_msg = 0; + int rel_type = ICMPV6_DEST_UNREACH; + int rel_code = ICMPV6_ADDR_UNREACH; + __u32 rel_info = 0; + __u16 len; + + /* If the packet doesn't contain the original IPv6 header we are + in trouble since we might need the source address for further + processing of the error. */ + + read_lock(&ip6ip6_lock); + if ((t = ip6ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL) + goto out; + + switch (type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Path to destination invalid " + "or inactive!\n", t->parms.name); + rel_msg = 1; + break; + case ICMPV6_TIME_EXCEED: + if (code == ICMPV6_EXC_HOPLIMIT) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small hop limit or " + "routing loop in tunnel!\n", + t->parms.name); + rel_msg = 1; + } + break; + case ICMPV6_PARAMPROB: + /* ignore if parameter problem not caused by a tunnel + encapsulation limit sub-option */ + if (code != ICMPV6_HDR_FIELD) { + break; + } + teli = parse_tlv_tnl_enc_lim(skb, skb->data); + + if (teli && teli == ntohl(info) - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small encapsulation " + "limit or routing loop in " + "tunnel!\n", t->parms.name); + rel_msg = 1; + } + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = ntohl(info) - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + + if ((len = sizeof (*ipv6h) + ipv6h->payload_len) > mtu) { + rel_type = ICMPV6_PKT_TOOBIG; + rel_code = 0; + rel_info = mtu; + rel_msg = 1; + } + break; + } + if (rel_msg && pskb_may_pull(skb, offset + sizeof (*ipv6h))) { + struct rt6_info *rt; + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out; + + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, offset); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt = rt6_lookup(&skb2->nh.ipv6h->saddr, NULL, 0, 0); + + if (rt && rt->rt6i_dev) + skb2->dev = rt->rt6i_dev; + + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + + if (rt) + dst_release(&rt->u.dst); + + kfree_skb(skb2); + } +out: + read_unlock(&ip6ip6_lock); +} + +static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph, + struct sk_buff *skb) +{ + struct ipv6hdr *inner_iph = skb->nh.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/** + * ip6ip6_rcv - decapsulate IPv6 packet and retransmit it locally + * @skb: received socket buffer + * + * Return: 0 + **/ + +static int +ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct ipv6hdr *ipv6h; + struct ip6_tnl *t; + + if (!pskb_may_pull(skb, sizeof (*ipv6h))) + goto discard; + + ipv6h = skb->nh.ipv6h; + + read_lock(&ip6ip6_lock); + + if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return 0; + } + + if (!(t->parms.flags & IP6_TNL_F_CAP_RCV)) { + t->stat.rx_dropped++; + read_unlock(&ip6ip6_lock); + goto discard; + } + secpath_reset(skb); + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + skb->dev = t->dev; + dst_release(skb->dst); + skb->dst = NULL; + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv6_copy_dscp(ipv6h, skb->nh.ipv6h); + ip6ip6_ecn_decapsulate(ipv6h, skb); + t->stat.rx_packets++; + t->stat.rx_bytes += skb->len; + netif_rx(skb); + read_unlock(&ip6ip6_lock); + return 0; + } + read_unlock(&ip6ip6_lock); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); +discard: + return 1; +} + +static inline struct ipv6_txoptions *create_tel(__u8 encap_limit) +{ + struct ipv6_tlv_tnl_enc_lim *tel; + struct ipv6_txoptions *opt; + __u8 *raw; + + int opt_len = sizeof(*opt) + 8; + + if (!(opt = kmalloc(opt_len, GFP_ATOMIC))) { + return NULL; + } + memset(opt, 0, opt_len); + opt->tot_len = opt_len; + opt->dst0opt = (struct ipv6_opt_hdr *) (opt + 1); + opt->opt_nflen = 8; + + tel = (struct ipv6_tlv_tnl_enc_lim *) (opt->dst0opt + 1); + tel->type = IPV6_TLV_TNL_ENCAP_LIMIT; + tel->length = 1; + tel->encap_limit = encap_limit; + + raw = (__u8 *) opt->dst0opt; + raw[5] = IPV6_TLV_PADN; + raw[6] = 1; + + return opt; +} + +/** + * ip6ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline int +ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +/** + * ip6ip6_tnl_xmit - encapsulate packet and send + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * + * Description: + * Build new header and do some sanity checks on the packet before sending + * it. + * + * Return: + * 0 + **/ + +static int +ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct net_device_stats *stats = &t->stat; + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + struct ipv6_txoptions *opt = NULL; + int encap_limit = -1; + __u16 offset; + struct flowi fl; + struct dst_entry *dst; + struct net_device *tdev; + int mtu; + int max_headroom = sizeof(struct ipv6hdr); + u8 proto; + int err; + int pkt_len; + int dsfield; + + if (t->recursion++) { + stats->collisions++; + goto tx_err; + } + if (skb->protocol != htons(ETH_P_IPV6) || + !(t->parms.flags & IP6_TNL_F_CAP_XMIT) || + ip6ip6_tnl_addr_conflict(t, ipv6h)) { + goto tx_err; + } + if ((offset = parse_tlv_tnl_enc_lim(skb, skb->nh.raw)) > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->nh.raw[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2, skb->dev); + goto tx_err; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + encap_limit = t->parms.encap_limit; + } + memcpy(&fl, &t->fl, sizeof (fl)); + proto = fl.proto; + + dsfield = ipv6_get_dsfield(ipv6h); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_TCLASS_MASK); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_FLOWLABEL_MASK); + + if (encap_limit >= 0 && (opt = create_tel(encap_limit)) == NULL) + goto tx_err; + + if ((dst = ip6_tnl_dst_check(t)) != NULL) + dst_hold(dst); + else + dst = ip6_route_output(NULL, &fl); + + if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + mtu = dst_mtu(dst) - sizeof (*ipv6h); + if (opt) { + max_headroom += 8; + mtu -= 8; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb->dst && mtu < dst_mtu(skb->dst)) { + struct rt6_info *rt = (struct rt6_info *) skb->dst; + rt->rt6i_flags |= RTF_MODIFIED; + rt->u.dst.metrics[RTAX_MTU-1] = mtu; + } + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + goto tx_err_dst_release; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom += LL_RESERVED_SPACE(tdev); + + if (skb_headroom(skb) < max_headroom || + skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb; + + if (!(new_skb = skb_realloc_headroom(skb, max_headroom))) + goto tx_err_dst_release; + + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + kfree_skb(skb); + skb = new_skb; + } + dst_release(skb->dst); + skb->dst = dst_clone(dst); + + skb->h.raw = skb->nh.raw; + + if (opt) + ipv6_push_nfrag_opts(skb, opt, &proto, NULL); + + skb->nh.raw = skb_push(skb, sizeof(struct ipv6hdr)); + ipv6h = skb->nh.ipv6h; + *(u32*)ipv6h = fl.fl6_flowlabel | htonl(0x60000000); + dsfield = INET_ECN_encapsulate(0, dsfield); + ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->nexthdr = proto; + ipv6_addr_copy(&ipv6h->saddr, &fl.fl6_src); + ipv6_addr_copy(&ipv6h->daddr, &fl.fl6_dst); + nf_reset(skb); + pkt_len = skb->len; + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, + skb->dst->dev, dst_output); + + if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) { + stats->tx_bytes += pkt_len; + stats->tx_packets++; + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + ip6_tnl_dst_store(t, dst); + + if (opt) + kfree(opt); + + t->recursion--; + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + if (opt) + kfree(opt); +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + t->recursion--; + return 0; +} + +static void ip6_tnl_set_cap(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + struct in6_addr *laddr = &p->laddr; + struct in6_addr *raddr = &p->raddr; + int ltype = ipv6_addr_type(laddr); + int rtype = ipv6_addr_type(raddr); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); + + if (ltype != IPV6_ADDR_ANY && rtype != IPV6_ADDR_ANY && + ((ltype|rtype) & + (IPV6_ADDR_UNICAST| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL| + IPV6_ADDR_MAPPED|IPV6_ADDR_RESERVED)) == IPV6_ADDR_UNICAST) { + struct net_device *ldev = NULL; + int l_ok = 1; + int r_ok = 1; + + if (p->link) + ldev = dev_get_by_index(p->link); + + if (ltype&IPV6_ADDR_UNICAST && !ipv6_chk_addr(laddr, ldev, 0)) + l_ok = 0; + + if (rtype&IPV6_ADDR_UNICAST && ipv6_chk_addr(raddr, NULL, 0)) + r_ok = 0; + + if (l_ok && r_ok) { + if (ltype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_RCV; + } + if (ldev) + dev_put(ldev); + } +} + +static void ip6ip6_tnl_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct ip6_tnl_parm *p = &t->parms; + struct flowi *fl = &t->fl; + + memcpy(&dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(&dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + + /* Set up flowi template */ + ipv6_addr_copy(&fl->fl6_src, &p->laddr); + ipv6_addr_copy(&fl->fl6_dst, &p->raddr); + fl->oif = p->link; + fl->fl6_flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + + ip6_tnl_set_cap(t); + + if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + struct rt6_info *rt = rt6_lookup(&p->raddr, &p->laddr, + p->link, 0); + + if (rt == NULL) + return; + + if (rt->rt6i_dev) { + dev->hard_header_len = rt->rt6i_dev->hard_header_len + + sizeof (struct ipv6hdr); + + dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dst_release(&rt->u.dst); + } +} + +/** + * ip6ip6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * @active: != 0 if tunnel is ready for use + * + * Description: + * ip6ip6_tnl_change() updates the tunnel parameters + **/ + +static int +ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +{ + ipv6_addr_copy(&t->parms.laddr, &p->laddr); + ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + ip6ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * ip6ip6_tnl_ioctl() is used for managing IPv6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6tnl0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ + +static int +ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + int create; + struct ip6_tnl_parm p; + struct ip6_tnl *t = NULL; + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ip6ip6_fb_tnl_dev) { + if (copy_from_user(&p, + ifr->ifr_ifru.ifru_data, + sizeof (p))) { + err = -EFAULT; + break; + } + if ((err = ip6ip6_tnl_locate(&p, &t, 0)) == -ENODEV) + t = (struct ip6_tnl *) dev->priv; + else if (err) + break; + } else + t = (struct ip6_tnl *) dev->priv; + + memcpy(&p, &t->parms, sizeof (p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + err = -EFAULT; + } + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + create = (cmd == SIOCADDTUNNEL); + if (!capable(CAP_NET_ADMIN)) + break; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + err = -EFAULT; + break; + } + if (!create && dev != ip6ip6_fb_tnl_dev) { + t = (struct ip6_tnl *) dev->priv; + } + if (!t && (err = ip6ip6_tnl_locate(&p, &t, create))) { + break; + } + if (cmd == SIOCCHGTUNNEL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + ip6ip6_tnl_unlink(t); + err = ip6ip6_tnl_change(t, &p); + ip6ip6_tnl_link(t); + netdev_state_change(dev); + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, + &t->parms, sizeof (p))) { + err = -EFAULT; + } else { + err = 0; + } + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + + if (dev == ip6ip6_fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, + sizeof (p))) { + err = -EFAULT; + break; + } + err = ip6ip6_tnl_locate(&p, &t, 0); + if (err) + break; + if (t == ip6ip6_fb_tnl_dev->priv) { + err = -EPERM; + break; + } + } else { + t = (struct ip6_tnl *) dev->priv; + } + err = unregister_netdevice(t->dev); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * ip6ip6_tnl_get_stats - return the stats for tunnel device + * @dev: virtual device associated with tunnel + * + * Return: stats for device + **/ + +static struct net_device_stats * +ip6ip6_tnl_get_stats(struct net_device *dev) +{ + return &(((struct ip6_tnl *) dev->priv)->stat); +} + +/** + * ip6ip6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ + +static int +ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) { + return -EINVAL; + } + dev->mtu = new_mtu; + return 0; +} + +/** + * ip6ip6_tnl_dev_setup - setup virtual tunnel device + * @dev: virtual device associated with tunnel + * + * Description: + * Initialize function pointers and device parameters + **/ + +static void ip6ip6_tnl_dev_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ip6ip6_tnl_dev_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ip6ip6_tnl_xmit; + dev->get_stats = ip6ip6_tnl_get_stats; + dev->do_ioctl = ip6ip6_tnl_ioctl; + dev->change_mtu = ip6ip6_tnl_change_mtu; + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + dev->flags |= IFF_NOARP; + dev->addr_len = sizeof(struct in6_addr); +} + + +/** + * ip6ip6_tnl_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static inline void +ip6ip6_tnl_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + t->fl.proto = IPPROTO_IPV6; + t->dev = dev; + strcpy(t->parms.name, dev->name); +} + +/** + * ip6ip6_tnl_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static int +ip6ip6_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + ip6ip6_tnl_dev_init_gen(dev); + ip6ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6ip6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ + +static int +ip6ip6_fb_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = dev->priv; + ip6ip6_tnl_dev_init_gen(dev); + dev_hold(dev); + tnls_wc[0] = t; + return 0; +} + +static struct xfrm6_tunnel ip6ip6_handler = { + .handler = ip6ip6_rcv, + .err_handler = ip6ip6_err, +}; + +/** + * ip6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ + +static int __init ip6_tunnel_init(void) +{ + int err; + + if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) { + printk(KERN_ERR "ip6ip6 init: can't register tunnel\n"); + return -EAGAIN; + } + ip6ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", + ip6ip6_tnl_dev_setup); + + if (!ip6ip6_fb_tnl_dev) { + err = -ENOMEM; + goto fail; + } + ip6ip6_fb_tnl_dev->init = ip6ip6_fb_tnl_dev_init; + + if ((err = register_netdev(ip6ip6_fb_tnl_dev))) { + free_netdev(ip6ip6_fb_tnl_dev); + goto fail; + } + return 0; +fail: + xfrm6_tunnel_deregister(&ip6ip6_handler); + return err; +} + +/** + * ip6_tunnel_cleanup - free resources and unregister protocol + **/ + +static void __exit ip6_tunnel_cleanup(void) +{ + if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0) + printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n"); + + unregister_netdev(ip6ip6_fb_tnl_dev); +} + +module_init(ip6_tunnel_init); +module_exit(ip6_tunnel_cleanup); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c new file mode 100644 index 000000000000..6cde5310cd76 --- /dev/null +++ b/net/ipv6/ipcomp6.c @@ -0,0 +1,524 @@ +/* + * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173 + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Author Mitsuru KANDA <mk@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * [Memo] + * + * Outbound: + * The compression of IP datagram MUST be done before AH/ESP processing, + * fragmentation, and the addition of Hop-by-Hop/Routing header. + * + * Inbound: + * The decompression of IP datagram MUST be done after the reassembly, + * AH/ESP processing. + */ +#include <linux/config.h> +#include <linux/module.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/ipcomp.h> +#include <asm/scatterlist.h> +#include <asm/semaphore.h> +#include <linux/crypto.h> +#include <linux/pfkeyv2.h> +#include <linux/random.h> +#include <linux/percpu.h> +#include <linux/smp.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/rtnetlink.h> +#include <net/icmp.h> +#include <net/ipv6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> + +struct ipcomp6_tfms { + struct list_head list; + struct crypto_tfm **tfms; + int users; +}; + +static DECLARE_MUTEX(ipcomp6_resource_sem); +static void **ipcomp6_scratches; +static int ipcomp6_scratch_users; +static LIST_HEAD(ipcomp6_tfms_list); + +static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + int err = 0; + u8 nexthdr = 0; + int hdr_len = skb->h.raw - skb->nh.raw; + unsigned char *tmp_hdr = NULL; + struct ipv6hdr *iph; + int plen, dlen; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + iph = skb->nh.ipv6h; + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) + goto out; + memcpy(tmp_hdr, iph, hdr_len); + nexthdr = *(u8 *)skb->data; + skb_pull(skb, sizeof(struct ipv6_comp_hdr)); + skb->nh.raw += sizeof(struct ipv6_comp_hdr); + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + iph = skb->nh.ipv6h; + iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); + skb->h.raw = skb->data; + + /* decompression */ + plen = skb->len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); + if (err) { + err = -EINVAL; + goto out_put_cpu; + } + + if (dlen < (plen + sizeof(struct ipv6_comp_hdr))) { + err = -EINVAL; + goto out_put_cpu; + } + + err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); + if (err) { + goto out_put_cpu; + } + + skb_put(skb, dlen - plen); + memcpy(skb->data, scratch, dlen); + + iph = skb->nh.ipv6h; + iph->payload_len = htons(skb->len); + +out_put_cpu: + put_cpu(); +out: + if (tmp_hdr) + kfree(tmp_hdr); + if (err) + goto error_out; + return nexthdr; +error_out: + return err; +} + +static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct ipv6hdr *top_iph; + int hdr_len; + struct ipv6_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + int plen, dlen; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + hdr_len = skb->h.raw - skb->data; + + /* check whether datagram len is larger than threshold */ + if ((skb->len - hdr_len) < ipcd->threshold) { + goto out_ok; + } + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + goto out_ok; + } + + /* compression */ + plen = skb->len - hdr_len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->h.raw; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); + if (err || (dlen + sizeof(struct ipv6_comp_hdr)) >= plen) { + put_cpu(); + goto out_ok; + } + memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); + put_cpu(); + pskb_trim(skb, hdr_len + dlen + sizeof(struct ip_comp_hdr)); + + /* insert ipcomp header and replace datagram */ + top_iph = (struct ipv6hdr *)skb->data; + + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + ipch = (struct ipv6_comp_hdr *)start; + ipch->nexthdr = *skb->nh.raw; + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + *skb->nh.raw = IPPROTO_COMP; + +out_ok: + return 0; +} + +static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + u32 spi; + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ipv6_comp_hdr *ipcomph = (struct ipv6_comp_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) + return; + + spi = ntohl(ntohs(ipcomph->cpi)); + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); + if (!x) + return; + + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + spi, NIP6(iph->daddr)); + xfrm_state_put(x); +} + +static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) +{ + struct xfrm_state *t = NULL; + + t = xfrm_state_alloc(); + if (!t) + goto out; + + t->id.proto = IPPROTO_IPV6; + t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr); + memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET6; + t->props.mode = 1; + memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); + + t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family); + if (t->type == NULL) + goto error; + + if (t->type->init_state(t, NULL)) + goto error; + + t->km.state = XFRM_STATE_VALID; + atomic_set(&t->tunnel_users, 1); + +out: + return t; + +error: + xfrm_state_put(t); + goto out; +} + +static int ipcomp6_tunnel_attach(struct xfrm_state *x) +{ + int err = 0; + struct xfrm_state *t = NULL; + u32 spi; + + spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&x->props.saddr); + if (spi) + t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr, + spi, IPPROTO_IPV6, AF_INET6); + if (!t) { + t = ipcomp6_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); + +out: + return err; +} + +static void ipcomp6_free_scratches(void) +{ + int i; + void **scratches; + + if (--ipcomp6_scratch_users) + return; + + scratches = ipcomp6_scratches; + if (!scratches) + return; + + for_each_cpu(i) { + void *scratch = *per_cpu_ptr(scratches, i); + if (scratch) + vfree(scratch); + } + + free_percpu(scratches); +} + +static void **ipcomp6_alloc_scratches(void) +{ + int i; + void **scratches; + + if (ipcomp6_scratch_users++) + return ipcomp6_scratches; + + scratches = alloc_percpu(void *); + if (!scratches) + return NULL; + + ipcomp6_scratches = scratches; + + for_each_cpu(i) { + void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); + if (!scratch) + return NULL; + *per_cpu_ptr(scratches, i) = scratch; + } + + return scratches; +} + +static void ipcomp6_free_tfms(struct crypto_tfm **tfms) +{ + struct ipcomp6_tfms *pos; + int cpu; + + list_for_each_entry(pos, &ipcomp6_tfms_list, list) { + if (pos->tfms == tfms) + break; + } + + BUG_TRAP(pos); + + if (--pos->users) + return; + + list_del(&pos->list); + kfree(pos); + + if (!tfms) + return; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); + if (tfm) + crypto_free_tfm(tfm); + } + free_percpu(tfms); +} + +static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name) +{ + struct ipcomp6_tfms *pos; + struct crypto_tfm **tfms; + int cpu; + + /* This can be any valid CPU ID so we don't need locking. */ + cpu = smp_processor_id(); + + list_for_each_entry(pos, &ipcomp6_tfms_list, list) { + struct crypto_tfm *tfm; + + tfms = pos->tfms; + tfm = *per_cpu_ptr(tfms, cpu); + + if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) { + pos->users++; + return tfms; + } + } + + pos = kmalloc(sizeof(*pos), GFP_KERNEL); + if (!pos) + return NULL; + + pos->users = 1; + INIT_LIST_HEAD(&pos->list); + list_add(&pos->list, &ipcomp6_tfms_list); + + pos->tfms = tfms = alloc_percpu(struct crypto_tfm *); + if (!tfms) + goto error; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0); + if (!tfm) + goto error; + *per_cpu_ptr(tfms, cpu) = tfm; + } + + return tfms; + +error: + ipcomp6_free_tfms(tfms); + return NULL; +} + +static void ipcomp6_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfms) + ipcomp6_free_tfms(ipcd->tfms); + ipcomp6_free_scratches(); +} + +static void ipcomp6_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + xfrm_state_delete_tunnel(x); + down(&ipcomp6_resource_sem); + ipcomp6_free_data(ipcd); + up(&ipcomp6_resource_sem); + kfree(ipcd); + + xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); +} + +static int ipcomp6_init_state(struct xfrm_state *x, void *args) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + if (x->encap) + goto out; + + err = -ENOMEM; + ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto out; + + memset(ipcd, 0, sizeof(*ipcd)); + x->props.header_len = 0; + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + + down(&ipcomp6_resource_sem); + if (!ipcomp6_alloc_scratches()) + goto error; + + ipcd->tfms = ipcomp6_alloc_tfms(x->calg->alg_name); + if (!ipcd->tfms) + goto error; + up(&ipcomp6_resource_sem); + + if (x->props.mode) { + err = ipcomp6_tunnel_attach(x); + if (err) + goto error_tunnel; + } + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; +error_tunnel: + down(&ipcomp6_resource_sem); +error: + ipcomp6_free_data(ipcd); + up(&ipcomp6_resource_sem); + kfree(ipcd); + + goto out; +} + +static struct xfrm_type ipcomp6_type = +{ + .description = "IPCOMP6", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp6_init_state, + .destructor = ipcomp6_destroy, + .input = ipcomp6_input, + .output = ipcomp6_output, +}; + +static struct inet6_protocol ipcomp6_protocol = +{ + .handler = xfrm6_rcv, + .err_handler = ipcomp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ipcomp6_init(void) +{ + if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp6_fini(void) +{ + if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp6_init); +module_exit(ipcomp6_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173"); +MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>"); + + diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c new file mode 100644 index 000000000000..279ab86be662 --- /dev/null +++ b/net/ipv6/ipv6_sockglue.c @@ -0,0 +1,704 @@ +/* + * IPv6 BSD socket options interface + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/net/ipv4/ip_sockglue.c + * + * $Id: ipv6_sockglue.c,v 1.41 2002/02/01 22:01:04 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * FIXME: Make the setsockopt code POSIX compliant: That is + * + * o Return -EINVAL for setsockopt of short lengths + * o Truncate getsockopt returns + * o Return an optlen of the truncated length if need be + * + * Changes: + * David L Stevens <dlstevens@us.ibm.com>: + * - added multicast source filtering API for MLDv2 + */ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/netfilter.h> + +#include <net/sock.h> +#include <net/snmp.h> +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/inet_common.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/xfrm.h> + +#include <asm/uaccess.h> + +DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); + +static struct packet_type ipv6_packet_type = { + .type = __constant_htons(ETH_P_IPV6), + .func = ipv6_rcv, +}; + +struct ip6_ra_chain *ip6_ra_chain; +DEFINE_RWLOCK(ip6_ra_lock); + +int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) +{ + struct ip6_ra_chain *ra, *new_ra, **rap; + + /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW) + return -EINVAL; + + new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + write_lock_bh(&ip6_ra_lock); + for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (sel>=0) { + write_unlock_bh(&ip6_ra_lock); + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + + *rap = ra->next; + write_unlock_bh(&ip6_ra_lock); + + if (ra->destructor) + ra->destructor(sk); + sock_put(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) { + write_unlock_bh(&ip6_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->sel = sel; + new_ra->destructor = destructor; + new_ra->next = ra; + *rap = new_ra; + sock_hold(sk); + write_unlock_bh(&ip6_ra_lock); + return 0; +} + +extern int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr); +extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); +extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen); + + +int ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + int val, valbool; + int retv = -ENOPROTOOPT; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + + if(level!=SOL_IPV6) + goto out; + + if (optval == NULL) + val=0; + else if (get_user(val, (int __user *) optval)) + return -EFAULT; + + valbool = (val!=0); + + lock_sock(sk); + + switch (optname) { + + case IPV6_ADDRFORM: + if (val == PF_INET) { + struct ipv6_txoptions *opt; + struct sk_buff *pktopt; + + if (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) + break; + + if (sk->sk_state != TCP_ESTABLISHED) { + retv = -ENOTCONN; + break; + } + + if (ipv6_only_sock(sk) || + !(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) { + retv = -EADDRNOTAVAIL; + break; + } + + fl6_free_socklist(sk); + ipv6_sock_mc_close(sk); + + if (sk->sk_protocol == IPPROTO_TCP) { + struct tcp_sock *tp = tcp_sk(sk); + + local_bh_disable(); + sock_prot_dec_use(sk->sk_prot); + sock_prot_inc_use(&tcp_prot); + local_bh_enable(); + sk->sk_prot = &tcp_prot; + tp->af_specific = &ipv4_specific; + sk->sk_socket->ops = &inet_stream_ops; + sk->sk_family = PF_INET; + tcp_sync_mss(sk, tp->pmtu_cookie); + } else { + local_bh_disable(); + sock_prot_dec_use(sk->sk_prot); + sock_prot_inc_use(&udp_prot); + local_bh_enable(); + sk->sk_prot = &udp_prot; + sk->sk_socket->ops = &inet_dgram_ops; + sk->sk_family = PF_INET; + } + opt = xchg(&np->opt, NULL); + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + pktopt = xchg(&np->pktoptions, NULL); + if (pktopt) + kfree_skb(pktopt); + + sk->sk_destruct = inet_sock_destruct; +#ifdef INET_REFCNT_DEBUG + atomic_dec(&inet6_sock_nr); +#endif + module_put(THIS_MODULE); + retv = 0; + break; + } + goto e_inval; + + case IPV6_V6ONLY: + if (inet_sk(sk)->num) + goto e_inval; + np->ipv6only = valbool; + retv = 0; + break; + + case IPV6_PKTINFO: + np->rxopt.bits.rxinfo = valbool; + retv = 0; + break; + + case IPV6_HOPLIMIT: + np->rxopt.bits.rxhlim = valbool; + retv = 0; + break; + + case IPV6_RTHDR: + if (val < 0 || val > 2) + goto e_inval; + np->rxopt.bits.srcrt = val; + retv = 0; + break; + + case IPV6_HOPOPTS: + np->rxopt.bits.hopopts = valbool; + retv = 0; + break; + + case IPV6_DSTOPTS: + np->rxopt.bits.dstopts = valbool; + retv = 0; + break; + + case IPV6_FLOWINFO: + np->rxopt.bits.rxflow = valbool; + retv = 0; + break; + + case IPV6_PKTOPTIONS: + { + struct ipv6_txoptions *opt = NULL; + struct msghdr msg; + struct flowi fl; + int junk; + + fl.fl6_flowlabel = 0; + fl.oif = sk->sk_bound_dev_if; + + if (optlen == 0) + goto update; + + /* 1K is probably excessive + * 1K is surely not enough, 2K per standard header is 16K. + */ + retv = -EINVAL; + if (optlen > 64*1024) + break; + + opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); + retv = -ENOBUFS; + if (opt == NULL) + break; + + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + optlen; + retv = -EFAULT; + if (copy_from_user(opt+1, optval, optlen)) + goto done; + + msg.msg_controllen = optlen; + msg.msg_control = (void*)(opt+1); + + retv = datagram_send_ctl(&msg, &fl, opt, &junk); + if (retv) + goto done; +update: + retv = 0; + if (sk->sk_type == SOCK_STREAM) { + if (opt) { + struct tcp_sock *tp = tcp_sk(sk); + if (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) + && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { + tp->ext_header_len = opt->opt_flen + opt->opt_nflen; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } + opt = xchg(&np->opt, opt); + sk_dst_reset(sk); + } else { + write_lock(&sk->sk_dst_lock); + opt = xchg(&np->opt, opt); + write_unlock(&sk->sk_dst_lock); + sk_dst_reset(sk); + } + +done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } + case IPV6_UNICAST_HOPS: + if (val > 255 || val < -1) + goto e_inval; + np->hop_limit = val; + retv = 0; + break; + + case IPV6_MULTICAST_HOPS: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (val > 255 || val < -1) + goto e_inval; + np->mcast_hops = val; + retv = 0; + break; + + case IPV6_MULTICAST_LOOP: + np->mc_loop = valbool; + retv = 0; + break; + + case IPV6_MULTICAST_IF: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) + goto e_inval; + + if (__dev_get_by_index(val) == NULL) { + retv = -ENODEV; + break; + } + np->mcast_oif = val; + retv = 0; + break; + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + { + struct ipv6_mreq mreq; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_ADD_MEMBERSHIP) + retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + else + retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + break; + } + case IPV6_JOIN_ANYCAST: + case IPV6_LEAVE_ANYCAST: + { + struct ipv6_mreq mreq; + + if (optlen != sizeof(struct ipv6_mreq)) + goto e_inval; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_JOIN_ANYCAST) + retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + else + retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in6 *psin6; + + retv = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(struct group_req))) + break; + if (greq.gr_group.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + psin6 = (struct sockaddr_in6 *)&greq.gr_group; + if (optname == MCAST_JOIN_GROUP) + retv = ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + else + retv = ipv6_sock_mc_drop(sk, greq.gr_interface, + &psin6->sin6_addr); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + retv = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET6 || + greqs.gsr_source.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; + retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, + &psin6->sin6_addr); + if (retv) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /*IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + retv = ip6_mc_source(add, omode, sk, &greqs); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_optmem_max; + extern int sysctl_mld_max_msf; + struct group_filter *gsf; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + retv = -ENOBUFS; + break; + } + gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); + if (gsf == 0) { + retv = -ENOBUFS; + break; + } + retv = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + kfree(gsf); + break; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffffU || + gsf->gf_numsrc > sysctl_mld_max_msf) { + kfree(gsf); + retv = -ENOBUFS; + break; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + kfree(gsf); + retv = -EINVAL; + break; + } + retv = ip6_mc_msfilter(sk, gsf); + kfree(gsf); + + break; + } + case IPV6_ROUTER_ALERT: + retv = ip6_ra_control(sk, val, NULL); + break; + case IPV6_MTU_DISCOVER: + if (val<0 || val>2) + goto e_inval; + np->pmtudisc = val; + retv = 0; + break; + case IPV6_MTU: + if (val && val < IPV6_MIN_MTU) + goto e_inval; + np->frag_size = val; + retv = 0; + break; + case IPV6_RECVERR: + np->recverr = valbool; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + retv = 0; + break; + case IPV6_FLOWINFO_SEND: + np->sndflow = valbool; + retv = 0; + break; + case IPV6_FLOWLABEL_MGR: + retv = ipv6_flowlabel_opt(sk, optval, optlen); + break; + case IPV6_IPSEC_POLICY: + case IPV6_XFRM_POLICY: + retv = xfrm_user_policy(sk, optname, optval, optlen); + break; + +#ifdef CONFIG_NETFILTER + default: + retv = nf_setsockopt(sk, PF_INET6, optname, optval, + optlen); + break; +#endif + + } + release_sock(sk); + +out: + return retv; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +int ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + int len; + int val; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + if(level!=SOL_IPV6) + return -ENOPROTOOPT; + if (get_user(len, optlen)) + return -EFAULT; + switch (optname) { + case IPV6_ADDRFORM: + if (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) + return -EINVAL; + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + val = sk->sk_family; + break; + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + return -EFAULT; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + + case IPV6_PKTOPTIONS: + { + struct msghdr msg; + struct sk_buff *skb; + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; + + lock_sock(sk); + skb = np->pktoptions; + if (skb) + atomic_inc(&skb->users); + release_sock(sk); + + if (skb) { + int err = datagram_recv_ctl(sk, &msg, skb); + kfree_skb(skb); + if (err) + return err; + } else { + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + src_info.ipi6_ifindex = np->mcast_oif; + ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr); + put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxhlim) { + int hlim = np->mcast_hops; + put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IPV6_MTU: + { + struct dst_entry *dst; + val = 0; + lock_sock(sk); + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); + } + release_sock(sk); + if (!val) + return -ENOTCONN; + break; + } + + case IPV6_V6ONLY: + val = np->ipv6only; + break; + + case IPV6_PKTINFO: + val = np->rxopt.bits.rxinfo; + break; + + case IPV6_HOPLIMIT: + val = np->rxopt.bits.rxhlim; + break; + + case IPV6_RTHDR: + val = np->rxopt.bits.srcrt; + break; + + case IPV6_HOPOPTS: + val = np->rxopt.bits.hopopts; + break; + + case IPV6_DSTOPTS: + val = np->rxopt.bits.dstopts; + break; + + case IPV6_FLOWINFO: + val = np->rxopt.bits.rxflow; + break; + + case IPV6_UNICAST_HOPS: + val = np->hop_limit; + break; + + case IPV6_MULTICAST_HOPS: + val = np->mcast_hops; + break; + + case IPV6_MULTICAST_LOOP: + val = np->mc_loop; + break; + + case IPV6_MULTICAST_IF: + val = np->mcast_oif; + break; + + case IPV6_MTU_DISCOVER: + val = np->pmtudisc; + break; + + case IPV6_RECVERR: + val = np->recverr; + break; + + case IPV6_FLOWINFO_SEND: + val = np->sndflow; + break; + + default: +#ifdef CONFIG_NETFILTER + lock_sock(sk); + val = nf_getsockopt(sk, PF_INET6, optname, optval, + &len); + release_sock(sk); + if (val >= 0) + val = put_user(len, optlen); + return val; +#else + return -EINVAL; +#endif + } + len = min_t(unsigned int, sizeof(int), len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +void __init ipv6_packet_init(void) +{ + dev_add_pack(&ipv6_packet_type); +} + +void ipv6_packet_cleanup(void) +{ + dev_remove_pack(&ipv6_packet_type); +} diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c new file mode 100644 index 000000000000..2f4c91ddc9a3 --- /dev/null +++ b/net/ipv6/ipv6_syms.c @@ -0,0 +1,41 @@ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/protocol.h> +#include <net/ipv6.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> +#include <net/xfrm.h> + +EXPORT_SYMBOL(ipv6_addr_type); +EXPORT_SYMBOL(icmpv6_send); +EXPORT_SYMBOL(icmpv6_statistics); +EXPORT_SYMBOL(icmpv6_err_convert); +EXPORT_SYMBOL(ndisc_mc_map); +EXPORT_SYMBOL(register_inet6addr_notifier); +EXPORT_SYMBOL(unregister_inet6addr_notifier); +EXPORT_SYMBOL(ip6_route_output); +#ifdef CONFIG_NETFILTER +EXPORT_SYMBOL(ip6_route_me_harder); +#endif +EXPORT_SYMBOL(addrconf_lock); +EXPORT_SYMBOL(ipv6_setsockopt); +EXPORT_SYMBOL(ipv6_getsockopt); +EXPORT_SYMBOL(inet6_register_protosw); +EXPORT_SYMBOL(inet6_unregister_protosw); +EXPORT_SYMBOL(inet6_add_protocol); +EXPORT_SYMBOL(inet6_del_protocol); +EXPORT_SYMBOL(ip6_xmit); +EXPORT_SYMBOL(inet6_release); +EXPORT_SYMBOL(inet6_bind); +EXPORT_SYMBOL(inet6_getname); +EXPORT_SYMBOL(inet6_ioctl); +EXPORT_SYMBOL(ipv6_get_saddr); +EXPORT_SYMBOL(ipv6_chk_addr); +EXPORT_SYMBOL(in6_dev_finish_destroy); +#ifdef CONFIG_XFRM +EXPORT_SYMBOL(xfrm6_rcv); +#endif +EXPORT_SYMBOL(rt6_lookup); +EXPORT_SYMBOL(fl6_sock_lookup); +EXPORT_SYMBOL(ipv6_push_nfrag_opts); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c new file mode 100644 index 000000000000..393b6e6f50a9 --- /dev/null +++ b/net/ipv6/mcast.c @@ -0,0 +1,2499 @@ +/* + * Multicast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: mcast.c,v 1.40 2002/02/08 03:57:19 davem Exp $ + * + * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * yoshfuji : fix format of router-alert option + * YOSHIFUJI Hideaki @USAGI: + * Fixed source address for MLD message based on + * <draft-ietf-magma-mld-source-05.txt>. + * YOSHIFUJI Hideaki @USAGI: + * - Ignore Queries for invalid addresses. + * - MLD for link-local addresses. + * David L Stevens <dlstevens@us.ibm.com>: + * - MLDv2 support + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/times.h> +#include <linux/net.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/route.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/if_inet6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> + +#include <net/ip6_checksum.h> + +/* Set to 3 to get tracing... */ +#define MCAST_DEBUG 2 + +#if MCAST_DEBUG >= 3 +#define MDBG(x) printk x +#else +#define MDBG(x) +#endif + +/* + * These header formats should be in a separate include file, but icmpv6.h + * doesn't have in6_addr defined in all cases, there is no __u128, and no + * other files reference these. + * + * +-DLS 4/14/03 + */ + +/* Multicast Listener Discovery version 2 headers */ + +struct mld2_grec { + __u8 grec_type; + __u8 grec_auxwords; + __u16 grec_nsrcs; + struct in6_addr grec_mca; + struct in6_addr grec_src[0]; +}; + +struct mld2_report { + __u8 type; + __u8 resv1; + __u16 csum; + __u16 resv2; + __u16 ngrec; + struct mld2_grec grec[0]; +}; + +struct mld2_query { + __u8 type; + __u8 code; + __u16 csum; + __u16 mrc; + __u16 resv1; + struct in6_addr mca; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 qrv:3, + suppress:1, + resv2:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 resv2:4, + suppress:1, + qrv:3; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __u8 qqic; + __u16 nsrcs; + struct in6_addr srcs[0]; +}; + +static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; + +/* Big mc list lock for all the sockets */ +static DEFINE_RWLOCK(ipv6_sk_mc_lock); + +static struct socket *igmp6_socket; + +int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr); + +static void igmp6_join_group(struct ifmcaddr6 *ma); +static void igmp6_leave_group(struct ifmcaddr6 *ma); +static void igmp6_timer_handler(unsigned long data); + +static void mld_gq_timer_expire(unsigned long data); +static void mld_ifc_timer_expire(unsigned long data); +static void mld_ifc_event(struct inet6_dev *idev); +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); +static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *addr); +static void mld_clear_delrec(struct inet6_dev *idev); +static int sf_setstate(struct ifmcaddr6 *pmc); +static void sf_markstate(struct ifmcaddr6 *pmc); +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); +static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta); +static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta); +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev); + + +#define IGMP6_UNSOLICITED_IVAL (10*HZ) +#define MLD_QRV_DEFAULT 2 + +#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \ + (idev)->cnf.force_mld_version == 1 || \ + ((idev)->mc_v1_seen && \ + time_before(jiffies, (idev)->mc_v1_seen))) + +#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) +#define MLDV2_EXP(thresh, nbmant, nbexp, value) \ + ((value) < (thresh) ? (value) : \ + ((MLDV2_MASK(value, nbmant) | (1<<(nbmant+nbexp))) << \ + (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp)))) + +#define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) +#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) + +#define IPV6_MLD_MAX_MSF 10 + +int sysctl_mld_max_msf = IPV6_MLD_MAX_MSF; + +/* + * socket join on multicast group + */ + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct net_device *dev = NULL; + struct ipv6_mc_socklist *mc_lst; + struct ipv6_pinfo *np = inet6_sk(sk); + int err; + + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + + mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); + + if (mc_lst == NULL) + return -ENOMEM; + + mc_lst->next = NULL; + ipv6_addr_copy(&mc_lst->addr, addr); + + if (ifindex == 0) { + struct rt6_info *rt; + rt = rt6_lookup(addr, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dev_hold(dev); + dst_release(&rt->u.dst); + } + } else + dev = dev_get_by_index(ifindex); + + if (dev == NULL) { + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return -ENODEV; + } + + mc_lst->ifindex = dev->ifindex; + mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sflist = NULL; + + /* + * now add/increase the group membership on the device + */ + + err = ipv6_dev_mc_inc(dev, addr); + + if (err) { + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + dev_put(dev); + return err; + } + + write_lock_bh(&ipv6_sk_mc_lock); + mc_lst->next = np->ipv6_mc_list; + np->ipv6_mc_list = mc_lst; + write_unlock_bh(&ipv6_sk_mc_lock); + + dev_put(dev); + + return 0; +} + +/* + * socket leave on multicast group + */ +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst, **lnk; + + write_lock_bh(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + struct net_device *dev; + + *lnk = mc_lst->next; + write_unlock_bh(&ipv6_sk_mc_lock); + + if ((dev = dev_get_by_index(mc_lst->ifindex)) != NULL) { + struct inet6_dev *idev = in6_dev_get(dev); + + if (idev) { + (void) ip6_mc_leave_src(sk,mc_lst,idev); + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + in6_dev_put(idev); + } + dev_put(dev); + } + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return 0; + } + } + write_unlock_bh(&ipv6_sk_mc_lock); + + return -ENOENT; +} + +static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) +{ + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + + if (ifindex == 0) { + struct rt6_info *rt; + + rt = rt6_lookup(group, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dev_hold(dev); + dst_release(&rt->u.dst); + } + } else + dev = dev_get_by_index(ifindex); + + if (!dev) + return NULL; + idev = in6_dev_get(dev); + if (!idev) { + dev_put(dev); + return NULL; + } + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return NULL; + } + return idev; +} + +void ipv6_sock_mc_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; + + write_lock_bh(&ipv6_sk_mc_lock); + while ((mc_lst = np->ipv6_mc_list) != NULL) { + struct net_device *dev; + + np->ipv6_mc_list = mc_lst->next; + write_unlock_bh(&ipv6_sk_mc_lock); + + dev = dev_get_by_index(mc_lst->ifindex); + if (dev) { + struct inet6_dev *idev = in6_dev_get(dev); + + if (idev) { + (void) ip6_mc_leave_src(sk, mc_lst, idev); + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + in6_dev_put(idev); + } + dev_put(dev); + } + + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + + write_lock_bh(&ipv6_sk_mc_lock); + } + write_unlock_bh(&ipv6_sk_mc_lock); +} + +int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr) +{ + struct in6_addr *source, *group; + struct ipv6_mc_socklist *pmc; + struct net_device *dev; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + int i, j, rv; + int err; + + if (pgsr->gsr_group.ss_family != AF_INET6 || + pgsr->gsr_source.ss_family != AF_INET6) + return -EINVAL; + + source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; + group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + idev = ip6_mc_find_dev(group, pgsr->gsr_interface); + if (!idev) + return -ENODEV; + dev = idev->dev; + + err = -EADDRNOTAVAIL; + + for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) + goto done; + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip6_mc_add_src(idev, group, omode, 0, NULL, 0); + ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sfmode = omode; + } + + psl = pmc->sflist; + if (!add) { + if (!psl) + goto done; + rv = !0; + for (i=0; i<psl->sl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, + sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; + + /* update the interface filter */ + ip6_mc_del_src(idev, group, omode, 1, source, 1); + + for (j=i+1; j<psl->sl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_mld_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip6_sf_socklist *newpsl; + int count = IP6_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, + IP6_SFLSIZE(count), GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP6_SFBLOCK; + if (psl) { + for (i=0; i<psl->sl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } + pmc->sflist = psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; i<psl->sl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = *source; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip6_mc_add_src(idev, group, omode, 1, source, 1); +done: + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return err; +} + +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) +{ + struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct net_device *dev; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *newpsl, *psl; + int i, err; + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + if (gsf->gf_fmode != MCAST_INCLUDE && + gsf->gf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + idev = ip6_mc_find_dev(group, gsf->gf_interface); + + if (!idev) + return -ENODEV; + dev = idev->dev; + err = -EADDRNOTAVAIL; + + for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + if (gsf->gf_numsrc) { + newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, + IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; + for (i=0; i<newpsl->sl_count; ++i) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; + newpsl->sl_addr[i] = psin6->sin6_addr; + } + err = ip6_mc_add_src(idev, group, gsf->gf_fmode, + newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else + newpsl = NULL; + psl = pmc->sflist; + if (psl) { + (void) ip6_mc_del_src(idev, group, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } else + (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sflist = newpsl; + pmc->sfmode = gsf->gf_fmode; +done: + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return err; +} + +int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct net_device *dev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + idev = ip6_mc_find_dev(group, gsf->gf_interface); + + if (!idev) + return -ENODEV; + + dev = idev->dev; + + err = -EADDRNOTAVAIL; + + for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(group, &pmc->addr)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = pmc->sflist; + count = psl ? psl->sl_count : 0; + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + for (i=0; i<copycount; i++) { + struct sockaddr_in6 *psin6; + struct sockaddr_storage ss; + + psin6 = (struct sockaddr_in6 *)&ss; + memset(&ss, 0, sizeof(ss)); + psin6->sin6_family = AF_INET6; + psin6->sin6_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return err; +} + +int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, + struct in6_addr *src_addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc; + struct ip6_sf_socklist *psl; + int rv = 1; + + read_lock(&ipv6_sk_mc_lock); + for (mc = np->ipv6_mc_list; mc; mc = mc->next) { + if (ipv6_addr_equal(&mc->addr, mc_addr)) + break; + } + if (!mc) { + read_unlock(&ipv6_sk_mc_lock); + return 1; + } + psl = mc->sflist; + if (!psl) { + rv = mc->sfmode == MCAST_EXCLUDE; + } else { + int i; + + for (i=0; i<psl->sl_count; i++) { + if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) + break; + } + if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + rv = 0; + if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + rv = 0; + } + read_unlock(&ipv6_sk_mc_lock); + + return rv; +} + +static void ma_put(struct ifmcaddr6 *mc) +{ + if (atomic_dec_and_test(&mc->mca_refcnt)) { + in6_dev_put(mc->idev); + kfree(mc); + } +} + +static void igmp6_group_added(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (!(mc->mca_flags&MAF_LOADED)) { + mc->mca_flags |= MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_add(dev, buf, dev->addr_len, 0); + } + spin_unlock_bh(&mc->mca_lock); + + if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) + return; + + if (MLD_V1_SEEN(mc->idev)) { + igmp6_join_group(mc); + return; + } + /* else v2 */ + + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); +} + +static void igmp6_group_dropped(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (mc->mca_flags&MAF_LOADED) { + mc->mca_flags &= ~MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_delete(dev, buf, dev->addr_len, 0); + } + + if (mc->mca_flags & MAF_NOREPORT) + goto done; + spin_unlock_bh(&mc->mca_lock); + + if (!mc->idev->dead) + igmp6_leave_group(mc); + + spin_lock_bh(&mc->mca_lock); + if (del_timer(&mc->mca_timer)) + atomic_dec(&mc->mca_refcnt); +done: + ip6_mc_clear_src(mc); + spin_unlock_bh(&mc->mca_lock); +} + +/* + * deleted ifmcaddr6 manipulation + */ +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) +{ + struct ifmcaddr6 *pmc; + + /* this is an "ifmcaddr6" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = (struct ifmcaddr6 *)kmalloc(sizeof(*pmc), GFP_ATOMIC); + if (!pmc) + return; + memset(pmc, 0, sizeof(*pmc)); + spin_lock_bh(&im->mca_lock); + spin_lock_init(&pmc->mca_lock); + pmc->idev = im->idev; + in6_dev_hold(idev); + pmc->mca_addr = im->mca_addr; + pmc->mca_crcount = idev->mc_qrv; + pmc->mca_sfmode = im->mca_sfmode; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + struct ip6_sf_list *psf; + + pmc->mca_tomb = im->mca_tomb; + pmc->mca_sources = im->mca_sources; + im->mca_tomb = im->mca_sources = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->mca_crcount; + } + spin_unlock_bh(&im->mca_lock); + + write_lock_bh(&idev->mc_lock); + pmc->next = idev->mc_tomb; + idev->mc_tomb = pmc; + write_unlock_bh(&idev->mc_lock); +} + +static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca) +{ + struct ifmcaddr6 *pmc, *pmc_prev; + struct ip6_sf_list *psf, *psf_next; + + write_lock_bh(&idev->mc_lock); + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(&pmc->mca_addr, pmca)) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + idev->mc_tomb = pmc->next; + } + write_unlock_bh(&idev->mc_lock); + if (pmc) { + for (psf=pmc->mca_tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in6_dev_put(pmc->idev); + kfree(pmc); + } +} + +static void mld_clear_delrec(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *nextpmc; + + write_lock_bh(&idev->mc_lock); + pmc = idev->mc_tomb; + idev->mc_tomb = NULL; + write_unlock_bh(&idev->mc_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip6_mc_clear_src(pmc); + in6_dev_put(pmc->idev); + kfree(pmc); + } + + /* clear dead sources, too */ + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + struct ip6_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->mca_lock); + psf = pmc->mca_tomb; + pmc->mca_tomb = NULL; + spin_unlock_bh(&pmc->mca_lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + read_unlock_bh(&idev->lock); +} + + +/* + * device multicast group inc (add if not found) + */ +int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr) +{ + struct ifmcaddr6 *mc; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENODEV; + } + + for (mc = idev->mc_list; mc; mc = mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, addr)) { + mc->mca_users++; + write_unlock_bh(&idev->lock); + ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, + NULL, 0); + in6_dev_put(idev); + return 0; + } + } + + /* + * not found: create a new one. + */ + + mc = kmalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); + + if (mc == NULL) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENOMEM; + } + + memset(mc, 0, sizeof(struct ifmcaddr6)); + init_timer(&mc->mca_timer); + mc->mca_timer.function = igmp6_timer_handler; + mc->mca_timer.data = (unsigned long) mc; + + ipv6_addr_copy(&mc->mca_addr, addr); + mc->idev = idev; + mc->mca_users = 1; + /* mca_stamp should be updated upon changes */ + mc->mca_cstamp = mc->mca_tstamp = jiffies; + atomic_set(&mc->mca_refcnt, 2); + spin_lock_init(&mc->mca_lock); + + /* initial mode is (EX, empty) */ + mc->mca_sfmode = MCAST_EXCLUDE; + mc->mca_sfcount[MCAST_EXCLUDE] = 1; + + if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || + IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + mc->mca_flags |= MAF_NOREPORT; + + mc->next = idev->mc_list; + idev->mc_list = mc; + write_unlock_bh(&idev->lock); + + mld_del_delrec(idev, &mc->mca_addr); + igmp6_group_added(mc); + ma_put(mc); + return 0; +} + +/* + * device multicast group del + */ +int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct ifmcaddr6 *ma, **map; + + write_lock_bh(&idev->lock); + for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addr)) { + if (--ma->mca_users == 0) { + *map = ma->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(ma); + + ma_put(ma); + return 0; + } + write_unlock_bh(&idev->lock); + return 0; + } + } + write_unlock_bh(&idev->lock); + + return -ENOENT; +} + +int ipv6_dev_mc_dec(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev = in6_dev_get(dev); + int err; + + if (!idev) + return -ENODEV; + + err = __ipv6_dev_mc_dec(idev, addr); + + in6_dev_put(idev); + + return err; +} + +/* + * identify MLD packets for MLD filter exceptions + */ +int ipv6_is_mld(struct sk_buff *skb, int nexthdr) +{ + struct icmp6hdr *pic; + + if (nexthdr != IPPROTO_ICMPV6) + return 0; + + if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) + return 0; + + pic = (struct icmp6hdr *)skb->h.raw; + + switch (pic->icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + return 1; + default: + break; + } + return 0; +} + +/* + * check if the interface/address pair is valid + */ +int ipv6_chk_mcast_addr(struct net_device *dev, struct in6_addr *group, + struct in6_addr *src_addr) +{ + struct inet6_dev *idev; + struct ifmcaddr6 *mc; + int rv = 0; + + idev = in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (mc = idev->mc_list; mc; mc=mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (mc) { + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; + + spin_lock_bh(&mc->mca_lock); + for (psf=mc->mca_sources;psf;psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + mc->mca_sfcount[MCAST_EXCLUDE]; + else + rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; + spin_unlock_bh(&mc->mca_lock); + } else + rv = 1; /* don't filter unspecified source */ + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + return rv; +} + +static void mld_gq_start_timer(struct inet6_dev *idev) +{ + int tv = net_random() % idev->mc_maxdelay; + + idev->mc_gq_running = 1; + if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +/* + * IGMP handling (alias multicast ICMPv6 messages) + */ + +static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) +{ + unsigned long delay = resptime; + + /* Do not start timer for these addresses */ + if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || + IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + return; + + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (delay >= resptime) { + if (resptime) + delay = net_random() % resptime; + else + delay = 1; + } + ma->mca_timer.expires = jiffies + delay; + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING; +} + +static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; i<nsrcs; i++) + if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) { + psf->sf_gsresp = 1; + scount++; + break; + } + } +} + +int igmp6_event_query(struct sk_buff *skb) +{ + struct mld2_query *mlh2 = (struct mld2_query *) skb->h.raw; + struct ifmcaddr6 *ma; + struct in6_addr *group; + unsigned long max_delay; + struct inet6_dev *idev; + struct icmp6hdr *hdr; + int group_type; + int mark = 0; + int len; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + return -EINVAL; + + /* compute payload length excluding extension headers */ + len = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); + len -= (char *)skb->h.raw - (char *)skb->nh.ipv6h; + + /* Drop queries with not link local source */ + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + idev = in6_dev_get(skb->dev); + + if (idev == NULL) + return 0; + + hdr = (struct icmp6hdr *) skb->h.raw; + group = (struct in6_addr *) (hdr + 1); + group_type = ipv6_addr_type(group); + + if (group_type != IPV6_ADDR_ANY && + !(group_type&IPV6_ADDR_MULTICAST)) { + in6_dev_put(idev); + return -EINVAL; + } + + if (len == 24) { + int switchback; + /* MLDv1 router present */ + + /* Translate milliseconds to jiffies */ + max_delay = (ntohs(hdr->icmp6_maxdelay)*HZ)/1000; + + switchback = (idev->mc_qrv + 1) * max_delay; + idev->mc_v1_seen = jiffies + switchback; + + /* cancel the interface change timer */ + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + } else if (len >= 28) { + max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000; + if (!max_delay) + max_delay = 1; + idev->mc_maxdelay = max_delay; + if (mlh2->qrv) + idev->mc_qrv = mlh2->qrv; + if (group_type == IPV6_ADDR_ANY) { /* general query */ + if (mlh2->nsrcs) { + in6_dev_put(idev); + return -EINVAL; /* no sources allowed */ + } + mld_gq_start_timer(idev); + in6_dev_put(idev); + return 0; + } + /* mark sources to include, if group & source-specific */ + mark = mlh2->nsrcs != 0; + } else { + in6_dev_put(idev); + return -EINVAL; + } + + read_lock_bh(&idev->lock); + if (group_type == IPV6_ADDR_ANY) { + for (ma = idev->mc_list; ma; ma=ma->next) { + spin_lock_bh(&ma->mca_lock); + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + } + } else { + for (ma = idev->mc_list; ma; ma=ma->next) { + if (group_type != IPV6_ADDR_ANY && + !ipv6_addr_equal(group, &ma->mca_addr)) + continue; + spin_lock_bh(&ma->mca_lock); + if (ma->mca_flags & MAF_TIMER_RUNNING) { + /* gsquery <- gsquery && mark */ + if (!mark) + ma->mca_flags &= ~MAF_GSQUERY; + } else { + /* gsquery <- mark */ + if (mark) + ma->mca_flags |= MAF_GSQUERY; + else + ma->mca_flags &= ~MAF_GSQUERY; + } + if (ma->mca_flags & MAF_GSQUERY) + mld_marksources(ma, ntohs(mlh2->nsrcs), + mlh2->srcs); + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + if (group_type != IPV6_ADDR_ANY) + break; + } + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + + return 0; +} + + +int igmp6_event_report(struct sk_buff *skb) +{ + struct ifmcaddr6 *ma; + struct in6_addr *addrp; + struct inet6_dev *idev; + struct icmp6hdr *hdr; + int addr_type; + + /* Our own report looped back. Ignore it. */ + if (skb->pkt_type == PACKET_LOOPBACK) + return 0; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + return -EINVAL; + + hdr = (struct icmp6hdr*) skb->h.raw; + + /* Drop reports with not link local source */ + addr_type = ipv6_addr_type(&skb->nh.ipv6h->saddr); + if (addr_type != IPV6_ADDR_ANY && + !(addr_type&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + addrp = (struct in6_addr *) (hdr + 1); + + idev = in6_dev_get(skb->dev); + if (idev == NULL) + return -ENODEV; + + /* + * Cancel the timer for this group + */ + + read_lock_bh(&idev->lock); + for (ma = idev->mc_list; ma; ma=ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addrp)) { + spin_lock(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) + atomic_dec(&ma->mca_refcnt); + ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); + spin_unlock(&ma->mca_lock); + break; + } + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + return 0; +} + +static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case MLD2_MODE_IS_INCLUDE: + case MLD2_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + return !((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp); + case MLD2_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case MLD2_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case MLD2_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; + case MLD2_BLOCK_OLD_SOURCES: + if (pmc->mca_sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip6_sf_list *psf; + int scount = 0; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +static struct sk_buff *mld_newpack(struct net_device *dev, int size) +{ + struct sock *sk = igmp6_socket->sk; + struct sk_buff *skb; + struct mld2_report *pmr; + struct in6_addr addr_buf; + int err; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + + /* we assume size > sizeof(ra) here */ + skb = sock_alloc_send_skb(sk, size + LL_RESERVED_SPACE(dev), 1, &err); + + if (skb == 0) + return NULL; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + if (dev->hard_header) { + unsigned char ha[MAX_ADDR_LEN]; + + ndisc_mc_map(&mld2_all_mcr, ha, dev, 1); + if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) { + kfree_skb(skb); + return NULL; + } + } + + if (ipv6_get_lladdr(dev, &addr_buf)) { + /* <draft-ietf-magma-mld-source-05.txt>: + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + memset(&addr_buf, 0, sizeof(addr_buf)); + } + + ip6_nd_hdr(sk, skb, dev, &addr_buf, &mld2_all_mcr, NEXTHDR_HOP, 0); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + pmr =(struct mld2_report *)skb_put(skb, sizeof(*pmr)); + skb->h.raw = (unsigned char *)pmr; + pmr->type = ICMPV6_MLD2_REPORT; + pmr->resv1 = 0; + pmr->csum = 0; + pmr->resv2 = 0; + pmr->ngrec = 0; + return skb; +} + +static void mld_sendpack(struct sk_buff *skb) +{ + struct ipv6hdr *pip6 = skb->nh.ipv6h; + struct mld2_report *pmr = (struct mld2_report *)skb->h.raw; + int payload_len, mldlen; + struct inet6_dev *idev = in6_dev_get(skb->dev); + int err; + + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + payload_len = skb->tail - (unsigned char *)skb->nh.ipv6h - + sizeof(struct ipv6hdr); + mldlen = skb->tail - skb->h.raw; + pip6->payload_len = htons(payload_len); + + pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, + IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); + if (!err) { + ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); + IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + } else + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct mld2_grec) + 4*mld_scount(pmc,type,gdel,sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, struct mld2_grec **ppgr) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr; + + if (!skb) + skb = mld_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->mca_addr; /* structure copy */ + pmr = (struct mld2_report *)skb->h.raw; + pmr->ngrec = htons(ntohs(pmr->ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr = NULL; + struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, first, isquery, truncate; + + if (pmc->mca_flags & MAF_NOREPORT) + return skb; + + isquery = type == MLD2_MODE_IS_INCLUDE || + type == MLD2_MODE_IS_EXCLUDE; + truncate = type == MLD2_MODE_IS_EXCLUDE || + type == MLD2_CHANGE_TO_EXCLUDE; + + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; + + if (!*psf_list) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header and at + * least one source. + */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)+ + sizeof(struct in6_addr)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + return skb; + } + pmr = skb ? (struct mld2_report *)skb->h.raw : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pmr && pmr->ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + } + } + first = 1; + scount = 0; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + struct in6_addr *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(*psrc) + + first*sizeof(struct mld2_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); + *psrc = psf->sf_addr; + scount++; + if ((type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + if (pgr) + pgr->grec_nsrcs = htons(scount); + + if (isquery) + pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ + return skb; +} + +static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (pmc->mca_flags & MAF_NOREPORT) + continue; + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + } else { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + if (skb) + mld_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void mld_clear_zeros(struct ip6_sf_list **ppsf) +{ + struct ip6_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void mld_send_cr(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + read_lock_bh(&idev->lock); + write_lock_bh(&idev->mc_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->mca_crcount) { + pmc->mca_crcount--; + if (pmc->mca_sfmode == MCAST_EXCLUDE) { + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + if (pmc->mca_crcount == 0) { + mld_clear_zeros(&pmc->mca_tomb); + mld_clear_zeros(&pmc->mca_sources); + } + } + if (pmc->mca_crcount == 0 && !pmc->mca_tomb && + !pmc->mca_sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + idev->mc_tomb = pmc_next; + in6_dev_put(pmc->idev); + kfree(pmc); + } else + pmc_prev = pmc; + } + write_unlock_bh(&idev->mc_lock); + + /* change recs */ + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_ALLOW_NEW_SOURCES; + } else { + type = MLD2_ALLOW_NEW_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->mca_crcount) { + pmc->mca_crcount--; + if (pmc->mca_sfmode == MCAST_EXCLUDE) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + } + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + if (!skb) + return; + (void) mld_sendpack(skb); +} + +static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) +{ + struct sock *sk = igmp6_socket->sk; + struct inet6_dev *idev; + struct sk_buff *skb; + struct icmp6hdr *hdr; + struct in6_addr *snd_addr; + struct in6_addr *addrp; + struct in6_addr addr_buf; + struct in6_addr all_routers; + int err, len, payload_len, full_len; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + snd_addr = addr; + if (type == ICMPV6_MGM_REDUCTION) { + snd_addr = &all_routers; + ipv6_addr_all_routers(&all_routers); + } + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + payload_len = len + sizeof(ra); + full_len = sizeof(struct ipv6hdr) + payload_len; + + skb = sock_alloc_send_skb(sk, LL_RESERVED_SPACE(dev) + full_len, 1, &err); + + if (skb == NULL) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + if (dev->hard_header) { + unsigned char ha[MAX_ADDR_LEN]; + ndisc_mc_map(snd_addr, ha, dev, 1); + if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0) + goto out; + } + + if (ipv6_get_lladdr(dev, &addr_buf)) { + /* <draft-ietf-magma-mld-source-05.txt>: + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + memset(&addr_buf, 0, sizeof(addr_buf)); + } + + ip6_nd_hdr(sk, skb, dev, &addr_buf, snd_addr, NEXTHDR_HOP, payload_len); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr)); + memset(hdr, 0, sizeof(struct icmp6hdr)); + hdr->icmp6_type = type; + + addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr)); + ipv6_addr_copy(addrp, addr); + + hdr->icmp6_cksum = csum_ipv6_magic(&addr_buf, snd_addr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) hdr, len, 0)); + + idev = in6_dev_get(skb->dev); + + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); + if (!err) { + if (type == ICMPV6_MGM_REDUCTION) + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); + else + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBRESPONSES); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + } else + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + + if (likely(idev != NULL)) + in6_dev_put(idev); + return; + +out: + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); +} + +static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, + struct in6_addr *psfsrc) +{ + struct ip6_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { + struct inet6_dev *idev = pmc->idev; + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->mca_sources = psf->sf_next; + if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && + !MLD_V1_SEEN(idev)) { + psf->sf_crcount = idev->mc_qrv; + psf->sf_next = pmc->mca_tomb; + pmc->mca_tomb = psf; + rv = 1; + } else + kfree(psf); + } + return rv; +} + +static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int changerec = 0; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + sf_markstate(pmc); + if (!delta) { + if (!pmc->mca_sfcount[sfmode]) { + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return -EINVAL; + } + pmc->mca_sfcount[sfmode]--; + } + err = 0; + for (i=0; i<sfcount; i++) { + int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); + + changerec |= rv > 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->mca_sfmode == MCAST_EXCLUDE && + pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && + pmc->mca_sfcount[MCAST_INCLUDE]) { + struct ip6_sf_list *psf; + + /* filter mode change */ + pmc->mca_sfmode = MCAST_INCLUDE; + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(pmc->idev); + } else if (sf_setstate(pmc) || changerec) + mld_ifc_event(pmc->idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, + struct in6_addr *psfsrc, int delta) +{ + struct ip6_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf) { + psf = (struct ip6_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + memset(psf, 0, sizeof(*psf)); + psf->sf_addr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->mca_sources = psf; + } + psf->sf_count[sfmode]++; + return 0; +} + +static void sf_markstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + int qrv = pmc->idev->mc_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in != psf->sf_oldin) { + psf->sf_crcount = qrv; + rv++; + } + } + return rv; +} + +/* + * Add multicast source filter list to the interface list + */ +static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int isexclude; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + + sf_markstate(pmc); + isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->mca_sfcount[sfmode]++; + err = 0; + for (i=0; i<sfcount; i++) { + err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i], delta); + if (err) + break; + } + if (err) { + int j; + + if (!delta) + pmc->mca_sfcount[sfmode]--; + for (j=0; j<i; j++) + (void) ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]); + } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { + struct inet6_dev *idev = pmc->idev; + struct ip6_sf_list *psf; + + /* filter mode change */ + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + pmc->mca_sfmode = MCAST_EXCLUDE; + else if (pmc->mca_sfcount[MCAST_INCLUDE]) + pmc->mca_sfmode = MCAST_INCLUDE; + /* else no filters; keep old mode for reports */ + + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(idev); + } else if (sf_setstate(pmc)) + mld_ifc_event(idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf, *nextpsf; + + for (psf=pmc->mca_tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_tomb = NULL; + for (psf=pmc->mca_sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_sources = NULL; + pmc->mca_sfmode = MCAST_EXCLUDE; + pmc->mca_sfcount[MCAST_EXCLUDE] = 0; + pmc->mca_sfcount[MCAST_EXCLUDE] = 1; +} + + +static void igmp6_join_group(struct ifmcaddr6 *ma) +{ + unsigned long delay; + + if (ma->mca_flags & MAF_NOREPORT) + return; + + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + + delay = net_random() % IGMP6_UNSOLICITED_IVAL; + + spin_lock_bh(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; + spin_unlock_bh(&ma->mca_lock); +} + +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev) +{ + int err; + + if (iml->sflist == 0) { + /* any-source empty exclude case */ + return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; + return err; +} + +static void igmp6_leave_group(struct ifmcaddr6 *ma) +{ + if (MLD_V1_SEEN(ma->idev)) { + if (ma->mca_flags & MAF_LAST_REPORTER) + igmp6_send(&ma->mca_addr, ma->idev->dev, + ICMPV6_MGM_REDUCTION); + } else { + mld_add_delrec(ma->idev, ma); + mld_ifc_event(ma->idev); + } +} + +static void mld_gq_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + idev->mc_gq_running = 0; + mld_send_report(idev, NULL); + __in6_dev_put(idev); +} + +static void mld_ifc_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + mld_send_cr(idev); + if (idev->mc_ifc_count) { + idev->mc_ifc_count--; + if (idev->mc_ifc_count) + mld_ifc_start_timer(idev, idev->mc_maxdelay); + } + __in6_dev_put(idev); +} + +static void mld_ifc_event(struct inet6_dev *idev) +{ + if (MLD_V1_SEEN(idev)) + return; + idev->mc_ifc_count = idev->mc_qrv; + mld_ifc_start_timer(idev, 1); +} + + +static void igmp6_timer_handler(unsigned long data) +{ + struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + + if (MLD_V1_SEEN(ma->idev)) + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + else + mld_send_report(ma->idev, ma); + + spin_lock(&ma->mca_lock); + ma->mca_flags |= MAF_LAST_REPORTER; + ma->mca_flags &= ~MAF_TIMER_RUNNING; + spin_unlock(&ma->mca_lock); + ma_put(ma); +} + +/* Device going down */ + +void ipv6_mc_down(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Withdraw multicast list */ + + read_lock_bh(&idev->lock); + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); + + for (i = idev->mc_list; i; i=i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); + + mld_clear_delrec(idev); +} + + +/* Device going up */ + +void ipv6_mc_up(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i=i->next) + igmp6_group_added(i); + read_unlock_bh(&idev->lock); +} + +/* IPv6 device initialization. */ + +void ipv6_mc_init_dev(struct inet6_dev *idev) +{ + struct in6_addr maddr; + + write_lock_bh(&idev->lock); + rwlock_init(&idev->mc_lock); + idev->mc_gq_running = 0; + init_timer(&idev->mc_gq_timer); + idev->mc_gq_timer.data = (unsigned long) idev; + idev->mc_gq_timer.function = &mld_gq_timer_expire; + idev->mc_tomb = NULL; + idev->mc_ifc_count = 0; + init_timer(&idev->mc_ifc_timer); + idev->mc_ifc_timer.data = (unsigned long) idev; + idev->mc_ifc_timer.function = &mld_ifc_timer_expire; + idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_v1_seen = 0; + write_unlock_bh(&idev->lock); + + /* Add all-nodes address. */ + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(idev->dev, &maddr); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ipv6_mc_destroy_dev(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + struct in6_addr maddr; + + /* Deactivate timers */ + ipv6_mc_down(idev); + + /* Delete all-nodes address. */ + ipv6_addr_all_nodes(&maddr); + + /* We cannot call ipv6_dev_mc_dec() directly, our caller in + * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will + * fail. + */ + __ipv6_dev_mc_dec(idev, &maddr); + + if (idev->cnf.forwarding) { + ipv6_addr_all_routers(&maddr); + __ipv6_dev_mc_dec(idev, &maddr); + } + + write_lock_bh(&idev->lock); + while ((i = idev->mc_list) != NULL) { + idev->mc_list = i->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(i); + ma_put(i); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + +#ifdef CONFIG_PROC_FS +struct igmp6_mc_iter_state { + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) + +static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) +{ + struct ifmcaddr6 *im = NULL; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL; + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; + idev = in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + im = im->next; + while (!im) { + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->mc_list; + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifmcaddr6 *im = igmp6_mc_get_first(seq); + if (im) + while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return igmp6_mc_get_idx(seq, *pos); +} + +static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifmcaddr6 *im; + im = igmp6_mc_get_next(seq, v); + ++*pos; + return im; +} + +static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + state->idev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp6_mc_seq_show(struct seq_file *seq, void *v) +{ + struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + seq_printf(seq, + "%-4d %-15s %04x%04x%04x%04x%04x%04x%04x%04x %5d %08X %ld\n", + state->dev->ifindex, state->dev->name, + NIP6(im->mca_addr), + im->mca_users, im->mca_flags, + (im->mca_flags&MAF_TIMER_RUNNING) ? + jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); + return 0; +} + +static struct seq_operations igmp6_mc_seq_ops = { + .start = igmp6_mc_seq_start, + .next = igmp6_mc_seq_next, + .stop = igmp6_mc_seq_stop, + .show = igmp6_mc_seq_show, +}; + +static int igmp6_mc_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp6_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &igmp6_mc_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp6_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +struct igmp6_mcf_iter_state { + struct net_device *dev; + struct inet6_dev *idev; + struct ifmcaddr6 *im; +}; + +#define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) + +static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) +{ + struct ip6_sf_list *psf = NULL; + struct ifmcaddr6 *im = NULL; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL, state->im = NULL; + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; + idev = in6_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (likely(im != NULL)) { + spin_lock_bh(&im->mca_lock); + psf = im->mca_sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->mca_lock); + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->mca_lock); + state->im = state->im->next; + while (!state->im) { + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + state->im = state->idev->mc_list; + } + if (!state->im) + break; + spin_lock_bh(&state->im->mca_lock); + psf = state->im->mca_sources; + } +out: + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp6_mcf_get_first(seq); + else + psf = igmp6_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->mca_lock); + state->im = NULL; + } + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + state->idev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip6_sf_list *psf = (struct ip6_sf_list *)v; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%32s %32s %6s %6s\n", "Idx", + "Device", "Multicast Address", + "Source Address", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s " + "%04x%04x%04x%04x%04x%04x%04x%04x " + "%04x%04x%04x%04x%04x%04x%04x%04x " + "%6lu %6lu\n", + state->dev->ifindex, state->dev->name, + NIP6(state->im->mca_addr), + NIP6(psf->sf_addr), + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static struct seq_operations igmp6_mcf_seq_ops = { + .start = igmp6_mcf_seq_start, + .next = igmp6_mcf_seq_next, + .stop = igmp6_mcf_seq_stop, + .show = igmp6_mcf_seq_show, +}; + +static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp6_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &igmp6_mcf_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp6_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +int __init igmp6_init(struct net_proto_family *ops) +{ + struct ipv6_pinfo *np; + struct sock *sk; + int err; + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &igmp6_socket); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the IGMP6 control socket (err %d).\n", + err); + igmp6_socket = NULL; /* For safety. */ + return err; + } + + sk = igmp6_socket->sk; + sk->sk_allocation = GFP_ATOMIC; + sk->sk_prot->unhash(sk); + + np = inet6_sk(sk); + np->hop_limit = 1; + +#ifdef CONFIG_PROC_FS + proc_net_fops_create("igmp6", S_IRUGO, &igmp6_mc_seq_fops); + proc_net_fops_create("mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); +#endif + + return 0; +} + +void igmp6_cleanup(void) +{ + sock_release(igmp6_socket); + igmp6_socket = NULL; /* for safety */ + +#ifdef CONFIG_PROC_FS + proc_net_remove("mcfilter6"); + proc_net_remove("igmp6"); +#endif +} diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c new file mode 100644 index 000000000000..7c291f4e9edc --- /dev/null +++ b/net/ipv6/ndisc.c @@ -0,0 +1,1690 @@ +/* + * Neighbour Discovery for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Mike Shaver <shaver@ingenia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Lars Fenneberg : fixed MTU setting on receipt + * of an RA. + * + * Janos Farkas : kmalloc failure checks + * Alexey Kuznetsov : state machine reworked + * and moved to net/core. + * Pekka Savola : RFC2461 validation + * YOSHIFUJI Hideaki @USAGI : Verify ND options properly + */ + +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0) +#define ND_NOPRINTK(x...) do { ; } while(0) +#define ND_PRINTK0 ND_PRINTK +#define ND_PRINTK1 ND_NOPRINTK +#define ND_PRINTK2 ND_NOPRINTK +#define ND_PRINTK3 ND_NOPRINTK +#if ND_DEBUG >= 1 +#undef ND_PRINTK1 +#define ND_PRINTK1 ND_PRINTK +#endif +#if ND_DEBUG >= 2 +#undef ND_PRINTK2 +#define ND_PRINTK2 ND_PRINTK +#endif +#if ND_DEBUG >= 3 +#undef ND_PRINTK3 +#define ND_PRINTK3 ND_PRINTK +#endif + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/route.h> +#include <linux/init.h> +#include <linux/rcupdate.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/jhash.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/ndisc.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/icmp.h> + +#include <net/flow.h> +#include <net/ip6_checksum.h> +#include <linux/proc_fs.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> + +static struct socket *ndisc_socket; + +static u32 ndisc_hash(const void *pkey, const struct net_device *dev); +static int ndisc_constructor(struct neighbour *neigh); +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); +static int pndisc_constructor(struct pneigh_entry *n); +static void pndisc_destructor(struct pneigh_entry *n); +static void pndisc_redo(struct sk_buff *skb); + +static struct neigh_ops ndisc_generic_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static struct neigh_ops ndisc_hh_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + + +static struct neigh_ops ndisc_direct_ops = { + .family = AF_INET6, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_table nd_tbl = { + .family = AF_INET6, + .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), + .key_len = sizeof(struct in6_addr), + .hash = ndisc_hash, + .constructor = ndisc_constructor, + .pconstructor = pndisc_constructor, + .pdestructor = pndisc_destructor, + .proxy_redo = pndisc_redo, + .id = "ndisc_cache", + .parms = { + .tbl = &nd_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +/* ND options */ +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[__ND_OPT_MAX]; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) + +/* + * Return the padding between the option length and the start of the + * link addr. Currently only IP-over-InfiniBand needs this, although + * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may + * also need a pad of 2. + */ +static int ndisc_addr_option_pad(unsigned short type) +{ + switch (type) { + case ARPHRD_INFINIBAND: return 2; + default: return 0; + } +} + +static inline int ndisc_opt_addr_space(struct net_device *dev) +{ + return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); +} + +static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, + unsigned short addr_type) +{ + int space = NDISC_OPT_SPACE(data_len); + int pad = ndisc_addr_option_pad(addr_type); + + opt[0] = type; + opt[1] = space>>3; + + memset(opt + 2, 0, pad); + opt += pad; + space -= pad; + + memcpy(opt+2, data, data_len); + data_len += 2; + opt += data_len; + if ((space -= data_len) > 0) + memset(opt, 0, space); + return opt + space; +} + +static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) +{ + int type; + if (!cur || !end || cur >= end) + return NULL; + type = cur->nd_opt_type; + do { + cur = ((void *)cur) + (cur->nd_opt_len << 3); + } while(cur < end && cur->nd_opt_type != type); + return (cur <= end && cur->nd_opt_type == type ? cur : NULL); +} + +static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) +{ + struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; + + if (!nd_opt || opt_len < 0 || !ndopts) + return NULL; + memset(ndopts, 0, sizeof(*ndopts)); + while (opt_len) { + int l; + if (opt_len < sizeof(struct nd_opt_hdr)) + return NULL; + l = nd_opt->nd_opt_len << 3; + if (opt_len < l || l == 0) + return NULL; + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + case ND_OPT_MTU: + case ND_OPT_REDIRECT_HDR: + if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { + ND_PRINTK2(KERN_WARNING + "%s(): duplicated ND6 option found: type=%d\n", + __FUNCTION__, + nd_opt->nd_opt_type); + } else { + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + } + break; + case ND_OPT_PREFIX_INFO: + ndopts->nd_opts_pi_end = nd_opt; + if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + break; + default: + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the protocol. + */ + ND_PRINTK2(KERN_NOTICE + "%s(): ignored unsupported option; type=%d, len=%d\n", + __FUNCTION__, + nd_opt->nd_opt_type, nd_opt->nd_opt_len); + } + opt_len -= l; + nd_opt = ((void *)nd_opt) + l; + } + return ndopts; +} + +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + u8 *lladdr = (u8 *)(p + 1); + int lladdrlen = p->nd_opt_len << 3; + int prepad = ndisc_addr_option_pad(dev->type); + if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) + return NULL; + return (lladdr + prepad); +} + +int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ + case ARPHRD_FDDI: + ipv6_eth_mc_map(addr, buf); + return 0; + case ARPHRD_IEEE802_TR: + ipv6_tr_mc_map(addr,buf); + return 0; + case ARPHRD_ARCNET: + ipv6_arcnet_mc_map(addr, buf); + return 0; + case ARPHRD_INFINIBAND: + ipv6_ib_mc_map(addr, buf); + return 0; + default: + if (dir) { + memcpy(buf, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + +static u32 ndisc_hash(const void *pkey, const struct net_device *dev) +{ + const u32 *p32 = pkey; + u32 addr_hash, i; + + addr_hash = 0; + for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) + addr_hash ^= *p32++; + + return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); +} + +static int ndisc_constructor(struct neighbour *neigh) +{ + struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct net_device *dev = neigh->dev; + struct inet6_dev *in6_dev; + struct neigh_parms *parms; + int is_multicast = ipv6_addr_is_multicast(addr); + + rcu_read_lock(); + in6_dev = in6_dev_get(dev); + if (in6_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in6_dev->nd_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &ndisc_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + if (is_multicast) { + neigh->nud_state = NUD_NOARP; + ndisc_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags&IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } else if (dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->hard_header_cache) + neigh->ops = &ndisc_hh_ops; + else + neigh->ops = &ndisc_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + in6_dev_put(in6_dev); + return 0; +} + +static int pndisc_constructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return -EINVAL; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + return 0; +} + +static void pndisc_destructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); +} + +/* + * Send a Neighbour Advertisement + */ + +static inline void ndisc_flow_init(struct flowi *fl, u8 type, + struct in6_addr *saddr, struct in6_addr *daddr) +{ + memset(fl, 0, sizeof(*fl)); + ipv6_addr_copy(&fl->fl6_src, saddr); + ipv6_addr_copy(&fl->fl6_dst, daddr); + fl->proto = IPPROTO_ICMPV6; + fl->fl_icmp_type = type; + fl->fl_icmp_code = 0; +} + +static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + struct in6_addr *daddr, struct in6_addr *solicited_addr, + int router, int solicited, int override, int inc_opt) +{ + struct in6_addr tmpaddr; + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct flowi fl; + struct dst_entry* dst; + struct sock *sk = ndisc_socket->sk; + struct in6_addr *src_addr; + struct nd_msg *msg; + int len; + struct sk_buff *skb; + int err; + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + + /* for anycast or proxy, solicited_addr != src_addr */ + ifp = ipv6_get_ifaddr(solicited_addr, dev, 1); + if (ifp) { + src_addr = solicited_addr; + in6_ifa_put(ifp); + } else { + if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr)) + return; + src_addr = &tmpaddr; + } + + ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr); + + dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + + if (inc_opt) { + if (dev->addr_len) + len += ndisc_opt_addr_space(dev); + else + inc_opt = 0; + } + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + + if (skb == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NA: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, src_addr, daddr, IPPROTO_ICMPV6, len); + + msg = (struct nd_msg *)skb_put(skb, len); + skb->h.raw = (unsigned char*)msg; + + msg->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; + msg->icmph.icmp6_code = 0; + msg->icmph.icmp6_cksum = 0; + + msg->icmph.icmp6_unused = 0; + msg->icmph.icmp6_router = router; + msg->icmph.icmp6_solicited = solicited; + msg->icmph.icmp6_override = !!override; + + /* Set the target address. */ + ipv6_addr_copy(&msg->target, solicited_addr); + + if (inc_opt) + ndisc_fill_addr_option(msg->opt, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, + dev->addr_len, dev->type); + + /* checksum */ + msg->icmph.icmp6_cksum = csum_ipv6_magic(src_addr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) msg, + len, 0)); + + skb->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, + struct in6_addr *solicit, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + struct flowi fl; + struct dst_entry* dst; + struct inet6_dev *idev; + struct sock *sk = ndisc_socket->sk; + struct sk_buff *skb; + struct nd_msg *msg; + struct in6_addr addr_buf; + int len; + int err; + int send_llinfo; + + if (saddr == NULL) { + if (ipv6_get_lladdr(dev, &addr_buf)) + return; + saddr = &addr_buf; + } + + ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr); + + dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + send_llinfo = dev->addr_len && !ipv6_addr_any(saddr); + if (send_llinfo) + len += ndisc_opt_addr_space(dev); + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + if (skb == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NA: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + + msg = (struct nd_msg *)skb_put(skb, len); + skb->h.raw = (unsigned char*)msg; + msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION; + msg->icmph.icmp6_code = 0; + msg->icmph.icmp6_cksum = 0; + msg->icmph.icmp6_unused = 0; + + /* Set the target address. */ + ipv6_addr_copy(&msg->target, solicit); + + if (send_llinfo) + ndisc_fill_addr_option(msg->opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, + dev->addr_len, dev->type); + + /* checksum */ + msg->icmph.icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, + daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) msg, + len, 0)); + /* send it! */ + skb->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORSOLICITS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct flowi fl; + struct dst_entry* dst; + struct inet6_dev *idev; + struct sock *sk = ndisc_socket->sk; + struct sk_buff *skb; + struct icmp6hdr *hdr; + __u8 * opt; + int len; + int err; + + ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr); + + dst = ndisc_dst_alloc(dev, NULL, daddr, ip6_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + + len = sizeof(struct icmp6hdr); + if (dev->addr_len) + len += ndisc_opt_addr_space(dev); + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + if (skb == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RS: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + + hdr = (struct icmp6hdr *)skb_put(skb, len); + skb->h.raw = (unsigned char*)hdr; + hdr->icmp6_type = NDISC_ROUTER_SOLICITATION; + hdr->icmp6_code = 0; + hdr->icmp6_cksum = 0; + hdr->icmp6_unused = 0; + + opt = (u8*) (hdr + 1); + + if (dev->addr_len) + ndisc_fill_addr_option(opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, + dev->addr_len, dev->type); + + /* checksum */ + hdr->icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) hdr, len, 0)); + + /* send it! */ + skb->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTROUTERSOLICITS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + + +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + /* + * "The sender MUST return an ICMP + * destination unreachable" + */ + dst_link_failure(skb); + kfree_skb(skb); +} + +/* Called with locked neigh: either read or both */ + +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + struct in6_addr *saddr = NULL; + struct in6_addr mcaddr; + struct net_device *dev = neigh->dev; + struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; + int probes = atomic_read(&neigh->probes); + + if (skb && ipv6_chk_addr(&skb->nh.ipv6h->saddr, dev, 1)) + saddr = &skb->nh.ipv6h->saddr; + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state & NUD_VALID)) { + ND_PRINTK1(KERN_DEBUG + "%s(): trying to ucast probe in NUD_INVALID: " + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + __FUNCTION__, + NIP6(*target)); + } + ndisc_send_ns(dev, neigh, target, target, saddr); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + } else { + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + } +} + +static void ndisc_recv_ns(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb->h.raw; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - msg->opt; + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct inet6_dev *idev = NULL; + struct neighbour *neigh; + int dad = ipv6_addr_any(saddr); + int inc; + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: multicast target address"); + return; + } + + /* + * RFC2461 7.1.1: + * DAD has to be destined for solicited node multicast address. + */ + if (dad && + !(daddr->s6_addr32[0] == htonl(0xff020000) && + daddr->s6_addr32[1] == htonl(0x00000000) && + daddr->s6_addr32[2] == htonl(0x00000001) && + daddr->s6_addr [12] == 0xff )) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (wrong destination)\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND options\n"); + return; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid link-layer address length\n"); + return; + } + + /* RFC2461 7.1.1: + * If the IP source address is the unspecified address, + * there MUST NOT be source link-layer address option + * in the message. + */ + if (dad) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (link-layer address option)\n"); + return; + } + } + + inc = ipv6_addr_is_multicast(daddr); + + if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1)) != NULL) { + if (ifp->flags & IFA_F_TENTATIVE) { + /* Address is tentative. If the source + is unspecified address, it is someone + does DAD, otherwise we ignore solicitations + until DAD timer expires. + */ + if (!dad) + goto out; + if (dev->type == ARPHRD_IEEE802_TR) { + unsigned char *sadr = skb->mac.raw; + if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && + sadr[9] == dev->dev_addr[1] && + sadr[10] == dev->dev_addr[2] && + sadr[11] == dev->dev_addr[3] && + sadr[12] == dev->dev_addr[4] && + sadr[13] == dev->dev_addr[5]) { + /* looped-back to us */ + goto out; + } + } + addrconf_dad_failure(ifp); + return; + } + + idev = ifp->idev; + } else { + idev = in6_dev_get(dev); + if (!idev) { + /* XXX: count this drop? */ + return; + } + + if (ipv6_chk_acast_addr(dev, &msg->target) || + (idev->cnf.forwarding && + pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) { + if (skb->stamp.tv_sec != LOCALLY_ENQUEUED && + skb->pkt_type != PACKET_HOST && + inc != 0 && + idev->nd_parms->proxy_delay != 0) { + /* + * for anycast or proxy, + * sender should delay its response + * by a random time between 0 and + * MAX_ANYCAST_DELAY_TIME seconds. + * (RFC2461) -- yoshfuji + */ + struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); + if (n) + pneigh_enqueue(&nd_tbl, idev->nd_parms, n); + goto out; + } + } else + goto out; + } + + if (dad) { + struct in6_addr maddr; + + ipv6_addr_all_nodes(&maddr); + ndisc_send_na(dev, NULL, &maddr, &msg->target, + idev->cnf.forwarding, 0, (ifp != NULL), 1); + goto out; + } + + if (inc) + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); + else + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); + + /* + * update / create cache entry + * for the source address + */ + neigh = __neigh_lookup(&nd_tbl, saddr, dev, + !inc || lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE); + if (neigh || !dev->hard_header) { + ndisc_send_na(dev, neigh, saddr, &msg->target, + idev->cnf.forwarding, + 1, (ifp != NULL && inc), inc); + if (neigh) + neigh_release(neigh); + } + +out: + if (ifp) + in6_ifa_put(ifp); + else + in6_dev_put(idev); + + return; +} + +static void ndisc_recv_na(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb->h.raw; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - msg->opt; + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct neighbour *neigh; + + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: packet too short\n"); + return; + } + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: target address is multicast.\n"); + return; + } + + if (ipv6_addr_is_multicast(daddr) && + msg->icmph.icmp6_solicited) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: solicited NA is multicasted.\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND option\n"); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: invalid link-layer address length\n"); + return; + } + } + if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1))) { + if (ifp->flags & IFA_F_TENTATIVE) { + addrconf_dad_failure(ifp); + return; + } + /* What should we make now? The advertisement + is invalid, but ndisc specs say nothing + about it. It could be misconfiguration, or + an smart proxy agent tries to help us :-) + */ + ND_PRINTK1(KERN_WARNING + "ICMPv6 NA: someone advertises our address on %s!\n", + ifp->idev->dev->name); + in6_ifa_put(ifp); + return; + } + neigh = neigh_lookup(&nd_tbl, &msg->target, dev); + + if (neigh) { + u8 old_flags = neigh->flags; + + if (neigh->nud_state & NUD_FAILED) + goto out; + + neigh_update(neigh, lladdr, + msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + + if ((old_flags & ~neigh->flags) & NTF_ROUTER) { + /* + * Change: router to host + */ + struct rt6_info *rt; + rt = rt6_get_dflt_router(saddr, dev); + if (rt) + ip6_del_rt(rt, NULL, NULL); + } + +out: + neigh_release(neigh); + } +} + +static void ndisc_recv_rs(struct sk_buff *skb) +{ + struct rs_msg *rs_msg = (struct rs_msg *) skb->h.raw; + unsigned long ndoptlen = skb->len - sizeof(*rs_msg); + struct neighbour *neigh; + struct inet6_dev *idev; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct ndisc_options ndopts; + u8 *lladdr = NULL; + + if (skb->len < sizeof(*rs_msg)) + return; + + idev = in6_dev_get(skb->dev); + if (!idev) { + if (net_ratelimit()) + ND_PRINTK1("ICMP6 RS: can't find in6 device\n"); + return; + } + + /* Don't accept RS if we're not in router mode */ + if (!idev->cnf.forwarding) + goto out; + + /* + * Don't update NCE if src = ::; + * this implies that the source node has no ip address assigned yet. + */ + if (ipv6_addr_any(saddr)) + goto out; + + /* Parse ND options */ + if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (net_ratelimit()) + ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n"); + goto out; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) + goto out; + } + + neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); + if (neigh) { + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + neigh_release(neigh); + } +out: + in6_dev_put(idev); +} + +static void ndisc_router_discovery(struct sk_buff *skb) +{ + struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw; + struct neighbour *neigh = NULL; + struct inet6_dev *in6_dev; + struct rt6_info *rt; + int lifetime; + struct ndisc_options ndopts; + int optlen; + + __u8 * opt = (__u8 *)(ra_msg + 1); + + optlen = (skb->tail - skb->h.raw) - sizeof(struct ra_msg); + + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: source address is not link-local.\n"); + return; + } + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: packet too short\n"); + return; + } + + /* + * set the RA_RECV flag in the interface + */ + + in6_dev = in6_dev_get(skb->dev); + if (in6_dev == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: can't find inet6 device for %s.\n", + skb->dev->name); + return; + } + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) { + in6_dev_put(in6_dev); + return; + } + + if (!ndisc_parse_options(opt, optlen, &ndopts)) { + in6_dev_put(in6_dev); + ND_PRINTK2(KERN_WARNING + "ICMP6 RA: invalid ND options\n"); + return; + } + + if (in6_dev->if_flags & IF_RS_SENT) { + /* + * flag that an RA was received after an RS was sent + * out on this interface. + */ + in6_dev->if_flags |= IF_RA_RCVD; + } + + /* + * Remember the managed/otherconf flags from most recently + * received RA message (RFC 2462) -- yoshfuji + */ + in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | + IF_RA_OTHERCONF)) | + (ra_msg->icmph.icmp6_addrconf_managed ? + IF_RA_MANAGED : 0) | + (ra_msg->icmph.icmp6_addrconf_other ? + IF_RA_OTHERCONF : 0); + + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + + rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); + + if (rt) + neigh = rt->rt6i_nexthop; + + if (rt && lifetime == 0) { + neigh_clone(neigh); + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } + + if (rt == NULL && lifetime) { + ND_PRINTK3(KERN_DEBUG + "ICMPv6 RA: adding default router.\n"); + + rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); + if (rt == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() failed to add default route.\n", + __FUNCTION__); + in6_dev_put(in6_dev); + return; + } + + neigh = rt->rt6i_nexthop; + if (neigh == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() got default router without neighbour.\n", + __FUNCTION__); + dst_release(&rt->u.dst); + in6_dev_put(in6_dev); + return; + } + neigh->flags |= NTF_ROUTER; + } + + if (rt) + rt->rt6i_expires = jiffies + (HZ * lifetime); + + if (ra_msg->icmph.icmp6_hop_limit) { + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit; + } + + /* + * Update Reachable Time and Retrans Timer + */ + + if (in6_dev->nd_parms) { + unsigned long rtime = ntohl(ra_msg->retrans_timer); + + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { + rtime = (rtime*HZ)/1000; + if (rtime < HZ/10) + rtime = HZ/10; + in6_dev->nd_parms->retrans_time = rtime; + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + + rtime = ntohl(ra_msg->reachable_time); + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { + rtime = (rtime*HZ)/1000; + + if (rtime < HZ/10) + rtime = HZ/10; + + if (rtime != in6_dev->nd_parms->base_reachable_time) { + in6_dev->nd_parms->base_reachable_time = rtime; + in6_dev->nd_parms->gc_staletime = 3 * rtime; + in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + } + } + + /* + * Process options. + */ + + if (!neigh) + neigh = __neigh_lookup(&nd_tbl, &skb->nh.ipv6h->saddr, + skb->dev, 1); + if (neigh) { + u8 *lladdr = NULL; + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid link-layer address length\n"); + goto out; + } + } + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER); + } + + if (ndopts.nd_opts_pi) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_opts_pi; + p; + p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { + addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3); + } + } + + if (ndopts.nd_opts_mtu) { + u32 mtu; + + memcpy(&mtu, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); + mtu = ntohl(mtu); + + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid mtu: %d\n", + mtu); + } else if (in6_dev->cnf.mtu6 != mtu) { + in6_dev->cnf.mtu6 = mtu; + + if (rt) + rt->u.dst.metrics[RTAX_MTU-1] = mtu; + + rt6_mtu_change(skb->dev, mtu); + } + } + + if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid RA options"); + } +out: + if (rt) + dst_release(&rt->u.dst); + else if (neigh) + neigh_release(neigh); + in6_dev_put(in6_dev); +} + +static void ndisc_redirect_rcv(struct sk_buff *skb) +{ + struct inet6_dev *in6_dev; + struct icmp6hdr *icmph; + struct in6_addr *dest; + struct in6_addr *target; /* new first hop to destination */ + struct neighbour *neigh; + int on_link = 0; + struct ndisc_options ndopts; + int optlen; + u8 *lladdr = NULL; + + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: source address is not link-local.\n"); + return; + } + + optlen = skb->tail - skb->h.raw; + optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: packet too short\n"); + return; + } + + icmph = (struct icmp6hdr *) skb->h.raw; + target = (struct in6_addr *) (icmph + 1); + dest = target + 1; + + if (ipv6_addr_is_multicast(dest)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination address is multicast.\n"); + return; + } + + if (ipv6_addr_equal(dest, target)) { + on_link = 1; + } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: target address is not link-local.\n"); + return; + } + + in6_dev = in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) { + in6_dev_put(in6_dev); + return; + } + + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ + + if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid ND options\n"); + in6_dev_put(in6_dev); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid link-layer address length\n"); + in6_dev_put(in6_dev); + return; + } + } + + neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + if (neigh) { + rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, lladdr, + on_link); + neigh_release(neigh); + } + in6_dev_put(in6_dev); +} + +void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, + struct in6_addr *target) +{ + struct sock *sk = ndisc_socket->sk; + int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + struct sk_buff *buff; + struct icmp6hdr *icmph; + struct in6_addr saddr_buf; + struct in6_addr *addrp; + struct net_device *dev; + struct rt6_info *rt; + struct dst_entry *dst; + struct inet6_dev *idev; + struct flowi fl; + u8 *opt; + int rd_len; + int err; + int hlen; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + + dev = skb->dev; + + if (ipv6_get_lladdr(dev, &saddr_buf)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: no link-local address on %s\n", + dev->name); + return; + } + + ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &skb->nh.ipv6h->saddr); + + dst = ip6_route_output(NULL, &fl); + if (dst == NULL) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err) { + dst_release(dst); + return; + } + + rt = (struct rt6_info *) dst; + + if (rt->rt6i_flags & RTF_GATEWAY) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination is not a neighbour.\n"); + dst_release(dst); + return; + } + if (!xrlim_allow(dst, 1*HZ)) { + dst_release(dst); + return; + } + + if (dev->addr_len) { + read_lock_bh(&neigh->lock); + if (neigh->nud_state & NUD_VALID) { + memcpy(ha_buf, neigh->ha, dev->addr_len); + read_unlock_bh(&neigh->lock); + ha = ha_buf; + len += ndisc_opt_addr_space(dev); + } else + read_unlock_bh(&neigh->lock); + } + + rd_len = min_t(unsigned int, + IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + rd_len &= ~0x7; + len += rd_len; + + buff = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + if (buff == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 Redirect: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + hlen = 0; + + skb_reserve(buff, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, + IPPROTO_ICMPV6, len); + + icmph = (struct icmp6hdr *)skb_put(buff, len); + buff->h.raw = (unsigned char*)icmph; + + memset(icmph, 0, sizeof(struct icmp6hdr)); + icmph->icmp6_type = NDISC_REDIRECT; + + /* + * copy target and destination addresses + */ + + addrp = (struct in6_addr *)(icmph + 1); + ipv6_addr_copy(addrp, target); + addrp++; + ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr); + + opt = (u8*) (addrp + 1); + + /* + * include target_address option + */ + + if (ha) + opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, + dev->addr_len, dev->type); + + /* + * build redirect option and copy skb over to the new packet. + */ + + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, skb->nh.ipv6h, rd_len - 8); + + icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &skb->nh.ipv6h->saddr, + len, IPPROTO_ICMPV6, + csum_partial((u8 *) icmph, len, 0)); + + buff->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +static void pndisc_redo(struct sk_buff *skb) +{ + ndisc_rcv(skb); + kfree_skb(skb); +} + +int ndisc_rcv(struct sk_buff *skb) +{ + struct nd_msg *msg; + + if (!pskb_may_pull(skb, skb->len)) + return 0; + + msg = (struct nd_msg *) skb->h.raw; + + __skb_push(skb, skb->data-skb->h.raw); + + if (skb->nh.ipv6h->hop_limit != 255) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid hop-limit: %d\n", + skb->nh.ipv6h->hop_limit); + return 0; + } + + if (msg->icmph.icmp6_code != 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid ICMPv6 code: %d\n", + msg->icmph.icmp6_code); + return 0; + } + + switch (msg->icmph.icmp6_type) { + case NDISC_NEIGHBOUR_SOLICITATION: + ndisc_recv_ns(skb); + break; + + case NDISC_NEIGHBOUR_ADVERTISEMENT: + ndisc_recv_na(skb); + break; + + case NDISC_ROUTER_SOLICITATION: + ndisc_recv_rs(skb); + break; + + case NDISC_ROUTER_ADVERTISEMENT: + ndisc_router_discovery(skb); + break; + + case NDISC_REDIRECT: + ndisc_redirect_rcv(skb); + break; + }; + + return 0; +} + +static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&nd_tbl, dev); + fib6_run_gc(~0UL); + break; + case NETDEV_DOWN: + neigh_ifdown(&nd_tbl, dev); + fib6_run_gc(~0UL); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block ndisc_netdev_notifier = { + .notifier_call = ndisc_netdev_event, +}; + +#ifdef CONFIG_SYSCTL +static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, + const char *func, const char *dev_name) +{ + static char warncomm[TASK_COMM_LEN]; + static int warned; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING + "process `%s' is using deprecated sysctl (%s) " + "net.ipv6.neigh.%s.%s; " + "Use net.ipv6.neigh.%s.%s_ms " + "instead.\n", + warncomm, func, + dev_name, ctl->procname, + dev_name, ctl->procname); + warned++; + } +} + +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net_device *dev = ctl->extra1; + struct inet6_dev *idev; + int ret; + + if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME) + ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); + + switch (ctl->ctl_name) { + case NET_NEIGH_RETRANS_TIME: + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + break; + case NET_NEIGH_REACHABLE_TIME: + ret = proc_dointvec_jiffies(ctl, write, + filp, buffer, lenp, ppos); + break; + case NET_NEIGH_RETRANS_TIME_MS: + case NET_NEIGH_REACHABLE_TIME_MS: + ret = proc_dointvec_ms_jiffies(ctl, write, + filp, buffer, lenp, ppos); + break; + default: + ret = -1; + } + + if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { + if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS) + idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + in6_dev_put(idev); + } + return ret; +} + +static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + struct net_device *dev = ctl->extra1; + struct inet6_dev *idev; + int ret; + + if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME) + ndisc_warn_deprecated_sysctl(ctl, "procfs", dev ? dev->name : "default"); + + switch (ctl->ctl_name) { + case NET_NEIGH_REACHABLE_TIME: + ret = sysctl_jiffies(ctl, name, nlen, + oldval, oldlenp, newval, newlen, + context); + break; + case NET_NEIGH_RETRANS_TIME_MS: + case NET_NEIGH_REACHABLE_TIME_MS: + ret = sysctl_ms_jiffies(ctl, name, nlen, + oldval, oldlenp, newval, newlen, + context); + break; + default: + ret = 0; + } + + if (newval && newlen && ret > 0 && + dev && (idev = in6_dev_get(dev)) != NULL) { + if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS) + idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + in6_dev_put(idev); + } + + return ret; +} + +#endif + +int __init ndisc_init(struct net_proto_family *ops) +{ + struct ipv6_pinfo *np; + struct sock *sk; + int err; + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &ndisc_socket); + if (err < 0) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n", + err); + ndisc_socket = NULL; /* For safety. */ + return err; + } + + sk = ndisc_socket->sk; + np = inet6_sk(sk); + sk->sk_allocation = GFP_ATOMIC; + np->hop_limit = 255; + /* Do not loopback ndisc messages */ + np->mc_loop = 0; + sk->sk_prot->unhash(sk); + + /* + * Initialize the neighbour table + */ + + neigh_table_init(&nd_tbl); + +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, + "ipv6", + &ndisc_ifinfo_sysctl_change, + &ndisc_ifinfo_sysctl_strategy); +#endif + + register_netdevice_notifier(&ndisc_netdev_notifier); + return 0; +} + +void ndisc_cleanup(void) +{ +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +#endif + neigh_table_clear(&nd_tbl); + sock_release(ndisc_socket); + ndisc_socket = NULL; /* For safety. */ +} diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig new file mode 100644 index 000000000000..77ec704c9ee3 --- /dev/null +++ b/net/ipv6/netfilter/Kconfig @@ -0,0 +1,242 @@ +# +# IP netfilter configuration +# + +menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" + depends on INET && IPV6 && NETFILTER && EXPERIMENTAL + +#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK +#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then +# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK +#fi +config IP6_NF_QUEUE + tristate "Userspace queueing via NETLINK" + ---help--- + + This option adds a queue handler to the kernel for IPv6 + packets which lets us to receive the filtered packets + with QUEUE target using libiptc as we can do with + the IPv4 now. + + (C) Fernando Anton 2001 + IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + Universidad Carlos III de Madrid + Universidad Politecnica de Alcala de Henares + email: <fanton@it.uc3m.es>. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_IPTABLES + tristate "IP6 tables support (required for filtering/masq/NAT)" + help + ip6tables is a general, extensible packet identification framework. + Currently only the packet filtering and packet mangling subsystem + for IPv6 use this, but connection tracking is going to follow. + Say 'Y' or 'M' here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +# The simple matches. +config IP6_NF_MATCH_LIMIT + tristate "limit match support" + depends on IP6_NF_IPTABLES + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_MAC + tristate "MAC address match support" + depends on IP6_NF_IPTABLES + help + mac matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_RT + tristate "Routing header match support" + depends on IP6_NF_IPTABLES + help + rt matching allows you to match packets based on the routing + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_OPTS + tristate "Hop-by-hop and Dst opts header match support" + depends on IP6_NF_IPTABLES + help + This allows one to match packets based on the hop-by-hop + and destination options headers of a packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_FRAG + tristate "Fragmentation header match support" + depends on IP6_NF_IPTABLES + help + frag matching allows you to match packets based on the fragmentation + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_HL + tristate "HL match support" + depends on IP6_NF_IPTABLES + help + HL matching allows you to match packets based on the hop + limit of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_MULTIPORT + tristate "Multiple port match support" + depends on IP6_NF_IPTABLES + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_OWNER + tristate "Owner match support" + depends on IP6_NF_IPTABLES + help + Packet owner matching allows you to match locally-generated packets + based on who created them: the user, group, process or session. + + To compile it as a module, choose M here. If unsure, say N. + +# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES +config IP6_NF_MATCH_MARK + tristate "netfilter MARK match support" + depends on IP6_NF_IPTABLES + help + Netfilter mark matching allows you to match packets based on the + `nfmark' value in the packet. This can be set by the MARK target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_IPV6HEADER + tristate "IPv6 Extension Headers Match" + depends on IP6_NF_IPTABLES + help + This module allows one to match packets based upon + the ipv6 extension headers. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_AHESP + tristate "AH/ESP match support" + depends on IP6_NF_IPTABLES + help + This module allows one to match AH and ESP packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_LENGTH + tristate "Packet Length match support" + depends on IP6_NF_IPTABLES + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_EUI64 + tristate "EUI64 address check" + depends on IP6_NF_IPTABLES + help + This module performs checking on the IPv6 source address + Compares the last 64 bits with the EUI64 (delivered + from the MAC address) address + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_PHYSDEV + tristate "Physdev match support" + depends on IP6_NF_IPTABLES && BRIDGE_NETFILTER + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES +# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then +# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES +# fi +# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES +# fi +# The targets +config IP6_NF_FILTER + tristate "Packet filtering" + depends on IP6_NF_IPTABLES + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_LOG + tristate "LOG target support" + depends on IP6_NF_FILTER + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then +# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER +# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER +# fi +# fi +config IP6_NF_MANGLE + tristate "Packet mangling" + depends on IP6_NF_IPTABLES + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE +config IP6_NF_TARGET_MARK + tristate "MARK target support" + depends on IP6_NF_MANGLE + help + This option adds a `MARK' target, which allows you to create rules + in the `mangle' table which alter the netfilter mark (nfmark) field + associated with the packet packet prior to routing. This can change + the routing method (see `Use netfilter MARK value as routing + key') and can also be used by other subsystems to change their + behavior. + + To compile it as a module, choose M here. If unsure, say N. + +#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES +config IP6_NF_RAW + tristate 'raw table support (required for TRACE)' + depends on IP6_NF_IPTABLES + help + This option adds a `raw' table to ip6tables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +endmenu + diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile new file mode 100644 index 000000000000..2e51714953b6 --- /dev/null +++ b/net/ipv6/netfilter/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the netfilter modules on top of IPv6. +# + +# Link order matters here. +obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_MATCH_LIMIT) += ip6t_limit.o +obj-$(CONFIG_IP6_NF_MATCH_MARK) += ip6t_mark.o +obj-$(CONFIG_IP6_NF_MATCH_LENGTH) += ip6t_length.o +obj-$(CONFIG_IP6_NF_MATCH_MAC) += ip6t_mac.o +obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o +obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o +obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o +obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o +obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o +obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o +obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o +obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o +obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o +obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o +obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o +obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o +obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o +obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o +obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o +obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c new file mode 100644 index 000000000000..c54830b89593 --- /dev/null +++ b/net/ipv6/netfilter/ip6_queue.c @@ -0,0 +1,741 @@ +/* + * This is a module which is used for queueing IPv6 packets and + * communicating with userspace via netlink. + * + * (C) 2001 Fernando Anton, this code is GPL. + * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain + * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain + * email: fanton@it.uc3m.es + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 2001-11-06: First try. Working with ip_queue.c for IPv4 and trying + * to adapt it to IPv6 + * HEAVILY based in ipqueue.c by James Morris. It's just + * a little modified version of it, so he's nearly the + * real coder of this. + * Few changes needed, mainly the hard_routing code and + * the netlink socket protocol (we're NETLINK_IP6_FW). + * 2002-06-25: Code cleanup. [JM: ported cleanup over from ip_queue.c] + * 2005-02-04: Added /proc counter for dropped packets; fixed so + * packets aren't delivered to user space if they're going + * to be dropped. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ipv6.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <net/sock.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> +#include <linux/netfilter_ipv4/ip_queue.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip6_queue" +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" + +struct ipq_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; +}; + +struct ipq_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + struct ipq_rt_info rt_info; +}; + +typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); + +static unsigned char copy_mode = IPQ_COPY_NONE; +static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT; +static DEFINE_RWLOCK(queue_lock); +static int peer_pid; +static unsigned int copy_range; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl; +static LIST_HEAD(queue_list); +static DECLARE_MUTEX(ipqnl_sem); + +static void +ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) +{ + nf_reinject(entry->skb, entry->info, verdict); + kfree(entry); +} + +static inline void +__ipq_enqueue_entry(struct ipq_queue_entry *entry) +{ + list_add(&entry->list, &queue_list); + queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct ipq_queue_entry * +__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue_list) { + struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__ipq_dequeue_entry(struct ipq_queue_entry *entry) +{ + list_del(&entry->list); + queue_total--; +} + +static inline struct ipq_queue_entry * +__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + entry = __ipq_find_entry(cmpfn, data); + if (entry == NULL) + return NULL; + + __ipq_dequeue_entry(entry); + return entry; +} + + +static inline void +__ipq_flush(int verdict) +{ + struct ipq_queue_entry *entry; + + while ((entry = __ipq_find_dequeue_entry(NULL, 0))) + ipq_issue_verdict(entry, verdict); +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + copy_mode = mode; + copy_range = range; + if (copy_range > 0xFFFF) + copy_range = 0xFFFF; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NF_DROP); +} + +static struct ipq_queue_entry * +ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + write_lock_bh(&queue_lock); + entry = __ipq_find_dequeue_entry(cmpfn, data); + write_unlock_bh(&queue_lock); + return entry; +} + +static void +ipq_flush(int verdict) +{ + write_lock_bh(&queue_lock); + __ipq_flush(verdict); + write_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + + read_lock_bh(&queue_lock); + + switch (copy_mode) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + data_len = 0; + break; + + case IPQ_COPY_PACKET: + if (copy_range == 0 || copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = copy_range; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + read_unlock_bh(&queue_lock); + return NULL; + } + + read_unlock_bh(&queue_lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + pmsg->timestamp_sec = entry->skb->stamp.tv_sec; + pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->mark = entry->skb->nfmark; + pmsg->hook = entry->info->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->info->indev) + strcpy(pmsg->indev_name, entry->info->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->info->outdev) + strcpy(pmsg->outdev_name, entry->info->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->info->indev && entry->skb->dev) { + pmsg->hw_type = entry->skb->dev->type; + if (entry->skb->dev->hard_header_parse) + pmsg->hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + printk(KERN_ERR "ip6_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct ipq_queue_entry *entry; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + printk(KERN_ERR "ip6_queue: OOM in ipq_enqueue_packet()\n"); + return -ENOMEM; + } + + entry->info = info; + entry->skb = skb; + + if (entry->info->hook == NF_IP_LOCAL_OUT) { + struct ipv6hdr *iph = skb->nh.ipv6h; + + entry->rt_info.daddr = iph->daddr; + entry->rt_info.saddr = iph->saddr; + } + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + goto err_out_free; + + write_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip6_queue: fill at %d entries, " + "dropping packet(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + write_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + write_unlock_bh(&queue_lock); + +err_out_free: + kfree(entry); + return status; +} + +static int +ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +{ + int diff; + struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, v->data_len); + else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip6_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_ip_make_writable(&e->skb, v->data_len)) + return -ENOMEM; + memcpy(e->skb->data, v->payload, v->data_len); + e->skb->nfcache |= NFC_ALTERED; + + /* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + * Not a nice way to cmp, but works + */ + if (e->info->hook == NF_IP_LOCAL_OUT) { + struct ipv6hdr *iph = e->skb->nh.ipv6h; + if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) || + !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr)) + return ip6_route_me_harder(e->skb); + } + return 0; +} + +static inline int +id_cmp(struct ipq_queue_entry *e, unsigned long id) +{ + return (id == (unsigned long )e); +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct ipq_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv6(vmsg, entry) < 0) + verdict = NF_DROP; + + ipq_issue_verdict(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + write_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + write_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + struct ipq_queue_entry *entry; + + while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) + ipq_issue_verdict(entry, NF_DROP); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags, nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = (struct nlmsghdr *)skb->data; + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + write_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + write_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + skblen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + return; +} + +static void +ipq_rcv_sk(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (down_trylock(&ipqnl_sem)) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + ipq_rcv_skb(skb); + kfree_skb(skb); + } + + up(&ipqnl_sem); + + } while (ipqnl && ipqnl->sk_receive_queue.qlen); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_IP6_FW && n->pid) { + write_lock_bh(&queue_lock); + if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .ctl_name = NET_IPQ_QMAX, + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_dir_table[] = { + { + .ctl_name = NET_IPV6, + .procname = "ipv6", + .mode = 0555, + .child = ipq_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipq_dir_table + }, + { .ctl_name = 0 } +}; + +static int +ipq_get_info(char *buffer, char **start, off_t offset, int length) +{ + int len; + + read_lock_bh(&queue_lock); + + len = sprintf(buffer, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netfilter dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + read_unlock_bh(&queue_lock); + + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + else if (len < 0) + len = 0; + return len; +} + +static int +init_or_cleanup(int init) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc; + + if (!init) + goto cleanup; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk); + if (ipqnl == NULL) { + printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + if (proc) + proc->owner = THIS_MODULE; + else { + printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } + + register_netdevice_notifier(&ipq_dev_notifier); + ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); + + status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL); + if (status < 0) { + printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup: + nf_unregister_queue_handler(PF_INET6); + synchronize_net(); + ipq_flush(NF_DROP); + +cleanup_sysctl: + unregister_sysctl_table(ipq_sysctl_header); + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(IPQ_PROC_FS_NAME); + +cleanup_ipqnl: + sock_release(ipqnl->sk_socket); + down(&ipqnl_sem); + up(&ipqnl_sem); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("IPv6 packet queue handler"); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c new file mode 100644 index 000000000000..c735276fdd5f --- /dev/null +++ b/net/ipv6/netfilter/ip6_tables.c @@ -0,0 +1,1970 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2002 Netfilter core team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> + * - increase module usage count as soon as we have rules inside + * a table + * 06 Jun 2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * - new extension header parser code + */ +#include <linux/config.h> +#include <linux/skbuff.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/netdevice.h> +#include <linux/module.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmpv6.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <asm/uaccess.h> +#include <asm/semaphore.h> +#include <linux/proc_fs.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("IPv6 packet filter"); + +#define IPV6_HDR_LEN (sizeof(struct ipv6hdr)) +#define IPV6_OPTHDR_LEN (sizeof(struct ipv6_opt_hdr)) + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(ip6t_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* Locking is simple: we assume at worst case there will be one packet + in user context and one from bottom halves (or soft irq if Alexey's + softnet patch was applied). + + We keep a set of rules for each CPU, so we can avoid write-locking + them; doing a readlock_bh() stops packets coming through if we're + in user context. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ip6t_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + /* Initial number of entries. Needed for module usage count */ + unsigned int initial_entries; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP6_NUMHOOKS]; + unsigned int underflow[NF_IP6_NUMHOOKS]; + + /* ip6t_entry tables: one per CPU */ + char entries[0] ____cacheline_aligned; +}; + +static LIST_HEAD(ip6t_target); +static LIST_HEAD(ip6t_match); +static LIST_HEAD(ip6t_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +static int ip6_masked_addrcmp(struct in6_addr addr1, struct in6_addr mask, + struct in6_addr addr2) +{ + int i; + for( i = 0; i < 16; i++){ + if((addr1.s6_addr[i] & mask.s6_addr[i]) != + (addr2.s6_addr[i] & mask.s6_addr[i])) + return 1; + } + return 0; +} + +/* Check for an extension */ +int +ip6t_ext_hdr(u8 nexthdr) +{ + return ( (nexthdr == IPPROTO_HOPOPTS) || + (nexthdr == IPPROTO_ROUTING) || + (nexthdr == IPPROTO_FRAGMENT) || + (nexthdr == IPPROTO_ESP) || + (nexthdr == IPPROTO_AH) || + (nexthdr == IPPROTO_NONE) || + (nexthdr == IPPROTO_DSTOPTS) ); +} + +/* Returns whether matches rule or not. */ +static inline int +ip6_packet_match(const struct sk_buff *skb, + const char *indev, + const char *outdev, + const struct ip6t_ip6 *ip6info, + unsigned int *protoff, + int *fragoff) +{ + size_t i; + unsigned long ret; + const struct ipv6hdr *ipv6 = skb->nh.ipv6h; + +#define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg)) + + if (FWINV(ip6_masked_addrcmp(ipv6->saddr,ip6info->smsk,ip6info->src), + IP6T_INV_SRCIP) + || FWINV(ip6_masked_addrcmp(ipv6->daddr,ip6info->dmsk,ip6info->dst), + IP6T_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); +/* + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + ipinfo->smsk.s_addr, ipinfo->src.s_addr, + ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, + ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ip6info->iniface)[i]) + & ((const unsigned long *)ip6info->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ip6info->iniface, + ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ip6info->outiface)[i]) + & ((const unsigned long *)ip6info->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ip6info->outiface, + ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":""); + return 0; + } + +/* ... might want to do something with class and flowlabel here ... */ + + /* look for the desired protocol header */ + if((ip6info->flags & IP6T_F_PROTO)) { + u_int8_t currenthdr = ipv6->nexthdr; + struct ipv6_opt_hdr _hdr, *hp; + u_int16_t ptr; /* Header offset in skb */ + u_int16_t hdrlen; /* Header */ + u_int16_t _fragoff = 0, *fp = NULL; + + ptr = IPV6_HDR_LEN; + + while (ip6t_ext_hdr(currenthdr)) { + /* Is there enough space for the next ext header? */ + if (skb->len - ptr < IPV6_OPTHDR_LEN) + return 0; + + /* NONE or ESP: there isn't protocol part */ + /* If we want to count these packets in '-p all', + * we will change the return 0 to 1*/ + if ((currenthdr == IPPROTO_NONE) || + (currenthdr == IPPROTO_ESP)) + break; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Size calculation */ + if (currenthdr == IPPROTO_FRAGMENT) { + fp = skb_header_pointer(skb, + ptr+offsetof(struct frag_hdr, + frag_off), + sizeof(_fragoff), + &_fragoff); + if (fp == NULL) + return 0; + + _fragoff = ntohs(*fp) & ~0x7; + hdrlen = 8; + } else if (currenthdr == IPPROTO_AH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + /* ptr is too large */ + if ( ptr > skb->len ) + return 0; + if (_fragoff) { + if (ip6t_ext_hdr(currenthdr)) + return 0; + break; + } + } + + *protoff = ptr; + *fragoff = _fragoff; + + /* currenthdr contains the protocol header */ + + dprintf("Packet protocol %hi ?= %s%hi.\n", + currenthdr, + ip6info->invflags & IP6T_INV_PROTO ? "!":"", + ip6info->proto); + + if (ip6info->proto == currenthdr) { + if(ip6info->invflags & IP6T_INV_PROTO) { + return 0; + } + return 1; + } + + /* We need match for the '-p all', too! */ + if ((ip6info->proto != 0) && + !(ip6info->invflags & IP6T_INV_PROTO)) + return 0; + } + return 1; +} + +/* should be ip6 safe */ +static inline int +ip6_checkentry(const struct ip6t_ip6 *ipv6) +{ + if (ipv6->flags & ~IP6T_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ipv6->flags & ~IP6T_F_MASK); + return 0; + } + if (ipv6->invflags & ~IP6T_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ipv6->invflags & ~IP6T_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ip6t_error(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip6_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ip6t_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + unsigned int protoff, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.kernel.match->match(skb, in, out, m->data, + offset, protoff, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ip6t_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ip6t_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ip6t_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ip6t_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ]; + int offset = 0; + unsigned int protoff = 0; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ip6t_entry *e, *back; + + /* Initialization */ + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + +#ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ip6t_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ip6t_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ip6t_entry *)table_base)->comefrom, + ((struct ip6t_entry *)table_base)->comefrom); + } + ((struct ip6t_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6, + &protoff, &offset)) { + struct ip6t_entry_target *t; + + if (IP6T_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, protoff, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, + ntohs((*pskb)->nh.ipv6h->payload_len) + + IPV6_HDR_LEN, + 1); + + t = ip6t_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct ip6t_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IP6T_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ip6t_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ip6t_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif + verdict = t->u.kernel.target->target(pskb, + in, out, + hook, + t->data, + userdata); + +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ip6t_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IP6T_CONTINUE) { + printk("Target %s reentered!\n", + t->u.kernel.target->name); + verdict = NF_DROP; + } + ((struct ip6t_entry *)table_base)->comefrom + = 0x57acc001; +#endif + if (verdict == IP6T_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ip6t_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* If it succeeds, returns element and locks mutex */ +static inline void * +find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + +#if 1 + duprintf("find_inlist: searching for `%s' in %s.\n", + name, head == &ip6t_target ? "ip6t_target" + : head == &ip6t_match ? "ip6t_match" + : head == &ip6t_tables ? "ip6t_tables" : "UNKNOWN"); +#endif + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + duprintf("find_inlist: loading `%s%s'.\n", prefix, name); + request_module("%s%s", prefix, name); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct ip6t_table * +ip6t_find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_tables, name, "ip6table_", error, mutex); +} + +static inline struct ip6t_match * +find_match_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_match, name, "ip6t_", error, mutex); +} + +static struct ip6t_target * +ip6t_find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_target, name, "ip6t_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ip6t_ip6 *ipv6) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ipv6); i++) + if (((char *)ipv6)[i]) + break; + + return (i == sizeof(*ipv6)); +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ip6t_entry *e + = (struct ip6t_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ip6t_standard_target *t + = (void *)ip6t_get_target(e); + + if (e->comefrom & (1 << NF_IP6_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP6_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ip6t_entry) + && (strcmp(t->target.u.user.name, + IP6T_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->ipv6)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<<NF_IP6_NUMHOOKS); +#ifdef DEBUG_IP_FIREWALL_USER + if (e->comefrom + & (1 << NF_IP6_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ip6t_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ip6t_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + IP6T_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ip6t_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.kernel.match->destroy) + m->u.kernel.match->destroy(m->data, + m->u.match_size - sizeof(*m)); + module_put(m->u.kernel.match->me); + return 0; +} + +static inline int +standard_check(const struct ip6t_entry_target *t, + unsigned int max_offset) +{ + struct ip6t_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != IP6T_ALIGN(sizeof(struct ip6t_standard_target))) { + duprintf("standard_check: target size %u != %u\n", + t->u.target_size, + IP6T_ALIGN(sizeof(struct ip6t_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ip6t_entry)) { + duprintf("ip6t_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ip6t_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ip6t_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + unsigned int *i) +{ + int ret; + struct ip6t_match *match; + + match = find_match_lock(m->u.user.name, &ret, &ip6t_mutex); + if (!match) { + // duprintf("check_match: `%s' not found\n", m->u.name); + return ret; + } + if (!try_module_get(match->me)) { + up(&ip6t_mutex); + return -ENOENT; + } + m->u.kernel.match = match; + up(&ip6t_mutex); + + if (m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ipv6, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + module_put(m->u.kernel.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ip6t_target ip6t_standard_target; + +static inline int +check_entry(struct ip6t_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ip6t_entry_target *t; + struct ip6t_target *target; + int ret; + unsigned int j; + + if (!ip6_checkentry(&e->ipv6)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ip6t_get_target(e); + target = ip6t_find_target_lock(t->u.user.name, &ret, &ip6t_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + goto cleanup_matches; + } + if (!try_module_get(target->me)) { + up(&ip6t_mutex); + ret = -ENOENT; + goto cleanup_matches; + } + t->u.kernel.target = target; + up(&ip6t_mutex); + if (!t->u.kernel.target) { + ret = -EBUSY; + goto cleanup_matches; + } + if (t->u.kernel.target == &ip6t_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IP6T_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ip6t_entry *e, + struct ip6t_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 + || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP6_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IP6T_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ip6t_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ip6t_entry *e, unsigned int *i) +{ + struct ip6t_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + t = ip6t_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ip6t_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ip6t_table_info * +replace_table(struct ip6t_table *table, + unsigned int num_counters, + struct ip6t_table_info *newinfo, + int *error) +{ + struct ip6t_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ip6t_entry *table_base; + unsigned int i; + + for (i = 0; i < num_possible_cpus(); i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ip6t_table_info *t, + struct ip6t_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ip6t_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct ip6t_entry *e; + struct ip6t_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ip6t_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ip6t_entry_match *m; + struct ip6t_entry_target *t; + + e = (struct ip6t_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ip6t_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ip6t_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ip6t_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ip6t_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ip6t_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ip6t_get_entries *entries, + struct ip6t_get_entries __user *uptr) +{ + int ret; + struct ip6t_table *t; + + t = ip6t_find_table_lock(entries->name, &ret, &ip6t_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&ip6t_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int +do_replace(void __user *user, unsigned int len) +{ + int ret; + struct ip6t_replace tmp; + struct ip6t_table *t; + struct ip6t_table_info *newinfo, *oldinfo; + struct ip6t_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct ip6t_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ip6t_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + /* Get a reference in advance, we're not allowed fail later */ + if (!try_module_get(t->me)) { + ret = -EBUSY; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + /* Silent error: too late now. */ + if (copy_to_user(tmp.counters, counters, + sizeof(struct ip6t_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&ip6t_mutex); + return ret; + + put_module: + module_put(t->me); + free_newinfo_counters_untrans_unlock: + up(&ip6t_mutex); + free_newinfo_counters_untrans: + IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ip6t_entry *e, + const struct ip6t_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct ip6t_counters_info tmp, *paddc; + struct ip6t_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ip6t_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IP6T_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ip6t_mutex); + free: + vfree(paddc); + + return ret; +} + +static int +do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: { + char name[IP6T_TABLE_MAXNAMELEN]; + struct ip6t_table *t; + + if (*len != sizeof(struct ip6t_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ip6t_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; + t = ip6t_find_table_lock(name, &ret, &ip6t_mutex); + if (t) { + struct ip6t_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + memcpy(info.name, name, sizeof(info.name)); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&ip6t_mutex); + } + } + break; + + case IP6T_SO_GET_ENTRIES: { + struct ip6t_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ip6t_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ip6t_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ip6t_register_target(struct ip6t_target *target) +{ + int ret; + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&ip6t_target, target)) { + duprintf("ip6t_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + } + up(&ip6t_mutex); + return ret; +} + +void +ip6t_unregister_target(struct ip6t_target *target) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_target, target); + up(&ip6t_mutex); +} + +int +ip6t_register_match(struct ip6t_match *match) +{ + int ret; + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&ip6t_match, match)) { + duprintf("ip6t_register_match: `%s' already in list!\n", + match->name); + ret = -EINVAL; + } + up(&ip6t_mutex); + + return ret; +} + +void +ip6t_unregister_match(struct ip6t_match *match) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_match, match); + up(&ip6t_mutex); +} + +int ip6t_register_table(struct ip6t_table *table, + const struct ip6t_replace *repl) +{ + int ret; + struct ip6t_table_info *newinfo; + static struct ip6t_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct ip6t_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ip6t_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&ip6t_tables, table); + + unlock: + up(&ip6t_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void ip6t_unregister_table(struct ip6t_table *table) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_tables, table); + up(&ip6t_mutex); + + /* Decrease module usage counts and free resources */ + IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int tcpoff, + unsigned int optlen, + int invert, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + + duprintf("tcp_match: finding option\n"); + if (!optlen) + return invert; + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, tcpoff + sizeof(struct tcphdr), optlen, + _opt); + if (op == NULL) { + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + const struct ip6t_tcp *tcpinfo = matchinfo; + + if (offset) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + } + /* Must not be a fragment. */ + return 0; + } + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT))) + return 0; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT))) + return 0; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IP6T_TCP_INV_FLAGS)) + return 0; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + *hotdrop = 1; + return 0; + } + if (!tcp_find_option(tcpinfo->option, skb, protoff, + th->doff*4 - sizeof(*th), + tcpinfo->invflags & IP6T_TCP_INV_OPTION, + hotdrop)) + return 0; + } + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ipv6->proto == IPPROTO_TCP + && !(ipv6->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_tcp)) + && !(tcpinfo->invflags & ~IP6T_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct udphdr _udph, *uh; + const struct ip6t_udp *udpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & IP6T_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & IP6T_INV_PROTO)) { + duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_udp))) { + duprintf("ip6t_udp: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_udp))); + return 0; + } + if (udpinfo->invflags & ~IP6T_UDP_INV_MASK) { + duprintf("ip6t_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return (type == test_type && code >= min_code && code <= max_code) + ^ invert; +} + +static int +icmp6_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct icmp6hdr _icmp, *ic; + const struct ip6t_icmp *icmpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ic = skb_header_pointer(skb, protoff, sizeof(_icmp), &_icmp); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return icmp6_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->icmp6_type, ic->icmp6_code, + !!(icmpinfo->invflags&IP6T_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp6_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ipv6->proto == IPPROTO_ICMPV6 + && !(ipv6->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_icmp)) + && !(icmpinfo->invflags & ~IP6T_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ip6t_target ip6t_standard_target = { + .name = IP6T_STANDARD_TARGET, +}; + +static struct ip6t_target ip6t_error_target = { + .name = IP6T_ERROR_TARGET, + .target = ip6t_error, +}; + +static struct nf_sockopt_ops ip6t_sockopts = { + .pf = PF_INET6, + .set_optmin = IP6T_BASE_CTL, + .set_optmax = IP6T_SO_SET_MAX+1, + .set = do_ip6t_set_ctl, + .get_optmin = IP6T_BASE_CTL, + .get_optmax = IP6T_SO_GET_MAX+1, + .get = do_ip6t_get_ctl, +}; + +static struct ip6t_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, +}; + +static struct ip6t_match udp_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, +}; + +static struct ip6t_match icmp6_matchstruct = { + .name = "icmp6", + .match = &icmp6_match, + .checkentry = &icmp6_checkentry, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const char *i, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", + i + sizeof(struct list_head)); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static inline int print_target(const struct ip6t_target *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if (t == &ip6t_standard_target || t == &ip6t_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); +} + +static int ip6t_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_tables, print_name, char *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} + +static int ip6t_get_targets(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_target, print_target, struct ip6t_target *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_match, print_name, char *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = +{ { "ip6_tables_names", ip6t_get_tables }, + { "ip6_tables_targets", ip6t_get_targets }, + { "ip6_tables_matches", ip6t_get_matches }, + { NULL, NULL} }; +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ip6t_mutex); + list_append(&ip6t_target, &ip6t_standard_target); + list_append(&ip6t_target, &ip6t_error_target); + list_append(&ip6t_match, &tcp_matchstruct); + list_append(&ip6t_match, &udp_matchstruct); + list_append(&ip6t_match, &icmp6_matchstruct); + up(&ip6t_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ip6t_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + int i; + + for (i = 0; ip6t_proc_entry[i].name; i++) { + proc = proc_net_create(ip6t_proc_entry[i].name, 0, + ip6t_proc_entry[i].get_info); + if (!proc) { + while (--i >= 0) + proc_net_remove(ip6t_proc_entry[i].name); + nf_unregister_sockopt(&ip6t_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } + } +#endif + + printk("ip6_tables: (C) 2000-2002 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ip6t_sockopts); +#ifdef CONFIG_PROC_FS + { + int i; + for (i = 0; ip6t_proc_entry[i].name; i++) + proc_net_remove(ip6t_proc_entry[i].name); + } +#endif +} + +EXPORT_SYMBOL(ip6t_register_table); +EXPORT_SYMBOL(ip6t_unregister_table); +EXPORT_SYMBOL(ip6t_do_table); +EXPORT_SYMBOL(ip6t_register_match); +EXPORT_SYMBOL(ip6t_unregister_match); +EXPORT_SYMBOL(ip6t_register_target); +EXPORT_SYMBOL(ip6t_unregister_target); +EXPORT_SYMBOL(ip6t_ext_hdr); + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c new file mode 100644 index 000000000000..bfc3d0185d19 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -0,0 +1,509 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 2001 Jan Rekorajski <baggins@pld.org.pl> + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/spinlock.h> +#include <linux/icmpv6.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/ipv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>"); +MODULE_DESCRIPTION("IP6 tables LOG target module"); +MODULE_LICENSE("GPL"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +struct in_device; +#include <net/route.h> +#include <linux/netfilter_ipv6/ip6t_LOG.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Use lock to serialize, so printks don't overlap */ +static DEFINE_SPINLOCK(log_lock); + +/* One level of recursion won't kill us */ +static void dump_packet(const struct ip6t_log_info *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h, *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000" */ + printk("SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->saddr)); + printk("DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->daddr)); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(u_int32_t *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(u_int32_t *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (info->logflags & IP6T_LOG_IPOPT) + printk("OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr, *fh; + + printk("FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + printk("TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + printk("%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + printk("INCOMPLETE "); + + printk("ID:%08x ", ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (info->logflags & IP6T_LOG_IPOPT) + printk(")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (info->logflags & IP6T_LOG_IPOPT) { + struct ip_auth_hdr _ahdr, *ah; + + /* Max length: 3 "AH " */ + printk("AH "); + + if (fragment) { + printk(")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + printk("INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + printk("SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (info->logflags & IP6T_LOG_IPOPT) { + struct ip_esp_hdr _esph, *eh; + + /* Max length: 4 "ESP " */ + printk("ESP "); + + if (fragment) { + printk(")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + printk("INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + printk("SPI=0x%x )", ntohl(eh->spi) ); + + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + printk("Unknown Ext Hdr %u", currenthdr); + return; + } + if (info->logflags & IP6T_LOG_IPOPT) + printk(") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + /* Max length: 10 "PROTO=TCP " */ + printk("PROTO=TCP "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); + if (th == NULL) { + printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (info->logflags & IP6T_LOG_TCPSEQ) + printk("SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + printk("WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + printk("CWR "); + if (th->ece) + printk("ECE "); + if (th->urg) + printk("URG "); + if (th->ack) + printk("ACK "); + if (th->psh) + printk("PSH "); + if (th->rst) + printk("RST "); + if (th->syn) + printk("SYN "); + if (th->fin) + printk("FIN "); + /* Max length: 11 "URGP=65535 " */ + printk("URGP=%u ", ntohs(th->urg_ptr)); + + if ((info->logflags & IP6T_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + unsigned int optsize = th->doff * 4 + - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, + ptr + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + printk("OPT (TRUNCATED)"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i =0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + break; + } + case IPPROTO_UDP: { + struct udphdr _udph, *uh; + + /* Max length: 10 "PROTO=UDP " */ + printk("PROTO=UDP "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); + if (uh == NULL) { + printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h, *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + printk("PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + printk("ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + printk("["); + dump_packet(info, skb, ptr + sizeof(_icmp6h), + 0); + printk("] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) + printk("MTU=%u ", ntohl(ic->icmp6_mtu)); + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + printk("PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } +} + +static void +ip6t_log_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ip6t_log_info *loginfo, + const char *level_string, + const char *prefix) +{ + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + + spin_lock_bh(&log_lock); + printk(level_string); + printk("%sIN=%s OUT=%s ", + prefix == NULL ? loginfo->prefix : prefix, + in ? in->name : "", + out ? out->name : ""); + if (in && !out) { + /* MAC logging for input chain only. */ + printk("MAC="); + if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { + if (skb->dev->type != ARPHRD_SIT){ + int i; + unsigned char *p = skb->mac.raw; + for (i = 0; i < skb->dev->hard_header_len; i++,p++) + printk("%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else { + int i; + unsigned char *p = skb->mac.raw; + if ( p - (ETH_ALEN*2+2) > skb->head ){ + p -= (ETH_ALEN+2); + for (i = 0; i < (ETH_ALEN); i++,p++) + printk("%02x%s", *p, + i == ETH_ALEN-1 ? "->" : ":"); + p -= (ETH_ALEN*2); + for (i = 0; i < (ETH_ALEN); i++,p++) + printk("%02x%c", *p, + i == ETH_ALEN-1 ? ' ' : ':'); + } + + if ((skb->dev->addr_len == 4) && + skb->dev->hard_header_len > 20){ + printk("TUNNEL="); + p = skb->mac.raw + 12; + for (i = 0; i < 4; i++,p++) + printk("%3d%s", *p, + i == 3 ? "->" : "."); + for (i = 0; i < 4; i++,p++) + printk("%3d%c", *p, + i == 3 ? ' ' : '.'); + } + } + } else + printk(" "); + } + + dump_packet(loginfo, skb, (u8*)skb->nh.ipv6h - skb->data, 1); + printk("\n"); + spin_unlock_bh(&log_lock); +} + +static unsigned int +ip6t_log_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_log_info *loginfo = targinfo; + char level_string[4] = "< >"; + + level_string[1] = '0' + (loginfo->level % 8); + ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + + return IP6T_CONTINUE; +} + +static void +ip6t_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ip6t_log_info loginfo = { + .level = 0, + .logflags = IP6T_LOG_MASK, + .prefix = "" + }; + + ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); +} + +static int ip6t_log_checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip6t_log_info *loginfo = targinfo; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_log_info))) { + DEBUGP("LOG: targinfosize %u != %u\n", + targinfosize, IP6T_ALIGN(sizeof(struct ip6t_log_info))); + return 0; + } + + if (loginfo->level >= 8) { + DEBUGP("LOG: level %u >= 8\n", loginfo->level); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + DEBUGP("LOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix)-1]); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_log_reg = { + .name = "LOG", + .target = ip6t_log_target, + .checkentry = ip6t_log_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ip6t_register_target(&ip6t_log_reg)) + return -EINVAL; + if (nflog) + nf_log_register(PF_INET6, &ip6t_logfn); + + return 0; +} + +static void __exit fini(void) +{ + if (nflog) + nf_log_unregister(PF_INET6, &ip6t_logfn); + ip6t_unregister_target(&ip6t_log_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c new file mode 100644 index 000000000000..d09ceb05013a --- /dev/null +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -0,0 +1,78 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ + +/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_MARK.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IP6T_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_mark_reg += { { NULL, NULL }, "MARK", target, checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + printk(KERN_DEBUG "registering ipv6 mark target\n"); + if (ip6t_register_target(&ip6t_mark_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_mark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c new file mode 100644 index 000000000000..d5b94f142bba --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -0,0 +1,208 @@ +/* Kernel module to match AH parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_ah.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 AH match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + DEBUGP("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r = (spi >= min && spi <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS\n" : "FAILED\n"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ip_auth_hdr *ah = NULL, _ah; + const struct ip6t_ah *ahinfo = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + + /*DEBUGP("IPv6 AH entered\n");*/ + /* if (opt->auth == 0) return 0; + * It does not filled on output */ + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_ah header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) + break; + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) + break; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) + hdrlen = 8; + else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* AH -> evaluate */ + if (nexthdr == NEXTHDR_AUTH) { + temp |= MASK_AH; + break; + } + + + /* set the flag */ + switch (nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_ah match: unknown nextheader %u\n",nexthdr); + return 0; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) { + DEBUGP("ipv6_ah: new pointer too large! \n"); + break; + } + } + + /* AH header not found */ + if (temp != MASK_AH) + return 0; + + if (len < sizeof(struct ip_auth_hdr)){ + *hotdrop = 1; + return 0; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); + BUG_ON(ah == NULL); + + DEBUGP("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen); + DEBUGP("RES %04X ", ah->reserved); + DEBUGP("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi)); + + DEBUGP("IPv6 AH spi %02X ", + (spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI)))); + DEBUGP("len %02X %04X %02X ", + ahinfo->hdrlen, hdrlen, + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN))); + DEBUGP("res %02X %04X %02X\n", + ahinfo->hdrres, ah->reserved, + !(ahinfo->hdrres && ah->reserved)); + + return (ah != NULL) + && + (spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI))) + && + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN)) + && + !(ahinfo->hdrres && ah->reserved); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_ah *ahinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_ah))) { + DEBUGP("ip6t_ah: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_ah))); + return 0; + } + if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { + DEBUGP("ip6t_ah: unknown flags %X\n", ahinfo->invflags); + return 0; + } + return 1; +} + +static struct ip6t_match ah_match = { + .name = "ah", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&ah_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&ah_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c new file mode 100644 index 000000000000..540925e4a7a8 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_dst.c @@ -0,0 +1,298 @@ +/* Kernel module to match Hop-by-Hop and Destination parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <asm/byteorder.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_opts.h> + +#define HOPBYHOP 0 + +MODULE_LICENSE("GPL"); +#if HOPBYHOP +MODULE_DESCRIPTION("IPv6 HbH match"); +#else +MODULE_DESCRIPTION("IPv6 DST match"); +#endif +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* + * (Type & 0xC0) >> 6 + * 0 -> ignorable + * 1 -> must drop the packet + * 2 -> send ICMP PARM PROB regardless and drop packet + * 3 -> Send ICMP if not a multicast address and drop packet + * (Type & 0x20) >> 5 + * 0 -> invariant + * 1 -> can change the routing + * (Type & 0x1F) Type + * 0 -> Pad1 (only 1 byte!) + * 1 -> PadN LENGTH info (total length = length + 2) + * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) + * 5 -> RTALERT 2 x x + */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ipv6_opt_hdr _optsh, *oh; + const struct ip6t_opts *optinfo = matchinfo; + unsigned int temp; + unsigned int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int ret = 0; + u8 _opttype, *tp = NULL; + u8 _optlen, *lp = NULL; + unsigned int optlen; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_opts header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* OPTS -> evaluate */ +#if HOPBYHOP + if (nexthdr == NEXTHDR_HOP) { + temp |= MASK_HOPOPTS; +#else + if (nexthdr == NEXTHDR_DEST) { + temp |= MASK_DSTOPTS; +#endif + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_opts match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_opts: new pointer is too large! \n"); + break; + } + } + + /* OPTIONS header not found */ +#if HOPBYHOP + if ( temp != MASK_HOPOPTS ) return 0; +#else + if ( temp != MASK_DSTOPTS ) return 0; +#endif + + if (len < (int)sizeof(struct ipv6_opt_hdr)){ + *hotdrop = 1; + return 0; + } + + if (len < hdrlen){ + /* Packet smaller than it's length field */ + return 0; + } + + oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); + BUG_ON(oh == NULL); + + DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); + + DEBUGP("len %02X %04X %02X ", + optinfo->hdrlen, hdrlen, + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); + + ret = (oh != NULL) + && + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); + + ptr += 2; + hdrlen -= 2; + if ( !(optinfo->flags & IP6T_OPTS_OPTS) ){ + return ret; + } else if (optinfo->flags & IP6T_OPTS_NSTRICT) { + DEBUGP("Not strict - not implemented"); + } else { + DEBUGP("Strict "); + DEBUGP("#%d ",optinfo->optsnr); + for(temp=0; temp<optinfo->optsnr; temp++){ + /* type field exists ? */ + if (hdrlen < 1) + break; + tp = skb_header_pointer(skb, ptr, sizeof(_opttype), + &_opttype); + if (tp == NULL) + break; + + /* Type check */ + if (*tp != (optinfo->opts[temp] & 0xFF00)>>8){ + DEBUGP("Tbad %02X %02X\n", + *tp, + (optinfo->opts[temp] & 0xFF00)>>8); + return 0; + } else { + DEBUGP("Tok "); + } + /* Length check */ + if (*tp) { + u16 spec_len; + + /* length field exists ? */ + if (hdrlen < 2) + break; + lp = skb_header_pointer(skb, ptr + 1, + sizeof(_optlen), + &_optlen); + if (lp == NULL) + break; + spec_len = optinfo->opts[temp] & 0x00FF; + + if (spec_len != 0x00FF && spec_len != *lp) { + DEBUGP("Lbad %02X %04X\n", *lp, + spec_len); + return 0; + } + DEBUGP("Lok "); + optlen = *lp + 2; + } else { + DEBUGP("Pad1\n"); + optlen = 1; + } + + /* Step to the next */ + DEBUGP("len%04X \n", optlen); + + if ((ptr > skb->len - optlen || hdrlen < optlen) && + (temp < optinfo->optsnr - 1)) { + DEBUGP("new pointer is too large! \n"); + break; + } + ptr += optlen; + hdrlen -= optlen; + } + if (temp == optinfo->optsnr) + return ret; + else return 0; + } + + return 0; +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_opts *optsinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_opts))) { + DEBUGP("ip6t_opts: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_opts))); + return 0; + } + if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { + DEBUGP("ip6t_opts: unknown flags %X\n", + optsinfo->invflags); + return 0; + } + + return 1; +} + +static struct ip6t_match opts_match = { +#if HOPBYHOP + .name = "hbh", +#else + .name = "dst", +#endif + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&opts_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&opts_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c new file mode 100644 index 000000000000..e39dd236fd8e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -0,0 +1,181 @@ +/* Kernel module to match ESP parameters. */ +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_esp.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 ESP match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + DEBUGP("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS\n" : "FAILED\n"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ip_esp_hdr _esp, *eh = NULL; + const struct ip6t_esp *espinfo = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + + /* Make sure this isn't an evil packet */ + /*DEBUGP("ipv6_esp entered \n");*/ + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + DEBUGP("ipv6_esp header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) + break; + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + temp |= MASK_ESP; + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) + hdrlen = 8; + else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* set the flag */ + switch (nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_esp match: unknown nextheader %u\n",nexthdr); + return 0; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) { + DEBUGP("ipv6_esp: new pointer too large! \n"); + break; + } + } + + /* ESP header not found */ + if (temp != MASK_ESP) + return 0; + + if (len < sizeof(struct ip_esp_hdr)) { + *hotdrop = 1; + return 0; + } + + eh = skb_header_pointer(skb, ptr, sizeof(_esp), &_esp); + BUG_ON(eh == NULL); + + DEBUGP("IPv6 ESP SPI %u %08X\n", ntohl(eh->spi), ntohl(eh->spi)); + + return (eh != NULL) + && spi_match(espinfo->spis[0], espinfo->spis[1], + ntohl(eh->spi), + !!(espinfo->invflags & IP6T_ESP_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_esp *espinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_esp))) { + DEBUGP("ip6t_esp: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_esp))); + return 0; + } + if (espinfo->invflags & ~IP6T_ESP_INV_MASK) { + DEBUGP("ip6t_esp: unknown flags %X\n", + espinfo->invflags); + return 0; + } + return 1; +} + +static struct ip6t_match esp_match = { + .name = "esp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&esp_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&esp_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c new file mode 100644 index 000000000000..616c2cbcd54d --- /dev/null +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -0,0 +1,101 @@ +/* Kernel module to match EUI64 address parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/if_ether.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_DESCRIPTION("IPv6 EUI64 address checking match"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + + unsigned char eui64[8]; + int i=0; + + if ( !(skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data) + && offset != 0) { + *hotdrop = 1; + return 0; + } + + memset(eui64, 0, sizeof(eui64)); + + if (eth_hdr(skb)->h_proto == ntohs(ETH_P_IPV6)) { + if (skb->nh.ipv6h->version == 0x6) { + memcpy(eui64, eth_hdr(skb)->h_source, 3); + memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3); + eui64[3]=0xff; + eui64[4]=0xfe; + eui64[0] |= 0x02; + + i=0; + while ((skb->nh.ipv6h->saddr.s6_addr[8+i] == + eui64[i]) && (i<8)) i++; + + if ( i == 8 ) + return 1; + } + } + + return 0; +} + +static int +ip6t_eui64_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) | + (1 << NF_IP6_FORWARD))) { + printk("ip6t_eui64: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } + + if (matchsize != IP6T_ALIGN(sizeof(int))) + return 0; + + return 1; +} + +static struct ip6t_match eui64_match = { + .name = "eui64", + .match = &match, + .checkentry = &ip6t_eui64_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&eui64_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&eui64_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c new file mode 100644 index 000000000000..4bfa30a9bc80 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -0,0 +1,229 @@ +/* Kernel module to match FRAG parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_frag.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 FRAG match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline int +id_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert) +{ + int r=0; + DEBUGP("frag id_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,id,max); + r=(id >= min && id <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct frag_hdr _frag, *fh = NULL; + const struct ip6t_frag *fraginfo = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_frag header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* FRAG -> evaluate */ + if (nexthdr == NEXTHDR_FRAGMENT) { + temp |= MASK_FRAGMENT; + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_frag match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_frag: new pointer too large! \n"); + break; + } + } + + /* FRAG header not found */ + if ( temp != MASK_FRAGMENT ) return 0; + + if (len < sizeof(struct frag_hdr)){ + *hotdrop = 1; + return 0; + } + + fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); + BUG_ON(fh == NULL); + + DEBUGP("INFO %04X ", fh->frag_off); + DEBUGP("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7); + DEBUGP("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6); + DEBUGP("MF %04X ", fh->frag_off & htons(IP6_MF)); + DEBUGP("ID %u %08X\n", ntohl(fh->identification), + ntohl(fh->identification)); + + DEBUGP("IPv6 FRAG id %02X ", + (id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)))); + DEBUGP("res %02X %02X%04X %02X ", + (fraginfo->flags & IP6T_FRAG_RES), fh->reserved, + ntohs(fh->frag_off) & 0x6, + !((fraginfo->flags & IP6T_FRAG_RES) + && (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); + DEBUGP("first %02X %02X %02X ", + (fraginfo->flags & IP6T_FRAG_FST), + ntohs(fh->frag_off) & ~0x7, + !((fraginfo->flags & IP6T_FRAG_FST) + && (ntohs(fh->frag_off) & ~0x7))); + DEBUGP("mf %02X %02X %02X ", + (fraginfo->flags & IP6T_FRAG_MF), + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_MF) + && !((ntohs(fh->frag_off) & IP6_MF)))); + DEBUGP("last %02X %02X %02X\n", + (fraginfo->flags & IP6T_FRAG_NMF), + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_NMF) + && (ntohs(fh->frag_off) & IP6_MF))); + + return (fh != NULL) + && + (id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))) + && + !((fraginfo->flags & IP6T_FRAG_RES) + && (fh->reserved || (ntohs(fh->frag_off) & 0x6))) + && + !((fraginfo->flags & IP6T_FRAG_FST) + && (ntohs(fh->frag_off) & ~0x7)) + && + !((fraginfo->flags & IP6T_FRAG_MF) + && !(ntohs(fh->frag_off) & IP6_MF)) + && + !((fraginfo->flags & IP6T_FRAG_NMF) + && (ntohs(fh->frag_off) & IP6_MF)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_frag *fraginfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_frag))) { + DEBUGP("ip6t_frag: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_frag))); + return 0; + } + if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { + DEBUGP("ip6t_frag: unknown flags %X\n", + fraginfo->invflags); + return 0; + } + + return 1; +} + +static struct ip6t_match frag_match = { + .name = "frag", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&frag_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&frag_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c new file mode 100644 index 000000000000..27f3650d127e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -0,0 +1,298 @@ +/* Kernel module to match Hop-by-Hop and Destination parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <asm/byteorder.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_opts.h> + +#define HOPBYHOP 1 + +MODULE_LICENSE("GPL"); +#if HOPBYHOP +MODULE_DESCRIPTION("IPv6 HbH match"); +#else +MODULE_DESCRIPTION("IPv6 DST match"); +#endif +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* + * (Type & 0xC0) >> 6 + * 0 -> ignorable + * 1 -> must drop the packet + * 2 -> send ICMP PARM PROB regardless and drop packet + * 3 -> Send ICMP if not a multicast address and drop packet + * (Type & 0x20) >> 5 + * 0 -> invariant + * 1 -> can change the routing + * (Type & 0x1F) Type + * 0 -> Pad1 (only 1 byte!) + * 1 -> PadN LENGTH info (total length = length + 2) + * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) + * 5 -> RTALERT 2 x x + */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ipv6_opt_hdr _optsh, *oh; + const struct ip6t_opts *optinfo = matchinfo; + unsigned int temp; + unsigned int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int ret = 0; + u8 _opttype, *tp = NULL; + u8 _optlen, *lp = NULL; + unsigned int optlen; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_opts header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* OPTS -> evaluate */ +#if HOPBYHOP + if (nexthdr == NEXTHDR_HOP) { + temp |= MASK_HOPOPTS; +#else + if (nexthdr == NEXTHDR_DEST) { + temp |= MASK_DSTOPTS; +#endif + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_opts match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_opts: new pointer is too large! \n"); + break; + } + } + + /* OPTIONS header not found */ +#if HOPBYHOP + if ( temp != MASK_HOPOPTS ) return 0; +#else + if ( temp != MASK_DSTOPTS ) return 0; +#endif + + if (len < (int)sizeof(struct ipv6_opt_hdr)){ + *hotdrop = 1; + return 0; + } + + if (len < hdrlen){ + /* Packet smaller than it's length field */ + return 0; + } + + oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); + BUG_ON(oh == NULL); + + DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); + + DEBUGP("len %02X %04X %02X ", + optinfo->hdrlen, hdrlen, + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); + + ret = (oh != NULL) + && + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); + + ptr += 2; + hdrlen -= 2; + if ( !(optinfo->flags & IP6T_OPTS_OPTS) ){ + return ret; + } else if (optinfo->flags & IP6T_OPTS_NSTRICT) { + DEBUGP("Not strict - not implemented"); + } else { + DEBUGP("Strict "); + DEBUGP("#%d ",optinfo->optsnr); + for(temp=0; temp<optinfo->optsnr; temp++){ + /* type field exists ? */ + if (hdrlen < 1) + break; + tp = skb_header_pointer(skb, ptr, sizeof(_opttype), + &_opttype); + if (tp == NULL) + break; + + /* Type check */ + if (*tp != (optinfo->opts[temp] & 0xFF00)>>8){ + DEBUGP("Tbad %02X %02X\n", + *tp, + (optinfo->opts[temp] & 0xFF00)>>8); + return 0; + } else { + DEBUGP("Tok "); + } + /* Length check */ + if (*tp) { + u16 spec_len; + + /* length field exists ? */ + if (hdrlen < 2) + break; + lp = skb_header_pointer(skb, ptr + 1, + sizeof(_optlen), + &_optlen); + if (lp == NULL) + break; + spec_len = optinfo->opts[temp] & 0x00FF; + + if (spec_len != 0x00FF && spec_len != *lp) { + DEBUGP("Lbad %02X %04X\n", *lp, + spec_len); + return 0; + } + DEBUGP("Lok "); + optlen = *lp + 2; + } else { + DEBUGP("Pad1\n"); + optlen = 1; + } + + /* Step to the next */ + DEBUGP("len%04X \n", optlen); + + if ((ptr > skb->len - optlen || hdrlen < optlen) && + (temp < optinfo->optsnr - 1)) { + DEBUGP("new pointer is too large! \n"); + break; + } + ptr += optlen; + hdrlen -= optlen; + } + if (temp == optinfo->optsnr) + return ret; + else return 0; + } + + return 0; +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_opts *optsinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_opts))) { + DEBUGP("ip6t_opts: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_opts))); + return 0; + } + if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { + DEBUGP("ip6t_opts: unknown flags %X\n", + optsinfo->invflags); + return 0; + } + + return 1; +} + +static struct ip6t_match opts_match = { +#if HOPBYHOP + .name = "hbh", +#else + .name = "dst", +#endif + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&opts_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&opts_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c new file mode 100644 index 000000000000..0beaff5471dd --- /dev/null +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -0,0 +1,80 @@ +/* Hop Limit matching module */ + +/* (C) 2001-2002 Maciej Soltysiak <solt@dns.toxicfilms.tv> + * Based on HW's ttl module + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv6/ip6t_hl.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); +MODULE_DESCRIPTION("IP tables Hop Limit matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_hl_info *info = matchinfo; + const struct ipv6hdr *ip6h = skb->nh.ipv6h; + + switch (info->mode) { + case IP6T_HL_EQ: + return (ip6h->hop_limit == info->hop_limit); + break; + case IP6T_HL_NE: + return (!(ip6h->hop_limit == info->hop_limit)); + break; + case IP6T_HL_LT: + return (ip6h->hop_limit < info->hop_limit); + break; + case IP6T_HL_GT: + return (ip6h->hop_limit > info->hop_limit); + break; + default: + printk(KERN_WARNING "ip6t_hl: unknown mode %d\n", + info->mode); + return 0; + } + + return 0; +} + +static int checkentry(const char *tablename, const struct ip6t_ip6 *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_hl_info))) + return 0; + + return 1; +} + +static struct ip6t_match hl_match = { + .name = "hl", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&hl_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&hl_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c new file mode 100644 index 000000000000..32e67f05845b --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -0,0 +1,167 @@ +/* ipv6header match - matches IPv6 packets based + on whether they contain certain headers */ + +/* Original idea: Brad Chapman + * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_ipv6header.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 headers match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +static int +ipv6header_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_ipv6header_info *info = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + + /* Make sure this isn't an evil packet */ + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + temp |= MASK_NONE; + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + temp |= MASK_ESP; + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + temp |= MASK_HOPOPTS; + break; + case NEXTHDR_ROUTING: + temp |= MASK_ROUTING; + break; + case NEXTHDR_FRAGMENT: + temp |= MASK_FRAGMENT; + break; + case NEXTHDR_AUTH: + temp |= MASK_AH; + break; + case NEXTHDR_DEST: + temp |= MASK_DSTOPTS; + break; + default: + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) + break; + } + + if ( (nexthdr != NEXTHDR_NONE ) && (nexthdr != NEXTHDR_ESP) ) + temp |= MASK_PROTO; + + if (info->modeflag) + return !((temp ^ info->matchflags ^ info->invflags) + & info->matchflags); + else { + if (info->invflags) + return temp != info->matchflags; + else + return temp == info->matchflags; + } +} + +static int +ipv6header_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_ipv6header_info *info = matchinfo; + + /* Check for obvious errors */ + /* This match is valid in all hooks! */ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_ipv6header_info))) + return 0; + + /* invflags is 0 or 0xff in hard mode */ + if ((!info->modeflag) && info->invflags != 0x00 + && info->invflags != 0xFF) + return 0; + + return 1; +} + +static struct ip6t_match ip6t_ipv6header_match = { + .name = "ipv6header", + .match = &ipv6header_match, + .checkentry = &ipv6header_checkentry, + .destroy = NULL, + .me = THIS_MODULE, +}; + +static int __init ipv6header_init(void) +{ + return ip6t_register_match(&ip6t_ipv6header_match); +} + +static void __exit ipv6header_exit(void) +{ + ip6t_unregister_match(&ip6t_ipv6header_match); +} + +module_init(ipv6header_init); +module_exit(ipv6header_exit); + diff --git a/net/ipv6/netfilter/ip6t_length.c b/net/ipv6/netfilter/ip6t_length.c new file mode 100644 index 000000000000..e0537d3811d5 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_length.c @@ -0,0 +1,66 @@ +/* Length Match - IPv6 Port */ + +/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv6/ip6t_length.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("IPv6 packet length match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_length_info))) + return 0; + + return 1; +} + +static struct ip6t_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&length_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&length_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_limit.c b/net/ipv6/netfilter/ip6t_limit.c new file mode 100644 index 000000000000..fb782f610be2 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_limit.c @@ -0,0 +1,147 @@ +/* Kernel module to control the rate + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ + +/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_limit.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); +MODULE_DESCRIPTION("rate limiting within ip6tables"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To avoid underflow, we multiply by 128 (ie. you get 128 credits per + jiffy). Hence a cost of 2^32-1, means one pass per 32768 seconds + at 1024HZ (or one every 9 hours). A cost of 1 means 12800 passes + per second at 100HZ. */ + +#define CREDITS_PER_JIFFY 128 + +static int +ip6t_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ip6t_rateinfo *r = ((struct ip6t_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IP6T_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IP6T_LIMIT_SCALE; +} + +static int +ip6t_limit_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ip6t_rateinfo *r = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Call rusty: overflow in ip6t_limit: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IP6T_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ip6t_match ip6t_limit_reg = { + .name = "limit", + .match = ip6t_limit_match, + .checkentry = ip6t_limit_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ip6t_register_match(&ip6t_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&ip6t_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mac.c b/net/ipv6/netfilter/ip6t_mac.c new file mode 100644 index 000000000000..526d43e37234 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mac.c @@ -0,0 +1,80 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> + +#include <linux/netfilter_ipv6/ip6t_mac.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MAC address matching module for IPv6"); +MODULE_AUTHOR("Netfilter Core Teaam <coreteam@netfilter.org>"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_mac_info *info = matchinfo; + + /* Is mac pointer valid? */ + return (skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data + /* If so, compare... */ + && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) + == 0) ^ info->invert)); +} + +static int +ip6t_mac_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) + | (1 << NF_IP6_FORWARD))) { + printk("ip6t_mac: only valid for PRE_ROUTING, LOCAL_IN or" + " FORWARD\n"); + return 0; + } + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mac_info))) + return 0; + + return 1; +} + +static struct ip6t_match mac_match = { + .name = "mac", + .match = &match, + .checkentry = &ip6t_mac_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&mac_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&mac_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mark.c b/net/ipv6/netfilter/ip6t_mark.c new file mode 100644 index 000000000000..affc3de364fc --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mark.c @@ -0,0 +1,66 @@ +/* Kernel module to match NFMARK values. */ + +/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv6/ip6t_mark.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables mark match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mark_info))) + return 0; + + return 1; +} + +static struct ip6t_match mark_match = { + .name = "mark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_multiport.c b/net/ipv6/netfilter/ip6t_multiport.c new file mode 100644 index 000000000000..6e3246153fa3 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_multiport.c @@ -0,0 +1,125 @@ +/* Kernel module to match one of a list of TCP/UDP ports: ports are in + the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/udp.h> +#include <linux/skbuff.h> +#include <linux/in.h> + +#include <linux/netfilter_ipv6/ip6t_multiport.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables match for multiple ports"); + +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match(const u_int16_t *portlist, enum ip6t_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) +{ + unsigned int i; + for (i=0; i<count; i++) { + if (flags != IP6T_MULTIPORT_DESTINATION + && portlist[i] == src) + return 1; + + if (flags != IP6T_MULTIPORT_SOURCE + && portlist[i] == dst) + return 1; + } + + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ip6t_multiport *multiinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + /* Must be big enough to read ports (both UDP and TCP have + them at the start). */ + pptr = skb_header_pointer(skb, protoff, sizeof(_ports), &_ports[0]); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ip6t_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match(multiinfo->ports, + multiinfo->flags, multiinfo->count, + ntohs(pptr[0]), ntohs(pptr[1])); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_multiport *multiinfo = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_multiport))) + return 0; + + /* Must specify proto == TCP/UDP, no unknown flags or bad count */ + return (ip->proto == IPPROTO_TCP || ip->proto == IPPROTO_UDP) + && !(ip->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_multiport)) + && (multiinfo->flags == IP6T_MULTIPORT_SOURCE + || multiinfo->flags == IP6T_MULTIPORT_DESTINATION + || multiinfo->flags == IP6T_MULTIPORT_EITHER) + && multiinfo->count <= IP6T_MULTI_PORTS; +} + +static struct ip6t_match multiport_match = { + .name = "multiport", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&multiport_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&multiport_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c new file mode 100644 index 000000000000..ab0e32d3de46 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_owner.c @@ -0,0 +1,174 @@ +/* Kernel module to match various things tied to sockets associated with + locally generated outgoing packets. */ + +/* (C) 2000-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> + +#include <linux/netfilter_ipv6/ip6t_owner.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("IP6 tables owner matching module"); +MODULE_LICENSE("GPL"); + +static int +match_pid(const struct sk_buff *skb, pid_t pid) +{ + struct task_struct *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out; + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); +out: + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_sid(const struct sk_buff *skb, pid_t sid) +{ + struct task_struct *g, *p; + struct file *file = skb->sk->sk_socket->file; + int i, found=0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + struct files_struct *files; + if (p->signal->session != sid) + continue; + + task_lock(p); + files = p->files; + if (files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == file) { + found = 1; + break; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + if (found) + goto out; + } while_each_thread(g, p); +out: + read_unlock(&tasklist_lock); + + return found; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_owner_info *info = matchinfo; + + if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) + return 0; + + if(info->match & IP6T_OWNER_UID) { + if((skb->sk->sk_socket->file->f_uid != info->uid) ^ + !!(info->invert & IP6T_OWNER_UID)) + return 0; + } + + if(info->match & IP6T_OWNER_GID) { + if((skb->sk->sk_socket->file->f_gid != info->gid) ^ + !!(info->invert & IP6T_OWNER_GID)) + return 0; + } + + if(info->match & IP6T_OWNER_PID) { + if (!match_pid(skb, info->pid) ^ + !!(info->invert & IP6T_OWNER_PID)) + return 0; + } + + if(info->match & IP6T_OWNER_SID) { + if (!match_sid(skb, info->sid) ^ + !!(info->invert & IP6T_OWNER_SID)) + return 0; + } + + return 1; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) { + printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); + return 0; + } + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info))) + return 0; +#ifdef CONFIG_SMP + /* files->file_lock can not be used in a BH */ + if (((struct ip6t_owner_info *)matchinfo)->match + & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { + printk("ip6t_owner: pid and sid matching is broken on SMP.\n"); + return 0; + } +#endif + return 1; +} + +static struct ip6t_match owner_match = { + .name = "owner", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&owner_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&owner_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_physdev.c b/net/ipv6/netfilter/ip6t_physdev.c new file mode 100644 index 000000000000..71515c86ece1 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_physdev.c @@ -0,0 +1,135 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv6/ip6t_physdev.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_bridge.h> +#define MATCH 1 +#define NOMATCH 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("iptables bridge physical device match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + int i; + static const char nulldevname[IFNAMSIZ]; + const struct ip6t_physdev_info *info = matchinfo; + unsigned int ret; + const char *indev, *outdev; + struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) && + !(info->invert & IP6T_PHYSDEV_OP_BRIDGED)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN) && + !(info->invert & IP6T_PHYSDEV_OP_ISIN)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_ISOUT) && + !(info->invert & IP6T_PHYSDEV_OP_ISOUT)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_IN) && + !(info->invert & IP6T_PHYSDEV_OP_IN)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_OUT) && + !(info->invert & IP6T_PHYSDEV_OP_OUT)) + return NOMATCH; + return MATCH; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & IP6T_PHYSDEV_OP_BRIDGED))) + return NOMATCH; + + if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISIN))) || + (info->bitmask & IP6T_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISOUT)))) + return NOMATCH; + + if (!(info->bitmask & IP6T_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)indev)[i] + ^ ((const unsigned int *)info->physindev)[i]) + & ((const unsigned int *)info->in_mask)[i]; + } + + if ((ret == 0) ^ !(info->invert & IP6T_PHYSDEV_OP_IN)) + return NOMATCH; + +match_outdev: + if (!(info->bitmask & IP6T_PHYSDEV_OP_OUT)) + return MATCH; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)outdev)[i] + ^ ((const unsigned int *)info->physoutdev)[i]) + & ((const unsigned int *)info->out_mask)[i]; + } + + return (ret != 0) ^ !(info->invert & IP6T_PHYSDEV_OP_OUT); +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_physdev_info *info = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_physdev_info))) + return 0; + if (!(info->bitmask & IP6T_PHYSDEV_OP_MASK) || + info->bitmask & ~IP6T_PHYSDEV_OP_MASK) + return 0; + return 1; +} + +static struct ip6t_match physdev_match = { + .name = "physdev", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&physdev_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&physdev_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c new file mode 100644 index 000000000000..a9526b773d28 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -0,0 +1,301 @@ +/* Kernel module to match ROUTING parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ipv6.h> +#include <linux/types.h> +#include <net/checksum.h> +#include <net/ipv6.h> + +#include <asm/byteorder.h> + +#include <linux/netfilter_ipv6/ip6_tables.h> +#include <linux/netfilter_ipv6/ip6t_rt.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 RT match"); +MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline int +segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert) +{ + int r=0; + DEBUGP("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,id,max); + r=(id >= min && id <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ipv6_rt_hdr _route, *rh = NULL; + const struct ip6t_rt *rtinfo = matchinfo; + unsigned int temp; + unsigned int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int ret = 0; + struct in6_addr *ap, _addr; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_rt header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* ROUTING -> evaluate */ + if (nexthdr == NEXTHDR_ROUTING) { + temp |= MASK_ROUTING; + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_rt match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_rt: new pointer is too large! \n"); + break; + } + } + + /* ROUTING header not found */ + if ( temp != MASK_ROUTING ) return 0; + + if (len < (int)sizeof(struct ipv6_rt_hdr)){ + *hotdrop = 1; + return 0; + } + + if (len < hdrlen){ + /* Pcket smaller than its length field */ + return 0; + } + + rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); + BUG_ON(rh == NULL); + + DEBUGP("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); + DEBUGP("TYPE %04X ", rh->type); + DEBUGP("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); + + DEBUGP("IPv6 RT segsleft %02X ", + (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS)))); + DEBUGP("type %02X %02X %02X ", + rtinfo->rt_type, rh->type, + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); + DEBUGP("len %02X %04X %02X ", + rtinfo->hdrlen, hdrlen, + (!(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN)))); + DEBUGP("res %02X %02X %02X ", + (rtinfo->flags & IP6T_RT_RES), ((struct rt0_hdr *)rh)->bitmap, + !((rtinfo->flags & IP6T_RT_RES) && (((struct rt0_hdr *)rh)->bitmap))); + + ret = (rh != NULL) + && + (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS))) + && + (!(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN))) + && + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP))); + + if (ret && (rtinfo->flags & IP6T_RT_RES)) { + u_int32_t *bp, _bitmap; + bp = skb_header_pointer(skb, + ptr + offsetof(struct rt0_hdr, bitmap), + sizeof(_bitmap), &_bitmap); + + ret = (*bp == 0); + } + + DEBUGP("#%d ",rtinfo->addrnr); + if ( !(rtinfo->flags & IP6T_RT_FST) ){ + return ret; + } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { + DEBUGP("Not strict "); + if ( rtinfo->addrnr > (unsigned int)((hdrlen-8)/16) ){ + DEBUGP("There isn't enough space\n"); + return 0; + } else { + unsigned int i = 0; + + DEBUGP("#%d ",rtinfo->addrnr); + for(temp=0; temp<(unsigned int)((hdrlen-8)/16); temp++){ + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + + BUG_ON(ap == NULL); + + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { + DEBUGP("i=%d temp=%d;\n",i,temp); + i++; + } + if (i==rtinfo->addrnr) break; + } + DEBUGP("i=%d #%d\n", i, rtinfo->addrnr); + if (i == rtinfo->addrnr) + return ret; + else return 0; + } + } else { + DEBUGP("Strict "); + if ( rtinfo->addrnr > (unsigned int)((hdrlen-8)/16) ){ + DEBUGP("There isn't enough space\n"); + return 0; + } else { + DEBUGP("#%d ",rtinfo->addrnr); + for(temp=0; temp<rtinfo->addrnr; temp++){ + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + BUG_ON(ap == NULL); + + if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) + break; + } + DEBUGP("temp=%d #%d\n", temp, rtinfo->addrnr); + if ((temp == rtinfo->addrnr) && (temp == (unsigned int)((hdrlen-8)/16))) + return ret; + else return 0; + } + } + + return 0; +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_rt *rtinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_rt))) { + DEBUGP("ip6t_rt: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_rt))); + return 0; + } + if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { + DEBUGP("ip6t_rt: unknown flags %X\n", + rtinfo->invflags); + return 0; + } + if ( (rtinfo->flags & (IP6T_RT_RES|IP6T_RT_FST_MASK)) && + (!(rtinfo->flags & IP6T_RT_TYP) || + (rtinfo->rt_type != 0) || + (rtinfo->invflags & IP6T_RT_INV_TYP)) ) { + DEBUGP("`--rt-type 0' required before `--rt-0-*'"); + return 0; + } + + return 1; +} + +static struct ip6t_match rt_match = { + .name = "rt", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&rt_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&rt_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c new file mode 100644 index 000000000000..4c0028671c20 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -0,0 +1,214 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[3]; + struct ip6t_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), + { [NF_IP6_LOCAL_IN] = 0, + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard), + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 }, + { [NF_IP6_LOCAL_IN] = 0, + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard), + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_error), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ip6t_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ip6t_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if 0 + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ip6t_ops[] = { + { + .hook = ip6t_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_FILTER, + }, + { + .hook = ip6t_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_FORWARD, + .priority = NF_IP6_PRI_FILTER, + }, + { + .hook = ip6t_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_FILTER, + }, +}; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ip6t_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c new file mode 100644 index 000000000000..85c1e6eada19 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -0,0 +1,287 @@ +/* + * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6 + * + * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Extended to all five netfilter hooks by Brad Chapman & Harald Welte + */ +#include <linux/module.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("ip6tables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | \ + (1 << NF_IP6_LOCAL_IN) | \ + (1 << NF_IP6_FORWARD) | \ + (1 << NF_IP6_LOCAL_OUT) | \ + (1 << NF_IP6_POST_ROUTING)) + +#if 0 +#define DEBUGP(x, args...) printk(KERN_DEBUG x, ## args) +#else +#define DEBUGP(x, args...) +#endif + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[5]; + struct ip6t_error term; +} initial_table __initdata += { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), + { [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard), + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, + [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4}, + { [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard), + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, + [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4}, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_IN */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_error), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ip6t_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_route_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static unsigned int +ip6t_local_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + + unsigned long nfmark; + unsigned int ret; + struct in6_addr saddr, daddr; + u_int8_t hop_limit; + u_int32_t flowlabel; + +#if 0 + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + /* save source/dest address, nfmark, hoplimit, flowlabel, priority, */ + memcpy(&saddr, &(*pskb)->nh.ipv6h->saddr, sizeof(saddr)); + memcpy(&daddr, &(*pskb)->nh.ipv6h->daddr, sizeof(daddr)); + nfmark = (*pskb)->nfmark; + hop_limit = (*pskb)->nh.ipv6h->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h); + + ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); + + if (ret != NF_DROP && ret != NF_STOLEN + && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr)) + || memcmp(&(*pskb)->nh.ipv6h->daddr, &daddr, sizeof(daddr)) + || (*pskb)->nfmark != nfmark + || (*pskb)->nh.ipv6h->hop_limit != hop_limit)) { + + /* something which could affect routing has changed */ + + DEBUGP("ip6table_mangle: we'd need to re-route a packet\n"); + } + + return ret; +} + +static struct nf_hook_ops ip6t_ops[] = { + { + .hook = ip6t_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_FORWARD, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_POST_ROUTING, + .priority = NF_IP6_PRI_MANGLE, + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ip6t_register_table(&packet_mangler, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + ret = nf_register_hook(&ip6t_ops[3]); + if (ret < 0) + goto cleanup_hook2; + + ret = nf_register_hook(&ip6t_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: + nf_unregister_hook(&ip6t_ops[3]); + cleanup_hook2: + nf_unregister_hook(&ip6t_ops[2]); + cleanup_hook1: + nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_mangler); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_mangler); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c new file mode 100644 index 000000000000..71407beaf790 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -0,0 +1,182 @@ +/* + * IPv6 raw table, a port of the IPv4 raw table to IPv6 + * + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + */ +#include <linux/module.h> +#include <linux/netfilter_ipv6/ip6_tables.h> + +#define RAW_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_OUT)) + +#if 0 +#define DEBUGP(x, args...) printk(KERN_DEBUG x, ## args) +#else +#define DEBUGP(x, args...) +#endif + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[2]; + struct ip6t_error term; +} initial_table __initdata = { + .repl = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .num_entries = 3, + .size = sizeof(struct ip6t_standard) * 2 + sizeof(struct ip6t_error), + .hook_entry = { + [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) + }, + .underflow = { + [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) + }, + }, + .entries = { + /* PRE_ROUTING */ + { + .entry = { + .target_offset = sizeof(struct ip6t_entry), + .next_offset = sizeof(struct ip6t_standard), + }, + .target = { + .target = { + .u = { + .target_size = IP6T_ALIGN(sizeof(struct ip6t_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + + /* LOCAL_OUT */ + { + .entry = { + .target_offset = sizeof(struct ip6t_entry), + .next_offset = sizeof(struct ip6t_standard), + }, + .target = { + .target = { + .u = { + .target_size = IP6T_ALIGN(sizeof(struct ip6t_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + }, + /* ERROR */ + .term = { + .entry = { + .target_offset = sizeof(struct ip6t_entry), + .next_offset = sizeof(struct ip6t_error), + }, + .target = { + .target = { + .u = { + .user = { + .target_size = IP6T_ALIGN(sizeof(struct ip6t_error_target)), + .name = IP6T_ERROR_TARGET, + }, + }, + }, + .errorname = "ERROR", + }, + } +}; + +static struct ip6t_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_raw, NULL); +} + +static struct nf_hook_ops ip6t_ops[] = { + { + .hook = ip6t_hook, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_FIRST + }, + { + .hook = ip6t_hook, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_FIRST + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ip6t_register_table(&packet_raw, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_raw); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_raw); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c new file mode 100644 index 000000000000..334a5967831e --- /dev/null +++ b/net/ipv6/proc.c @@ -0,0 +1,303 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. This is very similar to the IPv4 version, + * except it reports the sockets in the INET6 address family. + * + * Version: $Id: proc.c,v 1.17 2002/02/01 22:01:04 davem Exp $ + * + * Authors: David S. Miller (davem@caip.rutgers.edu) + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/socket.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/stddef.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/transp_v6.h> +#include <net/ipv6.h> + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_net_devsnmp6; + +static int fold_prot_inuse(struct proto *proto) +{ + int res = 0; + int cpu; + + for (cpu=0; cpu<NR_CPUS; cpu++) + res += proto->stats[cpu].inuse; + + return res; +} + +static int sockstat6_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "TCP6: inuse %d\n", + fold_prot_inuse(&tcpv6_prot)); + seq_printf(seq, "UDP6: inuse %d\n", + fold_prot_inuse(&udpv6_prot)); + seq_printf(seq, "RAW6: inuse %d\n", + fold_prot_inuse(&rawv6_prot)); + seq_printf(seq, "FRAG6: inuse %d memory %d\n", + ip6_frag_nqueues, atomic_read(&ip6_frag_mem)); + return 0; +} + +static struct snmp_mib snmp6_ipstats_list[] = { +/* ipv6 mib according to RFC 2465 */ + SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INRECEIVES), + SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), + SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), + SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), + SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), + SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), + SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp6_icmp6_list[] = { +/* icmpv6 mib according to RFC 2466 + + Exceptions: {In|Out}AdminProhibs are removed, because I see + no good reasons to account them separately + of another dest.unreachs. + OutErrs is zero identically. + OutEchos too. + OutRouterAdvertisements too. + OutGroupMembQueries too. + */ + SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), + SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), + SNMP_MIB_ITEM("Icmp6InDestUnreachs", ICMP6_MIB_INDESTUNREACHS), + SNMP_MIB_ITEM("Icmp6InPktTooBigs", ICMP6_MIB_INPKTTOOBIGS), + SNMP_MIB_ITEM("Icmp6InTimeExcds", ICMP6_MIB_INTIMEEXCDS), + SNMP_MIB_ITEM("Icmp6InParmProblems", ICMP6_MIB_INPARMPROBLEMS), + SNMP_MIB_ITEM("Icmp6InEchos", ICMP6_MIB_INECHOS), + SNMP_MIB_ITEM("Icmp6InEchoReplies", ICMP6_MIB_INECHOREPLIES), + SNMP_MIB_ITEM("Icmp6InGroupMembQueries", ICMP6_MIB_INGROUPMEMBQUERIES), + SNMP_MIB_ITEM("Icmp6InGroupMembResponses", ICMP6_MIB_INGROUPMEMBRESPONSES), + SNMP_MIB_ITEM("Icmp6InGroupMembReductions", ICMP6_MIB_INGROUPMEMBREDUCTIONS), + SNMP_MIB_ITEM("Icmp6InRouterSolicits", ICMP6_MIB_INROUTERSOLICITS), + SNMP_MIB_ITEM("Icmp6InRouterAdvertisements", ICMP6_MIB_INROUTERADVERTISEMENTS), + SNMP_MIB_ITEM("Icmp6InNeighborSolicits", ICMP6_MIB_INNEIGHBORSOLICITS), + SNMP_MIB_ITEM("Icmp6InNeighborAdvertisements", ICMP6_MIB_INNEIGHBORADVERTISEMENTS), + SNMP_MIB_ITEM("Icmp6InRedirects", ICMP6_MIB_INREDIRECTS), + SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), + SNMP_MIB_ITEM("Icmp6OutDestUnreachs", ICMP6_MIB_OUTDESTUNREACHS), + SNMP_MIB_ITEM("Icmp6OutPktTooBigs", ICMP6_MIB_OUTPKTTOOBIGS), + SNMP_MIB_ITEM("Icmp6OutTimeExcds", ICMP6_MIB_OUTTIMEEXCDS), + SNMP_MIB_ITEM("Icmp6OutParmProblems", ICMP6_MIB_OUTPARMPROBLEMS), + SNMP_MIB_ITEM("Icmp6OutEchoReplies", ICMP6_MIB_OUTECHOREPLIES), + SNMP_MIB_ITEM("Icmp6OutRouterSolicits", ICMP6_MIB_OUTROUTERSOLICITS), + SNMP_MIB_ITEM("Icmp6OutNeighborSolicits", ICMP6_MIB_OUTNEIGHBORSOLICITS), + SNMP_MIB_ITEM("Icmp6OutNeighborAdvertisements", ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS), + SNMP_MIB_ITEM("Icmp6OutRedirects", ICMP6_MIB_OUTREDIRECTS), + SNMP_MIB_ITEM("Icmp6OutGroupMembResponses", ICMP6_MIB_OUTGROUPMEMBRESPONSES), + SNMP_MIB_ITEM("Icmp6OutGroupMembReductions", ICMP6_MIB_OUTGROUPMEMBREDUCTIONS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp6_udp6_list[] = { + SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_SENTINEL +}; + +static unsigned long +fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} + +static inline void +snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist) +{ + int i; + for (i=0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, + fold_field(mib, itemlist[i].entry)); +} + +static int snmp6_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_dev *idev = (struct inet6_dev *)seq->private; + + if (idev) { + seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); + snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); + } else { + snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); + snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); + snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); + } + return 0; +} + +static int sockstat6_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sockstat6_seq_show, NULL); +} + +static struct file_operations sockstat6_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int snmp6_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, snmp6_seq_show, PDE(inode)->data); +} + +static struct file_operations snmp6_seq_fops = { + .owner = THIS_MODULE, + .open = snmp6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int snmp6_register_dev(struct inet6_dev *idev) +{ + struct proc_dir_entry *p; + + if (!idev || !idev->dev) + return -EINVAL; + + if (!proc_net_devsnmp6) + return -ENOENT; + + p = create_proc_entry(idev->dev->name, S_IRUGO, proc_net_devsnmp6); + if (!p) + return -ENOMEM; + + p->data = idev; + p->proc_fops = &snmp6_seq_fops; + + idev->stats.proc_dir_entry = p; + return 0; +} + +int snmp6_unregister_dev(struct inet6_dev *idev) +{ + if (!proc_net_devsnmp6) + return -ENOENT; + if (!idev || !idev->stats.proc_dir_entry) + return -EINVAL; + remove_proc_entry(idev->stats.proc_dir_entry->name, + proc_net_devsnmp6); + return 0; +} + +int __init ipv6_misc_proc_init(void) +{ + int rc = 0; + + if (!proc_net_fops_create("snmp6", S_IRUGO, &snmp6_seq_fops)) + goto proc_snmp6_fail; + + proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net); + if (!proc_net_devsnmp6) + goto proc_dev_snmp6_fail; + + if (!proc_net_fops_create("sockstat6", S_IRUGO, &sockstat6_seq_fops)) + goto proc_sockstat6_fail; +out: + return rc; + +proc_sockstat6_fail: + proc_net_remove("dev_snmp6"); +proc_dev_snmp6_fail: + proc_net_remove("snmp6"); +proc_snmp6_fail: + rc = -ENOMEM; + goto out; +} + +void ipv6_misc_proc_exit(void) +{ + proc_net_remove("sockstat6"); + proc_net_remove("dev_snmp6"); + proc_net_remove("snmp6"); +} + +#else /* CONFIG_PROC_FS */ + + +int snmp6_register_dev(struct inet6_dev *idev) +{ + return 0; +} + +int snmp6_unregister_dev(struct inet6_dev *idev) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +int snmp6_alloc_dev(struct inet6_dev *idev) +{ + int err = -ENOMEM; + + if (!idev || !idev->dev) + return -EINVAL; + + if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp; + + return 0; + +err_icmp: + return err; +} + +int snmp6_free_dev(struct inet6_dev *idev) +{ + snmp6_mib_free((void **)idev->stats.icmpv6); + return 0; +} + + diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c new file mode 100644 index 000000000000..52c1d58b6ca6 --- /dev/null +++ b/net/ipv6/protocol.c @@ -0,0 +1,86 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET6 protocol dispatch tables. + * + * Version: $Id: protocol.c,v 1.10 2001/05/18 02:25:49 davem Exp $ + * + * Authors: Pedro Roque <roque@di.fc.ul.pt> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Vince Laviano (vince@cs.stanford.edu) 16 May 2001 + * - Removed unused variable 'inet6_protocol_base' + * - Modified inet6_del_protocol() to correctly maintain copy bit. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> + +struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; +static DEFINE_SPINLOCK(inet6_proto_lock); + + +int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet6_proto_lock); + + if (inet6_protos[hash]) { + ret = -1; + } else { + inet6_protos[hash] = prot; + ret = 0; + } + + spin_unlock_bh(&inet6_proto_lock); + + return ret; +} + +/* + * Remove a protocol from the hash tables. + */ + +int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet6_proto_lock); + + if (inet6_protos[hash] != prot) { + ret = -1; + } else { + inet6_protos[hash] = NULL; + ret = 0; + } + + spin_unlock_bh(&inet6_proto_lock); + + synchronize_net(); + + return ret; +} diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c new file mode 100644 index 000000000000..5488ad0de4f6 --- /dev/null +++ b/net/ipv6/raw.c @@ -0,0 +1,1157 @@ +/* + * RAW sockets for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Adapted from linux/net/ipv4/raw.c + * + * $Id: raw.c,v 1.51 2002/02/01 22:01:04 davem Exp $ + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmpv6.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv6.h> +#include <asm/uaccess.h> +#include <asm/ioctls.h> + +#include <net/ip.h> +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/ip6_route.h> +#include <net/ip6_checksum.h> +#include <net/addrconf.h> +#include <net/transp_v6.h> +#include <net/udp.h> +#include <net/inet_common.h> + +#include <net/rawv6.h> +#include <net/xfrm.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE]; +DEFINE_RWLOCK(raw_v6_lock); + +static void raw_v6_hash(struct sock *sk) +{ + struct hlist_head *list = &raw_v6_htable[inet_sk(sk)->num & + (RAWV6_HTABLE_SIZE - 1)]; + + write_lock_bh(&raw_v6_lock); + sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock_bh(&raw_v6_lock); +} + +static void raw_v6_unhash(struct sock *sk) +{ + write_lock_bh(&raw_v6_lock); + if (sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(&raw_v6_lock); +} + + +/* Grumble... icmp and ip_input want to get at this... */ +struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, + struct in6_addr *loc_addr, struct in6_addr *rmt_addr) +{ + struct hlist_node *node; + int is_multicast = ipv6_addr_is_multicast(loc_addr); + + sk_for_each_from(sk, node) + if (inet_sk(sk)->num == num) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + goto found; + if (is_multicast && + inet6_mc_check(sk, loc_addr, rmt_addr)) + goto found; + continue; + } + goto found; + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +{ + struct icmp6hdr *icmph; + struct raw6_sock *rp = raw6_sk(sk); + + if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { + __u32 *data = &rp->filter.data[0]; + int bit_nr; + + icmph = (struct icmp6hdr *) skb->data; + bit_nr = icmph->icmp6_type; + + return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + } + return 0; +} + +/* + * demultiplex raw sockets. + * (should consider queueing the skb in the sock receive_queue + * without calling rawv6.c) + * + * Caller owns SKB so we must make clones. + */ +void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +{ + struct in6_addr *saddr; + struct in6_addr *daddr; + struct sock *sk; + __u8 hash; + + saddr = &skb->nh.ipv6h->saddr; + daddr = saddr + 1; + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + read_lock(&raw_v6_lock); + sk = sk_head(&raw_v6_htable[hash]); + + /* + * The first socket found will be delivered after + * delivery to transport protocols. + */ + + if (sk == NULL) + goto out; + + sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); + + while (sk) { + if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) + rawv6_rcv(sk, clone); + } + sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr); + } +out: + read_unlock(&raw_v6_lock); +} + +/* This cleans up af_inet6 a bit. -DaveM */ +static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; + __u32 v4addr = 0; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); + + /* Raw sockets are IPv6 only */ + if (addr_type == IPV6_ADDR_MAPPED) + return(-EADDRNOTAVAIL); + + lock_sock(sk); + + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE) + goto out; + + /* Check if the address belongs to the host. */ + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another + * one is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + goto out; + + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out; + } + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + err = -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) { + if (dev) + dev_put(dev); + goto out; + } + } + if (dev) + dev_put(dev); + } + + inet->rcv_saddr = inet->saddr = v4addr; + ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + err = 0; +out: + release_sock(sk); + return err; +} + +void rawv6_err(struct sock *sk, struct sk_buff *skb, + struct inet6_skb_parm *opt, + int type, int code, int offset, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + int err; + int harderr; + + /* Report error on raw socket, if: + 1. User requested recverr. + 2. Socket is connected (otherwise the error indication + is useless without recverr and error is hard. + */ + if (!np->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + harderr = icmpv6_err_convert(type, code, &err); + if (type == ICMPV6_PKT_TOOBIG) + harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); + + if (np->recverr) { + u8 *payload = skb->data; + if (!inet->hdrincl) + payload += offset; + ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); + } + + if (np->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + if ((raw6_sk(sk)->checksum || sk->sk_filter) && + skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + /* Charge it to the socket. */ + if (sock_queue_rcv_skb(sk,skb)<0) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; + } + + return 0; +} + +/* + * This is next to useless... + * if we demultiplex in network layer we don't need the extra call + * just to queue the skb... + * maybe we could have the network decide upon a hint if it + * should call raw_rcv for demultiplexing + */ +int rawv6_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + if (!rp->checksum) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, skb->csum)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "raw v6 hw csum failure.\n")); + skb->ip_summed = CHECKSUM_NONE; + } + } + if (skb->ip_summed == CHECKSUM_NONE) + skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, 0); + } + + if (inet->hdrincl) { + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + rawv6_rcv_skb(sk, skb); + return 0; +} + + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + struct sk_buff *skb; + size_t copied; + int err; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (addr_len) + *addr_len=sizeof(*sin6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + /* Copy the address. */ + if (sin6) { + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + sock_recv_timestamp(msg, sk, skb); + + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + /* FIXME: increment a raw6 drops counter here */ + goto out_free; +} + +static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, + struct raw6_sock *rp, int len) +{ + struct sk_buff *skb; + int err = 0; + u16 *csum; + u32 tmp_csum; + + if (!rp->checksum) + goto send; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + if (rp->offset + 1 < len) + csum = (u16 *)(skb->h.raw + rp->offset); + else { + err = -EINVAL; + goto out; + } + + /* should be check HW csum miyazawa */ + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + tmp_csum = skb->csum; + } else { + tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + } + + /* in case cksum was not initialized */ + if (unlikely(*csum)) + tmp_csum = csum_sub(tmp_csum, *csum); + + *csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, tmp_csum); + + if (*csum == 0) + *csum = -1; +send: + err = ip6_push_pending_frames(sk); +out: + return err; +} + +static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, + struct flowi *fl, struct rt6_info *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + unsigned int hh_len; + int err; + + if (length > rt->u.dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hh_len); + + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.ipv6h = iph = (struct ipv6hdr *)skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; + + skb->h.raw = skb->nh.raw; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; +out: + return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + int i; + + if (!msg->msg_iov) + return; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl->proto) { + case IPPROTO_ICMPV6: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + get_user(fl->fl_icmp_type, type); + __get_user(fl->fl_icmp_code, code); + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } +} + +static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p = NULL, final; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct dst_entry *dst = NULL; + struct flowi fl; + int addr_len = msg->msg_namelen; + int hlimit = -1; + u16 proto; + int err; + + /* Rough check on arithmetic overflow, + better check is made in ip6_build_xmit + */ + if (len < 0) + return -EMSGSIZE; + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Get and verify the address. + */ + memset(&fl, 0, sizeof(fl)); + + if (sin6) { + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (sin6->sin6_family && sin6->sin6_family != AF_INET6) + return(-EAFNOSUPPORT); + + /* port is the proto value [0..255] carried in nexthdr */ + proto = ntohs(sin6->sin6_port); + + if (!proto) + proto = inet->num; + else if (proto != inet->num) + return(-EINVAL); + + if (proto > 255) + return(-EINVAL); + + daddr = &sin6->sin6_addr; + if (np->sndflow) { + fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl.oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + proto = inet->num; + daddr = &np->daddr; + fl.fl6_flowlabel = np->flow_label; + } + + if (ipv6_addr_any(daddr)) { + /* + * unspecified destination address + * treated as error... is this correct ? + */ + fl6_sock_release(flowlabel); + return(-EINVAL); + } + + if (fl.oif == 0) + fl.oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(struct ipv6_txoptions); + + err = datagram_send_ctl(msg, &fl, opt, &hlimit); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + + fl.proto = proto; + rawv6_probe_proto_opt(&fl, msg); + + ipv6_addr_copy(&fl.fl6_dst, daddr); + if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + + /* merge ip6_build_xmit from ip6_output */ + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + dst_release(dst); + goto out; + } + + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; + +back_from_confirm: + if (inet->hdrincl) { + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags); + } else { + lock_sock(sk); + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, + hlimit, opt, &fl, (struct rt6_info*)dst, msg->msg_flags); + + if (err) + ip6_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = rawv6_push_pending_frames(sk, &fl, rp, len); + } +done: + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? + &np->daddr : NULL); + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + + release_sock(sk); +out: + fl6_sock_release(flowlabel); + return err<0?err:len; +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + switch (optname) { + case ICMPV6_FILTER: + if (optlen > sizeof(struct icmp6_filter)) + optlen = sizeof(struct icmp6_filter); + if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + }; + + return 0; +} + +static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int len; + + switch (optname) { + case ICMPV6_FILTER: + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + if (len > sizeof(struct icmp6_filter)) + len = sizeof(struct icmp6_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + }; + + return 0; +} + + +static int rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val; + + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, + optlen); + }; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + /* You may get strange result with a positive odd offset; + RFC2292bis agrees with me. */ + if (val > 0 && (val&1)) + return(-EINVAL); + if (val < 0) { + rp->checksum = 0; + } else { + rp->checksum = 1; + rp->offset = val; + } + + return 0; + break; + + default: + return(-ENOPROTOOPT); + } +} + +static int rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val, len; + + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, + optlen); + }; + + if (get_user(len,optlen)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + if (rp->checksum == 0) + val = -1; + else + val = rp->offset; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, sizeof(int), len); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->tail - skb->h.raw; + spin_unlock_irq(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: + return -ENOIOCTLCMD; + } +} + +static void rawv6_close(struct sock *sk, long timeout) +{ + if (inet_sk(sk)->num == IPPROTO_RAW) + ip6_ra_control(sk, -1, NULL); + + sk_common_release(sk); +} + +static int rawv6_init_sk(struct sock *sk) +{ + if (inet_sk(sk)->num == IPPROTO_ICMPV6) { + struct raw6_sock *rp = raw6_sk(sk); + rp->checksum = 1; + rp->offset = 2; + } + return(0); +} + +struct proto rawv6_prot = { + .name = "RAWv6", + .owner = THIS_MODULE, + .close = rawv6_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = rawv6_ioctl, + .init = rawv6_init_sk, + .destroy = inet6_destroy_sock, + .setsockopt = rawv6_setsockopt, + .getsockopt = rawv6_getsockopt, + .sendmsg = rawv6_sendmsg, + .recvmsg = rawv6_recvmsg, + .bind = rawv6_bind, + .backlog_rcv = rawv6_rcv_skb, + .hash = raw_v6_hash, + .unhash = raw_v6_unhash, + .obj_size = sizeof(struct raw6_sock), +}; + +#ifdef CONFIG_PROC_FS +struct raw6_iter_state { + int bucket; +}; + +#define raw6_seq_private(seq) ((struct raw6_iter_state *)(seq)->private) + +static struct sock *raw6_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct hlist_node *node; + struct raw6_iter_state* state = raw6_seq_private(seq); + + for (state->bucket = 0; state->bucket < RAWV6_HTABLE_SIZE; ++state->bucket) + sk_for_each(sk, node, &raw_v6_htable[state->bucket]) + if (sk->sk_family == PF_INET6) + goto out; + sk = NULL; +out: + return sk; +} + +static struct sock *raw6_get_next(struct seq_file *seq, struct sock *sk) +{ + struct raw6_iter_state* state = raw6_seq_private(seq); + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sk->sk_family != PF_INET6); + + if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) { + sk = sk_head(&raw_v6_htable[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *raw6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = raw6_get_first(seq); + if (sk) + while (pos && (sk = raw6_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *raw6_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&raw_v6_lock); + return *pos ? raw6_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *raw6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = raw6_get_first(seq); + else + sk = raw6_get_next(seq, v); + ++*pos; + return sk; +} + +static void raw6_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&raw_v6_lock); +} + +static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) +{ + struct ipv6_pinfo *np = inet6_sk(sp); + struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = 0; + srcp = inet_sk(sp)->num; + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); +} + +static int raw6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + else + raw6_sock_seq_show(seq, v, raw6_seq_private(seq)->bucket); + return 0; +} + +static struct seq_operations raw6_seq_ops = { + .start = raw6_seq_start, + .next = raw6_seq_next, + .stop = raw6_seq_stop, + .show = raw6_seq_show, +}; + +static int raw6_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct raw6_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + goto out; + rc = seq_open(file, &raw6_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations raw6_seq_fops = { + .owner = THIS_MODULE, + .open = raw6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init raw6_proc_init(void) +{ + if (!proc_net_fops_create("raw6", S_IRUGO, &raw6_seq_fops)) + return -ENOMEM; + return 0; +} + +void raw6_proc_exit(void) +{ + proc_net_remove("raw6"); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c new file mode 100644 index 000000000000..59e7c6317872 --- /dev/null +++ b/net/ipv6/reassembly.c @@ -0,0 +1,771 @@ +/* + * IPv6 fragment reassembly + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: reassembly.c,v 1.26 2001/03/07 22:00:57 davem Exp $ + * + * Based on: net/ipv4/ip_fragment.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Fixes: + * Andi Kleen Make it work with multiple hosts. + * More RFC compliance. + * + * Horst von Brand Add missing #include <linux/string.h> + * Alexey Kuznetsov SMP races, threading, cleanup. + * Patrick McHardy LRU queue of frag heads for evictor. + * Mitsuru KANDA @USAGI Register inet6_protocol{}. + * David Stevens and + * YOSHIFUJI,H. @USAGI Always remove fragment header to + * calculate ICV correctly. + */ +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/jiffies.h> +#include <linux/net.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> +#include <linux/jhash.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/rawv6.h> +#include <net/ndisc.h> +#include <net/addrconf.h> + +int sysctl_ip6frag_high_thresh = 256*1024; +int sysctl_ip6frag_low_thresh = 192*1024; + +int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT; + +struct ip6frag_skb_cb +{ + struct inet6_skb_parm h; + int offset; +}; + +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) + + +/* + * Equivalent of ipv4 struct ipq + */ + +struct frag_queue +{ + struct frag_queue *next; + struct list_head lru_list; /* lru list member */ + + __u32 id; /* fragment id */ + struct in6_addr saddr; + struct in6_addr daddr; + + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* expire timer */ + struct sk_buff *fragments; + int len; + int meat; + int iif; + struct timeval stamp; + unsigned int csum; + __u8 last_in; /* has first/last segment arrived? */ +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + __u16 nhoffset; + struct frag_queue **pprev; +}; + +/* Hash table. */ + +#define IP6Q_HASHSZ 64 + +static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static DEFINE_RWLOCK(ip6_frag_lock); +static u32 ip6_frag_hash_rnd; +static LIST_HEAD(ip6_frag_lru_list); +int ip6_frag_nqueues = 0; + +static __inline__ void __fq_unlink(struct frag_queue *fq) +{ + if(fq->next) + fq->next->pprev = fq->pprev; + *fq->pprev = fq->next; + list_del(&fq->lru_list); + ip6_frag_nqueues--; +} + +static __inline__ void fq_unlink(struct frag_queue *fq) +{ + write_lock(&ip6_frag_lock); + __fq_unlink(fq); + write_unlock(&ip6_frag_lock); +} + +static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + u32 a, b, c; + + a = saddr->s6_addr32[0]; + b = saddr->s6_addr32[1]; + c = saddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += ip6_frag_hash_rnd; + __jhash_mix(a, b, c); + + a += saddr->s6_addr32[3]; + b += daddr->s6_addr32[0]; + c += daddr->s6_addr32[1]; + __jhash_mix(a, b, c); + + a += daddr->s6_addr32[2]; + b += daddr->s6_addr32[3]; + c += id; + __jhash_mix(a, b, c); + + return c & (IP6Q_HASHSZ - 1); +} + +static struct timer_list ip6_frag_secret_timer; +int sysctl_ip6frag_secret_interval = 10 * 60 * HZ; + +static void ip6_frag_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&ip6_frag_lock); + get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32)); + for (i = 0; i < IP6Q_HASHSZ; i++) { + struct frag_queue *q; + + q = ip6_frag_hash[i]; + while (q) { + struct frag_queue *next = q->next; + unsigned int hval = ip6qhashfn(q->id, + &q->saddr, + &q->daddr); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = ip6_frag_hash[hval]) != NULL) + q->next->pprev = &q->next; + ip6_frag_hash[hval] = q; + q->pprev = &ip6_frag_hash[hval]; + } + + q = next; + } + } + write_unlock(&ip6_frag_lock); + + mod_timer(&ip6_frag_secret_timer, now + sysctl_ip6frag_secret_interval); +} + +atomic_t ip6_frag_mem = ATOMIC_INIT(0); + +/* Memory Tracking Functions. */ +static inline void frag_kfree_skb(struct sk_buff *skb, int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &ip6_frag_mem); + kfree_skb(skb); +} + +static inline void frag_free_queue(struct frag_queue *fq, int *work) +{ + if (work) + *work -= sizeof(struct frag_queue); + atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem); + kfree(fq); +} + +static inline struct frag_queue *frag_alloc_queue(void) +{ + struct frag_queue *fq = kmalloc(sizeof(struct frag_queue), GFP_ATOMIC); + + if(!fq) + return NULL; + atomic_add(sizeof(struct frag_queue), &ip6_frag_mem); + return fq; +} + +/* Destruction primitives. */ + +/* Complete destruction of fq. */ +static void ip6_frag_destroy(struct frag_queue *fq, int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(fq->last_in&COMPLETE); + BUG_TRAP(del_timer(&fq->timer) == 0); + + /* Release all fragment data. */ + fp = fq->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + frag_free_queue(fq, work); +} + +static __inline__ void fq_put(struct frag_queue *fq, int *work) +{ + if (atomic_dec_and_test(&fq->refcnt)) + ip6_frag_destroy(fq, work); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct frag_queue *fq) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + + if (!(fq->last_in & COMPLETE)) { + fq_unlink(fq); + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; + } +} + +static void ip6_evictor(void) +{ + struct frag_queue *fq; + struct list_head *tmp; + int work; + + work = atomic_read(&ip6_frag_mem) - sysctl_ip6frag_low_thresh; + if (work <= 0) + return; + + while(work > 0) { + read_lock(&ip6_frag_lock); + if (list_empty(&ip6_frag_lru_list)) { + read_unlock(&ip6_frag_lock); + return; + } + tmp = ip6_frag_lru_list.next; + fq = list_entry(tmp, struct frag_queue, lru_list); + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq, &work); + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } +} + +static void ip6_frag_expire(unsigned long data) +{ + struct frag_queue *fq = (struct frag_queue *) data; + + spin_lock(&fq->lock); + + if (fq->last_in & COMPLETE) + goto out; + + fq_kill(fq); + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + + /* Send error only if the first segment arrived. */ + if (fq->last_in&FIRST_IN && fq->fragments) { + struct net_device *dev = dev_get_by_index(fq->iif); + + /* + But use as source device on which LAST ARRIVED + segment was received. And do not use fq->dev + pointer directly, device might already disappeared. + */ + if (dev) { + fq->fragments->dev = dev; + icmpv6_send(fq->fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, + dev); + dev_put(dev); + } + } +out: + spin_unlock(&fq->lock); + fq_put(fq, NULL); +} + +/* Creation primitives. */ + + +static struct frag_queue *ip6_frag_intern(unsigned int hash, + struct frag_queue *fq_in) +{ + struct frag_queue *fq; + + write_lock(&ip6_frag_lock); +#ifdef CONFIG_SMP + for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + if (fq->id == fq_in->id && + ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && + ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { + atomic_inc(&fq->refcnt); + write_unlock(&ip6_frag_lock); + fq_in->last_in |= COMPLETE; + fq_put(fq_in, NULL); + return fq; + } + } +#endif + fq = fq_in; + + if (!mod_timer(&fq->timer, jiffies + sysctl_ip6frag_time)) + atomic_inc(&fq->refcnt); + + atomic_inc(&fq->refcnt); + if((fq->next = ip6_frag_hash[hash]) != NULL) + fq->next->pprev = &fq->next; + ip6_frag_hash[hash] = fq; + fq->pprev = &ip6_frag_hash[hash]; + INIT_LIST_HEAD(&fq->lru_list); + list_add_tail(&fq->lru_list, &ip6_frag_lru_list); + ip6_frag_nqueues++; + write_unlock(&ip6_frag_lock); + return fq; +} + + +static struct frag_queue * +ip6_frag_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct frag_queue *fq; + + if ((fq = frag_alloc_queue()) == NULL) + goto oom; + + memset(fq, 0, sizeof(struct frag_queue)); + + fq->id = id; + ipv6_addr_copy(&fq->saddr, src); + ipv6_addr_copy(&fq->daddr, dst); + + init_timer(&fq->timer); + fq->timer.function = ip6_frag_expire; + fq->timer.data = (long) fq; + spin_lock_init(&fq->lock); + atomic_set(&fq->refcnt, 1); + + return ip6_frag_intern(hash, fq); + +oom: + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return NULL; +} + +static __inline__ struct frag_queue * +fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct frag_queue *fq; + unsigned int hash = ip6qhashfn(id, src, dst); + + read_lock(&ip6_frag_lock); + for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + if (fq->id == id && + ipv6_addr_equal(src, &fq->saddr) && + ipv6_addr_equal(dst, &fq->daddr)) { + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + return fq; + } + } + read_unlock(&ip6_frag_lock); + + return ip6_frag_create(hash, id, src, dst); +} + + +static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->last_in & COMPLETE) + goto err; + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(skb->nh.ipv6h->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off - skb->nh.raw); + return; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_sub(skb->csum, + csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0)); + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->len || + ((fq->last_in & LAST_IN) && end != fq->len)) + goto err; + fq->last_in |= LAST_IN; + fq->len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, payload_len)); + return; + } + if (end > fq->len) { + /* Some bits beyond end -> corruption. */ + if (fq->last_in & LAST_IN) + goto err; + fq->len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) + goto err; + if (end-offset < skb->len) { + if (pskb_trim(skb, end - offset)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = fq->fragments; next != NULL; next = next->next) { + if (FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG6_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + if (!pskb_pull(skb, i)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + while (next && FRAG6_CB(next)->offset < end) { + int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + if (!pskb_pull(next, i)) + goto err; + FRAG6_CB(next)->offset += i; /* next fragment */ + fq->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragment is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + fq->fragments = next; + + fq->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + fq->fragments = skb; + + if (skb->dev) + fq->iif = skb->dev->ifindex; + skb->dev = NULL; + fq->stamp = skb->stamp; + fq->meat += skb->len; + atomic_add(skb->truesize, &ip6_frag_mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->last_in |= FIRST_IN; + } + write_lock(&ip6_frag_lock); + list_move_tail(&fq->lru_list, &ip6_frag_lru_list); + write_unlock(&ip6_frag_lock); + return; + +err: + IP6_INC_STATS(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, + unsigned int *nhoffp, + struct net_device *dev) +{ + struct sk_buff *fp, *head = fq->fragments; + int payload_len; + unsigned int nhoff; + + fq_kill(fq); + + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG6_CB(head)->offset == 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_oom; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_oom; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; i<skb_shinfo(head)->nr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &ip6_frag_mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; + head->nh.raw[nhoff] = head->h.raw[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac.raw += sizeof(struct frag_hdr); + head->nh.raw += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + head->h.raw = head->data; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &ip6_frag_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &ip6_frag_mem); + } + + head->next = NULL; + head->dev = dev; + head->stamp = fq->stamp; + head->nh.ipv6h->payload_len = htons(payload_len); + + *skb_in = head; + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + fq->fragments = NULL; + *nhoffp = nhoff; + return 1; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); +out_fail: + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return -1; +} + +static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct frag_queue *fq; + struct ipv6hdr *hdr; + + hdr = skb->nh.ipv6h; + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + + /* Jumbo payload inhibits frag. header */ + if (hdr->payload_len==0) { + IP6_INC_STATS(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw); + return -1; + } + if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+sizeof(struct frag_hdr))) { + IP6_INC_STATS(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw); + return -1; + } + + hdr = skb->nh.ipv6h; + fhdr = (struct frag_hdr *)skb->h.raw; + + if (!(fhdr->frag_off & htons(0xFFF9))) { + /* It is not a fragmented frame */ + skb->h.raw += sizeof(struct frag_hdr); + IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + + *nhoffp = (u8*)fhdr - skb->nh.raw; + return 1; + } + + if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh) + ip6_evictor(); + + if ((fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr)) != NULL) { + int ret = -1; + + spin_lock(&fq->lock); + + ip6_frag_queue(fq, skb, fhdr, *nhoffp); + + if (fq->last_in == (FIRST_IN|LAST_IN) && + fq->meat == fq->len) + ret = ip6_frag_reasm(fq, skbp, nhoffp, dev); + + spin_unlock(&fq->lock); + fq_put(fq, NULL); + return ret; + } + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -1; +} + +static struct inet6_protocol frag_protocol = +{ + .handler = ipv6_frag_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_frag_init(void) +{ + if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0) + printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n"); + + ip6_frag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&ip6_frag_secret_timer); + ip6_frag_secret_timer.function = ip6_frag_secret_rebuild; + ip6_frag_secret_timer.expires = jiffies + sysctl_ip6frag_secret_interval; + add_timer(&ip6_frag_secret_timer); +} diff --git a/net/ipv6/route.c b/net/ipv6/route.c new file mode 100644 index 000000000000..183802902c02 --- /dev/null +++ b/net/ipv6/route.c @@ -0,0 +1,2131 @@ +/* + * Linux INET6 implementation + * FIB front-end. + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * YOSHIFUJI Hideaki @USAGI + * reworked default router selection. + * - respect outgoing interface + * - select from (probably) reachable routers (i.e. + * routers in REACHABLE, STALE, DELAY or PROBE states). + * - always select the same router if it is (probably) + * reachable. otherwise, round-robin the list. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/times.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/route.h> +#include <linux/netdevice.h> +#include <linux/in6.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/if_arp.h> + +#ifdef CONFIG_PROC_FS +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#endif + +#include <net/snmp.h> +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/tcp.h> +#include <linux/rtnetlink.h> +#include <net/dst.h> +#include <net/xfrm.h> + +#include <asm/uaccess.h> + +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif + +/* Set to 3 to get tracing. */ +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RDBG(x) printk x +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RDBG(x) +#define RT6_TRACE(x...) do { ; } while (0) +#endif + + +static int ip6_rt_max_size = 4096; +static int ip6_rt_gc_min_interval = HZ / 2; +static int ip6_rt_gc_timeout = 60*HZ; +int ip6_rt_gc_interval = 30*HZ; +static int ip6_rt_gc_elasticity = 9; +static int ip6_rt_mtu_expires = 10*60*HZ; +static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_dst_destroy(struct dst_entry *); +static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +static int ip6_dst_gc(void); + +static int ip6_pkt_discard(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sk_buff *skb); +static void ip6_link_failure(struct sk_buff *skb); +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); + +static struct dst_ops ip6_dst_ops = { + .family = AF_INET6, + .protocol = __constant_htons(ETH_P_IPV6), + .gc = ip6_dst_gc, + .gc_thresh = 1024, + .check = ip6_dst_check, + .destroy = ip6_dst_destroy, + .ifdown = ip6_dst_ifdown, + .negative_advice = ip6_negative_advice, + .link_failure = ip6_link_failure, + .update_pmtu = ip6_rt_update_pmtu, + .entry_size = sizeof(struct rt6_info), +}; + +struct rt6_info ip6_null_entry = { + .u = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .dev = &loopback_dev, + .obsolete = -1, + .error = -ENETUNREACH, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, + .ops = &ip6_dst_ops, + .path = (struct dst_entry*)&ip6_null_entry, + } + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +struct fib6_node ip6_routing_table = { + .leaf = &ip6_null_entry, + .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, +}; + +/* Protects all the ip6 fib */ + +DEFINE_RWLOCK(rt6_lock); + + +/* allocate dst with ip6_dst_ops */ +static __inline__ struct rt6_info *ip6_dst_alloc(void) +{ + return (struct rt6_info *)dst_alloc(&ip6_dst_ops); +} + +static void ip6_dst_destroy(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + + if (idev != NULL) { + rt->rt6i_idev = NULL; + in6_dev_put(idev); + } +} + +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + + if (dev != &loopback_dev && idev != NULL && idev->dev == dev) { + struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev); + if (loopback_idev != NULL) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } + } +} + +static __inline__ int rt6_check_expired(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & RTF_EXPIRES && + time_after(jiffies, rt->rt6i_expires)); +} + +/* + * Route lookup. Any rt6_lock is implied. + */ + +static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, + int oif, + int strict) +{ + struct rt6_info *local = NULL; + struct rt6_info *sprt; + + if (oif) { + for (sprt = rt; sprt; sprt = sprt->u.next) { + struct net_device *dev = sprt->rt6i_dev; + if (dev->ifindex == oif) + return sprt; + if (dev->flags & IFF_LOOPBACK) { + if (sprt->rt6i_idev == NULL || + sprt->rt6i_idev->dev->ifindex != oif) { + if (strict && oif) + continue; + if (local && (!oif || + local->rt6i_idev->dev->ifindex == oif)) + continue; + } + local = sprt; + } + } + + if (local) + return local; + + if (strict) + return &ip6_null_entry; + } + return rt; +} + +/* + * pointer to the last default router chosen. BH is disabled locally. + */ +static struct rt6_info *rt6_dflt_pointer; +static DEFINE_SPINLOCK(rt6_dflt_lock); + +void rt6_reset_dflt_pointer(struct rt6_info *rt) +{ + spin_lock_bh(&rt6_dflt_lock); + if (rt == NULL || rt == rt6_dflt_pointer) { + RT6_TRACE("reset default router: %p->NULL\n", rt6_dflt_pointer); + rt6_dflt_pointer = NULL; + } + spin_unlock_bh(&rt6_dflt_lock); +} + +/* Default Router Selection (RFC 2461 6.3.6) */ +static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif) +{ + struct rt6_info *match = NULL; + struct rt6_info *sprt; + int mpri = 0; + + for (sprt = rt; sprt; sprt = sprt->u.next) { + struct neighbour *neigh; + int m = 0; + + if (!oif || + (sprt->rt6i_dev && + sprt->rt6i_dev->ifindex == oif)) + m += 8; + + if (rt6_check_expired(sprt)) + continue; + + if (sprt == rt6_dflt_pointer) + m += 4; + + if ((neigh = sprt->rt6i_nexthop) != NULL) { + read_lock_bh(&neigh->lock); + switch (neigh->nud_state) { + case NUD_REACHABLE: + m += 3; + break; + + case NUD_STALE: + case NUD_DELAY: + case NUD_PROBE: + m += 2; + break; + + case NUD_NOARP: + case NUD_PERMANENT: + m += 1; + break; + + case NUD_INCOMPLETE: + default: + read_unlock_bh(&neigh->lock); + continue; + } + read_unlock_bh(&neigh->lock); + } else { + continue; + } + + if (m > mpri || m >= 12) { + match = sprt; + mpri = m; + if (m >= 12) { + /* we choose the last default router if it + * is in (probably) reachable state. + * If route changed, we should do pmtu + * discovery. --yoshfuji + */ + break; + } + } + } + + spin_lock(&rt6_dflt_lock); + if (!match) { + /* + * No default routers are known to be reachable. + * SHOULD round robin + */ + if (rt6_dflt_pointer) { + for (sprt = rt6_dflt_pointer->u.next; + sprt; sprt = sprt->u.next) { + if (sprt->u.dst.obsolete <= 0 && + sprt->u.dst.error == 0 && + !rt6_check_expired(sprt)) { + match = sprt; + break; + } + } + for (sprt = rt; + !match && sprt; + sprt = sprt->u.next) { + if (sprt->u.dst.obsolete <= 0 && + sprt->u.dst.error == 0 && + !rt6_check_expired(sprt)) { + match = sprt; + break; + } + if (sprt == rt6_dflt_pointer) + break; + } + } + } + + if (match) { + if (rt6_dflt_pointer != match) + RT6_TRACE("changed default router: %p->%p\n", + rt6_dflt_pointer, match); + rt6_dflt_pointer = match; + } + spin_unlock(&rt6_dflt_lock); + + if (!match) { + /* + * Last Resort: if no default routers found, + * use addrconf default route. + * We don't record this route. + */ + for (sprt = ip6_routing_table.leaf; + sprt; sprt = sprt->u.next) { + if (!rt6_check_expired(sprt) && + (sprt->rt6i_flags & RTF_DEFAULT) && + (!oif || + (sprt->rt6i_dev && + sprt->rt6i_dev->ifindex == oif))) { + match = sprt; + break; + } + } + if (!match) { + /* no default route. give up. */ + match = &ip6_null_entry; + } + } + + return match; +} + +struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, + int oif, int strict) +{ + struct fib6_node *fn; + struct rt6_info *rt; + + read_lock_bh(&rt6_lock); + fn = fib6_lookup(&ip6_routing_table, daddr, saddr); + rt = rt6_device_match(fn->leaf, oif, strict); + dst_hold(&rt->u.dst); + rt->u.dst.__use++; + read_unlock_bh(&rt6_lock); + + rt->u.dst.lastuse = jiffies; + if (rt->u.dst.error == 0) + return rt; + dst_release(&rt->u.dst); + return NULL; +} + +/* ip6_ins_rt is called with FREE rt6_lock. + It takes new route entry, the addition fails by any reason the + route is freed. In any case, if caller does not hold it, it may + be destroyed. + */ + +int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + int err; + + write_lock_bh(&rt6_lock); + err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); + write_unlock_bh(&rt6_lock); + + return err; +} + +/* No rt6_lock! If COW failed, the function returns dead route entry + with dst->error set to errno value. + */ + +static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + int err; + struct rt6_info *rt; + + /* + * Clone the route. + */ + + rt = ip6_rt_copy(ort); + + if (rt) { + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); + + if (!(rt->rt6i_flags&RTF_GATEWAY)) + ipv6_addr_copy(&rt->rt6i_gateway, daddr); + + rt->rt6i_dst.plen = 128; + rt->rt6i_flags |= RTF_CACHE; + rt->u.dst.flags |= DST_HOST; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen && saddr) { + ipv6_addr_copy(&rt->rt6i_src.addr, saddr); + rt->rt6i_src.plen = 128; + } +#endif + + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + + dst_hold(&rt->u.dst); + + err = ip6_ins_rt(rt, NULL, NULL); + if (err == 0) + return rt; + + rt->u.dst.error = err; + + return rt; + } + dst_hold(&ip6_null_entry.u.dst); + return &ip6_null_entry; +} + +#define BACKTRACK() \ +if (rt == &ip6_null_entry && strict) { \ + while ((fn = fn->parent) != NULL) { \ + if (fn->fn_flags & RTN_ROOT) { \ + dst_hold(&rt->u.dst); \ + goto out; \ + } \ + if (fn->fn_flags & RTN_RTINFO) \ + goto restart; \ + } \ +} + + +void ip6_route_input(struct sk_buff *skb) +{ + struct fib6_node *fn; + struct rt6_info *rt; + int strict; + int attempts = 3; + + strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); + +relookup: + read_lock_bh(&rt6_lock); + + fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr); + +restart: + rt = fn->leaf; + + if ((rt->rt6i_flags & RTF_CACHE)) { + rt = rt6_device_match(rt, skb->dev->ifindex, strict); + BACKTRACK(); + dst_hold(&rt->u.dst); + goto out; + } + + rt = rt6_device_match(rt, skb->dev->ifindex, 0); + BACKTRACK(); + + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + struct rt6_info *nrt; + dst_hold(&rt->u.dst); + read_unlock_bh(&rt6_lock); + + nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr); + + dst_release(&rt->u.dst); + rt = nrt; + + if (rt->u.dst.error != -EEXIST || --attempts <= 0) + goto out2; + + /* Race condition! In the gap, when rt6_lock was + released someone could insert this route. Relookup. + */ + dst_release(&rt->u.dst); + goto relookup; + } + dst_hold(&rt->u.dst); + +out: + read_unlock_bh(&rt6_lock); +out2: + rt->u.dst.lastuse = jiffies; + rt->u.dst.__use++; + skb->dst = (struct dst_entry *) rt; +} + +struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) +{ + struct fib6_node *fn; + struct rt6_info *rt; + int strict; + int attempts = 3; + + strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); + +relookup: + read_lock_bh(&rt6_lock); + + fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src); + +restart: + rt = fn->leaf; + + if ((rt->rt6i_flags & RTF_CACHE)) { + rt = rt6_device_match(rt, fl->oif, strict); + BACKTRACK(); + dst_hold(&rt->u.dst); + goto out; + } + if (rt->rt6i_flags & RTF_DEFAULT) { + if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) + rt = rt6_best_dflt(rt, fl->oif); + } else { + rt = rt6_device_match(rt, fl->oif, strict); + BACKTRACK(); + } + + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + struct rt6_info *nrt; + dst_hold(&rt->u.dst); + read_unlock_bh(&rt6_lock); + + nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); + + dst_release(&rt->u.dst); + rt = nrt; + + if (rt->u.dst.error != -EEXIST || --attempts <= 0) + goto out2; + + /* Race condition! In the gap, when rt6_lock was + released someone could insert this route. Relookup. + */ + dst_release(&rt->u.dst); + goto relookup; + } + dst_hold(&rt->u.dst); + +out: + read_unlock_bh(&rt6_lock); +out2: + rt->u.dst.lastuse = jiffies; + rt->u.dst.__use++; + return &rt->u.dst; +} + + +/* + * Destination cache support functions + */ + +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rt6_info *rt; + + rt = (struct rt6_info *) dst; + + if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) + return dst; + + return NULL; +} + +static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + + if (rt) { + if (rt->rt6i_flags & RTF_CACHE) + ip6_del_rt(rt, NULL, NULL); + else + dst_release(dst); + } + return NULL; +} + +static void ip6_link_failure(struct sk_buff *skb) +{ + struct rt6_info *rt; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + + rt = (struct rt6_info *) skb->dst; + if (rt) { + if (rt->rt6i_flags&RTF_CACHE) { + dst_set_expires(&rt->u.dst, 0); + rt->rt6i_flags |= RTF_EXPIRES; + } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + rt->rt6i_node->fn_sernum = -1; + } +} + +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct rt6_info *rt6 = (struct rt6_info*)dst; + + if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + if (mtu < IPV6_MIN_MTU) { + mtu = IPV6_MIN_MTU; + dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + } + dst->metrics[RTAX_MTU-1] = mtu; + } +} + +/* Protected by rt6_lock. */ +static struct dst_entry *ndisc_dst_gc_list; +static int ipv6_get_mtu(struct net_device *dev); + +static inline unsigned int ipv6_advmss(unsigned int mtu) +{ + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + + if (mtu < ip6_rt_min_advmss) + mtu = ip6_rt_min_advmss; + + /* + * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and + * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. + * IPV6_MAXPLEN is also valid and means: "any MSS, + * rely only on pmtu discovery" + */ + if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) + mtu = IPV6_MAXPLEN; + return mtu; +} + +struct dst_entry *ndisc_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + struct in6_addr *addr, + int (*output)(struct sk_buff *)) +{ + struct rt6_info *rt; + struct inet6_dev *idev = in6_dev_get(dev); + + if (unlikely(idev == NULL)) + return NULL; + + rt = ip6_dst_alloc(); + if (unlikely(rt == NULL)) { + in6_dev_put(idev); + goto out; + } + + dev_hold(dev); + if (neigh) + neigh_hold(neigh); + else + neigh = ndisc_get_neigh(dev, addr); + + rt->rt6i_dev = dev; + rt->rt6i_idev = idev; + rt->rt6i_nexthop = neigh; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); + rt->u.dst.output = output; + +#if 0 /* there's no chance to use these for ndisc */ + rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST + ? DST_HOST + : 0; + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.plen = 128; +#endif + + write_lock_bh(&rt6_lock); + rt->u.dst.next = ndisc_dst_gc_list; + ndisc_dst_gc_list = &rt->u.dst; + write_unlock_bh(&rt6_lock); + + fib6_force_start_gc(); + +out: + return (struct dst_entry *)rt; +} + +int ndisc_dst_gc(int *more) +{ + struct dst_entry *dst, *next, **pprev; + int freed; + + next = NULL; + pprev = &ndisc_dst_gc_list; + freed = 0; + while ((dst = *pprev) != NULL) { + if (!atomic_read(&dst->__refcnt)) { + *pprev = dst->next; + dst_free(dst); + freed++; + } else { + pprev = &dst->next; + (*more)++; + } + } + + return freed; +} + +static int ip6_dst_gc(void) +{ + static unsigned expire = 30*HZ; + static unsigned long last_gc; + unsigned long now = jiffies; + + if (time_after(last_gc + ip6_rt_gc_min_interval, now) && + atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size) + goto out; + + expire++; + fib6_run_gc(expire); + last_gc = now; + if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh) + expire = ip6_rt_gc_timeout>>1; + +out: + expire -= expire>>ip6_rt_gc_elasticity; + return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size); +} + +/* Clean host part of a prefix. Not necessary in radix tree, + but results in cleaner routing tables. + + Remove it only when all the things will work! + */ + +static int ipv6_get_mtu(struct net_device *dev) +{ + int mtu = IPV6_MIN_MTU; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); + if (idev) { + mtu = idev->cnf.mtu6; + in6_dev_put(idev); + } + return mtu; +} + +int ipv6_get_hoplimit(struct net_device *dev) +{ + int hoplimit = ipv6_devconf.hop_limit; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); + if (idev) { + hoplimit = idev->cnf.hop_limit; + in6_dev_put(idev); + } + return hoplimit; +} + +/* + * + */ + +int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) +{ + int err; + struct rtmsg *r; + struct rtattr **rta; + struct rt6_info *rt = NULL; + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + int addr_type; + + rta = (struct rtattr **) _rtattr; + + if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) + return -EINVAL; +#ifndef CONFIG_IPV6_SUBTREES + if (rtmsg->rtmsg_src_len) + return -EINVAL; +#endif + if (rtmsg->rtmsg_ifindex) { + err = -ENODEV; + dev = dev_get_by_index(rtmsg->rtmsg_ifindex); + if (!dev) + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; + } + + if (rtmsg->rtmsg_metric == 0) + rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; + + rt = ip6_dst_alloc(); + + if (rt == NULL) { + err = -ENOMEM; + goto out; + } + + rt->u.dst.obsolete = -1; + rt->rt6i_expires = clock_t_to_jiffies(rtmsg->rtmsg_info); + if (nlh && (r = NLMSG_DATA(nlh))) { + rt->rt6i_protocol = r->rtm_protocol; + } else { + rt->rt6i_protocol = RTPROT_BOOT; + } + + addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); + + if (addr_type & IPV6_ADDR_MULTICAST) + rt->u.dst.input = ip6_mc_input; + else + rt->u.dst.input = ip6_forward; + + rt->u.dst.output = ip6_output; + + ipv6_addr_prefix(&rt->rt6i_dst.addr, + &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len); + rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len; + if (rt->rt6i_dst.plen == 128) + rt->u.dst.flags = DST_HOST; + +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_prefix(&rt->rt6i_src.addr, + &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len); + rt->rt6i_src.plen = rtmsg->rtmsg_src_len; +#endif + + rt->rt6i_metric = rtmsg->rtmsg_metric; + + /* We cannot add true routes via loopback here, + they would result in kernel looping; promote them to reject routes + */ + if ((rtmsg->rtmsg_flags&RTF_REJECT) || + (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + /* hold loopback dev/idev if we haven't done so. */ + if (dev != &loopback_dev) { + if (dev) { + dev_put(dev); + in6_dev_put(idev); + } + dev = &loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } + rt->u.dst.output = ip6_pkt_discard_out; + rt->u.dst.input = ip6_pkt_discard; + rt->u.dst.error = -ENETUNREACH; + rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + goto install_route; + } + + if (rtmsg->rtmsg_flags & RTF_GATEWAY) { + struct in6_addr *gw_addr; + int gwa_type; + + gw_addr = &rtmsg->rtmsg_gateway; + ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway); + gwa_type = ipv6_addr_type(gw_addr); + + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { + struct rt6_info *grt; + + /* IPv6 strictly inhibits using not link-local + addresses as nexthop address. + Otherwise, router will not able to send redirects. + It is very good, but in some (rare!) circumstances + (SIT, PtP, NBMA NOARP links) it is handy to allow + some exceptions. --ANK + */ + err = -EINVAL; + if (!(gwa_type&IPV6_ADDR_UNICAST)) + goto out; + + grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1); + + err = -EHOSTUNREACH; + if (grt == NULL) + goto out; + if (dev) { + if (dev != grt->rt6i_dev) { + dst_release(&grt->u.dst); + goto out; + } + } else { + dev = grt->rt6i_dev; + idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + if (!(grt->rt6i_flags&RTF_GATEWAY)) + err = 0; + dst_release(&grt->u.dst); + + if (err) + goto out; + } + err = -EINVAL; + if (dev == NULL || (dev->flags&IFF_LOOPBACK)) + goto out; + } + + err = -ENODEV; + if (dev == NULL) + goto out; + + if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) { + rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); + if (IS_ERR(rt->rt6i_nexthop)) { + err = PTR_ERR(rt->rt6i_nexthop); + rt->rt6i_nexthop = NULL; + goto out; + } + } + + rt->rt6i_flags = rtmsg->rtmsg_flags; + +install_route: + if (rta && rta[RTA_METRICS-1]) { + int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]); + struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > RTAX_MAX) { + err = -EINVAL; + goto out; + } + rt->u.dst.metrics[flavor-1] = + *(u32 *)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; + if (!rt->u.dst.metrics[RTAX_MTU-1]) + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); + if (!rt->u.dst.metrics[RTAX_ADVMSS-1]) + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); + rt->u.dst.dev = dev; + rt->rt6i_idev = idev; + return ip6_ins_rt(rt, nlh, _rtattr); + +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free((struct dst_entry *) rt); + return err; +} + +int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + int err; + + write_lock_bh(&rt6_lock); + + rt6_reset_dflt_pointer(NULL); + + err = fib6_del(rt, nlh, _rtattr); + dst_release(&rt->u.dst); + + write_unlock_bh(&rt6_lock); + + return err; +} + +static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_node *fn; + struct rt6_info *rt; + int err = -ESRCH; + + read_lock_bh(&rt6_lock); + + fn = fib6_locate(&ip6_routing_table, + &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len, + &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len); + + if (fn) { + for (rt = fn->leaf; rt; rt = rt->u.next) { + if (rtmsg->rtmsg_ifindex && + (rt->rt6i_dev == NULL || + rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex)) + continue; + if (rtmsg->rtmsg_flags&RTF_GATEWAY && + !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway)) + continue; + if (rtmsg->rtmsg_metric && + rtmsg->rtmsg_metric != rt->rt6i_metric) + continue; + dst_hold(&rt->u.dst); + read_unlock_bh(&rt6_lock); + + return ip6_del_rt(rt, nlh, _rtattr); + } + } + read_unlock_bh(&rt6_lock); + + return err; +} + +/* + * Handle redirects + */ +void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, + struct neighbour *neigh, u8 *lladdr, int on_link) +{ + struct rt6_info *rt, *nrt; + + /* Locate old route to this destination. */ + rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1); + + if (rt == NULL) + return; + + if (neigh->dev != rt->rt6i_dev) + goto out; + + /* + * Current route is on-link; redirect is always invalid. + * + * Seems, previous statement is not true. It could + * be node, which looks for us as on-link (f.e. proxy ndisc) + * But then router serving it might decide, that we should + * know truth 8)8) --ANK (980726). + */ + if (!(rt->rt6i_flags&RTF_GATEWAY)) + goto out; + + /* + * RFC 2461 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way default routers are chosen, this notion + * is a bit fuzzy and one might need to check all default + * routers. + */ + if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) { + if (rt->rt6i_flags & RTF_DEFAULT) { + struct rt6_info *rt1; + + read_lock(&rt6_lock); + for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) { + if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) { + dst_hold(&rt1->u.dst); + dst_release(&rt->u.dst); + read_unlock(&rt6_lock); + rt = rt1; + goto source_ok; + } + } + read_unlock(&rt6_lock); + } + if (net_ratelimit()) + printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " + "for redirect target\n"); + goto out; + } + +source_ok: + + /* + * We have finally decided to accept it. + */ + + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER)) + ); + + /* + * Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->u.dst); + + /* Duplicate redirect: silently ignore. */ + if (neigh == rt->u.dst.neighbour) + goto out; + + nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + if (on_link) + nrt->rt6i_flags &= ~RTF_GATEWAY; + + ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); + nrt->rt6i_dst.plen = 128; + nrt->u.dst.flags |= DST_HOST; + + ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); + nrt->rt6i_nexthop = neigh_clone(neigh); + /* Reset pmtu, it may be better */ + nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); + nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); + + if (ip6_ins_rt(nrt, NULL, NULL)) + goto out; + + if (rt->rt6i_flags&RTF_CACHE) { + ip6_del_rt(rt, NULL, NULL); + return; + } + +out: + dst_release(&rt->u.dst); + return; +} + +/* + * Handle ICMP "packet too big" messages + * i.e. Path MTU discovery + */ + +void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, + struct net_device *dev, u32 pmtu) +{ + struct rt6_info *rt, *nrt; + int allfrag = 0; + + rt = rt6_lookup(daddr, saddr, dev->ifindex, 0); + if (rt == NULL) + return; + + if (pmtu >= dst_mtu(&rt->u.dst)) + goto out; + + if (pmtu < IPV6_MIN_MTU) { + /* + * According to RFC2460, PMTU is set to the IPv6 Minimum Link + * MTU (1280) and a fragment header should always be included + * after a node receiving Too Big message reporting PMTU is + * less than the IPv6 Minimum Link MTU. + */ + pmtu = IPV6_MIN_MTU; + allfrag = 1; + } + + /* New mtu received -> path was valid. + They are sent only in response to data packets, + so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->u.dst); + + /* Host route. If it is static, it would be better + not to override it, but add new one, so that + when cache entry will expire old pmtu + would return automatically. + */ + if (rt->rt6i_flags & RTF_CACHE) { + rt->u.dst.metrics[RTAX_MTU-1] = pmtu; + if (allfrag) + rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires); + rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; + goto out; + } + + /* Network route. + Two cases are possible: + 1. It is connected route. Action: COW + 2. It is gatewayed route or NONEXTHOP route. Action: clone it. + */ + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + nrt = rt6_cow(rt, daddr, saddr); + if (!nrt->u.dst.error) { + nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; + if (allfrag) + nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + /* According to RFC 1981, detecting PMTU increase shouldn't be + happened within 5 mins, the recommended timer is 10 mins. + Here this route expiration time is set to ip6_rt_mtu_expires + which is 10 mins. After 10 mins the decreased pmtu is expired + and detecting PMTU increase will be automatically happened. + */ + dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; + } + dst_release(&nrt->u.dst); + } else { + nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr); + nrt->rt6i_dst.plen = 128; + nrt->u.dst.flags |= DST_HOST; + nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); + dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES; + nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; + if (allfrag) + nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + ip6_ins_rt(nrt, NULL, NULL); + } + +out: + dst_release(&rt->u.dst); +} + +/* + * Misc support functions + */ + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) +{ + struct rt6_info *rt = ip6_dst_alloc(); + + if (rt) { + rt->u.dst.input = ort->u.dst.input; + rt->u.dst.output = ort->u.dst.output; + + memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + rt->u.dst.dev = ort->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->u.dst.lastuse = jiffies; + rt->rt6i_expires = 0; + + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_metric = 0; + + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); +#ifdef CONFIG_IPV6_SUBTREES + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif + } + return rt; +} + +struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev) +{ + struct rt6_info *rt; + struct fib6_node *fn; + + fn = &ip6_routing_table; + + write_lock_bh(&rt6_lock); + for (rt = fn->leaf; rt; rt=rt->u.next) { + if (dev == rt->rt6i_dev && + ipv6_addr_equal(&rt->rt6i_gateway, addr)) + break; + } + if (rt) + dst_hold(&rt->u.dst); + write_unlock_bh(&rt6_lock); + return rt; +} + +struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, + struct net_device *dev) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(struct in6_rtmsg)); + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr); + rtmsg.rtmsg_metric = 1024; + rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES; + + rtmsg.rtmsg_ifindex = dev->ifindex; + + ip6_route_add(&rtmsg, NULL, NULL); + return rt6_get_dflt_router(gwaddr, dev); +} + +void rt6_purge_dflt_routers(void) +{ + struct rt6_info *rt; + +restart: + read_lock_bh(&rt6_lock); + for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { + dst_hold(&rt->u.dst); + + rt6_reset_dflt_pointer(NULL); + + read_unlock_bh(&rt6_lock); + + ip6_del_rt(rt, NULL, NULL); + + goto restart; + } + } + read_unlock_bh(&rt6_lock); +} + +int ipv6_route_ioctl(unsigned int cmd, void __user *arg) +{ + struct in6_rtmsg rtmsg; + int err; + + switch(cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = copy_from_user(&rtmsg, arg, + sizeof(struct in6_rtmsg)); + if (err) + return -EFAULT; + + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = ip6_route_add(&rtmsg, NULL, NULL); + break; + case SIOCDELRT: + err = ip6_route_del(&rtmsg, NULL, NULL); + break; + default: + err = -EINVAL; + } + rtnl_unlock(); + + return err; + }; + + return -EINVAL; +} + +/* + * Drop the packet on the floor + */ + +int ip6_pkt_discard(struct sk_buff *skb) +{ + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev); + kfree_skb(skb); + return 0; +} + +int ip6_pkt_discard_out(struct sk_buff *skb) +{ + skb->dev = skb->dst->dev; + return ip6_pkt_discard(skb); +} + +/* + * Allocate a dst for local (unicast / anycast) address. + */ + +struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + const struct in6_addr *addr, + int anycast) +{ + struct rt6_info *rt = ip6_dst_alloc(); + + if (rt == NULL) + return ERR_PTR(-ENOMEM); + + dev_hold(&loopback_dev); + in6_dev_hold(idev); + + rt->u.dst.flags = DST_HOST; + rt->u.dst.input = ip6_input; + rt->u.dst.output = ip6_output; + rt->rt6i_dev = &loopback_dev; + rt->rt6i_idev = idev; + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; + rt->u.dst.obsolete = -1; + + rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + if (!anycast) + rt->rt6i_flags |= RTF_LOCAL; + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + if (rt->rt6i_nexthop == NULL) { + dst_free((struct dst_entry *) rt); + return ERR_PTR(-ENOMEM); + } + + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.plen = 128; + + atomic_set(&rt->u.dst.__refcnt, 1); + + return rt; +} + +static int fib6_ifdown(struct rt6_info *rt, void *arg) +{ + if (((void*)rt->rt6i_dev == arg || arg == NULL) && + rt != &ip6_null_entry) { + RT6_TRACE("deleted by ifdown %p\n", rt); + return -1; + } + return 0; +} + +void rt6_ifdown(struct net_device *dev) +{ + write_lock_bh(&rt6_lock); + fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev); + write_unlock_bh(&rt6_lock); +} + +struct rt6_mtu_change_arg +{ + struct net_device *dev; + unsigned mtu; +}; + +static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; + struct inet6_dev *idev; + + /* In IPv6 pmtu discovery is not optional, + so that RTAX_MTU lock cannot disable it. + We still use this lock to block changes + caused by addrconf/ndisc. + */ + + idev = __in6_dev_get(arg->dev); + if (idev == NULL) + return 0; + + /* For administrative MTU increase, there is no way to discover + IPv6 PMTU increase, so PMTU increase should be updated here. + Since RFC 1981 doesn't include administrative MTU increase + update PMTU increase is a MUST. (i.e. jumbo frame) + */ + /* + If new MTU is less than route PMTU, this new MTU will be the + lowest MTU in the path, update the route PMTU to reflect PMTU + decreases; if new MTU is greater than route PMTU, and the + old MTU is the lowest MTU in the path, update the route PMTU + to reflect the increase. In this case if the other nodes' MTU + also have the lowest MTU, TOO BIG MESSAGE will be lead to + PMTU discouvery. + */ + if (rt->rt6i_dev == arg->dev && + !dst_metric_locked(&rt->u.dst, RTAX_MTU) && + (dst_mtu(&rt->u.dst) > arg->mtu || + (dst_mtu(&rt->u.dst) < arg->mtu && + dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) + rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu); + return 0; +} + +void rt6_mtu_change(struct net_device *dev, unsigned mtu) +{ + struct rt6_mtu_change_arg arg; + + arg.dev = dev; + arg.mtu = mtu; + read_lock_bh(&rt6_lock); + fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg); + read_unlock_bh(&rt6_lock); +} + +static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta, + struct in6_rtmsg *rtmsg) +{ + memset(rtmsg, 0, sizeof(*rtmsg)); + + rtmsg->rtmsg_dst_len = r->rtm_dst_len; + rtmsg->rtmsg_src_len = r->rtm_src_len; + rtmsg->rtmsg_flags = RTF_UP; + if (r->rtm_type == RTN_UNREACHABLE) + rtmsg->rtmsg_flags |= RTF_REJECT; + + if (rta[RTA_GATEWAY-1]) { + if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16); + rtmsg->rtmsg_flags |= RTF_GATEWAY; + } + if (rta[RTA_DST-1]) { + if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3)); + } + if (rta[RTA_SRC-1]) { + if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3)); + } + if (rta[RTA_OIF-1]) { + if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int))) + return -EINVAL; + memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + } + if (rta[RTA_PRIORITY-1]) { + if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + } + return 0; +} + +int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtmsg *r = NLMSG_DATA(nlh); + struct in6_rtmsg rtmsg; + + if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) + return -EINVAL; + return ip6_route_del(&rtmsg, nlh, arg); +} + +int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtmsg *r = NLMSG_DATA(nlh); + struct in6_rtmsg rtmsg; + + if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) + return -EINVAL; + return ip6_route_add(&rtmsg, nlh, arg); +} + +struct rt6_rtnl_dump_arg +{ + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, + struct in6_addr *dst, + struct in6_addr *src, + int iif, + int type, u32 pid, u32 seq, + struct nlmsghdr *in_nlh, int prefix) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; + + if (prefix) { /* user wants prefix routes only */ + if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + } + + if (!pid && in_nlh) { + pid = in_nlh->nlmsg_pid; + } + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET6; + rtm->rtm_dst_len = rt->rt6i_dst.plen; + rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + if (rt->rt6i_flags&RTF_REJECT) + rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + rtm->rtm_type = RTN_LOCAL; + else + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = rt->rt6i_protocol; + if (rt->rt6i_flags&RTF_DYNAMIC) + rtm->rtm_protocol = RTPROT_REDIRECT; + else if (rt->rt6i_flags & RTF_ADDRCONF) + rtm->rtm_protocol = RTPROT_KERNEL; + else if (rt->rt6i_flags&RTF_DEFAULT) + rtm->rtm_protocol = RTPROT_RA; + + if (rt->rt6i_flags&RTF_CACHE) + rtm->rtm_flags |= RTM_F_CLONED; + + if (dst) { + RTA_PUT(skb, RTA_DST, 16, dst); + rtm->rtm_dst_len = 128; + } else if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); +#ifdef CONFIG_IPV6_SUBTREES + if (src) { + RTA_PUT(skb, RTA_SRC, 16, src); + rtm->rtm_src_len = 128; + } else if (rtm->rtm_src_len) + RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); +#endif + if (iif) + RTA_PUT(skb, RTA_IIF, 4, &iif); + else if (dst) { + struct in6_addr saddr_buf; + if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0) + RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + goto rtattr_failure; + if (rt->u.dst.neighbour) + RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex); + RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric); + ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); + if (rt->rt6i_expires) + ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies); + else + ci.rta_expires = 0; + ci.rta_used = rt->u.dst.__use; + ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); + ci.rta_error = rt->u.dst.error; + ci.rta_id = 0; + ci.rta_ts = 0; + ci.rta_tsage = 0; + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int rt6_dump_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + int prefix; + + if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) { + struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh); + prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; + } else + prefix = 0; + + return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, + NULL, prefix); +} + +static int fib6_dump_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + + for (rt = w->leaf; rt; rt = rt->u.next) { + res = rt6_dump_route(rt, w->args); + if (res < 0) { + /* Frame is full, suspend walking */ + w->leaf = rt; + return 1; + } + BUG_TRAP(res!=0); + } + w->leaf = NULL; + return 0; +} + +static void fib6_dump_end(struct netlink_callback *cb) +{ + struct fib6_walker_t *w = (void*)cb->args[0]; + + if (w) { + cb->args[0] = 0; + fib6_walker_unlink(w); + kfree(w); + } + if (cb->args[1]) { + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; + } +} + +static int fib6_dump_done(struct netlink_callback *cb) +{ + fib6_dump_end(cb); + return cb->done(cb); +} + +int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rt6_rtnl_dump_arg arg; + struct fib6_walker_t *w; + int res; + + arg.skb = skb; + arg.cb = cb; + + w = (void*)cb->args[0]; + if (w == NULL) { + /* New dump: + * + * 1. hook callback destructor. + */ + cb->args[1] = (long)cb->done; + cb->done = fib6_dump_done; + + /* + * 2. allocate and initialize walker. + */ + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w == NULL) + return -ENOMEM; + RT6_TRACE("dump<%p", w); + memset(w, 0, sizeof(*w)); + w->root = &ip6_routing_table; + w->func = fib6_dump_node; + w->args = &arg; + cb->args[0] = (long)w; + read_lock_bh(&rt6_lock); + res = fib6_walk(w); + read_unlock_bh(&rt6_lock); + } else { + w->args = &arg; + read_lock_bh(&rt6_lock); + res = fib6_walk_continue(w); + read_unlock_bh(&rt6_lock); + } +#if RT6_DEBUG >= 3 + if (res <= 0 && skb->len == 0) + RT6_TRACE("%p>dump end\n", w); +#endif + res = res < 0 ? res : skb->len; + /* res < 0 is an error. (really, impossible) + res == 0 means that dump is complete, but skb still can contain data. + res > 0 dump is not complete, but frame is full. + */ + /* Destroy walker, if dump of this table is complete. */ + if (res <= 0) + fib6_dump_end(cb); + return res; +} + +int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + int iif = 0; + int err = -ENOBUFS; + struct sk_buff *skb; + struct flowi fl; + struct rt6_info *rt; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto out; + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); + + memset(&fl, 0, sizeof(fl)); + if (rta[RTA_SRC-1]) + ipv6_addr_copy(&fl.fl6_src, + (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1])); + if (rta[RTA_DST-1]) + ipv6_addr_copy(&fl.fl6_dst, + (struct in6_addr*)RTA_DATA(rta[RTA_DST-1])); + + if (rta[RTA_IIF-1]) + memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); + + if (iif) { + struct net_device *dev; + dev = __dev_get_by_index(iif); + if (!dev) { + err = -ENODEV; + goto out_free; + } + } + + fl.oif = 0; + if (rta[RTA_OIF-1]) + memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + + rt = (struct rt6_info*)ip6_route_output(NULL, &fl); + + skb->dst = &rt->u.dst; + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + err = rt6_fill_node(skb, rt, + &fl.fl6_dst, &fl.fl6_src, + iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, nlh, 0); + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + return err; +out_free: + kfree_skb(skb); + goto out; +} + +void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, gfp_any()); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); + return; + } + if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any()); +} + +/* + * /proc + */ + +#ifdef CONFIG_PROC_FS + +#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1) + +struct rt6_proc_arg +{ + char *buffer; + int offset; + int length; + int skip; + int len; +}; + +static int rt6_info_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg; + int i; + + if (arg->skip < arg->offset / RT6_INFO_LEN) { + arg->skip++; + return 0; + } + + if (arg->len >= arg->length) + return 0; + + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_dst.addr.s6_addr[i]); + arg->len += 2; + } + arg->len += sprintf(arg->buffer + arg->len, " %02x ", + rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_src.addr.s6_addr[i]); + arg->len += 2; + } + arg->len += sprintf(arg->buffer + arg->len, " %02x ", + rt->rt6i_src.plen); +#else + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000 00 "); + arg->len += 36; +#endif + + if (rt->rt6i_nexthop) { + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_nexthop->primary_key[i]); + arg->len += 2; + } + } else { + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000"); + arg->len += 32; + } + arg->len += sprintf(arg->buffer + arg->len, + " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt), + rt->u.dst.__use, rt->rt6i_flags, + rt->rt6i_dev ? rt->rt6i_dev->name : ""); + return 0; +} + +static int rt6_proc_info(char *buffer, char **start, off_t offset, int length) +{ + struct rt6_proc_arg arg; + arg.buffer = buffer; + arg.offset = offset; + arg.length = length; + arg.skip = 0; + arg.len = 0; + + read_lock_bh(&rt6_lock); + fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg); + read_unlock_bh(&rt6_lock); + + *start = buffer; + if (offset) + *start += offset % RT6_INFO_LEN; + + arg.len -= offset % RT6_INFO_LEN; + + if (arg.len > length) + arg.len = length; + if (arg.len < 0) + arg.len = 0; + + return arg.len; +} + +extern struct rt6_statistics rt6_stats; + +static int rt6_stats_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", + rt6_stats.fib_nodes, rt6_stats.fib_route_nodes, + rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries, + rt6_stats.fib_rt_cache, + atomic_read(&ip6_dst_ops.entries), + rt6_stats.fib_discarded_routes); + + return 0; +} + +static int rt6_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt6_stats_seq_show, NULL); +} + +static struct file_operations rt6_stats_seq_fops = { + .owner = THIS_MODULE, + .open = rt6_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SYSCTL + +static int flush_delay; + +static +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay); + return 0; + } else + return -EINVAL; +} + +ctl_table ipv6_route_table[] = { + { + .ctl_name = NET_IPV6_ROUTE_FLUSH, + .procname = "flush", + .data = &flush_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv6_sysctl_rtcache_flush + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_THRESH, + .procname = "gc_thresh", + .data = &ip6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ROUTE_MAX_SIZE, + .procname = "max_size", + .data = &ip6_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL, + .procname = "gc_min_interval", + .data = &ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT, + .procname = "gc_timeout", + .data = &ip6_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL, + .procname = "gc_interval", + .data = &ip6_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY, + .procname = "gc_elasticity", + .data = &ip6_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES, + .procname = "mtu_expires", + .data = &ip6_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS, + .procname = "min_adv_mss", + .data = &ip6_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, + .procname = "gc_min_interval_ms", + .data = &ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { .ctl_name = 0 } +}; + +#endif + +void __init ip6_route_init(void) +{ + struct proc_dir_entry *p; + + ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache", + sizeof(struct rt6_info), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ip6_dst_ops.kmem_cachep) + panic("cannot create ip6_dst_cache"); + + fib6_init(); +#ifdef CONFIG_PROC_FS + p = proc_net_create("ipv6_route", 0, rt6_proc_info); + if (p) + p->owner = THIS_MODULE; + + proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops); +#endif +#ifdef CONFIG_XFRM + xfrm6_init(); +#endif +} + +void ip6_route_cleanup(void) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove("ipv6_route"); + proc_net_remove("rt6_stats"); +#endif +#ifdef CONFIG_XFRM + xfrm6_fini(); +#endif + rt6_ifdown(NULL); + fib6_gc_cleanup(); + kmem_cache_destroy(ip6_dst_ops.kmem_cachep); +} diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c new file mode 100644 index 000000000000..b788f55e139b --- /dev/null +++ b/net/ipv6/sit.c @@ -0,0 +1,833 @@ +/* + * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> + * + * $Id: sit.c,v 1.53 2001/09/25 05:09:53 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Roger Venning <r.venning@telstra.com>: 6to4 support + * Nate Thompson <nate@thebog.net>: 6to4 support + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/icmp.h> +#include <asm/uaccess.h> +#include <linux/init.h> +#include <linux/netfilter_ipv4.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_fib.h> +#include <net/ip6_route.h> +#include <net/ndisc.h> +#include <net/addrconf.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/icmp.h> +#include <net/ipip.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> +#include <net/dsfield.h> + +/* + This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip6_fb_tunnel_init(struct net_device *dev); +static int ipip6_tunnel_init(struct net_device *dev); +static void ipip6_tunnel_setup(struct net_device *dev); + +static struct net_device *ipip6_fb_tunnel_dev; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static DEFINE_RWLOCK(ipip6_lock); + +static struct ip_tunnel * ipip6_tunnel_lookup(u32 remote, u32 local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &tunnels[prio][h]; +} + +static void ipip6_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipip6_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ipip6_lock); + *tp = t->next; + write_unlock_bh(&ipip6_lock); + break; + } + } +} + +static void ipip6_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipip6_bucket(t); + + t->next = *tp; + write_lock_bh(&ipip6_lock); + *tp = t; + write_unlock_bh(&ipip6_lock); +} + +static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct net_device *dev; + unsigned h = 0; + int prio = 0; + char name[IFNAMSIZ]; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + goto failed; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + int i; + for (i=1; i<100; i++) { + sprintf(name, "sit%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i==100) + goto failed; + } + + dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + if (dev == NULL) + return NULL; + + nt = dev->priv; + dev->init = ipip6_tunnel_init; + nt->parms = *parms; + + if (register_netdevice(dev) < 0) { + free_netdev(dev); + goto failed; + } + + dev_hold(dev); + + ipip6_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + return NULL; +} + +static void ipip6_tunnel_uninit(struct net_device *dev) +{ + if (dev == ipip6_fb_tunnel_dev) { + write_lock_bh(&ipip6_lock); + tunnels_wc[0] = NULL; + write_unlock_bh(&ipip6_lock); + dev_put(dev); + } else { + ipip6_tunnel_unlink((struct ip_tunnel*)dev->priv); + dev_put(dev); + } +} + + +static void ipip6_err(struct sk_buff *skb, u32 info) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)skb->data; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + read_lock(&ipip6_lock); + t = ipip6_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + read_unlock(&ipip6_lock); + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct ipv6hdr *iph6; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rt6_info *rt6i; + + if (len < hlen + sizeof(struct ipv6hdr)) + return; + iph6 = (struct ipv6hdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMPV6_PARAMPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Too complicated case ... */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + rel_type = ICMPV6_TIME_EXCEED; + rel_code = ICMPV6_EXC_HOPLIMIT; + break; + } + + /* Prepare fake skb to feed it to icmpv6_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)iph6); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0); + if (rt6i && rt6i->rt6i_dev) { + skb2->dev = rt6i->rt6i_dev; + + rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0); + + if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) { + struct ip_tunnel * t = (struct ip_tunnel*)rt6i->rt6i_dev->priv; + if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) { + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + } + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + } + } + kfree_skb(skb2); + return; +#endif +} + +static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) + IP6_ECN_set_ce(skb->nh.ipv6h); +} + +static int ipip6_rcv(struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + iph = skb->nh.iph; + + read_lock(&ipip6_lock); + if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + secpath_reset(skb); + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); + ipip6_ecn_decapsulate(iph, skb); + netif_rx(skb); + read_unlock(&ipip6_lock); + return 0; + } + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb); + read_unlock(&ipip6_lock); +out: + return 0; +} + +/* Returns the embedded IPv4 address if the IPv6 address + comes from 6to4 (RFC 3056) addr space */ + +static inline u32 try_6to4(struct in6_addr *v6dst) +{ + u32 dst = 0; + + if (v6dst->s6_addr16[0] == htons(0x2002)) { + /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ + memcpy(&dst, &v6dst->s6_addr16[1], 4); + } + return dst; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + struct ipv6hdr *iph6 = skb->nh.ipv6h; + u8 tos = tunnel->parms.iph.tos; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + struct in6_addr *addr6; + int addr_type; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != htons(ETH_P_IPV6)) + goto tx_error; + + if (!dst) + dst = try_6to4(&iph6->daddr); + + if (!dst) { + struct neighbour *neigh = NULL; + + if (skb->dst) + neigh = skb->dst->neighbour; + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &skb->nh.ipv6h->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } + + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + } + if (rt->rt_type != RTN_UNICAST) { + ip_rt_put(rt); + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + if (tiph->frag_off) + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + else + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (tunnel->parms.iph.daddr && skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + iph6 = skb->nh.ipv6h; + } + + skb->h.raw = skb->nh.raw; + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + if (mtu > IPV6_MIN_MTU) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + + iph->protocol = IPPROTO_IPV6; + iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = iph6->hop_limit; + + nf_reset(skb); + + IPTUNNEL_XMIT(); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipip6_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip6_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != ipip6_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = (struct ip_tunnel*)dev->priv; + ipip6_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip6_tunnel_link(t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipip6_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip6_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == ipip6_fb_tunnel_dev->priv) + goto done; + dev = t->dev; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static struct net_device_stats *ipip6_tunnel_get_stats(struct net_device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip6_tunnel_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ipip6_tunnel_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ipip6_tunnel_xmit; + dev->get_stats = ipip6_tunnel_get_stats; + dev->do_ioctl = ipip6_tunnel_ioctl; + dev->change_mtu = ipip6_tunnel_change_mtu; + + dev->type = ARPHRD_SIT; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; +} + +static int ipip6_tunnel_init(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + if (iph->daddr) { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; + struct rtable *rt; + if (!ip_route_output_key(&rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +int __init ipip6_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = dev->priv; + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPV6; + iph->ihl = 5; + iph->ttl = 64; + + dev_hold(dev); + tunnels_wc[0] = tunnel; + return 0; +} + +static struct net_protocol sit_protocol = { + .handler = ipip6_rcv, + .err_handler = ipip6_err, +}; + +void __exit sit_cleanup(void) +{ + inet_del_protocol(&sit_protocol, IPPROTO_IPV6); + unregister_netdev(ipip6_fb_tunnel_dev); +} + +int __init sit_init(void) +{ + int err; + + printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + + if (inet_add_protocol(&sit_protocol, IPPROTO_IPV6) < 0) { + printk(KERN_INFO "sit init: Can't add protocol\n"); + return -EAGAIN; + } + + ipip6_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + ipip6_tunnel_setup); + if (!ipip6_fb_tunnel_dev) { + err = -ENOMEM; + goto err1; + } + + ipip6_fb_tunnel_dev->init = ipip6_fb_tunnel_init; + + if ((err = register_netdev(ipip6_fb_tunnel_dev))) + goto err2; + + out: + return err; + err2: + free_netdev(ipip6_fb_tunnel_dev); + err1: + inet_del_protocol(&sit_protocol, IPPROTO_IPV6); + goto out; +} diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c new file mode 100644 index 000000000000..3a18e0e6ffed --- /dev/null +++ b/net/ipv6/sysctl_net_ipv6.c @@ -0,0 +1,125 @@ +/* + * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. + * + * Changes: + * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/config.h> +#include <linux/in6.h> +#include <linux/ipv6.h> +#include <net/ndisc.h> +#include <net/ipv6.h> +#include <net/addrconf.h> + +extern ctl_table ipv6_route_table[]; +extern ctl_table ipv6_icmp_table[]; + +#ifdef CONFIG_SYSCTL + +static ctl_table ipv6_table[] = { + { + .ctl_name = NET_IPV6_ROUTE, + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = ipv6_route_table + }, + { + .ctl_name = NET_IPV6_ICMP, + .procname = "icmp", + .maxlen = 0, + .mode = 0555, + .child = ipv6_icmp_table + }, + { + .ctl_name = NET_IPV6_BINDV6ONLY, + .procname = "bindv6only", + .data = &sysctl_ipv6_bindv6only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH, + .procname = "ip6frag_high_thresh", + .data = &sysctl_ip6frag_high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH, + .procname = "ip6frag_low_thresh", + .data = &sysctl_ip6frag_low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_TIME, + .procname = "ip6frag_time", + .data = &sysctl_ip6frag_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_IP6FRAG_SECRET_INTERVAL, + .procname = "ip6frag_secret_interval", + .data = &sysctl_ip6frag_secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV6_MLD_MAX_MSF, + .procname = "mld_max_msf", + .data = &sysctl_mld_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ipv6_sysctl_header; + +static ctl_table ipv6_net_table[] = { + { + .ctl_name = NET_IPV6, + .procname = "ipv6", + .mode = 0555, + .child = ipv6_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv6_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv6_net_table + }, + { .ctl_name = 0 } +}; + +void ipv6_sysctl_register(void) +{ + ipv6_sysctl_header = register_sysctl_table(ipv6_root_table, 0); +} + +void ipv6_sysctl_unregister(void) +{ + unregister_sysctl_table(ipv6_sysctl_header); +} + +#endif /* CONFIG_SYSCTL */ + + + diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c new file mode 100644 index 000000000000..4760c85e19db --- /dev/null +++ b/net/ipv6/tcp_ipv6.c @@ -0,0 +1,2265 @@ +/* + * TCP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * $Id: tcp_ipv6.c,v 1.144 2002/02/01 22:01:04 davem Exp $ + * + * Based on: + * linux/net/ipv4/tcp.c + * linux/net/ipv4/tcp_input.c + * linux/net/ipv4/tcp_output.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/net.h> +#include <linux/jiffies.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/init.h> +#include <linux/jhash.h> +#include <linux/ipsec.h> +#include <linux/times.h> + +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/random.h> + +#include <net/tcp.h> +#include <net/ndisc.h> +#include <net/ipv6.h> +#include <net/transp_v6.h> +#include <net/addrconf.h> +#include <net/ip6_route.h> +#include <net/ip6_checksum.h> +#include <net/inet_ecn.h> +#include <net/protocol.h> +#include <net/xfrm.h> +#include <net/addrconf.h> +#include <net/snmp.h> +#include <net/dsfield.h> + +#include <asm/uaccess.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +static void tcp_v6_send_reset(struct sk_buff *skb); +static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req); +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); + +static struct tcp_func ipv6_mapped; +static struct tcp_func ipv6_specific; + +/* I have no idea if this is a good hash for v6 or not. -DaveM */ +static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport, + struct in6_addr *faddr, u16 fport) +{ + int hashent = (lport ^ fport); + + hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); + hashent ^= hashent>>16; + hashent ^= hashent>>8; + return (hashent & (tcp_ehash_size - 1)); +} + +static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *laddr = &np->rcv_saddr; + struct in6_addr *faddr = &np->daddr; + __u16 lport = inet->num; + __u16 fport = inet->dport; + return tcp_v6_hashfn(laddr, lport, faddr, fport); +} + +static inline int tcp_v6_bind_conflict(struct sock *sk, + struct tcp_bind_bucket *tb) +{ + struct sock *sk2; + struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +/* Grrr, addr_type already calculated by caller, but I don't want + * to add some silly "cookie" argument to this method just for that. + * But it doesn't matter, the recalculation is in the rarest path + * this function ever takes. + */ +static int tcp_v6_get_port(struct sock *sk, unsigned short snum) +{ + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + struct hlist_node *node; + int ret; + + local_bh_disable(); + if (snum == 0) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + do { rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. */ + snum = rover; + } else { + head = &tcp_bhash[tcp_bhashfn(snum)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (tb && !hlist_empty(&tb->owners)) { + if (tb->fastreuse > 0 && sk->sk_reuse && + sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (tcp_v6_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; + +success: + if (!tcp_sk(sk)->bind_hash) + tcp_bind_hash(sk, tb, snum); + BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +static __inline__ void __tcp_v6_hash(struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + lock = &tcp_lhash_lock; + tcp_listen_wlock(); + } else { + sk->sk_hashent = tcp_v6_sk_hashfn(sk); + list = &tcp_ehash[sk->sk_hashent].chain; + lock = &tcp_ehash[sk->sk_hashent].lock; + write_lock(lock); + } + + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); +} + + +static void tcp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->af_specific == &ipv6_mapped) { + tcp_prot.hash(sk); + return; + } + local_bh_disable(); + __tcp_v6_hash(sk); + local_bh_enable(); + } +} + +static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif) +{ + struct sock *sk; + struct hlist_node *node; + struct sock *result = NULL; + int score, hiscore; + + hiscore=0; + read_lock(&tcp_lhash_lock); + sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) { + if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + score = 1; + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score++; + } + if (score == 3) { + result = sk; + break; + } + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + if (result) + sock_hold(result); + read_unlock(&tcp_lhash_lock); + return result; +} + +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * The sockhash lock must be held as a reader here. + */ + +static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 hnum, + int dif) +{ + struct tcp_ehash_bucket *head; + struct sock *sk; + struct hlist_node *node; + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + int hash; + + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + head = &tcp_ehash[hash]; + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ + if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + /* FIXME: acme: check this... */ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk->sk_family == PF_INET6) { + if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + goto hit; + } + } + read_unlock(&head->lock); + return NULL; + +hit: + sock_hold(sk); + read_unlock(&head->lock); + return sk; +} + + +static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 hnum, + int dif) +{ + struct sock *sk; + + sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif); + + if (sk) + return sk; + + return tcp_v6_lookup_listener(daddr, hnum, dif); +} + +inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(tcp_v6_lookup); + + +/* + * Open request hash tables. + */ + +static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) +{ + u32 a, b, c; + + a = raddr->s6_addr32[0]; + b = raddr->s6_addr32[1]; + c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32) rport; + __jhash_mix(a, b, c); + + return c & (TCP_SYNQ_HSIZE - 1); +} + +static struct open_request *tcp_v6_search_req(struct tcp_sock *tp, + struct open_request ***prevp, + __u16 rport, + struct in6_addr *raddr, + struct in6_addr *laddr, + int iif) +{ + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + + for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->class->family == AF_INET6 && + ipv6_addr_equal(&req->af.v6_req.rmt_addr, raddr) && + ipv6_addr_equal(&req->af.v6_req.loc_addr, laddr) && + (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); +} + +static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IPV6)) { + return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, + skb->nh.ipv6h->saddr.s6_addr32, + skb->h.th->dest, + skb->h.th->source); + } else { + return secure_tcp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + skb->h.th->dest, + skb->h.th->source); + } +} + +static int __tcp_v6_check_established(struct sock *sk, __u16 lport, + struct tcp_tw_bucket **twp) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr = &np->rcv_saddr; + struct in6_addr *saddr = &np->daddr; + int dif = sk->sk_bound_dev_if; + u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); + int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); + struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct sock *sk2; + struct hlist_node *node; + struct tcp_tw_bucket *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + tw = (struct tcp_tw_bucket*)sk2; + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tw->tw_ts_recent_stamp && + (!twp || (sysctl_tcp_tw_reuse && + xtime.tv_sec - + tw->tw_ts_recent_stamp > 1))) { + /* See comment in tcp_ipv4.c */ + tp->write_seq = tw->tw_snd_nxt + 65535 + 2; + if (!tp->write_seq) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + sock_hold(sk2); + goto unique; + } else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hashent = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + tcp_tw_deschedule(tw); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + tcp_tw_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 tcpv6_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + + return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +static int tcp_v6_hash_connect(struct sock *sk) +{ + unsigned short snum = inet_sk(sk)->num; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + tcpv6_port_offset(sk); + struct hlist_node *node; + struct tcp_tw_bucket *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &tcp_bhash[tcp_bhashfn(port)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + tb_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__tcp_v6_check_established(sk, + port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = tcp_bucket_create(head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + tcp_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __tcp_v6_hash(sk); + } + spin_unlock(&head->lock); + + if (tw) { + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + } + + ret = 0; + goto out; + } + + head = &tcp_bhash[tcp_bhashfn(snum)]; + tb = tcp_sk(sk)->bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __tcp_v6_hash(sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __tcp_v6_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +static __inline__ int tcp_v6_iif(struct sk_buff *skb) +{ + return IP6CB(skb)->iif; +} + +static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct flowi fl; + struct dst_entry *dst; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return(-EAFNOSUPPORT); + + memset(&fl, 0, sizeof(fl)); + + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl.fl6_flowlabel); + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + if (tp->rx_opt.ts_recent_stamp && + !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl.fl6_flowlabel; + + /* + * TCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = tp->ext_header_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + tp->af_specific = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + + err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + tp->ext_header_len = exthdrlen; + tp->af_specific = &ipv6_specific; + sk->sk_backlog_rcv = tcp_v6_do_rcv; + goto failure; + } else { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), + inet->saddr); + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), + inet->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, + (saddr ? saddr : &np->saddr)); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto failure; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + dst_release(dst); + goto failure; + } + + if (saddr == NULL) { + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->rcv_saddr = LOOPBACK4_IPV6; + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + tp->ext_header_len = 0; + if (np->opt) + tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + + tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + + inet->dport = usin->sin6_port; + + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v6_hash_connect(sk); + if (err) + goto late_failure; + + if (!tp->write_seq) + tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->sport, + inet->dport); + + err = tcp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + tcp_set_state(sk, TCP_CLOSE); + __sk_dst_reset(sk); +failure: + inet->dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + struct tcp_sock *tp; + __u32 seq; + + sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + tcp_tw_put((struct tcp_tw_bucket*)sk); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi fl; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->sk_err_soft = -err; + goto out; + } + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + goto out; + } + + } else + dst_hold(dst); + + if (tp->pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + /* Might be for an open_request */ + switch (sk->sk_state) { + struct open_request *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr, + &hdr->saddr, tcp_v6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != req->snt_isn) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + tcp_synq_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + + tcp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int tcp_v6_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff * skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr * final_p = NULL, final; + struct flowi fl; + int err = -1; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); + fl.fl6_flowlabel = 0; + fl.oif = req->af.v6_req.iif; + fl.fl_ip_dport = req->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (dst == NULL) { + opt = np->opt; + if (opt == NULL && + np->rxopt.bits.srcrt == 2 && + req->af.v6_req.pktopts) { + struct sk_buff *pktopts = req->af.v6_req.pktopts; + struct inet6_skb_parm *rxopt = IP6CB(pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); + } + + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto done; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto done; + } + + skb = tcp_make_synack(sk, dst, req); + if (skb) { + struct tcphdr *th = skb->h.th; + + th->check = tcp_v6_check(th, skb->len, + &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + err = ip6_xmit(sk, skb, &fl, opt, 0); + if (err == NET_XMIT_CN) + err = 0; + } + +done: + dst_release(dst); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return err; +} + +static void tcp_v6_or_free(struct open_request *req) +{ + if (req->af.v6_req.pktopts) + kfree_skb(req->af.v6_req.pktopts); +} + +static struct or_calltable or_ipv6 = { + .family = AF_INET6, + .rtx_syn_ack = tcp_v6_send_synack, + .send_ack = tcp_v6_or_send_ack, + .destructor = tcp_v6_or_free, + .send_reset = tcp_v6_send_reset +}; + +static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && np->rxopt.bits.hopopts) || + ((IPV6_FLOWINFO_MASK&*(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && np->rxopt.bits.srcrt) || + ((opt->dst1 || opt->dst0) && np->rxopt.bits.dstopts)) + return 1; + } + return 0; +} + + +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + if (skb->ip_summed == CHECKSUM_HW) { + th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); + skb->csum = offsetof(struct tcphdr, check); + } else { + th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, + csum_partial((char *)th, th->doff<<2, + skb->csum)); + } +} + + +static void tcp_v6_send_reset(struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th, *t1; + struct sk_buff *buff; + struct flowi fl; + + if (th->rst) + return; + + if (!ipv6_unicast_destination(skb)) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + sizeof(struct tcphdr), + GFP_ATOMIC); + if (buff == NULL) + return; + + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)); + + t1 = (struct tcphdr *) skb_push(buff,sizeof(struct tcphdr)); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = sizeof(*t1)/4; + t1->rst = 1; + + if(th->ack) { + t1->seq = th->ack_seq; + } else { + t1->ack = 1; + t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff<<2)); + } + + buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); + + t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + sizeof(*t1), IPPROTO_TCP, + buff->csum); + + fl.proto = IPPROTO_TCP; + fl.oif = tcp_v6_iif(skb); + fl.fl_ip_dport = t1->dest; + fl.fl_ip_sport = t1->source; + + /* sk = NULL, but it is safe for now. RST socket required. */ + if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { + + if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) { + dst_release(buff->dst); + return; + } + + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) +{ + struct tcphdr *th = skb->h.th, *t1; + struct sk_buff *buff; + struct flowi fl; + int tot_len = sizeof(struct tcphdr); + + if (ts) + tot_len += 3*4; + + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, + GFP_ATOMIC); + if (buff == NULL) + return; + + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); + + t1 = (struct tcphdr *) skb_push(buff,tot_len); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = tot_len/4; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = 1; + t1->window = htons(win); + + if (ts) { + u32 *ptr = (u32*)(t1 + 1); + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tcp_time_stamp); + *ptr = htonl(ts); + } + + buff->csum = csum_partial((char *)t1, tot_len, 0); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); + + t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + tot_len, IPPROTO_TCP, + buff->csum); + + fl.proto = IPPROTO_TCP; + fl.oif = tcp_v6_iif(skb); + fl.fl_ip_dport = t1->dest; + fl.fl_ip_sport = t1->source; + + if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { + if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) { + dst_release(buff->dst); + return; + } + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, + tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + + tcp_tw_put(tw); +} + +static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req) +{ + tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); +} + + +static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct open_request *req, **prev; + struct tcphdr *th = skb->h.th; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *nsk; + + /* Find possible connection requests. */ + req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr, + th->source, + &skb->nh.ipv6h->daddr, + ntohs(th->dest), + tcp_v6_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket*)nsk); + return NULL; + } + +#if 0 /*def CONFIG_SYN_COOKIES*/ + if (!th->rst && !th->syn && th->ack) + sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +} + +static void tcp_v6_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd); + + req->sk = NULL; + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); +} + + +/* FIXME: this is substantially similar to the ipv4 code. + * Can some kind of merge be done? -- erics + */ +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_options_received tmp_opt; + struct tcp_sock *tp = tcp_sk(sk); + struct open_request *req = NULL; + __u32 isn = TCP_SKB_CB(skb)->when; + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + /* + * There are no SYN attacks on IPv6, yet... + */ + if (tcp_synq_is_full(sk) && !isn) { + if (net_ratelimit()) + printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); + goto drop; + } + + if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; + + req = tcp_openreq_alloc(); + if (req == NULL) + goto drop; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tmp_opt.user_mss = tp->rx_opt.user_mss; + + tcp_parse_options(skb, &tmp_opt, 0); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + req->class = &or_ipv6; + ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); + TCP_ECN_create_request(req, skb->h.th); + req->af.v6_req.pktopts = NULL; + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || + np->rxopt.bits.rxhlim) { + atomic_inc(&skb->users); + req->af.v6_req.pktopts = skb; + } + req->af.v6_req.iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&req->af.v6_req.rmt_addr) & IPV6_ADDR_LINKLOCAL) + req->af.v6_req.iif = tcp_v6_iif(skb); + + if (isn == 0) + isn = tcp_v6_init_sequence(sk,skb); + + req->snt_isn = isn; + + if (tcp_v6_send_synack(sk, req, NULL)) + goto drop; + + tcp_v6_synq_add(sk, req); + + return 0; + +drop: + if (req) + tcp_openreq_free(req); + + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + return 0; /* don't send reset */ +} + +static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct tcp6_sock *newtcp6sk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + if (newsk == NULL) + return NULL; + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + newtp = tcp_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + newinet->daddr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + newinet->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + newtp->af_specific = &ipv6_mapped; + newsk->sk_backlog_rcv = tcp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Charge newly allocated IPv6 socket. Though it is mapped, + * it is IPv6 yet. + */ +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); +#endif + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 af_tcp.af_specific. + Sync it now. + */ + tcp_sync_mss(newsk, newtp->pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (np->rxopt.bits.srcrt == 2 && + opt == NULL && req->af.v6_req.pktopts) { + struct inet6_skb_parm *rxopt = IP6CB(req->af.v6_req.pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt)); + } + + if (dst == NULL) { + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = req->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out; + } + + newsk = tcp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out; + + /* Charge newly allocated IPv6 socket */ +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); +#endif + + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &req->af.v6_req.rmt_addr); + ipv6_addr_copy(&newnp->saddr, &req->af.v6_req.loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &req->af.v6_req.loc_addr); + newsk->sk_bound_dev_if = req->af.v6_req.iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (req->af.v6_req.pktopts) { + newnp->pktoptions = skb_clone(req->af.v6_req.pktopts, + GFP_ATOMIC); + kfree_skb(req->af.v6_req.pktopts); + req->af.v6_req.pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + newtp->ext_header_len = 0; + if (newnp->opt) + newtp->ext_header_len = newnp->opt->opt_nflen + + newnp->opt->opt_flen; + + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + tcp_initialize_rcv_mss(newsk); + + newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + + __tcp_v6_hash(newsk); + tcp_inherit_port(sk, newsk); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +out: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return NULL; +} + +static int tcp_v6_checksum_init(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,skb->csum)) + return 0; + LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n")); + } + if (skb->len <= 76) { + if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,0); + } + return 0; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp; + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, tcp_rcv_established and rcv_established + handle them correctly, but it is not case with + tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb, 0)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; + return 0; + } + + if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } + + TCP_CHECK_TIMER(sk); + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; + return 0; + +reset: + tcp_v6_send_reset(skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; + + +ipv6_pktoptions: + /* Do you ask, what is it? + + 1. skb was enqueued by tcp. + 2. skb is added to tail of read queue, rather than out of order. + 3. socket is not in passive state. + 4. Finally, it really contains options, which user wants to receive. + */ + tp = tcp_sk(sk); + if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + if (np->rxopt.bits.rxinfo) + np->mcast_oif = tcp_v6_iif(opt_skb); + if (np->rxopt.bits.rxhlim) + np->mcast_hops = opt_skb->nh.ipv6h->hop_limit; + if (ipv6_opt_accepted(sk, opt_skb)) { + skb_set_owner_r(opt_skb, sk); + opt_skb = xchg(&np->pktoptions, opt_skb); + } else { + __kfree_skb(opt_skb); + opt_skb = xchg(&np->pktoptions, NULL); + } + } + + if (opt_skb) + kfree_skb(opt_skb); + return 0; +} + +static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct tcphdr *th; + struct sock *sk; + int ret; + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* + * Count it even if it's bad. + */ + TCP_INC_STATS_BH(TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = skb->h.th; + + if (th->doff < sizeof(struct tcphdr)/4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff*4)) + goto discard_it; + + if ((skb->ip_summed != CHECKSUM_UNNECESSARY && + tcp_v6_checksum_init(skb) < 0)) + goto bad_packet; + + th = skb->h.th; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h); + TCP_SKB_CB(skb)->sacked = 0; + + sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source, + &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + return ret ? -1 : 0; + +no_tcp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + } else { + tcp_v6_send_reset(skb); + } + +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(TCP_MIB_INERRS); + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + + switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, skb->len)) { + case TCP_TW_SYN: + { + struct sock *sk2; + + sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + if (sk2 != NULL) { + tcp_tw_deschedule((struct tcp_tw_bucket *)sk); + tcp_tw_put((struct tcp_tw_bucket *)sk); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v6_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +static int tcp_v6_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + dst_release(dst); + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + dst_release(dst); + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +static int tcp_v6_remember_stamp(struct sock *sk) +{ + /* Alas, not yet... */ + return 0; +} + +static struct tcp_func ipv6_specific = { + .queue_xmit = tcp_v6_xmit, + .send_check = tcp_v6_send_check, + .rebuild_header = tcp_v6_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .remember_stamp = tcp_v6_remember_stamp, + .net_header_len = sizeof(struct ipv6hdr), + + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = v6_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* + * TCP over IPv4 via INET6 API + */ + +static struct tcp_func ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = tcp_v4_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .remember_stamp = tcp_v4_remember_stamp, + .net_header_len = sizeof(struct iphdr), + + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = v6_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + + + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v6_init_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + tp->rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = 0x7fffffff; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache_std = tp->mss_cache = 536; + + tp->reordering = sysctl_tcp_reordering; + + sk->sk_state = TCP_CLOSE; + + tp->af_specific = &ipv6_specific; + + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + atomic_inc(&tcp_sockets_allocated); + + return 0; +} + +static int tcp_v6_destroy_sock(struct sock *sk) +{ + extern int tcp_v4_destroy_sock(struct sock *sk); + + tcp_v4_destroy_sock(sk); + return inet6_destroy_sock(sk); +} + +/* Proc filesystem TCPv6 sock list dumping. */ +static void get_openreq6(struct seq_file *seq, + struct sock *sk, struct open_request *req, int i, int uid) +{ + struct in6_addr *dest, *src; + int ttd = req->expires - jiffies; + + if (ttd < 0) + ttd = 0; + + src = &req->af.v6_req.loc_addr; + dest = &req->af.v6_req.rmt_addr; + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + ntohs(inet_sk(sk)->sport), + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], + ntohs(req->rmt_port), + TCP_SYN_RECV, + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + 0, req); +} + +static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) +{ + struct in6_addr *dest, *src; + __u16 destp, srcp; + int timer_active; + unsigned long timer_expires; + struct inet_sock *inet = inet_sk(sp); + struct tcp_sock *tp = tcp_sk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->dport); + srcp = ntohs(inet->sport); + if (tp->pending == TCP_TIME_RETRANS) { + timer_active = 1; + timer_expires = tp->timeout; + } else if (tp->pending == TCP_TIME_PROBE0) { + timer_active = 4; + timer_expires = tp->timeout; + } else if (timer_pending(&sp->sk_timer)) { + timer_active = 2; + timer_expires = sp->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + tp->retransmits, + sock_i_uid(sp), + tp->probes_out, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, + tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh + ); +} + +static void get_timewait6_sock(struct seq_file *seq, + struct tcp_tw_bucket *tw, int i) +{ + struct in6_addr *dest, *src; + __u16 destp, srcp; + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = &tw->tw_v6_daddr; + src = &tw->tw_v6_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); +} + +#ifdef CONFIG_PROC_FS +static int tcp6_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state *st; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp6_sock(seq, v, st->num); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait6_sock(seq, v, st->num); + break; + } +out: + return 0; +} + +static struct file_operations tcp6_seq_fops; +static struct tcp_seq_afinfo tcp6_seq_afinfo = { + .owner = THIS_MODULE, + .name = "tcp6", + .family = AF_INET6, + .seq_show = tcp6_seq_show, + .seq_fops = &tcp6_seq_fops, +}; + +int __init tcp6_proc_init(void) +{ + return tcp_proc_register(&tcp6_seq_afinfo); +} + +void tcp6_proc_exit(void) +{ + tcp_proc_unregister(&tcp6_seq_afinfo); +} +#endif + +struct proto tcpv6_prot = { + .name = "TCPv6", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v6_connect, + .disconnect = tcp_disconnect, + .accept = tcp_accept, + .ioctl = tcp_ioctl, + .init = tcp_v6_init_sock, + .destroy = tcp_v6_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .sendmsg = tcp_sendmsg, + .recvmsg = tcp_recvmsg, + .backlog_rcv = tcp_v6_do_rcv, + .hash = tcp_v6_hash, + .unhash = tcp_unhash, + .get_port = tcp_v6_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp6_sock), +}; + +static struct inet6_protocol tcpv6_protocol = { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +extern struct proto_ops inet6_stream_ops; + +static struct inet_protosw tcpv6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcpv6_prot, + .ops = &inet6_stream_ops, + .capability = -1, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT, +}; + +void __init tcpv6_init(void) +{ + /* register inet6 protocol */ + if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0) + printk(KERN_ERR "tcpv6_init: Could not register protocol\n"); + inet6_register_protosw(&tcpv6_protosw); +} diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c new file mode 100644 index 000000000000..e251d0ba4f39 --- /dev/null +++ b/net/ipv6/udp.c @@ -0,0 +1,1075 @@ +/* + * UDP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque <roque@di.fc.ul.pt> + * + * Based on linux/ipv4/udp.c + * + * $Id: udp.c,v 1.65 2002/02/01 22:01:04 davem Exp $ + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/config.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/sched.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> +#include <linux/init.h> +#include <asm/uaccess.h> + +#include <net/sock.h> +#include <net/snmp.h> + +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/protocol.h> +#include <net/transp_v6.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> +#include <net/ip.h> +#include <net/udp.h> +#include <net/raw.h> +#include <net/inet_common.h> + +#include <net/ip6_checksum.h> +#include <net/xfrm.h> + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); + +/* Grrr, addr_type already calculated by caller, but I don't want + * to add some silly "cookie" argument to this method just for that. + */ +static int udp_v6_get_port(struct sock *sk, unsigned short snum) +{ + struct sock *sk2; + struct hlist_node *node; + + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; + + if (udp_port_rover > sysctl_local_port_range[1] || + udp_port_rover < sysctl_local_port_range[0]) + udp_port_rover = sysctl_local_port_range[0]; + best_size_so_far = 32767; + best = result = udp_port_rover; + for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + int size; + struct hlist_head *list; + + list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + goto gotit; + } + size = 0; + sk_for_each(sk2, node, list) + if (++size >= best_size_so_far) + goto next; + best_size_so_far = size; + best = result; + next:; + } + result = best; + for(;; result += UDP_HTABLE_SIZE) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } +gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, + &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { + if (inet_sk(sk2)->num == snum && + sk2 != sk && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!sk2->sk_reuse || !sk->sk_reuse) && + ipv6_rcv_saddr_equal(sk, sk2)) + goto fail; + } + } + + inet_sk(sk)->num = snum; + if (sk_unhashed(sk)) { + sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]); + sock_prot_inc_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); + return 0; + +fail: + write_unlock_bh(&udp_hash_lock); + return 1; +} + +static void udp_v6_hash(struct sock *sk) +{ + BUG(); +} + +static void udp_v6_unhash(struct sock *sk) +{ + write_lock_bh(&udp_hash_lock); + if (sk_del_node_init(sk)) { + inet_sk(sk)->num = 0; + sock_prot_dec_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); +} + +static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, int dif) +{ + struct sock *sk, *result = NULL; + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; + + read_lock(&udp_hash_lock); + sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + int score = 0; + if (inet->dport) { + if (inet->dport != sport) + continue; + score++; + } + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + continue; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score++; + } + if(score == 4) { + result = sk; + break; + } else if(score > badness) { + result = sk; + badness = score; + } + } + } + if (result) + sock_hold(result); + read_unlock(&udp_hash_lock); + return result; +} + +/* + * + */ + +static void udpv6_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +static int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + size_t copied; + int err; + + if (addr_len) + *addr_len=sizeof(struct sockaddr_in6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + +try_again: + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (msg->msg_name) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *) msg->msg_name; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = skb->h.uh->source; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + + if (skb->protocol == htons(ETH_P_IP)) + ipv6_addr_set(&sin6->sin6_addr, 0, 0, + htonl(0xffff), skb->nh.iph->saddr); + else { + ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + } + if (skb->protocol == htons(ETH_P_IP)) { + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } else { + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + } + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len - sizeof(struct udphdr); + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + skb_free_datagram(sk, skb); + + if (flags & MSG_DONTWAIT) { + UDP6_INC_STATS_USER(UDP_MIB_INERRORS); + return -EAGAIN; + } + goto try_again; +} + +static void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6_pinfo *np; + struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; + struct net_device *dev = skb->dev; + struct in6_addr *saddr = &hdr->saddr; + struct in6_addr *daddr = &hdr->daddr; + struct udphdr *uh = (struct udphdr*)(skb->data+offset); + struct sock *sk; + int err; + + sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex); + + if (sk == NULL) + return; + + np = inet6_sk(sk); + + if (!icmpv6_err_convert(type, code, &err) && !np->recverr) + goto out; + + if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) + goto out; + + if (np->recverr) + ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return -1; + } + + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (sock_queue_rcv_skb(sk,skb)<0) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; + } + UDP6_INC_STATS_BH(UDP_MIB_INDATAGRAMS); + return 0; +} + +static struct sock *udp_v6_mcast_next(struct sock *sk, + u16 loc_port, struct in6_addr *loc_addr, + u16 rmt_port, struct in6_addr *rmt_addr, + int dif) +{ + struct hlist_node *node; + struct sock *s = sk; + unsigned short num = ntohs(loc_port); + + sk_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (inet->num == num && s->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(s); + if (inet->dport) { + if (inet->dport != rmt_port) + continue; + } + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + return s; + continue; + } + if(!inet6_mc_check(s, loc_addr, rmt_addr)) + continue; + return s; + } + } + return NULL; +} + +/* + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static void udpv6_mcast_deliver(struct udphdr *uh, + struct in6_addr *saddr, struct in6_addr *daddr, + struct sk_buff *skb) +{ + struct sock *sk, *sk2; + int dif; + + read_lock(&udp_hash_lock); + sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + dif = skb->dev->ifindex; + sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { + kfree_skb(skb); + goto out; + } + + sk2 = sk; + while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr, + uh->source, saddr, dif))) { + struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); + if (buff) + udpv6_queue_rcv_skb(sk2, buff); + } + udpv6_queue_rcv_skb(sk, skb); +out: + read_unlock(&udp_hash_lock); +} + +static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct sock *sk; + struct udphdr *uh; + struct net_device *dev = skb->dev; + struct in6_addr *saddr, *daddr; + u32 ulen = 0; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto short_packet; + + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + uh = skb->h.uh; + + ulen = ntohs(uh->len); + + /* Check for jumbo payload */ + if (ulen == 0) + ulen = skb->len; + + if (ulen > skb->len || ulen < sizeof(*uh)) + goto short_packet; + + if (uh->check == 0) { + /* RFC 2460 section 8.1 says that we SHOULD log + this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG( + printk(KERN_INFO "IPv6: udp checksum is 0\n")); + goto discard; + } + + if (ulen < skb->len) { + if (__pskb_trim(skb, ulen)) + goto discard; + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + uh = skb->h.uh; + } + + if (skb->ip_summed==CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { + LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n")); + skb->ip_summed = CHECKSUM_NONE; + } + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); + + /* + * Multicast receive code + */ + if (ipv6_addr_is_multicast(daddr)) { + udpv6_mcast_deliver(uh, saddr, daddr, skb); + return 0; + } + + /* Unicast */ + + /* + * check socket cache ... must talk to Alan about his plans + * for sock caches... i'll skip this for now. + */ + sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex); + + if (sk == NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + goto discard; + UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); + + kfree_skb(skb); + return(0); + } + + /* deliver */ + + udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); + return(0); + +short_packet: + if (net_ratelimit()) + printk(KERN_DEBUG "UDP: short packet: %d/%u\n", ulen, skb->len); + +discard: + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return(0); +} +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +static void udp_v6_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip6_flush_pending_frames(sk); + } +} + +/* + * Sending + */ + +static int udp_v6_push_pending_frames(struct sock *sk, struct udp_sock *up) +{ + struct sk_buff *skb; + struct udphdr *uh; + struct inet_sock *inet = inet_sk(sk); + struct flowi *fl = &inet->cork.fl; + int err = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = skb->h.uh; + uh->source = fl->fl_ip_sport; + uh->dest = fl->fl_ip_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (sk->sk_no_check == UDP_CSUM_NOXMIT) { + skb->ip_summed = CHECKSUM_NONE; + goto send; + } + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + uh->check = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + up->len, fl->proto, skb->csum); + } else { + u32 tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + tmp_csum = csum_partial((char *)uh, + sizeof(struct udphdr), tmp_csum); + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + up->len, fl->proto, tmp_csum); + uh->check = tmp_csum; + + } + if (uh->check == 0) + uh->check = -1; + +send: + err = ip6_push_pending_frames(sk); +out: + up->len = 0; + up->pending = 0; + return err; +} + +static int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p = NULL, final; + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct flowi *fl = &inet->cork.fl; + struct dst_entry *dst; + int addr_len = msg->msg_namelen; + int ulen = len; + int hlimit = -1; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int err; + + /* destination address check */ + if (sin6) { + if (addr_len < offsetof(struct sockaddr, sa_data)) + return -EINVAL; + + switch (sin6->sin6_family) { + case AF_INET6: + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + daddr = &sin6->sin6_addr; + break; + case AF_INET: + goto do_udp_sendmsg; + case AF_UNSPEC: + msg->msg_name = sin6 = NULL; + msg->msg_namelen = addr_len = 0; + daddr = NULL; + break; + default: + return -EINVAL; + } + } else if (!up->pending) { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } else + daddr = NULL; + + if (daddr) { + if (ipv6_addr_type(daddr) == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + sin.sin_family = AF_INET; + sin.sin_port = sin6 ? sin6->sin6_port : inet->dport; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + msg->msg_name = &sin; + msg->msg_namelen = sizeof(sin); +do_udp_sendmsg: + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + return udp_sendmsg(iocb, sk, msg, len); + } + } + + if (up->pending == AF_INET) + return udp_sendmsg(iocb, sk, msg, len); + + /* Rough check on arithmetic overflow, + better check is made in ip6_build_xmit + */ + if (len > INT_MAX - sizeof(struct udphdr)) + return -EMSGSIZE; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET6)) { + release_sock(sk); + return -EAFNOSUPPORT; + } + dst = NULL; + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + memset(fl, 0, sizeof(*fl)); + + if (sin6) { + if (sin6->sin6_port == 0) + return -EINVAL; + + fl->fl_ip_dport = sin6->sin6_port; + daddr = &sin6->sin6_addr; + + if (np->sndflow) { + fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl->oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + fl->fl_ip_dport = inet->dport; + daddr = &np->daddr; + fl->fl6_flowlabel = np->flow_label; + } + + if (!fl->oif) + fl->oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(*opt); + + err = datagram_send_ctl(msg, fl, opt, &hlimit); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + + fl->proto = IPPROTO_UDP; + ipv6_addr_copy(&fl->fl6_dst, daddr); + if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl->fl6_src, &np->saddr); + fl->fl_ip_sport = inet->sport; + + /* merge ip6_build_xmit from ip6_output */ + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl->fl6_dst); + ipv6_addr_copy(&fl->fl6_dst, rt0->addr); + final_p = &final; + } + + if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) + fl->oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, fl); + if (err) + goto out; + if (final_p) + ipv6_addr_copy(&fl->fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) { + dst_release(dst); + goto out; + } + + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl->fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + err = -EINVAL; + goto out; + } + + up->pending = AF_INET6; + +do_append_data: + up->len += ulen; + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), + hlimit, opt, fl, (struct rt6_info*)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_v6_flush_pending_frames(sk); + else if (!corkreq) + err = udp_v6_push_pending_frames(sk, up); + + if (dst) + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ? + &np->daddr : NULL); + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + release_sock(sk); +out: + fl6_sock_release(flowlabel); + if (!err) { + UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS); + return len; + } + return err; + +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int udpv6_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_v6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); + + return 0; +} + +/* + * Socket option code for UDP + */ +static int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val; + int err = 0; + + if (level != SOL_UDP) + return ipv6_setsockopt(sk, level, optname, optval, optlen); + + if(optlen<sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch(optname) { + case UDP_CORK: + if (val != 0) { + up->corkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + udp_v6_push_pending_frames(sk, up); + release_sock(sk); + } + break; + + case UDP_ENCAP: + switch (val) { + case 0: + up->encap_type = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + + return err; +} + +static int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val, len; + + if (level != SOL_UDP) + return ipv6_getsockopt(sk, level, optname, optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +static struct inet6_protocol udpv6_protocol = { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) +{ + struct inet_sock *inet = inet_sk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->dport); + srcp = ntohs(inet->sport); + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); +} + +static int udp6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + else + udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + return 0; +} + +static struct file_operations udp6_seq_fops; +static struct udp_seq_afinfo udp6_seq_afinfo = { + .owner = THIS_MODULE, + .name = "udp6", + .family = AF_INET6, + .seq_show = udp6_seq_show, + .seq_fops = &udp6_seq_fops, +}; + +int __init udp6_proc_init(void) +{ + return udp_proc_register(&udp6_seq_afinfo); +} + +void udp6_proc_exit(void) { + udp_proc_unregister(&udp6_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +/* ------------------------------------------------------------------------ */ + +struct proto udpv6_prot = { + .name = "UDPv6", + .owner = THIS_MODULE, + .close = udpv6_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_v6_hash, + .unhash = udp_v6_unhash, + .get_port = udp_v6_get_port, + .obj_size = sizeof(struct udp6_sock), +}; + +extern struct proto_ops inet6_dgram_ops; + +static struct inet_protosw udpv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udpv6_prot, + .ops = &inet6_dgram_ops, + .capability =-1, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, +}; + + +void __init udpv6_init(void) +{ + if (inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP) < 0) + printk(KERN_ERR "udpv6_init: Could not register protocol\n"); + inet6_register_protosw(&udpv6_protosw); +} diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c new file mode 100644 index 000000000000..28c29d78338e --- /dev/null +++ b/net/ipv6/xfrm6_input.c @@ -0,0 +1,150 @@ +/* + * xfrm6_input.c: based on net/ipv4/xfrm4_input.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * YOSHIFUJI Hideaki @USAGI + * IPv6 support + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) +{ + struct sk_buff *skb = *pskb; + int err; + u32 seq; + struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; + struct xfrm_state *x; + int xfrm_nr = 0; + int decaps = 0; + int nexthdr; + unsigned int nhoff; + + nhoff = *nhoffp; + nexthdr = skb->nh.raw[nhoff]; + + seq = 0; + if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) + goto drop; + + do { + struct ipv6hdr *iph = skb->nh.ipv6h; + + if (xfrm_nr == XFRM_MAX_DEPTH) + goto drop; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, nexthdr, AF_INET6); + if (x == NULL) + goto drop; + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) + goto drop_unlock; + + if (x->props.replay_window && xfrm_replay_check(x, seq)) + goto drop_unlock; + + if (xfrm_state_check_expire(x)) + goto drop_unlock; + + nexthdr = x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb); + if (nexthdr <= 0) + goto drop_unlock; + + skb->nh.raw[nhoff] = nexthdr; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + xfrm_vec[xfrm_nr++].xvec = x; + + if (x->props.mode) { /* XXX */ + if (nexthdr != IPPROTO_IPV6) + goto drop; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + decaps = 1; + break; + } + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) < 0) + goto drop; + } while (!err); + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + sp = secpath_dup(skb->sp); + if (!sp) + goto drop; + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) + goto drop; + + memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); + skb->sp->len += xfrm_nr; + skb->ip_summed = CHECKSUM_NONE; + + if (decaps) { + if (!(skb->dev->flags&IFF_LOOPBACK)) { + dst_release(skb->dst); + skb->dst = NULL; + } + netif_rx(skb); + return -1; + } else { + return 1; + } + +drop_unlock: + spin_unlock(&x->lock); + xfrm_state_put(x); +drop: + while (--xfrm_nr >= 0) + xfrm_state_put(xfrm_vec[xfrm_nr].xvec); + kfree_skb(skb); + return -1; +} + +EXPORT_SYMBOL(xfrm6_rcv_spi); + +int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + return xfrm6_rcv_spi(pskb, nhoffp, 0); +} diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c new file mode 100644 index 000000000000..601a148f60f3 --- /dev/null +++ b/net/ipv6/xfrm6_output.c @@ -0,0 +1,143 @@ +/* + * xfrm6_output.c - Common IPsec encapsulation code for IPv6. + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/icmpv6.h> +#include <net/dsfield.h> +#include <net/inet_ecn.h> +#include <net/ipv6.h> +#include <net/xfrm.h> + +/* Add encapsulation header. + * + * In transport mode, the IP header and mutable extension headers will be moved + * forward to make space for the encapsulation header. + * + * In tunnel mode, the top IP header will be constructed per RFC 2401. + * The following fields in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static void xfrm6_encap(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph, *top_iph; + int dsfield; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + if (!x->props.mode) { + u8 *prevhdr; + int hdr_len; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + return; + } + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + dsfield = ipv6_get_dsfield(top_iph); + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->nexthdr = IPPROTO_IPV6; + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); +} + +static int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst = skb->dst; + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + ret = -EMSGSIZE; + } + + return ret; +} + +int xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + int err; + + if (skb->ip_summed == CHECKSUM_HW) { + err = skb_checksum_help(skb, 0); + if (err) + goto error_nolock; + } + + if (x->props.mode) { + err = xfrm6_tunnel_check_size(skb); + if (err) + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_state_check(x, skb); + if (err) + goto error; + + xfrm6_encap(skb); + + err = x->type->output(x, skb); + if (err) + goto error; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock_bh(&x->lock); + + skb->nh.raw = skb->data; + + if (!(skb->dst = dst_pop(dst))) { + err = -EHOSTUNREACH; + goto error_nolock; + } + err = NET_XMIT_BYPASS; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c new file mode 100644 index 000000000000..8a4f37de4d2d --- /dev/null +++ b/net/ipv6/xfrm6_policy.c @@ -0,0 +1,295 @@ +/* + * xfrm6_policy.c: based on xfrm4_policy.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion + * + */ + +#include <linux/config.h> +#include <net/xfrm.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_route.h> + +static struct dst_ops xfrm6_dst_ops; +static struct xfrm_policy_afinfo xfrm6_policy_afinfo; + +static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; + +static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +{ + int err = 0; + *dst = (struct xfrm_dst*)ip6_route_output(NULL, fl); + if (!*dst) + err = -ENETUNREACH; + return err; +} + +static struct dst_entry * +__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + /* Still not clear if we should set fl->fl6_{src,dst}... */ + read_lock_bh(&policy->lock); + for (dst = policy->bundles; dst; dst = dst->next) { + struct xfrm_dst *xdst = (struct xfrm_dst*)dst; + struct in6_addr fl_dst_prefix, fl_src_prefix; + + ipv6_addr_prefix(&fl_dst_prefix, + &fl->fl6_dst, + xdst->u.rt6.rt6i_dst.plen); + ipv6_addr_prefix(&fl_src_prefix, + &fl->fl6_src, + xdst->u.rt6.rt6i_src.plen); + if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && + ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && + xfrm_bundle_ok(xdst, fl, AF_INET6)) { + dst_clone(dst); + break; + } + } + read_unlock_bh(&policy->lock); + return dst; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +__xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p) +{ + struct dst_entry *dst, *dst_prev; + struct rt6_info *rt0 = (struct rt6_info*)(*dst_p); + struct rt6_info *rt = rt0; + struct in6_addr *remote = &fl->fl6_dst; + struct in6_addr *local = &fl->fl6_src; + struct flowi fl_tunnel = { + .nl_u = { + .ip6_u = { + .saddr = *local, + .daddr = *remote + } + } + }; + int i; + int err = 0; + int header_len = 0; + int trailer_len = 0; + + dst = dst_prev = NULL; + dst_hold(&rt->u.dst); + + for (i = 0; i < nx; i++) { + struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops); + struct xfrm_dst *xdst; + int tunnel = 0; + + if (unlikely(dst1 == NULL)) { + err = -ENOBUFS; + dst_release(&rt->u.dst); + goto error; + } + + if (!dst) + dst = dst1; + else { + dst_prev->child = dst1; + dst1->flags |= DST_NOHASH; + dst_clone(dst1); + } + + xdst = (struct xfrm_dst *)dst1; + xdst->route = &rt->u.dst; + + dst1->next = dst_prev; + dst_prev = dst1; + if (xfrm[i]->props.mode) { + remote = (struct in6_addr*)&xfrm[i]->id.daddr; + local = (struct in6_addr*)&xfrm[i]->props.saddr; + tunnel = 1; + } + header_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + + if (tunnel) { + ipv6_addr_copy(&fl_tunnel.fl6_dst, remote); + ipv6_addr_copy(&fl_tunnel.fl6_src, local); + err = xfrm_dst_lookup((struct xfrm_dst **) &rt, + &fl_tunnel, AF_INET6); + if (err) + goto error; + } else + dst_hold(&rt->u.dst); + } + + dst_prev->child = &rt->u.dst; + dst->path = &rt->u.dst; + + *dst_p = dst; + dst = dst_prev; + + dst_prev = *dst_p; + i = 0; + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; + + dst_prev->xfrm = xfrm[i++]; + dst_prev->dev = rt->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + dst_prev->obsolete = -1; + dst_prev->flags |= DST_HOST; + dst_prev->lastuse = jiffies; + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); + + /* Copy neighbour for reachability confirmation */ + dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); + dst_prev->input = rt->u.dst.input; + dst_prev->output = xfrm6_output; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); + x->u.rt6.rt6i_metric = rt0->rt6i_metric; + x->u.rt6.rt6i_node = rt0->rt6i_node; + x->u.rt6.rt6i_gateway = rt0->rt6i_gateway; + memcpy(&x->u.rt6.rt6i_gateway, &rt0->rt6i_gateway, sizeof(x->u.rt6.rt6i_gateway)); + x->u.rt6.rt6i_dst = rt0->rt6i_dst; + x->u.rt6.rt6i_src = rt0->rt6i_src; + header_len -= x->u.dst.xfrm->props.header_len; + trailer_len -= x->u.dst.xfrm->props.trailer_len; + } + + xfrm_init_pmtu(dst); + return 0; + +error: + if (dst) + dst_free(dst); + return err; +} + +static inline void +_decode_session6(struct sk_buff *skb, struct flowi *fl) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + u8 nexthdr = skb->nh.ipv6h->nexthdr; + + memset(fl, 0, sizeof(struct flowi)); + ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr); + ipv6_addr_copy(&fl->fl6_src, &hdr->saddr); + + while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) { + switch (nexthdr) { + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + offset += ipv6_optlen(exthdr); + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { + u16 *ports = (u16 *)exthdr; + + fl->fl_ip_sport = ports[0]; + fl->fl_ip_dport = ports[1]; + } + fl->proto = nexthdr; + return; + + case IPPROTO_ICMPV6: + if (pskb_may_pull(skb, skb->nh.raw + offset + 2 - skb->data)) { + u8 *icmp = (u8 *)exthdr; + + fl->fl_icmp_type = icmp[0]; + fl->fl_icmp_code = icmp[1]; + } + fl->proto = nexthdr; + return; + + /* XXX Why are there these headers? */ + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: + default: + fl->fl_ipsec_spi = 0; + fl->proto = nexthdr; + return; + }; + } +} + +static inline int xfrm6_garbage_collect(void) +{ + read_lock(&xfrm6_policy_afinfo.lock); + xfrm6_policy_afinfo.garbage_collect(); + read_unlock(&xfrm6_policy_afinfo.lock); + return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); +} + +static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static struct dst_ops xfrm6_dst_ops = { + .family = AF_INET6, + .protocol = __constant_htons(ETH_P_IPV6), + .gc = xfrm6_garbage_collect, + .update_pmtu = xfrm6_update_pmtu, + .gc_thresh = 1024, + .entry_size = sizeof(struct xfrm_dst), +}; + +static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { + .family = AF_INET6, + .lock = RW_LOCK_UNLOCKED, + .type_map = &xfrm6_type_map, + .dst_ops = &xfrm6_dst_ops, + .dst_lookup = xfrm6_dst_lookup, + .find_bundle = __xfrm6_find_bundle, + .bundle_create = __xfrm6_bundle_create, + .decode_session = _decode_session6, +}; + +static void __init xfrm6_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); +} + +static void xfrm6_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); +} + +void __init xfrm6_init(void) +{ + xfrm6_policy_init(); + xfrm6_state_init(); +} + +void xfrm6_fini(void) +{ + //xfrm6_input_fini(); + xfrm6_policy_fini(); + xfrm6_state_fini(); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c new file mode 100644 index 000000000000..bf0d0abc3871 --- /dev/null +++ b/net/ipv6/xfrm6_state.c @@ -0,0 +1,136 @@ +/* + * xfrm6_state.c: based on xfrm4_state.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro <kunihiro@ipinfusion.com> + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include <net/xfrm.h> +#include <linux/pfkeyv2.h> +#include <linux/ipsec.h> +#include <net/ipv6.h> + +static struct xfrm_state_afinfo xfrm6_state_afinfo; + +static void +__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + /* Initialize temporary selector matching only + * to current session. */ + ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); + ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); + x->sel.dport = xfrm_flowi_dport(fl); + x->sel.dport_mask = ~0; + x->sel.sport = xfrm_flowi_sport(fl); + x->sel.sport_mask = ~0; + x->sel.prefixlen_d = 128; + x->sel.prefixlen_s = 128; + x->sel.proto = fl->proto; + x->sel.ifindex = fl->oif; + x->id = tmpl->id; + if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET6; +} + +static struct xfrm_state * +__xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto) +{ + unsigned h = __xfrm6_spi_hash(daddr, spi, proto); + struct xfrm_state *x; + + list_for_each_entry(x, xfrm6_state_afinfo.state_byspi+h, byspi) { + if (x->props.family == AF_INET6 && + spi == x->id.spi && + ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) && + proto == x->id.proto) { + xfrm_state_hold(x); + return x; + } + } + return NULL; +} + +static struct xfrm_state * +__xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create) +{ + struct xfrm_state *x, *x0; + unsigned h = __xfrm6_dst_hash(daddr); + + x0 = NULL; + + list_for_each_entry(x, xfrm6_state_afinfo.state_bydst+h, bydst) { + if (x->props.family == AF_INET6 && + ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) && + mode == x->props.mode && + proto == x->id.proto && + ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) && + reqid == x->props.reqid && + x->km.state == XFRM_STATE_ACQ && + !x->id.spi) { + x0 = x; + break; + } + } + if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) { + ipv6_addr_copy((struct in6_addr *)x0->sel.daddr.a6, + (struct in6_addr *)daddr); + ipv6_addr_copy((struct in6_addr *)x0->sel.saddr.a6, + (struct in6_addr *)saddr); + x0->sel.prefixlen_d = 128; + x0->sel.prefixlen_s = 128; + ipv6_addr_copy((struct in6_addr *)x0->props.saddr.a6, + (struct in6_addr *)saddr); + x0->km.state = XFRM_STATE_ACQ; + ipv6_addr_copy((struct in6_addr *)x0->id.daddr.a6, + (struct in6_addr *)daddr); + x0->id.proto = proto; + x0->props.family = AF_INET6; + x0->props.mode = mode; + x0->props.reqid = reqid; + x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x0); + x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; + add_timer(&x0->timer); + xfrm_state_hold(x0); + list_add_tail(&x0->bydst, xfrm6_state_afinfo.state_bydst+h); + wake_up(&km_waitq); + } + if (x0) + xfrm_state_hold(x0); + return x0; +} + +static struct xfrm_state_afinfo xfrm6_state_afinfo = { + .family = AF_INET6, + .lock = RW_LOCK_UNLOCKED, + .init_tempsel = __xfrm6_init_tempsel, + .state_lookup = __xfrm6_state_lookup, + .find_acq = __xfrm6_find_acq, +}; + +void __init xfrm6_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm6_state_afinfo); +} + +void xfrm6_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); +} + diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c new file mode 100644 index 000000000000..ffcadd68b951 --- /dev/null +++ b/net/ipv6/xfrm6_tunnel.c @@ -0,0 +1,543 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors Mitsuru KANDA <mk@linux-ipv6.org> + * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> + * + * Based on net/ipv4/xfrm4_tunnel.c + * + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/xfrm.h> +#include <linux/list.h> +#include <net/ip.h> +#include <net/xfrm.h> +#include <net/ipv6.h> +#include <net/protocol.h> +#include <linux/ipv6.h> +#include <linux/icmpv6.h> + +#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG +# define X6TDEBUG 3 +#else +# define X6TDEBUG 1 +#endif + +#define X6TPRINTK(fmt, args...) printk(fmt, ## args) +#define X6TNOPRINTK(fmt, args...) do { ; } while(0) + +#if X6TDEBUG >= 1 +# define X6TPRINTK1 X6TPRINTK +#else +# define X6TPRINTK1 X6TNOPRINTK +#endif + +#if X6TDEBUG >= 3 +# define X6TPRINTK3 X6TPRINTK +#else +# define X6TPRINTK3 X6TNOPRINTK +#endif + +/* + * xfrm_tunnel_spi things are for allocating unique id ("spi") + * per xfrm_address_t. + */ +struct xfrm6_tunnel_spi { + struct hlist_node list_byaddr; + struct hlist_node list_byspi; + xfrm_address_t addr; + u32 spi; + atomic_t refcnt; +#ifdef XFRM6_TUNNEL_SPI_MAGIC + u32 magic; +#endif +}; + +#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG +# define XFRM6_TUNNEL_SPI_MAGIC 0xdeadbeef +#endif + +static DEFINE_RWLOCK(xfrm6_tunnel_spi_lock); + +static u32 xfrm6_tunnel_spi; + +#define XFRM6_TUNNEL_SPI_MIN 1 +#define XFRM6_TUNNEL_SPI_MAX 0xffffffff + +static kmem_cache_t *xfrm6_tunnel_spi_kmem; + +#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 +#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 + +static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; +static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; + +#ifdef XFRM6_TUNNEL_SPI_MAGIC +static int x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, + const char *name) +{ + if (unlikely(x6spi->magic != XFRM6_TUNNEL_SPI_MAGIC)) { + X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " + "at %p has corrupted magic %08x " + "(should be %08x)\n", + name, x6spi, x6spi->magic, XFRM6_TUNNEL_SPI_MAGIC); + return -1; + } + return 0; +} +#else +static int inline x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, + const char *name) +{ + return 0; +} +#endif + +#define X6SPI_CHECK_MAGIC(x6spi) x6spi_check_magic((x6spi), __FUNCTION__) + + +static unsigned inline xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) +{ + unsigned h; + + X6TPRINTK3(KERN_DEBUG "%s(addr=%p)\n", __FUNCTION__, addr); + + h = addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]; + h ^= h >> 16; + h ^= h >> 8; + h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; + + X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, h); + + return h; +} + +static unsigned inline xfrm6_tunnel_spi_hash_byspi(u32 spi) +{ + return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; +} + + +static int xfrm6_tunnel_spi_init(void) +{ + int i; + + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + xfrm6_tunnel_spi = 0; + xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", + sizeof(struct xfrm6_tunnel_spi), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!xfrm6_tunnel_spi_kmem) { + X6TPRINTK1(KERN_ERR + "%s(): failed to allocate xfrm6_tunnel_spi_kmem\n", + __FUNCTION__); + return -ENOMEM; + } + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]); + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byspi[i]); + return 0; +} + +static void xfrm6_tunnel_spi_fini(void) +{ + int i; + + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) { + if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i])) + goto err; + } + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) { + if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i])) + goto err; + } + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); + xfrm6_tunnel_spi_kmem = NULL; + return; +err: + X6TPRINTK1(KERN_ERR "%s(): table is not empty\n", __FUNCTION__); + return; +} + +static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + hlist_for_each_entry(x6spi, pos, + &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + X6SPI_CHECK_MAGIC(x6spi); + X6TPRINTK3(KERN_DEBUG "%s() = %p(%u)\n", __FUNCTION__, x6spi, x6spi->spi); + return x6spi; + } + } + + X6TPRINTK3(KERN_DEBUG "%s() = NULL(0)\n", __FUNCTION__); + return NULL; +} + +u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + read_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(saddr); + spi = x6spi ? x6spi->spi : 0; + read_unlock_bh(&xfrm6_tunnel_spi_lock); + return spi; +} + +EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); + +static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +{ + u32 spi; + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos; + unsigned index; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN || + xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX) + xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN; + else + xfrm6_tunnel_spi++; + + for (spi = xfrm6_tunnel_spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { + index = xfrm6_tunnel_spi_hash_byspi(spi); + hlist_for_each_entry(x6spi, pos, + &xfrm6_tunnel_spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + goto try_next_1; + } + xfrm6_tunnel_spi = spi; + goto alloc_spi; +try_next_1:; + } + for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tunnel_spi; spi++) { + index = xfrm6_tunnel_spi_hash_byspi(spi); + hlist_for_each_entry(x6spi, pos, + &xfrm6_tunnel_spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + goto try_next_2; + } + xfrm6_tunnel_spi = spi; + goto alloc_spi; +try_next_2:; + } + spi = 0; + goto out; +alloc_spi: + X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + __FUNCTION__, + NIP6(*(struct in6_addr *)saddr)); + x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC); + if (!x6spi) { + X6TPRINTK1(KERN_ERR "%s(): kmem_cache_alloc() failed\n", + __FUNCTION__); + goto out; + } +#ifdef XFRM6_TUNNEL_SPI_MAGIC + x6spi->magic = XFRM6_TUNNEL_SPI_MAGIC; +#endif + memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); + x6spi->spi = spi; + atomic_set(&x6spi->refcnt, 1); + + hlist_add_head(&x6spi->list_byspi, &xfrm6_tunnel_spi_byspi[index]); + + index = xfrm6_tunnel_spi_hash_byaddr(saddr); + hlist_add_head(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]); + X6SPI_CHECK_MAGIC(x6spi); +out: + X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); + return spi; +} + +u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + write_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(saddr); + if (x6spi) { + atomic_inc(&x6spi->refcnt); + spi = x6spi->spi; + } else + spi = __xfrm6_tunnel_alloc_spi(saddr); + write_unlock_bh(&xfrm6_tunnel_spi_lock); + + X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); + + return spi; +} + +EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); + +void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos, *n; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + write_lock_bh(&xfrm6_tunnel_spi_lock); + + hlist_for_each_entry_safe(x6spi, pos, n, + &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) + { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " + "for %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " + "found at %p\n", + __FUNCTION__, + NIP6(*(struct in6_addr *)saddr), + x6spi); + X6SPI_CHECK_MAGIC(x6spi); + if (atomic_dec_and_test(&x6spi->refcnt)) { + hlist_del(&x6spi->list_byaddr); + hlist_del(&x6spi->list_byspi); + kmem_cache_free(xfrm6_tunnel_spi_kmem, x6spi); + break; + } + } + } + write_unlock_bh(&xfrm6_tunnel_spi_lock); +} + +EXPORT_SYMBOL(xfrm6_tunnel_free_spi); + +static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *top_iph; + + top_iph = (struct ipv6hdr *)skb->data; + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm6_tunnel *xfrm6_tunnel_handler; +static DECLARE_MUTEX(xfrm6_tunnel_sem); + +int xfrm6_tunnel_register(struct xfrm6_tunnel *handler) +{ + int ret; + + down(&xfrm6_tunnel_sem); + ret = 0; + if (xfrm6_tunnel_handler != NULL) + ret = -EINVAL; + if (!ret) + xfrm6_tunnel_handler = handler; + up(&xfrm6_tunnel_sem); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_register); + +int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler) +{ + int ret; + + down(&xfrm6_tunnel_sem); + ret = 0; + if (xfrm6_tunnel_handler != handler) + ret = -EINVAL; + if (!ret) + xfrm6_tunnel_handler = NULL; + up(&xfrm6_tunnel_sem); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_deregister); + +static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct xfrm6_tunnel *handler = xfrm6_tunnel_handler; + struct ipv6hdr *iph = skb->nh.ipv6h; + u32 spi; + + /* device-like_ip6ip6_handler() */ + if (handler && handler->handler(pskb, nhoffp) == 0) + return 0; + + spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); + return xfrm6_rcv_spi(pskb, nhoffp, spi); +} + +static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct xfrm6_tunnel *handler = xfrm6_tunnel_handler; + + /* call here first for device-like ip6ip6 err handling */ + if (handler) { + handler->err_handler(skb, opt, type, code, offset, info); + return; + } + + /* xfrm6_tunnel native err handling */ + switch (type) { + case ICMPV6_DEST_UNREACH: + switch (code) { + case ICMPV6_NOROUTE: + case ICMPV6_ADM_PROHIBITED: + case ICMPV6_NOT_NEIGHBOUR: + case ICMPV6_ADDR_UNREACH: + case ICMPV6_PORT_UNREACH: + default: + X6TPRINTK3(KERN_DEBUG + "xfrm6_tunnel: Destination Unreach.\n"); + break; + } + break; + case ICMPV6_PKT_TOOBIG: + X6TPRINTK3(KERN_DEBUG + "xfrm6_tunnel: Packet Too Big.\n"); + break; + case ICMPV6_TIME_EXCEED: + switch (code) { + case ICMPV6_EXC_HOPLIMIT: + X6TPRINTK3(KERN_DEBUG + "xfrm6_tunnel: Too small Hoplimit.\n"); + break; + case ICMPV6_EXC_FRAGTIME: + default: + break; + } + break; + case ICMPV6_PARAMPROB: + switch (code) { + case ICMPV6_HDR_FIELD: break; + case ICMPV6_UNK_NEXTHDR: break; + case ICMPV6_UNK_OPTION: break; + } + break; + default: + break; + } + return; +} + +static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args) +{ + if (!x->props.mode) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct ipv6hdr); + + return 0; +} + +static void xfrm6_tunnel_destroy(struct xfrm_state *x) +{ + xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); +} + +static struct xfrm_type xfrm6_tunnel_type = { + .description = "IP6IP6", + .owner = THIS_MODULE, + .proto = IPPROTO_IPV6, + .init_state = xfrm6_tunnel_init_state, + .destructor = xfrm6_tunnel_destroy, + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, +}; + +static struct inet6_protocol xfrm6_tunnel_protocol = { + .handler = xfrm6_tunnel_rcv, + .err_handler = xfrm6_tunnel_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static int __init xfrm6_tunnel_init(void) +{ + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) { + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0) { + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel init(): can't add protocol\n"); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + return -EAGAIN; + } + if (xfrm6_tunnel_spi_init() < 0) { + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel init: failed to initialize spi\n"); + inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit xfrm6_tunnel_fini(void) +{ + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + xfrm6_tunnel_spi_fini(); + if (inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0) + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel close: can't remove protocol\n"); + if (xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6) < 0) + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel close: can't remove xfrm type\n"); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_fini); +MODULE_LICENSE("GPL"); |